Commit f49e6275 authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Added GatkPipeline, finaly replacement for "Gatk" pipeline

parent ae952d4c
/target/
\ No newline at end of file
GATK variantcalling pipeline
=======================
License
===
A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL license; For commercial users or users who do not want to follow the AGPL license, please contact sasc@lumc.nl to purchase a separate license.
{
"referenceFile" : "/data/DIV5/SASC/common/gatk_bundle_2.8/hg19/ucsc.hg19.fasta",
"reference" : "/data/DIV5/SASC/common/gatk_bundle_2.8/hg19/ucsc.hg19.fasta",
"samtools": { "exe": "test"},
"fastqc": { "exe": "/home/pjvan_thof/Downloads/FastQC/fastqc" },
"star" : {"exe":"/home/pjvan_thof/pipelines/test/test"},
"bwa" : {"exe":"/home/pjvan_thof/pipelines/test/test"},
"flexiprep": {
"fastqc": {"exe":"/home/pjvan_thof/pipelines/test/test"},
"sickle": {"exe":"/home/pjvan_thof/pipelines/test/test"}
},
"mapping": {
},
"gatk": {
"mapping": {
"flexiprep": {
}
},
"referenceFile" : "/data/DIV5/SASC/common/gatk_bundle_2.8/hg19/ucsc.hg19.fasta",
"dbsnp": "/home/pjvan_thof/pipelines/test/test",
"hapmap": "/home/pjvan_thof/pipelines/test/test",
"omni": "/home/pjvan_thof/pipelines/test/test",
"1000G": "/home/pjvan_thof/pipelines/test/test",
"mills": "/home/pjvan_thof/pipelines/test/test",
"haplotypecaller": {
"stand_call_conf": 20,
"stand_emit_conf": 20
}
},
"cutadapt": {"exe":"/home/pjvan_thof/pipelines/test/test"},
"samples": {
"test": {
"ID": "test",
"runs": {
"1" : {
"ID": "1",
"R1" : "/home/pjvan_thof/pipelines/test/test.R1.fastq.gz",
"R2" : "/home/pjvan_thof/pipelines/test/test.R2.fastq.gz"
},
"2" : {
"ID": "1",
"R1" : "/home/pjvan_thof/pipelines/test/test.R1.fastq.gz",
"R2" : "/home/pjvan_thof/pipelines/test/test.R2.fastq.gz"
}
}
}
}
}
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>nl.lumc.sasc</groupId>
<artifactId>GatkPipeline</artifactId>
<version>0.1.3</version>
<packaging>jar</packaging>
<name>GatkPipeline</name>
<url>http://maven.apache.org</url>
<parent>
<groupId>nl.lumc.sasc</groupId>
<artifactId>Biopet</artifactId>
<version>0.1.3</version>
<relativePath>../</relativePath>
</parent>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<sting.unpack.phase>prepare-package</sting.unpack.phase>
<sting.shade.phase>package</sting.shade.phase>
<app.main.class>nl.lumc.sasc.biopet.pipelines.gatk.GatkPipeline</app.main.class>
</properties>
<dependencies>
<!-- <dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>-->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-compiler</artifactId>
<version>2.11.0</version>
</dependency>
<!-- <dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_2.9.2</artifactId>
<version>2.0.M4</version>
<scope>test</scope>
</dependency>-->
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>Biopet-Framework</artifactId>
<version>0.1.3</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>Bam-Metrics</artifactId>
<version>0.1.3</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>Mapping</artifactId>
<version>0.1.3</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>GatkVariantcalling</artifactId>
<version>0.1.3</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>GatkGenotyping</artifactId>
<version>0.1.3</version>
</dependency>
<dependency>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>queue-package</artifactId>
<version>3.2-2</version>
</dependency>
</dependencies>
<build>
<resources>
<resource>
<directory>scripts</directory>
<includes>
<include>**/*</include>
</includes>
</resource>
</resources>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<id>scala-compile</id>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<arg>-dependencyfile</arg>
<arg>${project.build.directory}/.scala_dependencies</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
<!-- <plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.7.2</version>
<executions>
<execution>
<id>default-test</id>
Disable the default-test by putting it in phase none
<phase>none</phase>
</execution>
</executions>
</plugin>-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.3</version>
<configuration>
<finalName>${project.artifactId}-${project.version}</finalName>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<manifestEntries>
<Main-Class>${app.main.class}</Main-Class>
<X-Compile-Source-JDK>${maven.compile.source}</X-Compile-Source-JDK>
<X-Compile-Target-JDK>${maven.compile.target}</X-Compile-Target-JDK>
</manifestEntries>
</transformer>
</transformers>
<artifactSet>
<excludes>
<exclude>org.broadinstitute.gatk:queue-package</exclude>
<exclude>junit:junit</exclude>
<exclude>org.scala-lang:scala-compiler</exclude>
<exclude>org.scala-lang:scala-library</exclude>
<exclude>org.scala-lang:scala-reflect</exclude>
<exclude>org.scala-lang.modules:*</exclude>
</excludes>
</artifactSet>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.function._
import nl.lumc.sasc.biopet.function.aligners._
import java.io.File
import nl.lumc.sasc.biopet.core._
import nl.lumc.sasc.biopet.core.config._
import nl.lumc.sasc.biopet.pipelines.mapping._
import nl.lumc.sasc.biopet.function.picard.MarkDuplicates
import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics
import nl.lumc.sasc.biopet.pipelines.flexiprep._
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.queue.extensions.gatk._
import org.broadinstitute.gatk.queue.extensions.picard._
import org.broadinstitute.gatk.queue.function._
import org.broadinstitute.gatk.utils.commandline.{Argument}
import scala.util.parsing.json._
class GatkPipeline(val root:Configurable) extends QScript with MultiSampleQScript {
def this() = this(null)
@Argument(doc="Only Sample",shortName="sample", required=false)
val onlySample: String = ""
@Argument(doc="Skip Genotyping step",shortName="skipgenotyping", required=false)
var skipGenotyping: Boolean = false
@Argument(doc="Merge gvcfs",shortName="mergegvcfs", required=false)
var mergeGvcfs: Boolean = false
var referenceFile: File = _
var dbsnp: File = _
var gvcfFiles: List[File] = Nil
var finalBamFiles: List[File] = Nil
def init() {
for (file <- configfiles) globalConfig.loadConfigFile(file)
referenceFile = config("referenceFile")
if (configContains("dbsnp")) dbsnp = config("dbsnp")
for (file <- config("gvcfFiles", Nil).getList) gvcfFiles :+= file.toString
if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module")
else if (!outputDir.endsWith("/")) outputDir += "/"
}
def biopetScript() {
if (onlySample.isEmpty) {
runSamplesJobs
//SampleWide jobs
if (mergeGvcfs && gvcfFiles.size > 0) {
val newFile = outputDir + "merged.gvcf.vcf"
addCombineGVCFs(gvcfFiles, newFile)
gvcfFiles = List(newFile)
}
if (!skipGenotyping && gvcfFiles.size > 0) {
val gatkGenotyping = new GatkGenotyping(this)
gatkGenotyping.inputGvcfs = gvcfFiles
gatkGenotyping.outputDir = outputDir + "genotyping/"
gatkGenotyping.init
gatkGenotyping.biopetScript
addAll(gatkGenotyping.functions)
var vcfFile = gatkGenotyping.outputFile
if (config("inputtype", "dna").getString != "rna") {
vcfFile = addVariantAnnotator(vcfFile, finalBamFiles, outputDir + "recalibration/")
vcfFile = addSnpVariantRecalibrator(vcfFile, outputDir + "recalibration/")
vcfFile = addIndelVariantRecalibrator(vcfFile, outputDir + "recalibration/")
}
} else logger.warn("No gVCFs to genotype")
} else runSingleSampleJobs(onlySample)
}
// Called for each sample
def runSingleSampleJobs(sampleConfig:Map[String,Any]) : Map[String,List[File]] = {
var outputFiles:Map[String,List[File]] = Map()
var runBamfiles: List[File] = List()
var sampleID: String = sampleConfig("ID").toString
for ((run, runFiles) <- runRunsJobs(sampleConfig)) {
runBamfiles +:= runFiles("FinalBam")
}
outputFiles += ("FinalBams" -> runBamfiles)
// addAll(BamMetrics(this, bamFile, outputDir + "metrics/").functions) // Metrics pipeline
if (runBamfiles.size > 0) {
finalBamFiles ++= runBamfiles
val gatkVariantcalling = new GatkVariantcalling(this)
gatkVariantcalling.inputBams = runBamfiles
gatkVariantcalling.outputDir = outputDir + sampleID + "/variantcalling/"
gatkVariantcalling.init
gatkVariantcalling.biopetScript
addAll(gatkVariantcalling.functions)
gvcfFiles :+= gatkVariantcalling.outputFile
} else logger.warn("No bamfiles for variant calling for sample: " + sampleID)
return outputFiles
}
// Called for each run from a sample
def runSingleRunJobs(runConfig:Map[String,Any], sampleConfig:Map[String,Any]) : Map[String,File] = {
var outputFiles:Map[String,File] = Map()
val runID: String = runConfig("ID").toString
val sampleID: String = sampleConfig("ID").toString
val runDir: String = outputDir + sampleID + "/run_" + runID + "/"
var inputType = ""
if (runConfig.contains("inputtype")) inputType = runConfig("inputtype").toString
else inputType = config("inputtype", "dna").toString
if (runConfig.contains("R1")) {
val mapping = Mapping.loadFromRunConfig(this, runConfig, sampleConfig, runDir)
addAll(mapping.functions) // Add functions of mapping to curent function pool
outputFiles += ("FinalBam" -> mapping.outputFiles("finalBamFile"))
} else this.logger.error("Sample: " + sampleID + ": No R1 found for run: " + runConfig)
return outputFiles
}
def addSnpVariantRecalibrator(inputVcf:File, dir:String): File = {
val snpVariantRecalibrator = getVariantRecalibrator("snp")
snpVariantRecalibrator.input +:= inputVcf
snpVariantRecalibrator.recal_file = swapExt(dir, inputVcf,".vcf",".snp.recal")
snpVariantRecalibrator.tranches_file = swapExt(dir, inputVcf,".vcf",".snp.tranches")
if (!snpVariantRecalibrator.resource.isEmpty) {
add(snpVariantRecalibrator)
val snpApplyRecalibration = getApplyRecalibration("snp")
snpApplyRecalibration.input +:= inputVcf
snpApplyRecalibration.recal_file = snpVariantRecalibrator.recal_file
snpApplyRecalibration.tranches_file = snpVariantRecalibrator.tranches_file
snpApplyRecalibration.out = swapExt(dir, inputVcf,".vcf",".snp.recal.vcf")
add(snpApplyRecalibration)
return snpApplyRecalibration.out
} else {
logger.warn("Skipped snp Recalibration, resource is missing")
return inputVcf
}
}
def addIndelVariantRecalibrator(inputVcf:File, dir:String): File = {
val indelVariantRecalibrator = getVariantRecalibrator("indel")
indelVariantRecalibrator.input +:= inputVcf
indelVariantRecalibrator.recal_file = swapExt(dir, inputVcf,".vcf",".indel.recal")
indelVariantRecalibrator.tranches_file = swapExt(dir, inputVcf,".vcf",".indel.tranches")
if (!indelVariantRecalibrator.resource.isEmpty) {
add(indelVariantRecalibrator)
val indelApplyRecalibration = getApplyRecalibration("indel")
indelApplyRecalibration.input +:= inputVcf
indelApplyRecalibration.recal_file = indelVariantRecalibrator.recal_file
indelApplyRecalibration.tranches_file = indelVariantRecalibrator.tranches_file
indelApplyRecalibration.out = swapExt(dir, inputVcf,".vcf",".indel.recal.vcf")
add(indelApplyRecalibration)
return indelApplyRecalibration.out
} else {
logger.warn("Skipped indel Recalibration, resource is missing")
return inputVcf
}
}
def getVariantRecalibrator(mode_arg:String) : VariantRecalibrator = {
val variantRecalibrator = new VariantRecalibrator() with gatkArguments {
if (mode_arg == "indel") {
this.mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL
if (configContains("mills", "variantrecalibrator")) this.resource :+= new TaggedFile(config("mills", "", "variantrecalibrator").getString, "known=false,training=true,truth=true,prior=12.0")
} else { // SNP
this.mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP
if (configContains("hapmap", "variantrecalibrator")) this.resource +:= new TaggedFile(config("hapmap", "", "variantrecalibrator").getString, "known=false,training=true,truth=true,prior=15.0")
if (configContains("omni", "variantrecalibrator")) this.resource +:= new TaggedFile(config("omni", "", "variantrecalibrator").getString, "known=false,training=true,truth=true,prior=12.0")
if (configContains("1000G", "variantrecalibrator")) this.resource +:= new TaggedFile(config("1000G", "", "variantrecalibrator").getString, "known=false,training=true,truth=false,prior=10.0")
}
if (configContains("dbsnp", "variantrecalibrator")) this.resource :+= new TaggedFile(config("dbsnp", "", "variantrecalibrator").getString, "known=true,training=false,truth=false,prior=2.0")
this.nt = 4
this.memoryLimit = nt * 2
this.an = Seq("QD","DP","FS","ReadPosRankSum","MQRankSum")
if (configContains("minnumbadvariants", "variantrecalibrator")) this.minNumBadVariants = config("minnumbadvariants", "", "variantrecalibrator")
if (configContains("maxgaussians", "variantrecalibrator")) this.maxGaussians = config("maxgaussians", "", "variantrecalibrator")
}
return variantRecalibrator
}
def getApplyRecalibration(mode_arg:String) : ApplyRecalibration = {
val applyRecalibration = new ApplyRecalibration() with gatkArguments {
if (mode_arg == "indel") {
this.mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL
this.ts_filter_level = config("ts_filter_level", 99.0, "applyrecalibration")
} else { // SNP
this.mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP
this.ts_filter_level = config("ts_filter_level", 99.5, "applyrecalibration")
}
this.nt = 3
this.memoryLimit = nt * 2
if (configContains("scattercount", "applyrecalibration")) this.scatterCount = config("scattercount", 1, "applyrecalibration")
}
return applyRecalibration
}
def addVariantAnnotator(inputvcf:File, bamfiles:List[File], dir:String): File = {
val variantAnnotator = new VariantAnnotator with gatkArguments {
this.variant = inputvcf
this.input_file = bamfiles
this.dbsnp = config("dbsnp", "variantannotator")
this.out = swapExt(dir, inputvcf,".vcf",".anotated.vcf")
if (configContains("scattercount", "variantannotator")) this.scatterCount = config("scattercount", 1, "variantannotator")
}
add(variantAnnotator)
return variantAnnotator.out
}
def addCombineGVCFs(input:List[File], output:File): File = {
val combineGVCFs = new CombineGVCFs with gatkArguments {
this.variant = input
this.o = output
if (configContains("scattercount", "variantannotator")) this.scatterCount = config("scattercount", 1, "combinegvcfs")
}
add(combineGVCFs)
return output
}
trait gatkArguments extends CommandLineGATK {
this.reference_sequence = referenceFile
this.memoryLimit = 2
this.jobResourceRequests :+= "h_vmem=4G"
}
}
object GatkPipeline extends PipelineCommand {
override val pipeline = "/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.class"
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment