From f49e62750dd8956a87d0f6c51facc7e50d0c161a Mon Sep 17 00:00:00 2001 From: Peter van 't Hof <p.j.van_t_hof@lumc.nl> Date: Fri, 18 Jul 2014 15:57:07 +0200 Subject: [PATCH] Added GatkPipeline, finaly replacement for "Gatk" pipeline --- gatk-pipeline/.gitignore | 1 + gatk-pipeline/README.md | 8 + gatk-pipeline/examples/test.json | 48 ++++ gatk-pipeline/pom.xml | 155 ++++++++++++ .../biopet/pipelines/gatk/GatkPipeline.scala | 231 ++++++++++++++++++ 5 files changed, 443 insertions(+) create mode 100644 gatk-pipeline/.gitignore create mode 100644 gatk-pipeline/README.md create mode 100644 gatk-pipeline/examples/test.json create mode 100644 gatk-pipeline/pom.xml create mode 100644 gatk-pipeline/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala diff --git a/gatk-pipeline/.gitignore b/gatk-pipeline/.gitignore new file mode 100644 index 000000000..a6f89c2da --- /dev/null +++ b/gatk-pipeline/.gitignore @@ -0,0 +1 @@ +/target/ \ No newline at end of file diff --git a/gatk-pipeline/README.md b/gatk-pipeline/README.md new file mode 100644 index 000000000..8547b8626 --- /dev/null +++ b/gatk-pipeline/README.md @@ -0,0 +1,8 @@ +GATK variantcalling pipeline +======================= + + +License +=== + +A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL license; For commercial users or users who do not want to follow the AGPL license, please contact sasc@lumc.nl to purchase a separate license. diff --git a/gatk-pipeline/examples/test.json b/gatk-pipeline/examples/test.json new file mode 100644 index 000000000..eb32e1a21 --- /dev/null +++ b/gatk-pipeline/examples/test.json @@ -0,0 +1,48 @@ +{ + "referenceFile" : "/data/DIV5/SASC/common/gatk_bundle_2.8/hg19/ucsc.hg19.fasta", + "reference" : "/data/DIV5/SASC/common/gatk_bundle_2.8/hg19/ucsc.hg19.fasta", + "samtools": { "exe": "test"}, + "fastqc": { "exe": "/home/pjvan_thof/Downloads/FastQC/fastqc" }, + "star" : {"exe":"/home/pjvan_thof/pipelines/test/test"}, + "bwa" : {"exe":"/home/pjvan_thof/pipelines/test/test"}, + "flexiprep": { + "fastqc": {"exe":"/home/pjvan_thof/pipelines/test/test"}, + "sickle": {"exe":"/home/pjvan_thof/pipelines/test/test"} + }, + "mapping": { + }, + "gatk": { + "mapping": { + "flexiprep": { + } + }, + "referenceFile" : "/data/DIV5/SASC/common/gatk_bundle_2.8/hg19/ucsc.hg19.fasta", + "dbsnp": "/home/pjvan_thof/pipelines/test/test", + "hapmap": "/home/pjvan_thof/pipelines/test/test", + "omni": "/home/pjvan_thof/pipelines/test/test", + "1000G": "/home/pjvan_thof/pipelines/test/test", + "mills": "/home/pjvan_thof/pipelines/test/test", + "haplotypecaller": { + "stand_call_conf": 20, + "stand_emit_conf": 20 + } + }, + "cutadapt": {"exe":"/home/pjvan_thof/pipelines/test/test"}, + "samples": { + "test": { + "ID": "test", + "runs": { + "1" : { + "ID": "1", + "R1" : "/home/pjvan_thof/pipelines/test/test.R1.fastq.gz", + "R2" : "/home/pjvan_thof/pipelines/test/test.R2.fastq.gz" + }, + "2" : { + "ID": "1", + "R1" : "/home/pjvan_thof/pipelines/test/test.R1.fastq.gz", + "R2" : "/home/pjvan_thof/pipelines/test/test.R2.fastq.gz" + } + } + } + } +} diff --git a/gatk-pipeline/pom.xml b/gatk-pipeline/pom.xml new file mode 100644 index 000000000..f7d6b1e23 --- /dev/null +++ b/gatk-pipeline/pom.xml @@ -0,0 +1,155 @@ +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <groupId>nl.lumc.sasc</groupId> + <artifactId>GatkPipeline</artifactId> + <version>0.1.3</version> + <packaging>jar</packaging> + + <name>GatkPipeline</name> + <url>http://maven.apache.org</url> + + <parent> + <groupId>nl.lumc.sasc</groupId> + <artifactId>Biopet</artifactId> + <version>0.1.3</version> + <relativePath>../</relativePath> + </parent> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + <sting.unpack.phase>prepare-package</sting.unpack.phase> + <sting.shade.phase>package</sting.shade.phase> + <app.main.class>nl.lumc.sasc.biopet.pipelines.gatk.GatkPipeline</app.main.class> + </properties> + + <dependencies> +<!-- <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>3.8.1</version> + <scope>test</scope> + </dependency>--> + <dependency> + <groupId>org.scala-lang</groupId> + <artifactId>scala-compiler</artifactId> + <version>2.11.0</version> + </dependency> +<!-- <dependency> + <groupId>org.scalatest</groupId> + <artifactId>scalatest_2.9.2</artifactId> + <version>2.0.M4</version> + <scope>test</scope> + </dependency>--> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>Biopet-Framework</artifactId> + <version>0.1.3</version> + </dependency> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>Bam-Metrics</artifactId> + <version>0.1.3</version> + </dependency> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>Mapping</artifactId> + <version>0.1.3</version> + </dependency> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>GatkVariantcalling</artifactId> + <version>0.1.3</version> + </dependency> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>GatkGenotyping</artifactId> + <version>0.1.3</version> + </dependency> + <dependency> + <groupId>org.broadinstitute.gatk</groupId> + <artifactId>queue-package</artifactId> + <version>3.2-2</version> + </dependency> + </dependencies> + <build> + <resources> + <resource> + <directory>scripts</directory> + <includes> + <include>**/*</include> + </includes> + </resource> + </resources> + <plugins> + <plugin> + <groupId>org.scala-tools</groupId> + <artifactId>maven-scala-plugin</artifactId> + <version>2.15.2</version> + <executions> + <execution> + <id>scala-compile</id> + <goals> + <goal>compile</goal> + <goal>testCompile</goal> + </goals> + <configuration> + <args> + <arg>-dependencyfile</arg> + <arg>${project.build.directory}/.scala_dependencies</arg> + </args> + </configuration> + </execution> + </executions> + </plugin> +<!-- <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + <version>2.7.2</version> + <executions> + <execution> + <id>default-test</id> + Disable the default-test by putting it in phase none + <phase>none</phase> + </execution> + </executions> + </plugin>--> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-shade-plugin</artifactId> + <version>2.3</version> + <configuration> + <finalName>${project.artifactId}-${project.version}</finalName> + <transformers> + <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> + <manifestEntries> + <Main-Class>${app.main.class}</Main-Class> + <X-Compile-Source-JDK>${maven.compile.source}</X-Compile-Source-JDK> + <X-Compile-Target-JDK>${maven.compile.target}</X-Compile-Target-JDK> + </manifestEntries> + </transformer> + </transformers> + <artifactSet> + <excludes> + <exclude>org.broadinstitute.gatk:queue-package</exclude> + <exclude>junit:junit</exclude> + <exclude>org.scala-lang:scala-compiler</exclude> + <exclude>org.scala-lang:scala-library</exclude> + <exclude>org.scala-lang:scala-reflect</exclude> + <exclude>org.scala-lang.modules:*</exclude> + </excludes> + </artifactSet> + </configuration> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> +</project> diff --git a/gatk-pipeline/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala b/gatk-pipeline/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala new file mode 100644 index 000000000..4936fd04a --- /dev/null +++ b/gatk-pipeline/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala @@ -0,0 +1,231 @@ +package nl.lumc.sasc.biopet.pipelines.gatk + +import nl.lumc.sasc.biopet.function._ +import nl.lumc.sasc.biopet.function.aligners._ +import java.io.File +import nl.lumc.sasc.biopet.core._ +import nl.lumc.sasc.biopet.core.config._ +import nl.lumc.sasc.biopet.pipelines.mapping._ +import nl.lumc.sasc.biopet.function.picard.MarkDuplicates +import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics +import nl.lumc.sasc.biopet.pipelines.flexiprep._ +import org.broadinstitute.gatk.queue.QScript +import org.broadinstitute.gatk.queue.extensions.gatk._ +import org.broadinstitute.gatk.queue.extensions.picard._ +import org.broadinstitute.gatk.queue.function._ +import org.broadinstitute.gatk.utils.commandline.{Argument} +import scala.util.parsing.json._ + +class GatkPipeline(val root:Configurable) extends QScript with MultiSampleQScript { + def this() = this(null) + + @Argument(doc="Only Sample",shortName="sample", required=false) + val onlySample: String = "" + + @Argument(doc="Skip Genotyping step",shortName="skipgenotyping", required=false) + var skipGenotyping: Boolean = false + + @Argument(doc="Merge gvcfs",shortName="mergegvcfs", required=false) + var mergeGvcfs: Boolean = false + + var referenceFile: File = _ + var dbsnp: File = _ + var gvcfFiles: List[File] = Nil + var finalBamFiles: List[File] = Nil + + def init() { + for (file <- configfiles) globalConfig.loadConfigFile(file) + referenceFile = config("referenceFile") + if (configContains("dbsnp")) dbsnp = config("dbsnp") + for (file <- config("gvcfFiles", Nil).getList) gvcfFiles :+= file.toString + if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module") + else if (!outputDir.endsWith("/")) outputDir += "/" + } + + def biopetScript() { + if (onlySample.isEmpty) { + runSamplesJobs + + //SampleWide jobs + if (mergeGvcfs && gvcfFiles.size > 0) { + val newFile = outputDir + "merged.gvcf.vcf" + addCombineGVCFs(gvcfFiles, newFile) + gvcfFiles = List(newFile) + } + + if (!skipGenotyping && gvcfFiles.size > 0) { + val gatkGenotyping = new GatkGenotyping(this) + gatkGenotyping.inputGvcfs = gvcfFiles + gatkGenotyping.outputDir = outputDir + "genotyping/" + gatkGenotyping.init + gatkGenotyping.biopetScript + addAll(gatkGenotyping.functions) + var vcfFile = gatkGenotyping.outputFile + + if (config("inputtype", "dna").getString != "rna") { + vcfFile = addVariantAnnotator(vcfFile, finalBamFiles, outputDir + "recalibration/") + vcfFile = addSnpVariantRecalibrator(vcfFile, outputDir + "recalibration/") + vcfFile = addIndelVariantRecalibrator(vcfFile, outputDir + "recalibration/") + } + } else logger.warn("No gVCFs to genotype") + } else runSingleSampleJobs(onlySample) + } + + // Called for each sample + def runSingleSampleJobs(sampleConfig:Map[String,Any]) : Map[String,List[File]] = { + var outputFiles:Map[String,List[File]] = Map() + var runBamfiles: List[File] = List() + var sampleID: String = sampleConfig("ID").toString + for ((run, runFiles) <- runRunsJobs(sampleConfig)) { + runBamfiles +:= runFiles("FinalBam") + } + outputFiles += ("FinalBams" -> runBamfiles) + +// addAll(BamMetrics(this, bamFile, outputDir + "metrics/").functions) // Metrics pipeline + + if (runBamfiles.size > 0) { + finalBamFiles ++= runBamfiles + val gatkVariantcalling = new GatkVariantcalling(this) + gatkVariantcalling.inputBams = runBamfiles + gatkVariantcalling.outputDir = outputDir + sampleID + "/variantcalling/" + gatkVariantcalling.init + gatkVariantcalling.biopetScript + addAll(gatkVariantcalling.functions) + gvcfFiles :+= gatkVariantcalling.outputFile + } else logger.warn("No bamfiles for variant calling for sample: " + sampleID) + return outputFiles + } + + // Called for each run from a sample + def runSingleRunJobs(runConfig:Map[String,Any], sampleConfig:Map[String,Any]) : Map[String,File] = { + var outputFiles:Map[String,File] = Map() + val runID: String = runConfig("ID").toString + val sampleID: String = sampleConfig("ID").toString + val runDir: String = outputDir + sampleID + "/run_" + runID + "/" + var inputType = "" + if (runConfig.contains("inputtype")) inputType = runConfig("inputtype").toString + else inputType = config("inputtype", "dna").toString + if (runConfig.contains("R1")) { + val mapping = Mapping.loadFromRunConfig(this, runConfig, sampleConfig, runDir) + addAll(mapping.functions) // Add functions of mapping to curent function pool + + outputFiles += ("FinalBam" -> mapping.outputFiles("finalBamFile")) + } else this.logger.error("Sample: " + sampleID + ": No R1 found for run: " + runConfig) + return outputFiles + } + + def addSnpVariantRecalibrator(inputVcf:File, dir:String): File = { + val snpVariantRecalibrator = getVariantRecalibrator("snp") + snpVariantRecalibrator.input +:= inputVcf + snpVariantRecalibrator.recal_file = swapExt(dir, inputVcf,".vcf",".snp.recal") + snpVariantRecalibrator.tranches_file = swapExt(dir, inputVcf,".vcf",".snp.tranches") + if (!snpVariantRecalibrator.resource.isEmpty) { + add(snpVariantRecalibrator) + + val snpApplyRecalibration = getApplyRecalibration("snp") + snpApplyRecalibration.input +:= inputVcf + snpApplyRecalibration.recal_file = snpVariantRecalibrator.recal_file + snpApplyRecalibration.tranches_file = snpVariantRecalibrator.tranches_file + snpApplyRecalibration.out = swapExt(dir, inputVcf,".vcf",".snp.recal.vcf") + add(snpApplyRecalibration) + + return snpApplyRecalibration.out + } else { + logger.warn("Skipped snp Recalibration, resource is missing") + return inputVcf + } + } + + def addIndelVariantRecalibrator(inputVcf:File, dir:String): File = { + val indelVariantRecalibrator = getVariantRecalibrator("indel") + indelVariantRecalibrator.input +:= inputVcf + indelVariantRecalibrator.recal_file = swapExt(dir, inputVcf,".vcf",".indel.recal") + indelVariantRecalibrator.tranches_file = swapExt(dir, inputVcf,".vcf",".indel.tranches") + if (!indelVariantRecalibrator.resource.isEmpty) { + add(indelVariantRecalibrator) + + val indelApplyRecalibration = getApplyRecalibration("indel") + indelApplyRecalibration.input +:= inputVcf + indelApplyRecalibration.recal_file = indelVariantRecalibrator.recal_file + indelApplyRecalibration.tranches_file = indelVariantRecalibrator.tranches_file + indelApplyRecalibration.out = swapExt(dir, inputVcf,".vcf",".indel.recal.vcf") + add(indelApplyRecalibration) + + return indelApplyRecalibration.out + } else { + logger.warn("Skipped indel Recalibration, resource is missing") + return inputVcf + } + } + + def getVariantRecalibrator(mode_arg:String) : VariantRecalibrator = { + val variantRecalibrator = new VariantRecalibrator() with gatkArguments { + if (mode_arg == "indel") { + this.mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL + if (configContains("mills", "variantrecalibrator")) this.resource :+= new TaggedFile(config("mills", "", "variantrecalibrator").getString, "known=false,training=true,truth=true,prior=12.0") + } else { // SNP + this.mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP + if (configContains("hapmap", "variantrecalibrator")) this.resource +:= new TaggedFile(config("hapmap", "", "variantrecalibrator").getString, "known=false,training=true,truth=true,prior=15.0") + if (configContains("omni", "variantrecalibrator")) this.resource +:= new TaggedFile(config("omni", "", "variantrecalibrator").getString, "known=false,training=true,truth=true,prior=12.0") + if (configContains("1000G", "variantrecalibrator")) this.resource +:= new TaggedFile(config("1000G", "", "variantrecalibrator").getString, "known=false,training=true,truth=false,prior=10.0") + } + if (configContains("dbsnp", "variantrecalibrator")) this.resource :+= new TaggedFile(config("dbsnp", "", "variantrecalibrator").getString, "known=true,training=false,truth=false,prior=2.0") + this.nt = 4 + this.memoryLimit = nt * 2 + this.an = Seq("QD","DP","FS","ReadPosRankSum","MQRankSum") + if (configContains("minnumbadvariants", "variantrecalibrator")) this.minNumBadVariants = config("minnumbadvariants", "", "variantrecalibrator") + if (configContains("maxgaussians", "variantrecalibrator")) this.maxGaussians = config("maxgaussians", "", "variantrecalibrator") + } + return variantRecalibrator + } + + def getApplyRecalibration(mode_arg:String) : ApplyRecalibration = { + val applyRecalibration = new ApplyRecalibration() with gatkArguments { + if (mode_arg == "indel") { + this.mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL + this.ts_filter_level = config("ts_filter_level", 99.0, "applyrecalibration") + } else { // SNP + this.mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP + this.ts_filter_level = config("ts_filter_level", 99.5, "applyrecalibration") + } + this.nt = 3 + this.memoryLimit = nt * 2 + if (configContains("scattercount", "applyrecalibration")) this.scatterCount = config("scattercount", 1, "applyrecalibration") + } + return applyRecalibration + } + + def addVariantAnnotator(inputvcf:File, bamfiles:List[File], dir:String): File = { + val variantAnnotator = new VariantAnnotator with gatkArguments { + this.variant = inputvcf + this.input_file = bamfiles + this.dbsnp = config("dbsnp", "variantannotator") + this.out = swapExt(dir, inputvcf,".vcf",".anotated.vcf") + if (configContains("scattercount", "variantannotator")) this.scatterCount = config("scattercount", 1, "variantannotator") + } + add(variantAnnotator) + + return variantAnnotator.out + } + + def addCombineGVCFs(input:List[File], output:File): File = { + val combineGVCFs = new CombineGVCFs with gatkArguments { + this.variant = input + this.o = output + if (configContains("scattercount", "variantannotator")) this.scatterCount = config("scattercount", 1, "combinegvcfs") + } + add(combineGVCFs) + + return output + } + + trait gatkArguments extends CommandLineGATK { + this.reference_sequence = referenceFile + this.memoryLimit = 2 + this.jobResourceRequests :+= "h_vmem=4G" + } +} + +object GatkPipeline extends PipelineCommand { + override val pipeline = "/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.class" +} -- GitLab