From f49e62750dd8956a87d0f6c51facc7e50d0c161a Mon Sep 17 00:00:00 2001
From: Peter van 't Hof <p.j.van_t_hof@lumc.nl>
Date: Fri, 18 Jul 2014 15:57:07 +0200
Subject: [PATCH] Added GatkPipeline, finaly replacement for "Gatk" pipeline

---
 gatk-pipeline/.gitignore                      |   1 +
 gatk-pipeline/README.md                       |   8 +
 gatk-pipeline/examples/test.json              |  48 ++++
 gatk-pipeline/pom.xml                         | 155 ++++++++++++
 .../biopet/pipelines/gatk/GatkPipeline.scala  | 231 ++++++++++++++++++
 5 files changed, 443 insertions(+)
 create mode 100644 gatk-pipeline/.gitignore
 create mode 100644 gatk-pipeline/README.md
 create mode 100644 gatk-pipeline/examples/test.json
 create mode 100644 gatk-pipeline/pom.xml
 create mode 100644 gatk-pipeline/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala

diff --git a/gatk-pipeline/.gitignore b/gatk-pipeline/.gitignore
new file mode 100644
index 000000000..a6f89c2da
--- /dev/null
+++ b/gatk-pipeline/.gitignore
@@ -0,0 +1 @@
+/target/
\ No newline at end of file
diff --git a/gatk-pipeline/README.md b/gatk-pipeline/README.md
new file mode 100644
index 000000000..8547b8626
--- /dev/null
+++ b/gatk-pipeline/README.md
@@ -0,0 +1,8 @@
+GATK variantcalling pipeline
+=======================
+
+
+License
+===
+
+A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL license; For commercial users or users who do not want to follow the AGPL license, please contact sasc@lumc.nl to purchase a separate license.
diff --git a/gatk-pipeline/examples/test.json b/gatk-pipeline/examples/test.json
new file mode 100644
index 000000000..eb32e1a21
--- /dev/null
+++ b/gatk-pipeline/examples/test.json
@@ -0,0 +1,48 @@
+{
+    "referenceFile" : "/data/DIV5/SASC/common/gatk_bundle_2.8/hg19/ucsc.hg19.fasta",
+    "reference" : "/data/DIV5/SASC/common/gatk_bundle_2.8/hg19/ucsc.hg19.fasta",
+    "samtools": { "exe": "test"},
+    "fastqc": { "exe": "/home/pjvan_thof/Downloads/FastQC/fastqc" },
+    "star" : {"exe":"/home/pjvan_thof/pipelines/test/test"},
+    "bwa" : {"exe":"/home/pjvan_thof/pipelines/test/test"},
+    "flexiprep": {
+        "fastqc": {"exe":"/home/pjvan_thof/pipelines/test/test"},
+        "sickle": {"exe":"/home/pjvan_thof/pipelines/test/test"}
+    },
+    "mapping": {
+    },
+    "gatk": {
+        "mapping": {
+            "flexiprep": {
+            }
+        },
+        "referenceFile" : "/data/DIV5/SASC/common/gatk_bundle_2.8/hg19/ucsc.hg19.fasta",
+        "dbsnp": "/home/pjvan_thof/pipelines/test/test",
+        "hapmap": "/home/pjvan_thof/pipelines/test/test",
+        "omni": "/home/pjvan_thof/pipelines/test/test",
+        "1000G": "/home/pjvan_thof/pipelines/test/test",
+        "mills": "/home/pjvan_thof/pipelines/test/test",
+        "haplotypecaller": {
+            "stand_call_conf": 20,
+            "stand_emit_conf": 20
+        }
+    },
+    "cutadapt": {"exe":"/home/pjvan_thof/pipelines/test/test"},
+    "samples": {
+        "test": {
+            "ID": "test",
+            "runs": {
+                "1" : {
+                    "ID": "1",
+                    "R1" : "/home/pjvan_thof/pipelines/test/test.R1.fastq.gz",
+                    "R2" : "/home/pjvan_thof/pipelines/test/test.R2.fastq.gz"
+                },
+                "2" : {
+                    "ID": "1",
+                    "R1" : "/home/pjvan_thof/pipelines/test/test.R1.fastq.gz",
+                    "R2" : "/home/pjvan_thof/pipelines/test/test.R2.fastq.gz"
+                }
+            }
+        }
+    }
+}
diff --git a/gatk-pipeline/pom.xml b/gatk-pipeline/pom.xml
new file mode 100644
index 000000000..f7d6b1e23
--- /dev/null
+++ b/gatk-pipeline/pom.xml
@@ -0,0 +1,155 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>nl.lumc.sasc</groupId>
+  <artifactId>GatkPipeline</artifactId>
+  <version>0.1.3</version>
+  <packaging>jar</packaging>
+
+  <name>GatkPipeline</name>
+  <url>http://maven.apache.org</url>
+  
+  <parent>
+      <groupId>nl.lumc.sasc</groupId>
+      <artifactId>Biopet</artifactId>
+      <version>0.1.3</version>
+      <relativePath>../</relativePath>
+  </parent>
+  
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <sting.unpack.phase>prepare-package</sting.unpack.phase>
+    <sting.shade.phase>package</sting.shade.phase>
+    <app.main.class>nl.lumc.sasc.biopet.pipelines.gatk.GatkPipeline</app.main.class>
+  </properties>
+  
+    <dependencies>
+<!--        <dependency>
+          <groupId>junit</groupId>
+          <artifactId>junit</artifactId>
+          <version>3.8.1</version>
+          <scope>test</scope>
+        </dependency>-->
+        <dependency>
+            <groupId>org.scala-lang</groupId>
+            <artifactId>scala-compiler</artifactId>
+            <version>2.11.0</version>
+        </dependency>
+<!--        <dependency>
+            <groupId>org.scalatest</groupId>
+            <artifactId>scalatest_2.9.2</artifactId>
+            <version>2.0.M4</version>
+            <scope>test</scope>
+        </dependency>-->
+        <dependency>
+            <groupId>nl.lumc.sasc</groupId>
+            <artifactId>Biopet-Framework</artifactId>
+            <version>0.1.3</version>
+        </dependency>
+        <dependency>
+            <groupId>nl.lumc.sasc</groupId>
+            <artifactId>Bam-Metrics</artifactId>
+            <version>0.1.3</version>
+        </dependency>
+        <dependency>
+            <groupId>nl.lumc.sasc</groupId>
+            <artifactId>Mapping</artifactId>
+            <version>0.1.3</version>
+        </dependency>
+        <dependency>
+            <groupId>nl.lumc.sasc</groupId>
+            <artifactId>GatkVariantcalling</artifactId>
+            <version>0.1.3</version>
+        </dependency>
+        <dependency>
+            <groupId>nl.lumc.sasc</groupId>
+            <artifactId>GatkGenotyping</artifactId>
+            <version>0.1.3</version>
+        </dependency>
+        <dependency>
+            <groupId>org.broadinstitute.gatk</groupId>
+            <artifactId>queue-package</artifactId>
+            <version>3.2-2</version>
+        </dependency>
+    </dependencies>
+    <build>
+        <resources>
+            <resource>
+                 <directory>scripts</directory>
+                 <includes>
+                     <include>**/*</include>
+                 </includes>
+            </resource>
+        </resources>
+        <plugins>
+            <plugin>
+                <groupId>org.scala-tools</groupId>
+                <artifactId>maven-scala-plugin</artifactId>
+                <version>2.15.2</version>
+                <executions>
+                    <execution>
+                        <id>scala-compile</id>
+                        <goals>
+                            <goal>compile</goal>
+                            <goal>testCompile</goal>
+                        </goals>
+                        <configuration>
+                            <args>
+                                <arg>-dependencyfile</arg>
+                                <arg>${project.build.directory}/.scala_dependencies</arg>
+                            </args>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+<!--            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <version>2.7.2</version>
+                <executions>
+                    <execution>
+                        <id>default-test</id>
+                         Disable the default-test by putting it in phase none 
+                        <phase>none</phase>
+                    </execution>
+                </executions>
+            </plugin>-->
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>2.3</version>
+                <configuration>
+                  <finalName>${project.artifactId}-${project.version}</finalName>
+                  <transformers>
+                    <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                      <manifestEntries>
+                        <Main-Class>${app.main.class}</Main-Class>
+                        <X-Compile-Source-JDK>${maven.compile.source}</X-Compile-Source-JDK>
+                        <X-Compile-Target-JDK>${maven.compile.target}</X-Compile-Target-JDK>
+                      </manifestEntries>
+                    </transformer>
+                  </transformers>
+                  <artifactSet>
+                    <excludes>
+                      <exclude>org.broadinstitute.gatk:queue-package</exclude>
+                      <exclude>junit:junit</exclude>
+                      <exclude>org.scala-lang:scala-compiler</exclude>
+                      <exclude>org.scala-lang:scala-library</exclude>
+                      <exclude>org.scala-lang:scala-reflect</exclude>
+                      <exclude>org.scala-lang.modules:*</exclude>
+                    </excludes>
+                  </artifactSet>
+                </configuration>
+            <executions>
+              <execution>
+                <phase>package</phase>
+                <goals>
+                  <goal>shade</goal>
+                </goals>
+              </execution>
+            </executions>
+          </plugin>
+        </plugins>
+    </build>
+</project>
diff --git a/gatk-pipeline/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala b/gatk-pipeline/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala
new file mode 100644
index 000000000..4936fd04a
--- /dev/null
+++ b/gatk-pipeline/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala
@@ -0,0 +1,231 @@
+package nl.lumc.sasc.biopet.pipelines.gatk
+
+import nl.lumc.sasc.biopet.function._
+import nl.lumc.sasc.biopet.function.aligners._
+import java.io.File
+import nl.lumc.sasc.biopet.core._
+import nl.lumc.sasc.biopet.core.config._
+import nl.lumc.sasc.biopet.pipelines.mapping._
+import nl.lumc.sasc.biopet.function.picard.MarkDuplicates
+import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics
+import nl.lumc.sasc.biopet.pipelines.flexiprep._
+import org.broadinstitute.gatk.queue.QScript
+import org.broadinstitute.gatk.queue.extensions.gatk._
+import org.broadinstitute.gatk.queue.extensions.picard._
+import org.broadinstitute.gatk.queue.function._
+import org.broadinstitute.gatk.utils.commandline.{Argument}
+import scala.util.parsing.json._
+
+class GatkPipeline(val root:Configurable) extends QScript with MultiSampleQScript {
+  def this() = this(null)
+  
+  @Argument(doc="Only Sample",shortName="sample", required=false)
+  val onlySample: String = ""
+  
+  @Argument(doc="Skip Genotyping step",shortName="skipgenotyping", required=false)
+  var skipGenotyping: Boolean = false
+  
+  @Argument(doc="Merge gvcfs",shortName="mergegvcfs", required=false)
+  var mergeGvcfs: Boolean = false
+  
+  var referenceFile: File = _
+  var dbsnp: File = _
+  var gvcfFiles: List[File] = Nil
+  var finalBamFiles: List[File] = Nil
+  
+  def init() {
+    for (file <- configfiles) globalConfig.loadConfigFile(file)
+    referenceFile = config("referenceFile")
+    if (configContains("dbsnp")) dbsnp = config("dbsnp")
+    for (file <- config("gvcfFiles", Nil).getList) gvcfFiles :+= file.toString
+    if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module")
+    else if (!outputDir.endsWith("/")) outputDir += "/"
+  }
+  
+  def biopetScript() {
+    if (onlySample.isEmpty) {
+      runSamplesJobs
+      
+      //SampleWide jobs
+      if (mergeGvcfs && gvcfFiles.size > 0) {
+        val newFile = outputDir + "merged.gvcf.vcf"
+        addCombineGVCFs(gvcfFiles, newFile)
+        gvcfFiles = List(newFile)
+      }
+      
+      if (!skipGenotyping && gvcfFiles.size > 0) {
+        val gatkGenotyping = new GatkGenotyping(this)
+        gatkGenotyping.inputGvcfs = gvcfFiles
+        gatkGenotyping.outputDir = outputDir + "genotyping/"
+        gatkGenotyping.init
+        gatkGenotyping.biopetScript
+        addAll(gatkGenotyping.functions)
+        var vcfFile = gatkGenotyping.outputFile
+        
+        if (config("inputtype", "dna").getString != "rna") {
+          vcfFile = addVariantAnnotator(vcfFile, finalBamFiles, outputDir + "recalibration/")
+          vcfFile = addSnpVariantRecalibrator(vcfFile, outputDir + "recalibration/")
+          vcfFile = addIndelVariantRecalibrator(vcfFile, outputDir + "recalibration/")
+        }
+      } else logger.warn("No gVCFs to genotype")
+    } else runSingleSampleJobs(onlySample)    
+  }
+  
+  // Called for each sample
+  def runSingleSampleJobs(sampleConfig:Map[String,Any]) : Map[String,List[File]] = {
+    var outputFiles:Map[String,List[File]] = Map()
+    var runBamfiles: List[File] = List()
+    var sampleID: String = sampleConfig("ID").toString
+    for ((run, runFiles) <- runRunsJobs(sampleConfig)) {
+      runBamfiles +:= runFiles("FinalBam")
+    }
+    outputFiles += ("FinalBams" -> runBamfiles)
+    
+//    addAll(BamMetrics(this, bamFile, outputDir + "metrics/").functions) // Metrics pipeline
+    
+    if (runBamfiles.size > 0) {
+      finalBamFiles ++= runBamfiles
+      val gatkVariantcalling = new GatkVariantcalling(this)
+      gatkVariantcalling.inputBams = runBamfiles
+      gatkVariantcalling.outputDir = outputDir + sampleID + "/variantcalling/"
+      gatkVariantcalling.init
+      gatkVariantcalling.biopetScript
+      addAll(gatkVariantcalling.functions)
+      gvcfFiles :+= gatkVariantcalling.outputFile
+    } else logger.warn("No bamfiles for variant calling for sample: " + sampleID)
+    return outputFiles
+  }
+  
+  // Called for each run from a sample
+  def runSingleRunJobs(runConfig:Map[String,Any], sampleConfig:Map[String,Any]) : Map[String,File] = {
+    var outputFiles:Map[String,File] = Map()
+    val runID: String = runConfig("ID").toString
+    val sampleID: String = sampleConfig("ID").toString
+    val runDir: String = outputDir + sampleID + "/run_" + runID + "/"
+    var inputType = ""
+    if (runConfig.contains("inputtype")) inputType = runConfig("inputtype").toString
+    else inputType = config("inputtype", "dna").toString
+    if (runConfig.contains("R1")) {
+      val mapping = Mapping.loadFromRunConfig(this, runConfig, sampleConfig, runDir)
+      addAll(mapping.functions) // Add functions of mapping to curent function pool
+      
+      outputFiles += ("FinalBam" -> mapping.outputFiles("finalBamFile"))
+    } else this.logger.error("Sample: " + sampleID + ": No R1 found for run: " + runConfig)    
+    return outputFiles
+  }
+  
+  def addSnpVariantRecalibrator(inputVcf:File, dir:String): File = {
+    val snpVariantRecalibrator = getVariantRecalibrator("snp")
+    snpVariantRecalibrator.input +:= inputVcf
+    snpVariantRecalibrator.recal_file = swapExt(dir, inputVcf,".vcf",".snp.recal")
+    snpVariantRecalibrator.tranches_file = swapExt(dir, inputVcf,".vcf",".snp.tranches")
+    if (!snpVariantRecalibrator.resource.isEmpty) {
+      add(snpVariantRecalibrator)
+
+      val snpApplyRecalibration = getApplyRecalibration("snp")
+      snpApplyRecalibration.input +:= inputVcf
+      snpApplyRecalibration.recal_file = snpVariantRecalibrator.recal_file
+      snpApplyRecalibration.tranches_file = snpVariantRecalibrator.tranches_file
+      snpApplyRecalibration.out = swapExt(dir, inputVcf,".vcf",".snp.recal.vcf")
+      add(snpApplyRecalibration)
+
+      return snpApplyRecalibration.out
+    } else {
+      logger.warn("Skipped snp Recalibration, resource is missing")
+      return inputVcf
+    }
+  }
+  
+  def addIndelVariantRecalibrator(inputVcf:File, dir:String): File = {
+    val indelVariantRecalibrator = getVariantRecalibrator("indel")
+    indelVariantRecalibrator.input +:= inputVcf
+    indelVariantRecalibrator.recal_file = swapExt(dir, inputVcf,".vcf",".indel.recal")
+    indelVariantRecalibrator.tranches_file = swapExt(dir, inputVcf,".vcf",".indel.tranches")
+    if (!indelVariantRecalibrator.resource.isEmpty) {
+      add(indelVariantRecalibrator)
+
+      val indelApplyRecalibration = getApplyRecalibration("indel")
+      indelApplyRecalibration.input +:= inputVcf
+      indelApplyRecalibration.recal_file = indelVariantRecalibrator.recal_file
+      indelApplyRecalibration.tranches_file = indelVariantRecalibrator.tranches_file
+      indelApplyRecalibration.out = swapExt(dir, inputVcf,".vcf",".indel.recal.vcf")
+      add(indelApplyRecalibration)
+
+      return indelApplyRecalibration.out
+    } else {
+      logger.warn("Skipped indel Recalibration, resource is missing")
+      return inputVcf
+    }
+  }
+  
+  def getVariantRecalibrator(mode_arg:String) : VariantRecalibrator = {
+    val variantRecalibrator = new VariantRecalibrator() with gatkArguments {
+      if (mode_arg == "indel") {
+        this.mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL
+        if (configContains("mills", "variantrecalibrator")) this.resource :+= new TaggedFile(config("mills", "", "variantrecalibrator").getString, "known=false,training=true,truth=true,prior=12.0")
+      } else { // SNP
+        this.mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP
+        if (configContains("hapmap", "variantrecalibrator")) this.resource +:= new TaggedFile(config("hapmap", "", "variantrecalibrator").getString, "known=false,training=true,truth=true,prior=15.0")
+        if (configContains("omni", "variantrecalibrator")) this.resource +:= new TaggedFile(config("omni", "", "variantrecalibrator").getString, "known=false,training=true,truth=true,prior=12.0")
+        if (configContains("1000G", "variantrecalibrator")) this.resource +:= new TaggedFile(config("1000G", "", "variantrecalibrator").getString, "known=false,training=true,truth=false,prior=10.0")
+      }
+      if (configContains("dbsnp", "variantrecalibrator")) this.resource :+= new TaggedFile(config("dbsnp", "", "variantrecalibrator").getString, "known=true,training=false,truth=false,prior=2.0")
+      this.nt = 4
+      this.memoryLimit = nt * 2
+      this.an = Seq("QD","DP","FS","ReadPosRankSum","MQRankSum")
+      if (configContains("minnumbadvariants", "variantrecalibrator")) this.minNumBadVariants = config("minnumbadvariants", "", "variantrecalibrator")
+      if (configContains("maxgaussians", "variantrecalibrator")) this.maxGaussians = config("maxgaussians", "", "variantrecalibrator")
+    }
+    return variantRecalibrator
+  }
+  
+  def getApplyRecalibration(mode_arg:String) : ApplyRecalibration = {
+    val applyRecalibration = new ApplyRecalibration() with gatkArguments {
+      if (mode_arg == "indel") {
+        this.mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL
+        this.ts_filter_level = config("ts_filter_level", 99.0, "applyrecalibration")
+      } else { // SNP
+        this.mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP
+        this.ts_filter_level = config("ts_filter_level", 99.5, "applyrecalibration")
+      }
+      this.nt = 3
+      this.memoryLimit = nt * 2
+      if (configContains("scattercount", "applyrecalibration")) this.scatterCount = config("scattercount", 1, "applyrecalibration")
+    }
+    return applyRecalibration
+  }
+    
+  def addVariantAnnotator(inputvcf:File, bamfiles:List[File], dir:String): File = {
+    val variantAnnotator = new VariantAnnotator with gatkArguments {
+      this.variant = inputvcf
+      this.input_file = bamfiles
+      this.dbsnp = config("dbsnp", "variantannotator")
+      this.out = swapExt(dir, inputvcf,".vcf",".anotated.vcf")
+      if (configContains("scattercount", "variantannotator")) this.scatterCount = config("scattercount", 1, "variantannotator")
+    }
+    add(variantAnnotator)
+    
+    return variantAnnotator.out
+  }
+  
+  def addCombineGVCFs(input:List[File], output:File): File = {
+    val combineGVCFs = new CombineGVCFs with gatkArguments {
+      this.variant = input
+      this.o = output
+      if (configContains("scattercount", "variantannotator")) this.scatterCount = config("scattercount", 1, "combinegvcfs")
+    }
+    add(combineGVCFs)
+    
+    return output
+  }
+  
+  trait gatkArguments extends CommandLineGATK {
+    this.reference_sequence = referenceFile
+    this.memoryLimit = 2
+    this.jobResourceRequests :+= "h_vmem=4G"
+  }
+}
+
+object GatkPipeline extends PipelineCommand {
+  override val pipeline = "/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.class"
+}
-- 
GitLab