Skip to content
Snippets Groups Projects
Commit f49e6275 authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Added GatkPipeline, finaly replacement for "Gatk" pipeline

parent ae952d4c
No related branches found
No related tags found
No related merge requests found
/target/
\ No newline at end of file
GATK variantcalling pipeline
=======================
License
===
A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL license; For commercial users or users who do not want to follow the AGPL license, please contact sasc@lumc.nl to purchase a separate license.
{
"referenceFile" : "/data/DIV5/SASC/common/gatk_bundle_2.8/hg19/ucsc.hg19.fasta",
"reference" : "/data/DIV5/SASC/common/gatk_bundle_2.8/hg19/ucsc.hg19.fasta",
"samtools": { "exe": "test"},
"fastqc": { "exe": "/home/pjvan_thof/Downloads/FastQC/fastqc" },
"star" : {"exe":"/home/pjvan_thof/pipelines/test/test"},
"bwa" : {"exe":"/home/pjvan_thof/pipelines/test/test"},
"flexiprep": {
"fastqc": {"exe":"/home/pjvan_thof/pipelines/test/test"},
"sickle": {"exe":"/home/pjvan_thof/pipelines/test/test"}
},
"mapping": {
},
"gatk": {
"mapping": {
"flexiprep": {
}
},
"referenceFile" : "/data/DIV5/SASC/common/gatk_bundle_2.8/hg19/ucsc.hg19.fasta",
"dbsnp": "/home/pjvan_thof/pipelines/test/test",
"hapmap": "/home/pjvan_thof/pipelines/test/test",
"omni": "/home/pjvan_thof/pipelines/test/test",
"1000G": "/home/pjvan_thof/pipelines/test/test",
"mills": "/home/pjvan_thof/pipelines/test/test",
"haplotypecaller": {
"stand_call_conf": 20,
"stand_emit_conf": 20
}
},
"cutadapt": {"exe":"/home/pjvan_thof/pipelines/test/test"},
"samples": {
"test": {
"ID": "test",
"runs": {
"1" : {
"ID": "1",
"R1" : "/home/pjvan_thof/pipelines/test/test.R1.fastq.gz",
"R2" : "/home/pjvan_thof/pipelines/test/test.R2.fastq.gz"
},
"2" : {
"ID": "1",
"R1" : "/home/pjvan_thof/pipelines/test/test.R1.fastq.gz",
"R2" : "/home/pjvan_thof/pipelines/test/test.R2.fastq.gz"
}
}
}
}
}
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>nl.lumc.sasc</groupId>
<artifactId>GatkPipeline</artifactId>
<version>0.1.3</version>
<packaging>jar</packaging>
<name>GatkPipeline</name>
<url>http://maven.apache.org</url>
<parent>
<groupId>nl.lumc.sasc</groupId>
<artifactId>Biopet</artifactId>
<version>0.1.3</version>
<relativePath>../</relativePath>
</parent>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<sting.unpack.phase>prepare-package</sting.unpack.phase>
<sting.shade.phase>package</sting.shade.phase>
<app.main.class>nl.lumc.sasc.biopet.pipelines.gatk.GatkPipeline</app.main.class>
</properties>
<dependencies>
<!-- <dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>-->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-compiler</artifactId>
<version>2.11.0</version>
</dependency>
<!-- <dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_2.9.2</artifactId>
<version>2.0.M4</version>
<scope>test</scope>
</dependency>-->
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>Biopet-Framework</artifactId>
<version>0.1.3</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>Bam-Metrics</artifactId>
<version>0.1.3</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>Mapping</artifactId>
<version>0.1.3</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>GatkVariantcalling</artifactId>
<version>0.1.3</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>GatkGenotyping</artifactId>
<version>0.1.3</version>
</dependency>
<dependency>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>queue-package</artifactId>
<version>3.2-2</version>
</dependency>
</dependencies>
<build>
<resources>
<resource>
<directory>scripts</directory>
<includes>
<include>**/*</include>
</includes>
</resource>
</resources>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<id>scala-compile</id>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<arg>-dependencyfile</arg>
<arg>${project.build.directory}/.scala_dependencies</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
<!-- <plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.7.2</version>
<executions>
<execution>
<id>default-test</id>
Disable the default-test by putting it in phase none
<phase>none</phase>
</execution>
</executions>
</plugin>-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.3</version>
<configuration>
<finalName>${project.artifactId}-${project.version}</finalName>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<manifestEntries>
<Main-Class>${app.main.class}</Main-Class>
<X-Compile-Source-JDK>${maven.compile.source}</X-Compile-Source-JDK>
<X-Compile-Target-JDK>${maven.compile.target}</X-Compile-Target-JDK>
</manifestEntries>
</transformer>
</transformers>
<artifactSet>
<excludes>
<exclude>org.broadinstitute.gatk:queue-package</exclude>
<exclude>junit:junit</exclude>
<exclude>org.scala-lang:scala-compiler</exclude>
<exclude>org.scala-lang:scala-library</exclude>
<exclude>org.scala-lang:scala-reflect</exclude>
<exclude>org.scala-lang.modules:*</exclude>
</excludes>
</artifactSet>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.function._
import nl.lumc.sasc.biopet.function.aligners._
import java.io.File
import nl.lumc.sasc.biopet.core._
import nl.lumc.sasc.biopet.core.config._
import nl.lumc.sasc.biopet.pipelines.mapping._
import nl.lumc.sasc.biopet.function.picard.MarkDuplicates
import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics
import nl.lumc.sasc.biopet.pipelines.flexiprep._
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.queue.extensions.gatk._
import org.broadinstitute.gatk.queue.extensions.picard._
import org.broadinstitute.gatk.queue.function._
import org.broadinstitute.gatk.utils.commandline.{Argument}
import scala.util.parsing.json._
class GatkPipeline(val root:Configurable) extends QScript with MultiSampleQScript {
def this() = this(null)
@Argument(doc="Only Sample",shortName="sample", required=false)
val onlySample: String = ""
@Argument(doc="Skip Genotyping step",shortName="skipgenotyping", required=false)
var skipGenotyping: Boolean = false
@Argument(doc="Merge gvcfs",shortName="mergegvcfs", required=false)
var mergeGvcfs: Boolean = false
var referenceFile: File = _
var dbsnp: File = _
var gvcfFiles: List[File] = Nil
var finalBamFiles: List[File] = Nil
def init() {
for (file <- configfiles) globalConfig.loadConfigFile(file)
referenceFile = config("referenceFile")
if (configContains("dbsnp")) dbsnp = config("dbsnp")
for (file <- config("gvcfFiles", Nil).getList) gvcfFiles :+= file.toString
if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module")
else if (!outputDir.endsWith("/")) outputDir += "/"
}
def biopetScript() {
if (onlySample.isEmpty) {
runSamplesJobs
//SampleWide jobs
if (mergeGvcfs && gvcfFiles.size > 0) {
val newFile = outputDir + "merged.gvcf.vcf"
addCombineGVCFs(gvcfFiles, newFile)
gvcfFiles = List(newFile)
}
if (!skipGenotyping && gvcfFiles.size > 0) {
val gatkGenotyping = new GatkGenotyping(this)
gatkGenotyping.inputGvcfs = gvcfFiles
gatkGenotyping.outputDir = outputDir + "genotyping/"
gatkGenotyping.init
gatkGenotyping.biopetScript
addAll(gatkGenotyping.functions)
var vcfFile = gatkGenotyping.outputFile
if (config("inputtype", "dna").getString != "rna") {
vcfFile = addVariantAnnotator(vcfFile, finalBamFiles, outputDir + "recalibration/")
vcfFile = addSnpVariantRecalibrator(vcfFile, outputDir + "recalibration/")
vcfFile = addIndelVariantRecalibrator(vcfFile, outputDir + "recalibration/")
}
} else logger.warn("No gVCFs to genotype")
} else runSingleSampleJobs(onlySample)
}
// Called for each sample
def runSingleSampleJobs(sampleConfig:Map[String,Any]) : Map[String,List[File]] = {
var outputFiles:Map[String,List[File]] = Map()
var runBamfiles: List[File] = List()
var sampleID: String = sampleConfig("ID").toString
for ((run, runFiles) <- runRunsJobs(sampleConfig)) {
runBamfiles +:= runFiles("FinalBam")
}
outputFiles += ("FinalBams" -> runBamfiles)
// addAll(BamMetrics(this, bamFile, outputDir + "metrics/").functions) // Metrics pipeline
if (runBamfiles.size > 0) {
finalBamFiles ++= runBamfiles
val gatkVariantcalling = new GatkVariantcalling(this)
gatkVariantcalling.inputBams = runBamfiles
gatkVariantcalling.outputDir = outputDir + sampleID + "/variantcalling/"
gatkVariantcalling.init
gatkVariantcalling.biopetScript
addAll(gatkVariantcalling.functions)
gvcfFiles :+= gatkVariantcalling.outputFile
} else logger.warn("No bamfiles for variant calling for sample: " + sampleID)
return outputFiles
}
// Called for each run from a sample
def runSingleRunJobs(runConfig:Map[String,Any], sampleConfig:Map[String,Any]) : Map[String,File] = {
var outputFiles:Map[String,File] = Map()
val runID: String = runConfig("ID").toString
val sampleID: String = sampleConfig("ID").toString
val runDir: String = outputDir + sampleID + "/run_" + runID + "/"
var inputType = ""
if (runConfig.contains("inputtype")) inputType = runConfig("inputtype").toString
else inputType = config("inputtype", "dna").toString
if (runConfig.contains("R1")) {
val mapping = Mapping.loadFromRunConfig(this, runConfig, sampleConfig, runDir)
addAll(mapping.functions) // Add functions of mapping to curent function pool
outputFiles += ("FinalBam" -> mapping.outputFiles("finalBamFile"))
} else this.logger.error("Sample: " + sampleID + ": No R1 found for run: " + runConfig)
return outputFiles
}
def addSnpVariantRecalibrator(inputVcf:File, dir:String): File = {
val snpVariantRecalibrator = getVariantRecalibrator("snp")
snpVariantRecalibrator.input +:= inputVcf
snpVariantRecalibrator.recal_file = swapExt(dir, inputVcf,".vcf",".snp.recal")
snpVariantRecalibrator.tranches_file = swapExt(dir, inputVcf,".vcf",".snp.tranches")
if (!snpVariantRecalibrator.resource.isEmpty) {
add(snpVariantRecalibrator)
val snpApplyRecalibration = getApplyRecalibration("snp")
snpApplyRecalibration.input +:= inputVcf
snpApplyRecalibration.recal_file = snpVariantRecalibrator.recal_file
snpApplyRecalibration.tranches_file = snpVariantRecalibrator.tranches_file
snpApplyRecalibration.out = swapExt(dir, inputVcf,".vcf",".snp.recal.vcf")
add(snpApplyRecalibration)
return snpApplyRecalibration.out
} else {
logger.warn("Skipped snp Recalibration, resource is missing")
return inputVcf
}
}
def addIndelVariantRecalibrator(inputVcf:File, dir:String): File = {
val indelVariantRecalibrator = getVariantRecalibrator("indel")
indelVariantRecalibrator.input +:= inputVcf
indelVariantRecalibrator.recal_file = swapExt(dir, inputVcf,".vcf",".indel.recal")
indelVariantRecalibrator.tranches_file = swapExt(dir, inputVcf,".vcf",".indel.tranches")
if (!indelVariantRecalibrator.resource.isEmpty) {
add(indelVariantRecalibrator)
val indelApplyRecalibration = getApplyRecalibration("indel")
indelApplyRecalibration.input +:= inputVcf
indelApplyRecalibration.recal_file = indelVariantRecalibrator.recal_file
indelApplyRecalibration.tranches_file = indelVariantRecalibrator.tranches_file
indelApplyRecalibration.out = swapExt(dir, inputVcf,".vcf",".indel.recal.vcf")
add(indelApplyRecalibration)
return indelApplyRecalibration.out
} else {
logger.warn("Skipped indel Recalibration, resource is missing")
return inputVcf
}
}
def getVariantRecalibrator(mode_arg:String) : VariantRecalibrator = {
val variantRecalibrator = new VariantRecalibrator() with gatkArguments {
if (mode_arg == "indel") {
this.mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL
if (configContains("mills", "variantrecalibrator")) this.resource :+= new TaggedFile(config("mills", "", "variantrecalibrator").getString, "known=false,training=true,truth=true,prior=12.0")
} else { // SNP
this.mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP
if (configContains("hapmap", "variantrecalibrator")) this.resource +:= new TaggedFile(config("hapmap", "", "variantrecalibrator").getString, "known=false,training=true,truth=true,prior=15.0")
if (configContains("omni", "variantrecalibrator")) this.resource +:= new TaggedFile(config("omni", "", "variantrecalibrator").getString, "known=false,training=true,truth=true,prior=12.0")
if (configContains("1000G", "variantrecalibrator")) this.resource +:= new TaggedFile(config("1000G", "", "variantrecalibrator").getString, "known=false,training=true,truth=false,prior=10.0")
}
if (configContains("dbsnp", "variantrecalibrator")) this.resource :+= new TaggedFile(config("dbsnp", "", "variantrecalibrator").getString, "known=true,training=false,truth=false,prior=2.0")
this.nt = 4
this.memoryLimit = nt * 2
this.an = Seq("QD","DP","FS","ReadPosRankSum","MQRankSum")
if (configContains("minnumbadvariants", "variantrecalibrator")) this.minNumBadVariants = config("minnumbadvariants", "", "variantrecalibrator")
if (configContains("maxgaussians", "variantrecalibrator")) this.maxGaussians = config("maxgaussians", "", "variantrecalibrator")
}
return variantRecalibrator
}
def getApplyRecalibration(mode_arg:String) : ApplyRecalibration = {
val applyRecalibration = new ApplyRecalibration() with gatkArguments {
if (mode_arg == "indel") {
this.mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL
this.ts_filter_level = config("ts_filter_level", 99.0, "applyrecalibration")
} else { // SNP
this.mode = org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP
this.ts_filter_level = config("ts_filter_level", 99.5, "applyrecalibration")
}
this.nt = 3
this.memoryLimit = nt * 2
if (configContains("scattercount", "applyrecalibration")) this.scatterCount = config("scattercount", 1, "applyrecalibration")
}
return applyRecalibration
}
def addVariantAnnotator(inputvcf:File, bamfiles:List[File], dir:String): File = {
val variantAnnotator = new VariantAnnotator with gatkArguments {
this.variant = inputvcf
this.input_file = bamfiles
this.dbsnp = config("dbsnp", "variantannotator")
this.out = swapExt(dir, inputvcf,".vcf",".anotated.vcf")
if (configContains("scattercount", "variantannotator")) this.scatterCount = config("scattercount", 1, "variantannotator")
}
add(variantAnnotator)
return variantAnnotator.out
}
def addCombineGVCFs(input:List[File], output:File): File = {
val combineGVCFs = new CombineGVCFs with gatkArguments {
this.variant = input
this.o = output
if (configContains("scattercount", "variantannotator")) this.scatterCount = config("scattercount", 1, "combinegvcfs")
}
add(combineGVCFs)
return output
}
trait gatkArguments extends CommandLineGATK {
this.reference_sequence = referenceFile
this.memoryLimit = 2
this.jobResourceRequests :+= "h_vmem=4G"
}
}
object GatkPipeline extends PipelineCommand {
override val pipeline = "/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.class"
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment