Commit 88b4b2a4 authored by Peter van 't Hof's avatar Peter van 't Hof

Added protected artifact for gatk pipelines

parent 6441dfd0
/target/
\ No newline at end of file
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>nl.lumc.sasc</groupId>
<artifactId>BiopetGatkPipelines</artifactId>
<version>0.2.0-DEV</version>
<packaging>jar</packaging>
<inceptionYear>2014</inceptionYear>
<name>BiopetGatkPipelines</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<sting.unpack.phase>prepare-package</sting.unpack.phase>
<sting.shade.phase>package</sting.shade.phase>
<app.main.class>nl.lumc.sasc.biopet.core.BiopetExecutable</app.main.class>
</properties>
<dependencies>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>BiopetFramework</artifactId>
<version>0.2.0-DEV</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>BiopetGatkExtensions</artifactId>
<version>0.2.0-DEV</version>
</dependency>
</dependencies>
<build>
<resources>
<resource>
<directory>src/main/resources</directory>
<includes>
<include>**/*</include>
</includes>
</resource>
</resources>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<id>scala-compile</id>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<arg>-dependencyfile</arg>
<arg>${project.build.directory}/.scala_dependencies</arg>
<arg>-deprecation</arg>
<arg>-feature</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>2.5</version>
<configuration>
<archive>
<manifest>
<addDefaultImplementationEntries>true</addDefaultImplementationEntries>
<addDefaultSpecificationEntries>true</addDefaultSpecificationEntries>
</manifest>
</archive>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<showDeprecation>true</showDeprecation>
</configuration>
</plugin>
</plugins>
</build>
</project>
package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }
import scala.util.Random
class GatkBenchmarkGenotyping(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Input(doc = "Sample gvcf file")
var sampleGvcf: File = _
@Argument(doc = "SampleName", required = true)
var sampleName: String = _
@Input(doc = "Gvcf files", shortName = "I", required = false)
var gvcfFiles: List[File] = Nil
var reference: File = config("reference")
@Argument(doc = "Dbsnp", shortName = "dbsnp", required = false)
var dbsnp: File = config("dbsnp")
def init() {
if (config.contains("gvcffiles")) for (file <- config("gvcffiles").asList) {
gvcfFiles ::= file.toString
}
if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module")
else if (!outputDir.endsWith("/")) outputDir += "/"
}
def biopetScript() {
var todoGvcfs = gvcfFiles
var gvcfPool: List[File] = Nil
addGenotypingPipeline(gvcfPool)
while (todoGvcfs.size > 0) {
val index = Random.nextInt(todoGvcfs.size)
gvcfPool ::= todoGvcfs(index)
addGenotypingPipeline(gvcfPool)
todoGvcfs = todoGvcfs.filter(b => b != todoGvcfs(index))
}
}
def addGenotypingPipeline(gvcfPool: List[File]) {
val gatkGenotyping = new GatkGenotyping(this)
gatkGenotyping.inputGvcfs = sampleGvcf :: gvcfPool
gatkGenotyping.samples :+= sampleName
gatkGenotyping.outputDir = outputDir + "samples_" + gvcfPool.size + "/"
gatkGenotyping.init
gatkGenotyping.biopetScript
addAll(gatkGenotyping.functions)
}
}
object GatkBenchmarkGenotyping extends PipelineCommand
package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.gatk.{ GenotypeGVCFs, SelectVariants }
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument }
class GatkGenotyping(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Input(doc = "Gvcf files", shortName = "I")
var inputGvcfs: List[File] = Nil
@Argument(doc = "Reference", shortName = "R", required = false)
var reference: File = config("reference")
@Argument(doc = "Dbsnp", shortName = "dbsnp", required = false)
var dbsnp: File = config("dbsnp")
@Argument(doc = "OutputName", required = false)
var outputName: String = "genotype"
@Output(doc = "OutputFile", shortName = "O", required = false)
var outputFile: File = _
@Argument(doc = "Samples", shortName = "sample", required = false)
var samples: List[String] = Nil
def init() {
if (outputFile == null) outputFile = outputDir + outputName + ".vcf.gz"
if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module")
else if (!outputDir.endsWith("/")) outputDir += "/"
}
def biopetScript() {
addGenotypeGVCFs(inputGvcfs, outputFile)
if (!samples.isEmpty) {
if (samples.size > 1) addSelectVariants(outputFile, samples, outputDir + "samples/", "all")
for (sample <- samples) addSelectVariants(outputFile, List(sample), outputDir + "samples/", sample)
}
}
def addGenotypeGVCFs(gvcfFiles: List[File], outputFile: File): File = {
val genotypeGVCFs = GenotypeGVCFs(this, gvcfFiles, outputFile)
add(genotypeGVCFs)
return genotypeGVCFs.out
}
def addSelectVariants(inputFile: File, samples: List[String], outputDir: String, name: String) {
val selectVariants = SelectVariants(this, inputFile, outputDir + name + ".vcf.gz")
selectVariants.excludeNonVariants = true
for (sample <- samples) selectVariants.sample_name :+= sample
add(selectVariants)
}
}
object GatkGenotyping extends PipelineCommand
package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.core.MultiSampleQScript
import nl.lumc.sasc.biopet.core.PipelineCommand
import nl.lumc.sasc.biopet.core.config.Configurable
import htsjdk.samtools.SamReaderFactory
import scala.collection.JavaConversions._
import java.io.File
import nl.lumc.sasc.biopet.extensions.gatk.{ CombineVariants, CombineGVCFs }
import nl.lumc.sasc.biopet.extensions.picard.AddOrReplaceReadGroups
import nl.lumc.sasc.biopet.extensions.picard.SamToFastq
import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics
import nl.lumc.sasc.biopet.pipelines.mapping.Mapping
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.utils.commandline.{ Argument }
class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScript {
def this() = this(null)
@Argument(doc = "Only Sample", shortName = "sample", required = false)
val onlySample: List[String] = Nil
@Argument(doc = "Skip Genotyping step", shortName = "skipgenotyping", required = false)
var skipGenotyping: Boolean = false
@Argument(doc = "Merge gvcfs", shortName = "mergegvcfs", required = false)
var mergeGvcfs: Boolean = false
@Argument(doc = "Joint variantcalling", shortName = "jointVariantCalling", required = false)
var jointVariantcalling: Boolean = config("joint_variantcalling", default = false)
@Argument(doc = "Joint genotyping", shortName = "jointGenotyping", required = false)
var jointGenotyping: Boolean = config("joint_genotyping", default = false)
var singleSampleCalling = config("single_sample_calling", default = true)
var reference: File = config("reference", required = true)
var dbsnp: File = config("dbsnp")
var gvcfFiles: List[File] = Nil
var finalBamFiles: List[File] = Nil
var useAllelesOption: Boolean = config("use_alleles_option", default = false)
class LibraryOutput extends AbstractLibraryOutput {
var mappedBamFile: File = _
var variantcalling: GatkVariantcalling.ScriptOutput = _
}
class SampleOutput extends AbstractSampleOutput {
var variantcalling: GatkVariantcalling.ScriptOutput = _
}
def init() {
if (config.contains("target_bed")) {
defaults ++= Map("gatk" -> Map(("intervals" -> config("target_bed").asStringList)))
}
if (config.contains("gvcfFiles"))
for (file <- config("gvcfFiles").asList)
gvcfFiles :+= file.toString
if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module")
else if (!outputDir.endsWith("/")) outputDir += "/"
}
val multisampleVariantcalling = new GatkVariantcalling(this) {
override protected lazy val configName = "gatkvariantcalling"
override def configPath: List[String] = "multisample" :: super.configPath
}
def biopetScript() {
if (onlySample.isEmpty) {
runSamplesJobs
//SampleWide jobs
if (mergeGvcfs && gvcfFiles.size > 0) {
val newFile = outputDir + "merged.gvcf.vcf.gz"
add(CombineGVCFs(this, gvcfFiles, newFile))
gvcfFiles = List(newFile)
}
if (!skipGenotyping && gvcfFiles.size > 0) {
if (jointGenotyping) {
val gatkGenotyping = new GatkGenotyping(this)
gatkGenotyping.inputGvcfs = gvcfFiles
gatkGenotyping.outputDir = outputDir + "genotyping/"
gatkGenotyping.init
gatkGenotyping.biopetScript
addAll(gatkGenotyping.functions)
var vcfFile = gatkGenotyping.outputFile
}
} else logger.warn("No gVCFs to genotype")
if (jointVariantcalling) {
val allBamfiles = for (
(sampleID, sampleOutput) <- samplesOutput;
file <- sampleOutput.variantcalling.bamFiles
) yield file
val allRawVcfFiles = for ((sampleID, sampleOutput) <- samplesOutput) yield sampleOutput.variantcalling.rawFilterVcfFile
val gatkVariantcalling = new GatkVariantcalling(this) {
override protected lazy val configName = "gatkvariantcalling"
override def configPath: List[String] = "multisample" :: super.configPath
}
if (gatkVariantcalling.useMpileup) {
val cvRaw = CombineVariants(this, allRawVcfFiles.toList, outputDir + "variantcalling/multisample.raw.vcf.gz")
add(cvRaw)
gatkVariantcalling.rawVcfInput = cvRaw.out
}
multisampleVariantcalling.preProcesBams = false
multisampleVariantcalling.doublePreProces = false
multisampleVariantcalling.inputBams = allBamfiles.toList
multisampleVariantcalling.outputDir = outputDir + "variantcalling"
multisampleVariantcalling.outputName = "multisample"
multisampleVariantcalling.init
multisampleVariantcalling.biopetScript
addAll(multisampleVariantcalling.functions)
if (config("inputtype", default = "dna").asString != "rna" && config("recalibration", default = false).asBoolean) {
val recalibration = new GatkVariantRecalibration(this)
recalibration.inputVcf = multisampleVariantcalling.scriptOutput.finalVcfFile
recalibration.bamFiles = finalBamFiles
recalibration.outputDir = outputDir + "recalibration/"
recalibration.init
recalibration.biopetScript
}
}
} else for (sample <- onlySample) runSingleSampleJobs(sample)
}
// Called for each sample
def runSingleSampleJobs(sampleConfig: Map[String, Any]): SampleOutput = {
val sampleOutput = new SampleOutput
var libraryBamfiles: List[File] = List()
val sampleID: String = sampleConfig("ID").toString
sampleOutput.libraries = runLibraryJobs(sampleConfig)
val sampleDir = globalSampleDir + sampleID
for ((libraryID, libraryOutput) <- sampleOutput.libraries) {
libraryBamfiles ++= libraryOutput.variantcalling.bamFiles
}
if (libraryBamfiles.size > 0) {
finalBamFiles ++= libraryBamfiles
val gatkVariantcalling = new GatkVariantcalling(this)
gatkVariantcalling.inputBams = libraryBamfiles
gatkVariantcalling.outputDir = sampleDir + "/variantcalling/"
gatkVariantcalling.preProcesBams = false
if (!singleSampleCalling) {
gatkVariantcalling.useHaplotypecaller = false
gatkVariantcalling.useUnifiedGenotyper = false
}
gatkVariantcalling.sampleID = sampleID
gatkVariantcalling.init
gatkVariantcalling.biopetScript
addAll(gatkVariantcalling.functions)
sampleOutput.variantcalling = gatkVariantcalling.scriptOutput
gvcfFiles :+= gatkVariantcalling.scriptOutput.gvcfFile
} else logger.warn("No bamfiles for variant calling for sample: " + sampleID)
return sampleOutput
}
// Called for each run from a sample
def runSingleLibraryJobs(runConfig: Map[String, Any], sampleConfig: Map[String, Any]): LibraryOutput = {
val libraryOutput = new LibraryOutput
val runID: String = runConfig("ID").toString
val sampleID: String = sampleConfig("ID").toString
val runDir: String = globalSampleDir + sampleID + "/run_" + runID + "/"
var inputType = ""
if (runConfig.contains("inputtype")) inputType = runConfig("inputtype").toString
else inputType = config("inputtype", default = "dna").toString
if (runConfig.contains("R1")) {
val mapping = Mapping.loadFromLibraryConfig(this, runConfig, sampleConfig, runDir)
addAll(mapping.functions) // Add functions of mapping to curent function pool
libraryOutput.mappedBamFile = mapping.outputFiles("finalBamFile")
} else if (runConfig.contains("bam")) {
var bamFile = new File(runConfig("bam").toString)
if (!bamFile.exists) throw new IllegalStateException("Bam in config does not exist, file: " + bamFile)
if (config("bam_to_fastq", default = false).asBoolean) {
val samToFastq = SamToFastq(this, bamFile, runDir + sampleID + "-" + runID + ".R1.fastq",
runDir + sampleID + "-" + runID + ".R2.fastq")
add(samToFastq, isIntermediate = true)
val mapping = Mapping.loadFromLibraryConfig(this, runConfig, sampleConfig, runDir, startJobs = false)
mapping.input_R1 = samToFastq.fastqR1
mapping.input_R2 = samToFastq.fastqR2
mapping.init
mapping.biopetScript
addAll(mapping.functions) // Add functions of mapping to curent function pool
libraryOutput.mappedBamFile = mapping.outputFiles("finalBamFile")
} else {
var readGroupOke = true
val inputSam = SamReaderFactory.makeDefault.open(bamFile)
val header = inputSam.getFileHeader.getReadGroups
for (readGroup <- inputSam.getFileHeader.getReadGroups) {
if (readGroup.getSample != sampleID) logger.warn("Sample ID readgroup in bam file is not the same")
if (readGroup.getLibrary != runID) logger.warn("Library ID readgroup in bam file is not the same")
if (readGroup.getSample != sampleID || readGroup.getLibrary != runID) readGroupOke = false
}
inputSam.close
if (!readGroupOke) {
if (config("correct_readgroups", default = false)) {
logger.info("Correcting readgroups, file:" + bamFile)
val aorrg = AddOrReplaceReadGroups(this, bamFile, new File(runDir + sampleID + "-" + runID + ".bam"))
aorrg.RGID = sampleID + "-" + runID
aorrg.RGLB = runID
aorrg.RGSM = sampleID
if (runConfig.contains("PL")) aorrg.RGPL = runConfig("PL").toString
else aorrg.RGPL = "illumina"
if (runConfig.contains("PU")) aorrg.RGPU = runConfig("PU").toString
else aorrg.RGPU = "na"
if (runConfig.contains("CN")) aorrg.RGCN = runConfig("CN").toString
add(aorrg, isIntermediate = true)
bamFile = aorrg.output
} else throw new IllegalStateException("Readgroup sample and/or library of input bamfile is not correct, file: " + bamFile +
"\nPossible to set 'correct_readgroups' to true on config to automatic fix this")
}
addAll(BamMetrics(this, bamFile, runDir + "metrics/").functions)
libraryOutput.mappedBamFile = bamFile
}
} else logger.error("Sample: " + sampleID + ": No R1 found for run: " + runConfig)
val gatkVariantcalling = new GatkVariantcalling(this)
gatkVariantcalling.inputBams = List(libraryOutput.mappedBamFile)
gatkVariantcalling.outputDir = runDir
gatkVariantcalling.variantcalling = config("library_variantcalling", default = false)
gatkVariantcalling.preProcesBams = true
gatkVariantcalling.sampleID = sampleID
gatkVariantcalling.init
gatkVariantcalling.biopetScript
addAll(gatkVariantcalling.functions)
libraryOutput.variantcalling = gatkVariantcalling.scriptOutput
return libraryOutput
}
}
object GatkPipeline extends PipelineCommand
package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.core.BiopetQScript
import nl.lumc.sasc.biopet.core.PipelineCommand
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.gatk.ApplyRecalibration
import nl.lumc.sasc.biopet.extensions.gatk.VariantAnnotator
import nl.lumc.sasc.biopet.extensions.gatk.VariantRecalibrator
import org.broadinstitute.gatk.queue.QScript
class GatkVariantRecalibration(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Input(doc = "input vcf file", shortName = "I")
var inputVcf: File = _
@Input(doc = "input vcf file", shortName = "BAM", required = false)
var bamFiles: List[File] = Nil
@Output(doc = "output vcf file", shortName = "out")
var outputVcf: File = _
def init() {
if (inputVcf == null) throw new IllegalStateException("Missing Output directory on gatk module")
if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module")
else if (!outputDir.endsWith("/")) outputDir += "/"
}
def biopetScript() {
var vcfFile: File = if (!bamFiles.isEmpty) addVariantAnnotator(inputVcf, bamFiles, outputDir) else inputVcf
vcfFile = addSnpVariantRecalibrator(vcfFile, outputDir)
vcfFile = addIndelVariantRecalibrator(vcfFile, outputDir)
}
def addSnpVariantRecalibrator(inputVcf: File, dir: String): File = {
val snpRecal = VariantRecalibrator(this, inputVcf, swapExt(dir, inputVcf, ".vcf", ".indel.recal"),
swapExt(dir, inputVcf, ".vcf", ".indel.tranches"), indel = false)
if (!snpRecal.resource.isEmpty) {
add(snpRecal)
val snpApply = ApplyRecalibration(this, inputVcf, swapExt(dir, inputVcf, ".vcf", ".indel.recal.vcf"),
snpRecal.recal_file, snpRecal.tranches_file, indel = false)
add(snpApply)
return snpApply.out
} else {
logger.warn("Skipped snp Recalibration, resource is missing")
return inputVcf
}
}
def addIndelVariantRecalibrator(inputVcf: File, dir: String): File = {
val indelRecal = VariantRecalibrator(this, inputVcf, swapExt(dir, inputVcf, ".vcf", ".indel.recal"),
swapExt(dir, inputVcf, ".vcf", ".indel.tranches"), indel = true)
if (!indelRecal.resource.isEmpty) {
add(indelRecal)
val indelApply = ApplyRecalibration(this, inputVcf, swapExt(dir, inputVcf, ".vcf", ".indel.recal.vcf"),
indelRecal.recal_file, indelRecal.tranches_file, indel = true)
add(indelApply)
return indelApply.out
} else {
logger.warn("Skipped indel Recalibration, resource is missing")
return inputVcf
}
}
def addVariantAnnotator(inputvcf: File, bamfiles: List[File], dir: String): File = {
val variantAnnotator = VariantAnnotator(this, inputvcf, bamfiles, swapExt(dir, inputvcf, ".vcf", ".anotated.vcf"))
add(variantAnnotator)
return variantAnnotator.out
}
}
object GatkVariantRecalibration extends PipelineCommand
package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import java.io.File
import nl.lumc.sasc.biopet.tools.{ MpileupToVcf, VcfFilter, MergeAlleles }
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.gatk.{ AnalyzeCovariates, BaseRecalibrator, GenotypeGVCFs, HaplotypeCaller, IndelRealigner, PrintReads, RealignerTargetCreator, SelectVariants, CombineVariants, UnifiedGenotyper }
import nl.lumc.sasc.biopet.extensions.picard.MarkDuplicates
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile
import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }
import scala.collection.SortedMap
import scala.language.reflectiveCalls
class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
val scriptOutput = new GatkVariantcalling.ScriptOutput
@Input(doc = "Bam files (should be deduped bams)", shortName = "BAM")
var inputBams: List[File] = Nil
@Input(doc = "Raw vcf file", shortName = "raw")
var rawVcfInput: File = _
@Argument(doc = "Reference", shortName = "R", required = false)
var reference: File = config("reference", required = true)
@Argument(doc = "Dbsnp", shortName = "dbsnp", required = false)
var dbsnp: File = config("dbsnp")
@Argument(doc = "OutputName", required = false)
var outputName: String = _
@Argument(doc = "Sample name", required = false)
var sampleID: String = _
var preProcesBams: Option[Boolean] = config("pre_proces_bams", default = true)
var variantcalling: Boolean = true
var doublePreProces: Option[Boolean] = config("double_pre_proces", default = true)
var useHaplotypecaller: Option[Boolean] = config("use_haplotypecaller", default = true)
var useUnifiedGenotyper: Option[Boolean] = config("use_unifiedgenotyper", default = false)
var useAllelesOption: Option[Boolean] = config("use_alleles_option", default = false)
var useMpileup: Boolean = config("use_mpileup", default = true)
var useIndelRealigner: Boolean = config("use_indel_realign", default = true)
var useBaseRecalibration: Boolean = config("use_base_recalibration", default = true)