Commit a482f22f authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Merge remote-tracking branch 'remotes/origin/develop' into fix-238

parents 6c3170cc ac389d5d
......@@ -31,6 +31,8 @@ class BiopetPipeline(val root: Configurable) extends QScript with SummaryQScript
// Executing a biopet pipeline inside
val shiva = new Shiva(this)
add(shiva)
shiva.init()
shiva.biopetScript()
addAll(shiva.functions)
......
......@@ -25,13 +25,13 @@
<dependencies>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>BiopetCore</artifactId>
<artifactId>BiopetExtensions</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-queue-extensions-distribution</artifactId>
<version>3.4</version>
<version>3.5</version>
</dependency>
</dependencies>
</project>
......@@ -8,8 +8,13 @@ package nl.lumc.sasc.biopet.extensions.gatk.broad
import java.io.File
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.Output
class GenotypeGVCFs(val root: Configurable) extends org.broadinstitute.gatk.queue.extensions.gatk.GenotypeGVCFs with GatkGeneral {
@Output(required = false)
protected var vcfIndex: File = _
annotation ++= config("annotation", default = Seq(), freeVar = false).asStringList
if (config.contains("dbsnp")) dbsnp = config("dbsnp")
......@@ -22,6 +27,11 @@ class GenotypeGVCFs(val root: Configurable) extends org.broadinstitute.gatk.queu
stand_call_conf = config("stand_call_conf", default = 30)
stand_emit_conf = config("stand_emit_conf", default = 0)
}
override def freezeFieldValues(): Unit = {
super.freezeFieldValues()
if (out.getName.endsWith(".vcf.gz")) vcfIndex = new File(out.getAbsolutePath + ".tbi")
}
}
object GenotypeGVCFs {
......@@ -29,6 +39,7 @@ object GenotypeGVCFs {
val gg = new GenotypeGVCFs(root)
gg.variant = gvcfFiles
gg.out = output
if (gg.out.getName.endsWith(".vcf.gz")) gg.vcfIndex = new File(gg.out.getAbsolutePath + ".tbi")
gg
}
}
\ No newline at end of file
......@@ -5,11 +5,18 @@
*/
package nl.lumc.sasc.biopet.extensions.gatk.broad
import java.io.File
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{Gather, Output}
import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType
class HaplotypeCaller(val root: Configurable) extends org.broadinstitute.gatk.queue.extensions.gatk.HaplotypeCaller with GatkGeneral {
@Gather(enabled = false)
@Output(required = false)
protected var vcfIndex: File = _
override val defaultThreads = 1
min_mapping_quality_score = config("minMappingQualityScore", default = 20)
......@@ -40,6 +47,7 @@ class HaplotypeCaller(val root: Configurable) extends org.broadinstitute.gatk.qu
override def freezeFieldValues() {
super.freezeFieldValues()
if (out.getName.endsWith(".vcf.gz")) vcfIndex = new File(out.getAbsolutePath + ".tbi")
if (bamOutput != null && nct.getOrElse(1) > 1) {
logger.warn("BamOutput is on, nct/threads is forced to set on 1, this option is only for debug")
nCoresRequest = Some(1)
......@@ -47,10 +55,22 @@ class HaplotypeCaller(val root: Configurable) extends org.broadinstitute.gatk.qu
nct = Some(getThreads)
memoryLimit = Option(memoryLimit.getOrElse(2.0) * nct.getOrElse(1))
}
}
object HaplotypeCaller {
def apply(root: Configurable, inputFiles: List[File], outputFile: File): HaplotypeCaller = {
val hc = new HaplotypeCaller(root)
hc.input_file = inputFiles
hc.out = outputFile
if (hc.out.getName.endsWith(".vcf.gz")) hc.vcfIndex = new File(hc.out.getAbsolutePath + ".tbi")
hc
}
def useGvcf() {
emitRefConfidence = org.broadinstitute.gatk.tools.walkers.haplotypecaller.ReferenceConfidenceMode.GVCF
variant_index_type = GATKVCFIndexType.LINEAR
variant_index_parameter = config("variant_index_parameter", default = 128000)
def gvcf(root: Configurable, inputFile: File, outputFile: File): HaplotypeCaller = {
val hc = apply(root, List(inputFile), outputFile)
hc.emitRefConfidence = org.broadinstitute.gatk.tools.walkers.haplotypecaller.ReferenceConfidenceMode.GVCF
hc.variant_index_type = GATKVCFIndexType.LINEAR
hc.variant_index_parameter = Some(hc.config("variant_index_parameter", default = 128000).asInt)
hc
}
}
}
\ No newline at end of file
......@@ -8,8 +8,12 @@ package nl.lumc.sasc.biopet.extensions.gatk.broad
import java.io.File
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.Output
class IndelRealigner(val root: Configurable) extends org.broadinstitute.gatk.queue.extensions.gatk.IndelRealigner with GatkGeneral {
@Output
protected var bamIndex: File = _
if (config.contains("scattercount")) scatterCount = config("scattercount")
}
......@@ -19,6 +23,7 @@ object IndelRealigner {
ir.input_file :+= input
ir.targetIntervals = targetIntervals
ir.out = new File(outputDir, input.getName.stripSuffix(".bam") + ".realign.bam")
ir.bamIndex = new File(outputDir, input.getName.stripSuffix(".bam") + ".realign.bai")
ir
}
}
\ No newline at end of file
......@@ -5,9 +5,17 @@
*/
package nl.lumc.sasc.biopet.extensions.gatk.broad
import java.io.File
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{Gather, Output}
class UnifiedGenotyper(val root: Configurable) extends org.broadinstitute.gatk.queue.extensions.gatk.UnifiedGenotyper with GatkGeneral {
@Gather(enabled = false)
@Output(required = false)
protected var vcfIndex: File = _
if (config.contains("scattercount")) scatterCount = config("scattercount")
if (config.contains("dbsnp")) this.dbsnp = config("dbsnp")
sample_ploidy = config("ploidy")
......@@ -36,3 +44,14 @@ class UnifiedGenotyper(val root: Configurable) extends org.broadinstitute.gatk.q
memoryLimit = Option(nct.getOrElse(1) * memoryLimit.getOrElse(2.0))
}
}
object UnifiedGenotyper {
def apply(root: Configurable, inputFiles: List[File], outputFile: File): UnifiedGenotyper = {
val ug = new UnifiedGenotyper(root)
ug.input_file = inputFiles
ug.out = outputFile
if (ug.out.getName.endsWith(".vcf.gz")) ug.vcfIndex = new File(ug.out.getAbsolutePath + ".tbi")
ug
}
}
\ No newline at end of file
/**
* Due to the license issue with GATK, this part of Biopet can only be used inside the
* LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions
* on how to use this protected part of biopet or contact us at sasc@lumc.nl
*/
package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import org.broadinstitute.gatk.queue.QScript
import scala.util.Random
class GatkBenchmarkGenotyping(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Input(doc = "Sample gvcf file")
var sampleGvcf: File = _
@Argument(doc = "SampleName", required = true)
var sampleName: String = _
@Input(doc = "Gvcf files", shortName = "I", required = false)
var gvcfFiles: List[File] = Nil
var reference: File = config("reference")
@Argument(doc = "Dbsnp", shortName = "dbsnp", required = false)
var dbsnp: File = config("dbsnp")
def init() {
if (config.contains("gvcffiles")) for (file <- config("gvcffiles").asList)
gvcfFiles ::= file.toString
}
def biopetScript() {
var todoGvcfs = gvcfFiles
var gvcfPool: List[File] = Nil
addGenotypingPipeline(gvcfPool)
while (todoGvcfs.nonEmpty) {
val index = Random.nextInt(todoGvcfs.size)
gvcfPool ::= todoGvcfs(index)
addGenotypingPipeline(gvcfPool)
todoGvcfs = todoGvcfs.filter(b => b != todoGvcfs(index))
}
}
def addGenotypingPipeline(gvcfPool: List[File]) {
val gatkGenotyping = new GatkGenotyping(this)
gatkGenotyping.inputGvcfs = sampleGvcf :: gvcfPool
gatkGenotyping.samples :+= sampleName
gatkGenotyping.outputDir = new File(outputDir, "samples_" + gvcfPool.size)
gatkGenotyping.init()
gatkGenotyping.biopetScript()
addAll(gatkGenotyping.functions)
}
}
object GatkBenchmarkGenotyping extends PipelineCommand
/**
* Due to the license issue with GATK, this part of Biopet can only be used inside the
* LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions
* on how to use this protected part of biopet or contact us at sasc@lumc.nl
*/
package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.extensions.gatk.broad.{ GenotypeGVCFs, SelectVariants }
import org.broadinstitute.gatk.queue.QScript
class GatkGenotyping(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Input(doc = "Gvcf files", shortName = "I")
var inputGvcfs: List[File] = Nil
@Argument(doc = "Reference", shortName = "R", required = false)
var reference: File = config("reference")
@Argument(doc = "Dbsnp", shortName = "dbsnp", required = false)
var dbsnp: File = config("dbsnp")
@Argument(doc = "OutputName", required = false)
var outputName: String = "genotype"
@Output(doc = "OutputFile", shortName = "O", required = false)
var outputFile: File = _
@Argument(doc = "Samples", shortName = "sample", required = false)
var samples: List[String] = Nil
def init() {
require(outputName != null, "Outputname is null")
if (outputFile == null) outputFile = new File(outputDir, outputName + ".vcf.gz")
}
def biopetScript() {
addGenotypeGVCFs(inputGvcfs, outputFile)
if (samples.nonEmpty) {
if (samples.size > 1) addSelectVariants(outputFile, samples, new File(outputDir, "samples/"), "all")
for (sample <- samples) addSelectVariants(outputFile, List(sample), new File(outputDir, "samples/"), sample)
}
}
def addGenotypeGVCFs(gvcfFiles: List[File], outputFile: File): File = {
val genotypeGVCFs = GenotypeGVCFs(this, gvcfFiles, outputFile)
add(genotypeGVCFs)
genotypeGVCFs.out
}
def addSelectVariants(inputFile: File, samples: List[String], outputDir: File, name: String) {
val selectVariants = SelectVariants(this, inputFile, new File(outputDir, name + ".vcf.gz"))
selectVariants.excludeNonVariants = true
for (sample <- samples) selectVariants.sample_name :+= sample
add(selectVariants)
}
}
object GatkGenotyping extends PipelineCommand
/**
* Due to the license issue with GATK, this part of Biopet can only be used inside the
* LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions
* on how to use this protected part of biopet or contact us at sasc@lumc.nl
*/
package nl.lumc.sasc.biopet.pipelines.gatk
import htsjdk.samtools.SamReaderFactory
import nl.lumc.sasc.biopet.core.{ MultiSampleQScript, PipelineCommand }
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
import nl.lumc.sasc.biopet.extensions.gatk.broad.{ CombineGVCFs, CombineVariants }
import nl.lumc.sasc.biopet.extensions.picard.{ AddOrReplaceReadGroups, SamToFastq }
import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics
import nl.lumc.sasc.biopet.pipelines.bamtobigwig.Bam2Wig
import nl.lumc.sasc.biopet.pipelines.mapping.Mapping
import org.broadinstitute.gatk.queue.QScript
import scala.collection.JavaConversions._
class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScript with SummaryQScript {
qscript =>
def this() = this(null)
@Argument(doc = "Skip Genotyping step", shortName = "skipgenotyping", required = false)
var skipGenotyping: Boolean = config("skip_genotyping", default = false)
/** Merge gvcfs */
var mergeGvcfs: Boolean = config("merge_gvcfs", default = false)
/** Joint variantcalling */
var jointVariantcalling: Boolean = config("joint_variantcalling", default = false)
/** Joint genotyping */
var jointGenotyping: Boolean = config("joint_genotyping", default = false)
var singleSampleCalling = config("single_sample_calling", default = true)
var reference: File = config("reference")
var useAllelesOption: Boolean = config("use_alleles_option", default = false)
val externalGvcfs = config("external_gvcfs_files", default = Nil).asFileList
def summaryFile = new File(outputDir, "GatkPipeline.summary.json")
//TODO: Add summary
def summaryFiles = Map()
//TODO: Add summary
def summarySettings = Map()
def makeSample(id: String) = new Sample(id)
class Sample(sampleId: String) extends AbstractSample(sampleId) {
//TODO: Add summary
def summaryFiles: Map[String, File] = Map()
//TODO: Add summary
def summaryStats: Map[String, Any] = Map()
def makeLibrary(id: String) = new Library(id)
class Library(libId: String) extends AbstractLibrary(libId) {
//TODO: Add summary
def summaryFiles: Map[String, File] = Map()
//TODO: Add summary
def summaryStats: Map[String, Any] = Map()
val mapping = new Mapping(qscript)
mapping.sampleId = Some(sampleId)
mapping.libId = Some(libId)
mapping.outputDir = libDir
/** Library variantcalling */
val gatkVariantcalling = new GatkVariantcalling(qscript)
gatkVariantcalling.doublePreProces = false
gatkVariantcalling.sampleID = sampleId
gatkVariantcalling.outputDir = new File(libDir, "variantcalling")
protected def addJobs(): Unit = {
val bamFile: Option[File] = if (config.contains("R1")) {
mapping.input_R1 = config("R1")
mapping.input_R2 = config("R2")
mapping.init()
mapping.biopetScript()
addAll(mapping.functions) // Add functions of mapping to curent function pool
Some(mapping.finalBamFile)
} else if (config.contains("bam")) {
var bamFile: File = config("bam")
if (!bamFile.exists) throw new IllegalStateException("Bam in config does not exist, file: " + bamFile)
if (config("bam_to_fastq", default = false).asBoolean) {
val samToFastq = SamToFastq(qscript, bamFile, libDir + sampleId + "-" + libId + ".R1.fastq",
libDir + sampleId + "-" + libId + ".R2.fastq")
samToFastq.isIntermediate = true
qscript.add(samToFastq)
mapping.input_R1 = samToFastq.fastqR1
mapping.input_R2 = Some(samToFastq.fastqR2)
mapping.init()
mapping.biopetScript()
addAll(mapping.functions) // Add functions of mapping to curent function pool
Some(mapping.finalBamFile)
} else {
var readGroupOke = true
val inputSam = SamReaderFactory.makeDefault.open(bamFile)
val header = inputSam.getFileHeader.getReadGroups
for (readGroup <- inputSam.getFileHeader.getReadGroups) {
if (readGroup.getSample != sampleId) logger.warn("Sample ID readgroup in bam file is not the same")
if (readGroup.getLibrary != libId) logger.warn("Library ID readgroup in bam file is not the same")
if (readGroup.getSample != sampleId || readGroup.getLibrary != libId) readGroupOke = false
}
inputSam.close()
if (!readGroupOke) {
if (config("correct_readgroups", default = false)) {
logger.info("Correcting readgroups, file:" + bamFile)
val aorrg = AddOrReplaceReadGroups(qscript, bamFile, new File(libDir + sampleId + "-" + libId + ".bam"))
aorrg.RGID = sampleId + "-" + libId
aorrg.RGLB = libId
aorrg.RGSM = sampleId
aorrg.isIntermediate = true
qscript.add(aorrg)
bamFile = aorrg.output
} else throw new IllegalStateException("Sample readgroup and/or library of input bamfile is not correct, file: " + bamFile +
"\nPlease note that it is possible to set 'correct_readgroups' to true in the config to automatic fix this")
}
addAll(BamMetrics(qscript, bamFile, libDir + "metrics/").functions)
Some(bamFile)
}
} else {
logger.error("Sample: " + sampleId + ": No R1 found for run: " + libId)
None
}
if (bamFile.isDefined) {
gatkVariantcalling.inputBams = List(bamFile.get)
gatkVariantcalling.variantcalling = config("library_variantcalling", default = false)
gatkVariantcalling.init()
gatkVariantcalling.biopetScript()
addAll(gatkVariantcalling.functions)
}
addSummaryQScript(mapping)
}
}
/** sample variantcalling */
val gatkVariantcalling = new GatkVariantcalling(qscript)
gatkVariantcalling.sampleID = sampleId
gatkVariantcalling.outputDir = new File(sampleDir, "variantcalling")
protected def addJobs(): Unit = {
addPerLibJobs()
gatkVariantcalling.inputBams = libraries.map(_._2.mapping.finalBamFile).toList
gatkVariantcalling.preProcesBams = false
if (!singleSampleCalling) {
gatkVariantcalling.useHaplotypecaller = false
gatkVariantcalling.useUnifiedGenotyper = false
}
gatkVariantcalling.init()
gatkVariantcalling.biopetScript()
addAll(gatkVariantcalling.functions)
gatkVariantcalling.inputBams.foreach(x => addAll(Bam2Wig(qscript, x).functions))
}
}
def init() {
}
val multisampleVariantcalling = new GatkVariantcalling(this) {
override def configName = "gatkvariantcalling"
override def configPath: List[String] = super.configPath ::: "multisample" :: Nil
}
def biopetScript(): Unit = {
addSamplesJobs()
addSummaryJobs()
}
def addMultiSampleJobs(): Unit = {
val gvcfFiles: List[File] = if (mergeGvcfs && externalGvcfs.size + samples.size > 1) {
val newFile = new File(outputDir, "merged.gvcf.vcf.gz")
add(CombineGVCFs(this, externalGvcfs ++ samples.map(_._2.gatkVariantcalling.scriptOutput.gvcfFile), newFile))
List(newFile)
} else externalGvcfs ++ samples.map(_._2.gatkVariantcalling.scriptOutput.gvcfFile)
if (!skipGenotyping && gvcfFiles.nonEmpty) {
if (jointGenotyping) {
val gatkGenotyping = new GatkGenotyping(this)
gatkGenotyping.inputGvcfs = gvcfFiles
gatkGenotyping.outputDir = new File(outputDir, "genotyping")
gatkGenotyping.init()
gatkGenotyping.biopetScript()
addAll(gatkGenotyping.functions)
var vcfFile = gatkGenotyping.outputFile
}
} else logger.warn("No gVCFs to genotype")
if (jointVariantcalling) {
val allBamfiles = samples.map(_._2.gatkVariantcalling.scriptOutput.bamFiles).toList.fold(Nil)(_ ++ _)
val allRawVcfFiles = samples.map(_._2.gatkVariantcalling.scriptOutput.rawVcfFile).filter(_ != null).toList
val gatkVariantcalling = new GatkVariantcalling(this) {
override def configName = "gatkvariantcalling"
override def configPath: List[String] = super.configPath ::: "multisample" :: Nil
}
if (gatkVariantcalling.useMpileup) {
val cvRaw = CombineVariants(this, allRawVcfFiles.toList, new File(outputDir, "variantcalling/multisample.raw.vcf.gz"))
add(cvRaw)
gatkVariantcalling.rawVcfInput = cvRaw.out
}
multisampleVariantcalling.preProcesBams = false
multisampleVariantcalling.doublePreProces = false
multisampleVariantcalling.inputBams = allBamfiles.toList
multisampleVariantcalling.outputDir = new File(outputDir, "variantcalling")
multisampleVariantcalling.outputName = "multisample"
multisampleVariantcalling.init()
multisampleVariantcalling.biopetScript()
addAll(multisampleVariantcalling.functions)
if (config("inputtype", default = "dna").asString != "rna" && config("recalibration", default = false).asBoolean) {
val recalibration = new GatkVariantRecalibration(this)
recalibration.inputVcf = multisampleVariantcalling.scriptOutput.finalVcfFile
recalibration.bamFiles = allBamfiles
recalibration.outputDir = new File(outputDir, "recalibration")
recalibration.init()
recalibration.biopetScript()
}
}
}
}
object GatkPipeline extends PipelineCommand
/**
* Due to the license issue with GATK, this part of Biopet can only be used inside the
* LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions
* on how to use this protected part of biopet or contact us at sasc@lumc.nl
*/
package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.extensions.gatk.broad.{ ApplyRecalibration, VariantAnnotator, VariantRecalibrator }
import org.broadinstitute.gatk.queue.QScript
class GatkVariantRecalibration(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Input(doc = "input vcf file", shortName = "I")