Skip to content
Snippets Groups Projects
Commit d91e8f6f authored by Wai Yi Leung's avatar Wai Yi Leung
Browse files

Merge branch 'feature-multisample_mapping' into 'develop'

Multisample mapping pipeline

Fixes #225 

See merge request !292
parents 0beb17fd 7d9ff889
No related branches found
No related tags found
No related merge requests found
Showing
with 310 additions and 344 deletions
......@@ -8,7 +8,7 @@ package nl.lumc.sasc.biopet.extensions.gatk.broad
import java.io.File
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{Gather, Output}
import org.broadinstitute.gatk.utils.commandline.{ Gather, Output }
class GenotypeGVCFs(val root: Configurable) extends org.broadinstitute.gatk.queue.extensions.gatk.GenotypeGVCFs with GatkGeneral {
......
......@@ -56,36 +56,40 @@ class Shiva(val root: Configurable) extends QScript with ShivaTrait {
("use_indel_realigner" -> useIndelRealigner) +
("use_base_recalibration" -> useBaseRecalibration)
/** This will adds preprocess steps, gatk indel realignment and base recalibration is included here */
override def preProcess(input: File): Option[File] = {
if (!useIndelRealigner && !useBaseRecalibration) None
else {
val indelRealignFile = useIndelRealigner match {
case true => addIndelRealign(input, libDir, useBaseRecalibration || libraries.size > 1)
case false => input
}
useBaseRecalibration match {
case true => Some(addBaseRecalibrator(indelRealignFile, libDir, libraries.size > 1))
case false => Some(indelRealignFile)
}
override def preProcessBam = if (useIndelRealigner && useBaseRecalibration)
bamFile.map(swapExt(libDir, _, ".bam", ".realign.baserecal.bam"))
else if (useIndelRealigner) bamFile.map(swapExt(libDir, _, ".bam", ".realign.bam"))
else if (useBaseRecalibration) bamFile.map(swapExt(libDir, _, ".bam", ".baserecal.bam"))
else bamFile
override def addJobs(): Unit = {
super.addJobs()
if (useIndelRealigner && useBaseRecalibration) {
val file = addIndelRealign(bamFile.get, libDir, isIntermediate = true)
addBaseRecalibrator(file, libDir, libraries.size > 1)
} else if (useIndelRealigner) {
addIndelRealign(bamFile.get, libDir, libraries.size > 1)
} else if (useBaseRecalibration) {
addBaseRecalibrator(bamFile.get, libDir, libraries.size > 1)
}
}
}
override def keepMergedFiles: Boolean = config("keep_merged_files", default = false)
override def summarySettings = super.summarySettings + ("use_indel_realigner" -> useIndelRealigner)
lazy val useIndelRealigner: Boolean = config("use_indel_realigner", default = true)
/** This methods will add double preprocess steps, with GATK indel realignment */
override protected def addDoublePreProcess(input: List[File], isIntermediate: Boolean = false): Option[File] = {
if (input.size <= 1) super.addDoublePreProcess(input)
else super.addDoublePreProcess(input, isIntermediate = useIndelRealigner).collect {
case file =>
useIndelRealigner match {
case true => addIndelRealign(file, sampleDir, isIntermediate = false)
case false => file
}
override def preProcessBam = if (useIndelRealigner && libraries.values.flatMap(_.preProcessBam).size > 1) {
bamFile.map(swapExt(sampleDir, _, ".bam", ".realign.bam"))
} else bamFile
override def addJobs(): Unit = {
super.addJobs()
if (useIndelRealigner && libraries.values.flatMap(_.preProcessBam).size > 1) {
addIndelRealign(bamFile.get, sampleDir, false)
}
}
}
......
......@@ -132,4 +132,13 @@ object Ln {
ln.relative = relative
ln
}
def linkBamFile(root: Configurable, input: File, output: File, index: Boolean = true, relative: Boolean = true): List[Ln] = {
val bamLn = Ln(root, input, output, relative)
bamLn :: (if (index) {
val inputIndex = new File(input.getAbsolutePath.stripSuffix(".bam") + ".bai")
val outputIndex = new File(output.getAbsolutePath.stripSuffix(".bam") + ".bai")
List(Ln(root, inputIndex, outputIndex, relative))
} else Nil)
}
}
......@@ -105,11 +105,12 @@ class MarkDuplicates(val root: Configurable) extends Picard with Summarizable {
}
object MarkDuplicates {
/** Returns default MarkDuplicates */
def apply(root: Configurable, input: List[File], output: File): MarkDuplicates = {
def apply(root: Configurable, input: List[File], output: File, isIntermediate: Boolean = false): MarkDuplicates = {
val markDuplicates = new MarkDuplicates(root)
markDuplicates.input = input
markDuplicates.output = output
markDuplicates.outputMetrics = new File(output.getParent, output.getName.stripSuffix(".bam") + ".metrics")
markDuplicates.isIntermediate = isIntermediate
markDuplicates
}
}
\ No newline at end of file
......@@ -65,12 +65,14 @@ class MergeSamFiles(val root: Configurable) extends Picard {
object MergeSamFiles {
/** Returns default MergeSamFiles */
def apply(root: Configurable, input: List[File], outputDir: File, sortOrder: String = null): MergeSamFiles = {
def apply(root: Configurable, input: List[File], outputFile: File,
sortOrder: String = null, isIntermediate: Boolean = false): MergeSamFiles = {
val mergeSamFiles = new MergeSamFiles(root)
mergeSamFiles.input = input
mergeSamFiles.output = new File(outputDir, input.head.getName.stripSuffix(".bam").stripSuffix(".sam") + ".merge.bam")
mergeSamFiles.output = outputFile
if (sortOrder == null) mergeSamFiles.sortOrder = "coordinate"
else mergeSamFiles.sortOrder = sortOrder
mergeSamFiles.isIntermediate = isIntermediate
mergeSamFiles
}
}
\ No newline at end of file
......@@ -22,6 +22,7 @@ object BiopetExecutablePublic extends BiopetExecutable {
def publicPipelines: List[MainCommand] = List(
nl.lumc.sasc.biopet.pipelines.flexiprep.Flexiprep,
nl.lumc.sasc.biopet.pipelines.mapping.Mapping,
nl.lumc.sasc.biopet.pipelines.mapping.MultisampleMapping,
nl.lumc.sasc.biopet.pipelines.gentrap.Gentrap,
nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics,
nl.lumc.sasc.biopet.pipelines.sage.Sage,
......
......@@ -18,16 +18,13 @@ package nl.lumc.sasc.biopet.pipelines.carp
import java.io.File
import nl.lumc.sasc.biopet.core._
import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsView
import nl.lumc.sasc.biopet.utils.config._
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
import nl.lumc.sasc.biopet.extensions.Ln
import nl.lumc.sasc.biopet.extensions.macs2.Macs2CallPeak
import nl.lumc.sasc.biopet.extensions.picard.{ BuildBamIndex, MergeSamFiles }
import nl.lumc.sasc.biopet.extensions.picard.BuildBamIndex
import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsView
import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics
import nl.lumc.sasc.biopet.pipelines.bamtobigwig.Bam2Wig
import nl.lumc.sasc.biopet.pipelines.mapping.Mapping
import nl.lumc.sasc.biopet.utils.ConfigUtils
import nl.lumc.sasc.biopet.pipelines.mapping.MultisampleMappingTrait
import nl.lumc.sasc.biopet.utils.config._
import org.broadinstitute.gatk.queue.QScript
/**
......@@ -35,7 +32,7 @@ import org.broadinstitute.gatk.queue.QScript
* Chip-Seq analysis pipeline
* This pipeline performs QC,mapping and peak calling
*/
class Carp(val root: Configurable) extends QScript with MultiSampleQScript with SummaryQScript with Reference {
class Carp(val root: Configurable) extends QScript with MultisampleMappingTrait with Reference {
qscript =>
def this() = this(null)
......@@ -44,6 +41,7 @@ class Carp(val root: Configurable) extends QScript with MultiSampleQScript with
"skip_markduplicates" -> false,
"aligner" -> "bwa-mem"
),
"merge_strategy" -> "preprocessmergesam",
"samtoolsview" -> Map("q" -> 10)
)
......@@ -56,97 +54,41 @@ class Carp(val root: Configurable) extends QScript with MultiSampleQScript with
def summaryFile = new File(outputDir, "Carp.summary.json")
//TODO: Add summary
def summaryFiles = Map("reference" -> referenceFasta())
//TODO: Add summary
def summarySettings = Map("reference" -> referenceSummary)
def makeSample(id: String) = new Sample(id)
class Sample(sampleId: String) extends AbstractSample(sampleId) {
//TODO: Add summary
def summaryFiles: Map[String, File] = Map()
//TODO: Add summary
def summaryStats: Map[String, Any] = Map()
def makeLibrary(id: String) = new Library(id)
class Library(libId: String) extends AbstractLibrary(libId) {
//TODO: Add summary
def summaryFiles: Map[String, File] = Map()
override def makeSample(id: String) = new Sample(id)
class Sample(sampleId: String) extends super.Sample(sampleId) {
//TODO: Add summary
def summaryStats: Map[String, Any] = Map()
val mapping = new Mapping(qscript)
mapping.libId = Some(libId)
mapping.sampleId = Some(sampleId)
mapping.outputDir = libDir
def addJobs(): Unit = {
if (config.contains("R1")) {
mapping.input_R1 = config("R1")
if (config.contains("R2")) mapping.input_R2 = config("R2")
inputFiles :+= new InputFile(mapping.input_R1, config("R1_md5"))
mapping.input_R2.foreach(inputFiles :+= new InputFile(_, config("R2_md5")))
mapping.init()
mapping.biopetScript()
addAll(mapping.functions)
} else logger.error("Sample: " + sampleId + ": No R1 found for library: " + libId)
addSummaryQScript(mapping)
}
}
override def preProcessBam = Some(createFile(".filter.bam"))
val bamFile = createFile(".bam")
val bamFileFilter = createFile(".filter.bam")
val controls: List[String] = config("control", default = Nil)
def addJobs(): Unit = {
addPerLibJobs()
val bamFiles = libraries.map(_._2.mapping.finalBamFile).toList
if (bamFiles.length == 1) {
add(Ln(qscript, bamFiles.head, bamFile))
val oldIndex = new File(bamFiles.head.getAbsolutePath.stripSuffix(".bam") + ".bai")
val newIndex = new File(bamFile.getAbsolutePath.stripSuffix(".bam") + ".bai")
add(Ln(qscript, oldIndex, newIndex))
} else if (bamFiles.length > 1) {
val merge = new MergeSamFiles(qscript)
merge.input = bamFiles
merge.sortOrder = "coordinate"
merge.output = bamFile
add(merge)
}
val bamMetrics = BamMetrics(qscript, bamFile, new File(sampleDir, "metrics"), sampleId = Some(sampleId))
addAll(bamMetrics.functions)
addSummaryQScript(bamMetrics)
override def summarySettings = super.summarySettings ++ Map("controls" -> controls)
val bamMetricsFilter = BamMetrics(qscript, bamFileFilter, new File(sampleDir, "metrics-filter"), sampleId = Some(sampleId))
addAll(bamMetricsFilter.functions)
bamMetricsFilter.summaryName = "bammetrics-filter"
addSummaryQScript(bamMetricsFilter)
override def addJobs(): Unit = {
super.addJobs()
addAll(Bam2Wig(qscript, bamFile).functions)
addAll(Bam2Wig(qscript, bamFileFilter).functions)
add(Bam2Wig(qscript, bamFile.get))
val samtoolsView = new SamtoolsView(qscript)
samtoolsView.input = bamFile
samtoolsView.output = bamFileFilter
samtoolsView.input = bamFile.get
samtoolsView.output = preProcessBam.get
samtoolsView.b = true
samtoolsView.h = true
add(samtoolsView)
val bamMetricsFilter = BamMetrics(qscript, preProcessBam.get, new File(sampleDir, "metrics-filter"), sampleId = Some(sampleId))
addAll(bamMetricsFilter.functions)
bamMetricsFilter.summaryName = "bammetrics-filter"
addSummaryQScript(bamMetricsFilter)
add(Bam2Wig(qscript, preProcessBam.get))
val buildBamIndex = new BuildBamIndex(qscript)
buildBamIndex.input = bamFileFilter
buildBamIndex.output = swapExt(bamFileFilter.getParent, bamFileFilter, ".bam", ".bai")
buildBamIndex.input = preProcessBam.get
buildBamIndex.output = swapExt(preProcessBam.get.getParentFile, preProcessBam.get, ".bam", ".bai")
add(buildBamIndex)
val macs2 = new Macs2CallPeak(qscript)
macs2.treatment = bamFileFilter
macs2.treatment = preProcessBam.get
macs2.name = Some(sampleId)
macs2.outputdir = sampleDir + File.separator + "macs2" + File.separator + sampleId + File.separator
add(macs2)
......@@ -160,32 +102,22 @@ class Carp(val root: Configurable) extends QScript with MultiSampleQScript with
Some(carp)
}
def init() = {
override def init() = {
super.init()
// ensure that no samples are called 'control' since that is our reserved keyword
require(!sampleIds.contains("control"),
"No sample should be named 'control' since it is a reserved for the Carp pipeline")
}
def biopetScript() {
// Define what the pipeline should do
// First step is QC, this will be done with Flexiprep
// Second step is mapping, this will be done with the Mapping pipeline
// Third step is calling peaks on the bam files produced with the mapping pipeline, this will be done with MACS2
logger.info("Starting CArP pipeline")
addSamplesJobs()
addSummaryJobs()
}
def addMultiSampleJobs(): Unit = {
override def addMultiSampleJobs(): Unit = {
super.addMultiSampleJobs()
for ((sampleId, sample) <- samples) {
for (controlId <- sample.controls) {
if (!samples.contains(controlId))
throw new IllegalStateException("For sample: " + sampleId + " this control: " + controlId + " does not exist")
val macs2 = new Macs2CallPeak(this)
macs2.treatment = sample.bamFileFilter
macs2.control = samples(controlId).bamFileFilter
macs2.treatment = sample.preProcessBam.get
macs2.control = samples(controlId).preProcessBam.get
macs2.name = Some(sampleId + "_VS_" + controlId)
macs2.outputdir = sample.sampleDir + File.separator + "macs2" + File.separator + macs2.name.get + File.separator
add(macs2)
......
......@@ -246,7 +246,7 @@ class Mapping(val root: Configurable) extends QScript with SummaryQScript with S
add(md)
addSummarizable(md, "mark_duplicates")
} else if (skipMarkduplicates && chunking) {
val mergeSamFile = MergeSamFiles(this, bamFiles, outputDir)
val mergeSamFile = MergeSamFiles(this, bamFiles, new File(outputDir, outputName + ".merge.bam"))
add(mergeSamFile)
bamFile = mergeSamFile.output
}
......
package nl.lumc.sasc.biopet.pipelines.mapping
import java.io.File
import htsjdk.samtools.SamReaderFactory
import nl.lumc.sasc.biopet.core.{ PipelineCommand, Reference, MultiSampleQScript }
import nl.lumc.sasc.biopet.extensions.Ln
import nl.lumc.sasc.biopet.extensions.picard.{ MarkDuplicates, MergeSamFiles, AddOrReplaceReadGroups, SamToFastq }
import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics
import nl.lumc.sasc.biopet.utils.Logging
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.queue.QScript
import MultisampleMapping.MergeStrategy
import scala.collection.JavaConversions._
/**
* Created by pjvanthof on 18/12/15.
*/
trait MultisampleMappingTrait extends MultiSampleQScript
with Reference { qscript: QScript =>
def mergeStrategy: MergeStrategy.Value = {
val value: String = config("merge_strategy", default = "preprocessmarkduplicates")
MergeStrategy.values.find(_.toString.toLowerCase == value) match {
case Some(v) => v
case _ => throw new IllegalArgumentException(s"merge_strategy '$value' does not exist")
}
}
def init(): Unit = {
}
def biopetScript(): Unit = {
addSamplesJobs()
addSummaryJobs()
}
def addMultiSampleJobs(): Unit = {
// this code will be executed after all code of all samples is executed
}
def summaryFiles: Map[String, File] = Map("referenceFasta" -> referenceFasta())
def summarySettings: Map[String, Any] = Map(
"reference" -> referenceSummary,
"merge_strategy" -> mergeStrategy.toString)
def makeSample(id: String) = new Sample(id)
class Sample(sampleId: String) extends AbstractSample(sampleId) {
def makeLibrary(id: String) = new Library(id)
class Library(libId: String) extends AbstractLibrary(libId) {
def summaryFiles: Map[String, File] = (inputR1.map("input_R1" -> _) :: inputR2.map("input_R2" -> _) ::
inputBam.map("input_bam" -> _) :: bamFile.map("output_bam" -> _) ::
preProcessBam.map("output_preProcessBam" -> _) :: Nil).flatten.toMap
def summaryStats: Map[String, Any] = Map()
lazy val inputR1: Option[File] = MultisampleMapping.fileMustBeAbsolute(config("R1"))
lazy val inputR2: Option[File] = MultisampleMapping.fileMustBeAbsolute(config("R2"))
lazy val inputBam: Option[File] = MultisampleMapping.fileMustBeAbsolute(if (inputR1.isEmpty) config("bam") else None)
lazy val bamToFastq: Boolean = config("bam_to_fastq", default = false)
lazy val correctReadgroups: Boolean = config("correct_readgroups", default = false)
lazy val mapping = if (inputR1.isDefined || (inputBam.isDefined && bamToFastq)) {
val m = new Mapping(qscript)
m.sampleId = Some(sampleId)
m.libId = Some(libId)
m.outputDir = libDir
Some(m)
} else None
def bamFile = mapping match {
case Some(m) => Some(m.finalBamFile)
case _ if inputBam.isDefined => Some(new File(libDir, s"$sampleId-$libId.bam"))
case _ => None
}
def preProcessBam = bamFile
def addJobs(): Unit = {
inputR1.foreach(inputFiles :+= new InputFile(_, config("R1_md5")))
inputR2.foreach(inputFiles :+= new InputFile(_, config("R2_md5")))
inputBam.foreach(inputFiles :+= new InputFile(_, config("bam_md5")))
if (inputR1.isDefined) {
mapping.foreach { m =>
m.input_R1 = inputR1.get
m.input_R2 = inputR2
add(m)
}
} else if (inputBam.isDefined) {
if (bamToFastq) {
val samToFastq = SamToFastq(qscript, inputBam.get,
new File(libDir, sampleId + "-" + libId + ".R1.fq.gz"),
new File(libDir, sampleId + "-" + libId + ".R2.fq.gz"))
samToFastq.isIntermediate = true
qscript.add(samToFastq)
mapping.foreach(m => {
m.input_R1 = samToFastq.fastqR1
m.input_R2 = Some(samToFastq.fastqR2)
add(m)
})
} else {
val inputSam = SamReaderFactory.makeDefault.open(inputBam.get)
val readGroups = inputSam.getFileHeader.getReadGroups
val readGroupOke = readGroups.forall(readGroup => {
if (readGroup.getSample != sampleId) logger.warn("Sample ID readgroup in bam file is not the same")
if (readGroup.getLibrary != libId) logger.warn("Library ID readgroup in bam file is not the same")
readGroup.getSample == sampleId && readGroup.getLibrary == libId
})
inputSam.close()
if (!readGroupOke) {
if (correctReadgroups) {
logger.info("Correcting readgroups, file:" + inputBam.get)
val aorrg = AddOrReplaceReadGroups(qscript, inputBam.get, bamFile.get)
aorrg.RGID = s"$sampleId-$libId"
aorrg.RGLB = libId
aorrg.RGSM = sampleId
aorrg.RGPL = "unknown"
aorrg.RGPU = "na"
aorrg.isIntermediate = true
qscript.add(aorrg)
} else throw new IllegalStateException("Sample readgroup and/or library of input bamfile is not correct, file: " + bamFile +
"\nPlease note that it is possible to set 'correct_readgroups' to true in the config to automatic fix this")
} else {
val oldBamFile: File = inputBam.get
val oldIndex: File = new File(oldBamFile.getAbsolutePath.stripSuffix(".bam") + ".bai")
val newIndex: File = new File(libDir, bamFile.get.getName.stripSuffix(".bam") + ".bai")
val baiLn = Ln(qscript, oldIndex, newIndex)
add(baiLn)
val bamLn = Ln(qscript, oldBamFile, bamFile.get)
bamLn.deps :+= baiLn.output
add(bamLn)
}
val bamMetrics = new BamMetrics(qscript)
bamMetrics.sampleId = Some(sampleId)
bamMetrics.libId = Some(libId)
bamMetrics.inputBam = bamFile.get
bamMetrics.outputDir = new File(libDir, "metrics")
add(bamMetrics)
}
} else logger.warn(s"Sample '$sampleId' does not have any input files")
}
}
def summaryFiles: Map[String, File] = (bamFile.map("output_bam" -> _) ::
preProcessBam.map("output_preProcessBam" -> _) :: Nil).flatten.toMap
def summaryStats: Map[String, Any] = Map()
def bamFile = if (libraries.flatMap(_._2.bamFile).nonEmpty &&
mergeStrategy != MultisampleMapping.MergeStrategy.None)
Some(new File(sampleDir, s"$sampleId.bam"))
else None
def preProcessBam = bamFile
def keepMergedFiles: Boolean = config("keep_merged_files", default = true)
def addJobs(): Unit = {
addPerLibJobs() // This add jobs for each library
mergeStrategy match {
case MergeStrategy.None =>
case (MergeStrategy.MergeSam | MergeStrategy.MarkDuplicates) if libraries.flatMap(_._2.bamFile).size == 1 =>
add(Ln.linkBamFile(qscript, libraries.flatMap(_._2.bamFile).head, bamFile.get): _*)
case (MergeStrategy.PreProcessMergeSam | MergeStrategy.PreProcessMarkDuplicates) if libraries.flatMap(_._2.preProcessBam).size == 1 =>
add(Ln.linkBamFile(qscript, libraries.flatMap(_._2.preProcessBam).head, bamFile.get): _*)
case MergeStrategy.MergeSam =>
add(MergeSamFiles(qscript, libraries.flatMap(_._2.bamFile).toList, bamFile.get, isIntermediate = keepMergedFiles))
case MergeStrategy.PreProcessMergeSam =>
add(MergeSamFiles(qscript, libraries.flatMap(_._2.preProcessBam).toList, bamFile.get, isIntermediate = keepMergedFiles))
case MergeStrategy.MarkDuplicates =>
add(MarkDuplicates(qscript, libraries.flatMap(_._2.bamFile).toList, bamFile.get, isIntermediate = keepMergedFiles))
case MergeStrategy.PreProcessMarkDuplicates =>
add(MarkDuplicates(qscript, libraries.flatMap(_._2.preProcessBam).toList, bamFile.get, isIntermediate = keepMergedFiles))
case _ => throw new IllegalStateException("This should not be possible, unimplemented MergeStrategy?")
}
if (mergeStrategy != MergeStrategy.None && libraries.flatMap(_._2.bamFile).nonEmpty) {
val bamMetrics = new BamMetrics(qscript)
bamMetrics.sampleId = Some(sampleId)
bamMetrics.inputBam = preProcessBam.get
bamMetrics.outputDir = new File(sampleDir, "metrics")
add(bamMetrics)
}
}
}
}
class MultisampleMapping(val root: Configurable) extends QScript with MultisampleMappingTrait {
def this() = this(null)
def summaryFile: File = new File(outputDir, "MultisamplePipeline.summary.json")
}
object MultisampleMapping extends PipelineCommand {
object MergeStrategy extends Enumeration {
val None, MergeSam, MarkDuplicates, PreProcessMergeSam, PreProcessMarkDuplicates = Value
}
def fileMustBeAbsolute(file: Option[File]): Option[File] = {
if (file.forall(_.isAbsolute)) file
else {
Logging.addError(s"$file should be a absolute file path")
file.map(_.getAbsoluteFile)
}
}
}
......@@ -125,7 +125,7 @@ class Sage(val root: Configurable) extends QScript with MultiSampleQScript {
val bamFile: File = if (libraryBamfiles.size == 1) libraryBamfiles.head
else if (libraryBamfiles.size > 1) {
val mergeSamFiles = MergeSamFiles(qscript, libraryBamfiles, sampleDir)
val mergeSamFiles = MergeSamFiles(qscript, libraryBamfiles, new File(sampleDir, s"$sampleId.bam"))
qscript.add(mergeSamFiles)
mergeSamFiles.output
} else null
......
......@@ -15,35 +15,18 @@
*/
package nl.lumc.sasc.biopet.pipelines.shiva
import htsjdk.samtools.SamReaderFactory
import nl.lumc.sasc.biopet.core.{ MultiSampleQScript, Reference }
import nl.lumc.sasc.biopet.extensions.Ln
import nl.lumc.sasc.biopet.extensions.picard.{ AddOrReplaceReadGroups, MarkDuplicates, SamToFastq }
import nl.lumc.sasc.biopet.pipelines.bammetrics.{ TargetRegions, BamMetrics }
import nl.lumc.sasc.biopet.pipelines.mapping.Mapping
import nl.lumc.sasc.biopet.core.Reference
import nl.lumc.sasc.biopet.pipelines.bammetrics.TargetRegions
import nl.lumc.sasc.biopet.pipelines.mapping.{ MultisampleMappingTrait }
import nl.lumc.sasc.biopet.pipelines.toucan.Toucan
import nl.lumc.sasc.biopet.utils.Logging
import org.broadinstitute.gatk.queue.QScript
import scala.collection.JavaConversions._
/**
* This is a trait for the Shiva pipeline
*
* Created by pjvan_thof on 2/26/15.
*/
trait ShivaTrait extends MultiSampleQScript with Reference with TargetRegions { qscript: QScript =>
/** Executed before running the script */
def init(): Unit = {
}
/** Method to add jobs */
def biopetScript(): Unit = {
addSamplesJobs()
addSummaryJobs()
}
trait ShivaTrait extends MultisampleMappingTrait with Reference with TargetRegions { qscript: QScript =>
override def reportClass = {
val shiva = new ShivaReport(this)
......@@ -65,167 +48,29 @@ trait ShivaTrait extends MultiSampleQScript with Reference with TargetRegions {
}
/** Method to make a sample */
def makeSample(id: String) = new Sample(id)
override def makeSample(id: String) = new this.Sample(id)
/** Class that will generate jobs for a sample */
class Sample(sampleId: String) extends AbstractSample(sampleId) {
/** Sample specific files to add to summary */
def summaryFiles: Map[String, File] = {
preProcessBam match {
case Some(b) => Map("preProcessBam" -> b)
case _ => Map()
}
}
/** Sample specific stats to add to summary */
def summaryStats: Map[String, Any] = Map()
class Sample(sampleId: String) extends super.Sample(sampleId) {
/** Method to make a library */
def makeLibrary(id: String) = new Library(id)
override def makeLibrary(id: String) = new this.Library(id)
/** Sample specific settings */
override def summarySettings = Map("single_sample_variantcalling" -> variantcalling.isDefined)
/** Class to generate jobs for a library */
class Library(libId: String) extends AbstractLibrary(libId) {
/** Library specific files to add to the summary */
def summaryFiles: Map[String, File] = {
((bamFile, preProcessBam) match {
case (Some(b), Some(pb)) => Map("bamFile" -> b, "preProcessBam" -> pb)
case (Some(b), _) => Map("bamFile" -> b, "preProcessBam" -> b)
case _ => Map()
}) ++ (inputR1.map("input_R1" -> _) ::
inputR2.map("input_R2" -> _) ::
inputBam.map("input_bam" -> _) :: Nil).flatten.toMap
}
/** Library specific stats to add to summary */
def summaryStats: Map[String, Any] = Map()
/** Method to execute library preprocess */
def preProcess(input: File): Option[File] = None
class Library(libId: String) extends super.Library(libId) {
/** Library specific settings */
override def summarySettings = Map("library_variantcalling" -> variantcalling.isDefined)
/** Method to make the mapping submodule */
def makeMapping = {
val mapping = new Mapping(qscript)
mapping.sampleId = Some(sampleId)
mapping.libId = Some(libId)
mapping.outputDir = libDir
mapping.outputName = sampleId + "-" + libId
(Some(mapping), Some(mapping.finalBamFile), preProcess(mapping.finalBamFile))
}
def fileMustBeAbsulute(file: Option[File]): Option[File] = {
if (file.forall(_.isAbsolute)) file
else {
Logging.addError(s"$file for $sampleId / $libId should be a absolute file path")
file.map(_.getAbsoluteFile)
}
}
lazy val inputR1: Option[File] = fileMustBeAbsulute(config("R1"))
lazy val inputR2: Option[File] = fileMustBeAbsulute(config("R2"))
lazy val inputBam: Option[File] = fileMustBeAbsulute(if (inputR1.isEmpty) config("bam") else None)
lazy val (mapping, bamFile, preProcessBam): (Option[Mapping], Option[File], Option[File]) =
(inputR1.isDefined, inputBam.isDefined) match {
case (true, _) => makeMapping // Default starting from fastq files
case (false, true) => // Starting from bam file
config("bam_to_fastq", default = false).asBoolean match {
case true => makeMapping // bam file will be converted to fastq
case false =>
val file = new File(libDir, sampleId + "-" + libId + ".final.bam")
(None, Some(file), preProcess(file))
}
case _ => (None, None, None)
}
lazy val variantcalling = if (config("library_variantcalling", default = false).asBoolean &&
(bamFile.isDefined || preProcessBam.isDefined)) {
Some(makeVariantcalling(multisample = false))
} else None
/** This will add jobs for this library */
def addJobs(): Unit = {
(inputR1.isDefined, inputBam.isDefined) match {
case (true, _) => mapping.foreach(mapping => {
mapping.input_R1 = inputR1.get
mapping.input_R2 = inputR2
inputFiles :+= new InputFile(mapping.input_R1, config("R1_md5"))
mapping.input_R2.foreach(inputFiles :+= new InputFile(_, config("R2_md5")))
})
case (false, true) => {
inputFiles :+= new InputFile(inputBam.get, config("bam_md5"))
config("bam_to_fastq", default = false).asBoolean match {
case true =>
val samToFastq = SamToFastq(qscript, inputBam.get,
new File(libDir, sampleId + "-" + libId + ".R1.fq.gz"),
new File(libDir, sampleId + "-" + libId + ".R2.fq.gz"))
samToFastq.isIntermediate = true
qscript.add(samToFastq)
mapping.foreach(mapping => {
mapping.input_R1 = samToFastq.fastqR1
mapping.input_R2 = Some(samToFastq.fastqR2)
})
case false =>
val inputSam = SamReaderFactory.makeDefault.open(inputBam.get)
val readGroups = inputSam.getFileHeader.getReadGroups
val readGroupOke = readGroups.forall(readGroup => {
if (readGroup.getSample != sampleId) logger.warn("Sample ID readgroup in bam file is not the same")
if (readGroup.getLibrary != libId) logger.warn("Library ID readgroup in bam file is not the same")
readGroup.getSample == sampleId && readGroup.getLibrary == libId
})
inputSam.close()
if (!readGroupOke) {
if (config("correct_readgroups", default = false).asBoolean) {
logger.info("Correcting readgroups, file:" + inputBam.get)
val aorrg = AddOrReplaceReadGroups(qscript, inputBam.get, bamFile.get)
aorrg.RGID = sampleId + "-" + libId
aorrg.RGLB = libId
aorrg.RGSM = sampleId
aorrg.RGPL = "unknown"
aorrg.RGPU = "na"
aorrg.isIntermediate = true
qscript.add(aorrg)
} else throw new IllegalStateException("Sample readgroup and/or library of input bamfile is not correct, file: " + bamFile +
"\nPlease note that it is possible to set 'correct_readgroups' to true in the config to automatic fix this")
} else {
val oldBamFile: File = inputBam.get
val oldIndex: File = new File(oldBamFile.getAbsolutePath.stripSuffix(".bam") + ".bai")
val newIndex: File = new File(libDir, bamFile.get.getName.stripSuffix(".bam") + ".bai")
val baiLn = Ln(qscript, oldIndex, newIndex)
add(baiLn)
val bamLn = Ln(qscript, oldBamFile, bamFile.get)
bamLn.deps :+= baiLn.output
add(bamLn)
val bamMetrics = new BamMetrics(qscript)
bamMetrics.sampleId = Some(sampleId)
bamMetrics.libId = Some(libId)
bamMetrics.inputBam = bamFile.get
bamMetrics.outputDir = new File(libDir, "metrics")
bamMetrics.init()
bamMetrics.biopetScript()
addAll(bamMetrics.functions)
addSummaryQScript(bamMetrics)
}
}
}
case _ => logger.warn("Sample: " + sampleId + " Library: " + libId + ", no reads found")
}
mapping.foreach(mapping => {
mapping.init()
mapping.biopetScript()
addAll(mapping.functions) // Add functions of mapping to curent function pool
addSummaryQScript(mapping)
})
override def addJobs = {
super.addJobs()
variantcalling.foreach(vc => {
vc.sampleId = Some(sampleId)
......@@ -241,59 +86,15 @@ trait ShivaTrait extends MultiSampleQScript with Reference with TargetRegions {
}
}
/** This will add jobs for the double preprocessing */
protected def addDoublePreProcess(input: List[File], isIntermediate: Boolean = false): Option[File] = {
if (input == Nil) None
else if (input.tail == Nil) {
val bamFile = new File(sampleDir, s"$sampleId.bam")
val oldIndex: File = new File(input.head.getAbsolutePath.stripSuffix(".bam") + ".bai")
val newIndex: File = new File(sampleDir, s"$sampleId.bai")
val baiLn = Ln(qscript, oldIndex, newIndex)
add(baiLn)
val bamLn = Ln(qscript, input.head, bamFile)
bamLn.deps :+= baiLn.output
add(bamLn)
Some(bamFile)
} else {
val md = new MarkDuplicates(qscript)
md.input = input
md.output = new File(sampleDir, sampleId + ".dedup.bam")
md.outputMetrics = new File(sampleDir, sampleId + ".dedup.metrics")
//FIXME: making this file intermediate make the pipeline restart unnessery jobs
//md.isIntermediate = isIntermediate
add(md)
addSummarizable(md, "mark_duplicates")
Some(md.output)
}
}
lazy val preProcessBam: Option[File] = addDoublePreProcess(libraries.flatMap(lib => {
(lib._2.bamFile, lib._2.preProcessBam) match {
case (_, Some(file)) => Some(file)
case (Some(file), _) => Some(file)
case _ => None
}
}).toList)
lazy val variantcalling = if (config("single_sample_variantcalling", default = false).asBoolean) {
Some(makeVariantcalling(multisample = false))
} else None
/** This will add sample jobs */
def addJobs(): Unit = {
addPerLibJobs()
override def addJobs(): Unit = {
super.addJobs()
preProcessBam.foreach { bam =>
val bamMetrics = new BamMetrics(qscript)
bamMetrics.sampleId = Some(sampleId)
bamMetrics.inputBam = bam
bamMetrics.outputDir = new File(sampleDir, "metrics")
bamMetrics.init()
bamMetrics.biopetScript()
addAll(bamMetrics.functions)
addSummaryQScript(bamMetrics)
variantcalling.foreach(vc => {
vc.sampleId = Some(sampleId)
vc.outputDir = new File(sampleDir, "variantcalling")
......@@ -321,7 +122,9 @@ trait ShivaTrait extends MultiSampleQScript with Reference with TargetRegions {
} else None
/** This will add the mutisample variantcalling */
def addMultiSampleJobs(): Unit = {
override def addMultiSampleJobs = {
super.addMultiSampleJobs()
multisampleVariantCalling.foreach(vc => {
vc.outputDir = new File(outputDir, "variantcalling")
vc.inputBams = samples.flatMap { case (sampleId, sample) => sample.preProcessBam.map(sampleId -> _) }
......@@ -354,15 +157,11 @@ trait ShivaTrait extends MultiSampleQScript with Reference with TargetRegions {
def summaryFile = new File(outputDir, "Shiva.summary.json")
/** Settings of pipeline for summary */
def summarySettings = Map(
"reference" -> referenceSummary,
override def summarySettings = super.summarySettings ++ Map(
"annotation" -> annotation.isDefined,
"multisample_variantcalling" -> multisampleVariantCalling.isDefined,
"sv_calling" -> svCalling.isDefined,
"regions_of_interest" -> roiBedFiles.map(_.getName.stripSuffix(".bed")),
"amplicon_bed" -> ampliconBedFile.map(_.getName.stripSuffix(".bed"))
)
/** Files for the summary */
def summaryFiles = Map("referenceFasta" -> referenceFasta())
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment