Commit 14d8d45f authored by bow's avatar bow
Browse files

Merge branch 'develop' into feature-gentrap

Conflicts:
	public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/SamtoolsMpileup.scala
	public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala
parents 1c9c025e 118f09fd
package nl.lumc.sasc.biopet.pipelines.shiva
import java.io.File
import htsjdk.samtools.SamReaderFactory
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
import nl.lumc.sasc.biopet.core.MultiSampleQScript
import nl.lumc.sasc.biopet.extensions.Ln
import nl.lumc.sasc.biopet.extensions.picard.{ AddOrReplaceReadGroups, SamToFastq, MarkDuplicates }
import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics
import nl.lumc.sasc.biopet.pipelines.mapping.Mapping
import scala.collection.JavaConversions._
/**
* This is a trait for the Shiva pipeline
*
* Created by pjvan_thof on 2/26/15.
*/
trait ShivaTrait extends MultiSampleQScript with SummaryQScript {
qscript =>
/** Executed before running the script */
def init: Unit = {
}
/** Method to add jobs */
def biopetScript: Unit = {
addSamplesJobs()
addSummaryJobs
}
/** Method to make the variantcalling submodule of shiva */
def makeVariantcalling(multisample: Boolean = false): ShivaVariantcallingTrait = {
if (multisample) new ShivaVariantcalling(qscript) {
override def namePrefix = "multisample"
override def configName = "shivavariantcalling"
override def configPath: List[String] = super.configPath ::: "multisample" :: Nil
}
else new ShivaVariantcalling(qscript) {
override def configName = "shivavariantcalling"
}
}
/** Method to make a sample */
def makeSample(id: String) = new Sample(id)
/** Class that will generate jobs for a sample */
class Sample(sampleId: String) extends AbstractSample(sampleId) {
/** Sample specific files to add to summary */
def summaryFiles: Map[String, File] = {
preProcessBam match {
case Some(preProcessBam) => Map("bamFile" -> preProcessBam)
case _ => Map()
}
}
/** Sample specific stats to add to summary */
def summaryStats: Map[String, Any] = Map()
/** Method to make a library */
def makeLibrary(id: String) = new Library(id)
/** Class to generate jobs for a library */
class Library(libId: String) extends AbstractLibrary(libId) {
/** Library specific files to add to the summary */
def summaryFiles: Map[String, File] = {
(bamFile, preProcessBam) match {
case (Some(bamFile), Some(preProcessBam)) => Map("bamFile" -> bamFile, "preProcessBam" -> preProcessBam)
case (Some(bamFile), _) => Map("bamFile" -> bamFile)
case _ => Map()
}
}
/** Library specific stats to add to summary */
def summaryStats: Map[String, Any] = Map()
/** Method to execute library preprocess */
def preProcess(input: File): Option[File] = None
/** Method to make the mapping submodule */
def makeMapping = {
val mapping = new Mapping(qscript)
mapping.sampleId = Some(sampleId)
mapping.libId = Some(libId)
mapping.outputDir = libDir
mapping.outputName = sampleId + "-" + libId
(Some(mapping), Some(mapping.finalBamFile), preProcess(mapping.finalBamFile))
}
lazy val (mapping, bamFile, preProcessBam): (Option[Mapping], Option[File], Option[File]) =
(config.contains("R1"), config.contains("bam")) match {
case (true, _) => makeMapping // Default starting from fastq files
case (false, true) => // Starting from bam file
config("bam_to_fastq", default = false).asBoolean match {
case true => makeMapping // bam file will be converted to fastq
case false => {
val file = new File(libDir, sampleId + "-" + libId + ".final.bam")
(None, Some(file), preProcess(file))
}
}
case _ => (None, None, None)
}
lazy val variantcalling = if (config("library_variantcalling", default = false).asBoolean &&
(bamFile.isDefined || preProcessBam.isDefined)) {
Some(makeVariantcalling(multisample = false))
} else None
/** This will add jobs for this library */
def addJobs(): Unit = {
(config.contains("R1"), config.contains("bam")) match {
case (true, _) => mapping.foreach(mapping => {
mapping.input_R1 = config("R1")
mapping.input_R2 = config("R2")
})
case (false, true) => config("bam_to_fastq", default = false).asBoolean match {
case true => {
val samToFastq = SamToFastq(qscript, config("bam"),
new File(libDir, sampleId + "-" + libId + ".R1.fastq"),
new File(libDir, sampleId + "-" + libId + ".R2.fastq"))
samToFastq.isIntermediate = true
qscript.add(samToFastq)
mapping.foreach(mapping => {
mapping.input_R1 = samToFastq.fastqR1
mapping.input_R2 = Some(samToFastq.fastqR2)
})
}
case false => {
val inputSam = SamReaderFactory.makeDefault.open(config("bam"))
val readGroups = inputSam.getFileHeader.getReadGroups
val readGroupOke = readGroups.forall(readGroup => {
if (readGroup.getSample != sampleId) logger.warn("Sample ID readgroup in bam file is not the same")
if (readGroup.getLibrary != libId) logger.warn("Library ID readgroup in bam file is not the same")
readGroup.getSample == sampleId && readGroup.getLibrary == libId
})
inputSam.close
if (!readGroupOke) {
if (config("correct_readgroups", default = false).asBoolean) {
logger.info("Correcting readgroups, file:" + config("bam"))
val aorrg = AddOrReplaceReadGroups(qscript, config("bam"), bamFile.get)
aorrg.RGID = sampleId + "-" + libId
aorrg.RGLB = libId
aorrg.RGSM = sampleId
aorrg.isIntermediate = true
qscript.add(aorrg)
} else throw new IllegalStateException("Sample readgroup and/or library of input bamfile is not correct, file: " + bamFile +
"\nPlease note that it is possible to set 'correct_readgroups' to true in the config to automatic fix this")
} else {
val oldBamFile: File = config("bam")
val oldIndex: File = new File(oldBamFile.getAbsolutePath.stripSuffix(".bam") + ".bai")
val newIndex: File = new File(libDir, oldBamFile.getName.stripSuffix(".bam") + ".bai")
val baiLn = Ln(qscript, oldIndex, newIndex)
add(baiLn)
val bamLn = Ln(qscript, oldBamFile, bamFile.get)
bamLn.deps :+= baiLn.out
add(bamLn)
}
}
}
case _ => logger.warn("Sample: " + sampleId + " Library: " + libId + ", no reads found")
}
mapping.foreach(mapping => {
mapping.init
mapping.biopetScript
addAll(mapping.functions) // Add functions of mapping to curent function pool
addSummaryQScript(mapping)
})
variantcalling.foreach(vc => {
vc.sampleId = Some(libId)
vc.libId = Some(sampleId)
vc.outputDir = new File(libDir, "variantcalling")
if (preProcessBam.isDefined) vc.inputBams = preProcessBam.get :: Nil
else vc.inputBams = bamFile.get :: Nil
vc.init
vc.biopetScript
addAll(vc.functions)
addSummaryQScript(vc)
})
}
}
/** This will add jobs for the double preprocessing */
protected def addDoublePreProcess(input: List[File], isIntermediate: Boolean = false): Option[File] = {
if (input == Nil) None
else if (input.tail == Nil) {
val bamFile = new File(sampleDir, input.head.getName)
val oldIndex: File = new File(input.head.getAbsolutePath.stripSuffix(".bam") + ".bai")
val newIndex: File = new File(sampleDir, input.head.getName.stripSuffix(".bam") + ".bai")
val baiLn = Ln(qscript, oldIndex, newIndex)
add(baiLn)
val bamLn = Ln(qscript, input.head, bamFile)
bamLn.deps :+= baiLn.out
add(bamLn)
Some(bamFile)
} else {
val md = new MarkDuplicates(qscript)
md.input = input
md.output = new File(sampleDir, sampleId + ".dedup.bam")
md.outputMetrics = new File(sampleDir, sampleId + ".dedup.bam")
md.isIntermediate = isIntermediate
add(md)
addSummarizable(md, "mark_duplicates")
Some(md.output)
}
}
lazy val preProcessBam: Option[File] = addDoublePreProcess(libraries.map(lib => {
(lib._2.bamFile, lib._2.preProcessBam) match {
case (_, Some(file)) => Some(file)
case (Some(file), _) => Some(file)
case _ => None
}
}).flatten.toList)
lazy val variantcalling = if (config("single_sample_variantcalling", default = false).asBoolean) {
Some(makeVariantcalling(multisample = true))
} else None
/** This will add sample jobs */
def addJobs(): Unit = {
addPerLibJobs()
if (preProcessBam.isDefined) {
val bamMetrics = new BamMetrics(qscript)
bamMetrics.sampleId = Some(sampleId)
bamMetrics.inputBam = preProcessBam.get
bamMetrics.outputDir = sampleDir
bamMetrics.init
bamMetrics.biopetScript
addAll(bamMetrics.functions)
addSummaryQScript(bamMetrics)
variantcalling.foreach(vc => {
vc.sampleId = Some(sampleId)
vc.outputDir = new File(sampleDir, "variantcalling")
vc.inputBams = preProcessBam.get :: Nil
vc.init
vc.biopetScript
addAll(vc.functions)
addSummaryQScript(vc)
})
}
}
}
lazy val variantcalling = if (config("multisample_sample_variantcalling", default = true).asBoolean) {
Some(makeVariantcalling(multisample = true))
} else None
/** This will add the mutisample variantcalling */
def addMultiSampleJobs(): Unit = {
variantcalling.foreach(vc => {
vc.outputDir = new File(outputDir, "variantcalling")
vc.inputBams = samples.map(_._2.preProcessBam).flatten.toList
vc.init
vc.biopetScript
addAll(vc.functions)
addSummaryQScript(vc)
})
}
/** Location of summary file */
def summaryFile = new File(outputDir, "Shiva.summary.json")
/** Settings of pipeline for summary */
def summarySettings = Map()
/** Files for the summary */
def summaryFiles = Map()
}
package nl.lumc.sasc.biopet.pipelines.shiva
import nl.lumc.sasc.biopet.core.PipelineCommand
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.queue.QScript
/**
* Created by pjvan_thof on 2/26/15.
*/
class ShivaVariantcalling(val root: Configurable) extends QScript with ShivaVariantcallingTrait {
def this() = this(null)
}
object ShivaVariantcalling extends PipelineCommand
\ No newline at end of file
package nl.lumc.sasc.biopet.pipelines.shiva
import java.io.File
import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand, SampleLibraryTag }
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
import nl.lumc.sasc.biopet.extensions.{ Tabix, Bgzip, Gzip }
import nl.lumc.sasc.biopet.extensions.bcftools.BcftoolsCall
import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants
import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsMpileup
import nl.lumc.sasc.biopet.tools.{ VcfStats, VcfFilter, MpileupToVcf }
import nl.lumc.sasc.biopet.utils.ConfigUtils
import org.broadinstitute.gatk.queue.function.CommandLineFunction
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
import scala.collection.generic.Sorted
/**
* Created by pjvan_thof on 2/26/15.
*/
trait ShivaVariantcallingTrait extends SummaryQScript with SampleLibraryTag {
qscript =>
@Input(doc = "Bam files (should be deduped bams)", shortName = "BAM", required = true)
var inputBams: List[File] = Nil
/** Name prefix, can override this methods if neeeded */
def namePrefix: String = {
(sampleId, libId) match {
case (Some(sampleId), Some(libId)) => sampleId + "-" + libId
case (Some(sampleId), _) => sampleId
case _ => config("name_prefix")
}
}
/** Executed before script */
def init: Unit = {
}
/** Final merged output files of all variantcaller modes */
def finalFile = new File(outputDir, namePrefix + ".final.vcf.gz")
/** Variantcallers requested by the config */
protected val configCallers: Set[String] = config("variantcallers")
/** This will add jobs for this pipeline */
def biopetScript: Unit = {
for (cal <- configCallers) {
if (!callersList.exists(_.name == cal))
BiopetQScript.addError("variantcaller '" + cal + "' does not exist, possible to use: " + callersList.map(_.name).mkString(", "))
}
val callers = callersList.filter(x => configCallers.contains(x.name)).sortBy(_.prio)
require(!inputBams.isEmpty, "No input bams found")
require(!callers.isEmpty, "must select atleast 1 variantcaller, possible to use: " + callersList.map(_.name).mkString(", "))
val cv = new CombineVariants(qscript)
cv.outputFile = finalFile
cv.setKey = "VariantCaller"
cv.genotypeMergeOptions = Some("PRIORITIZE")
cv.rodPriorityList = callers.map(_.name).mkString(",")
for (caller <- callers) {
caller.addJobs()
cv.addInput(caller.outputFile, caller.name)
val vcfStats = new VcfStats(qscript)
vcfStats.input = caller.outputFile
vcfStats.setOutputDir(new File(caller.outputDir, "vcfstats"))
add(vcfStats)
addSummarizable(vcfStats, namePrefix + "-vcfstats-" + caller.name)
}
add(cv)
val vcfStats = new VcfStats(qscript)
vcfStats.input = finalFile
vcfStats.setOutputDir(new File(outputDir, "vcfstats"))
vcfStats.infoTags :+= cv.setKey
add(vcfStats)
addSummarizable(vcfStats, namePrefix + "-vcfstats-final")
addSummaryJobs
}
/** Will generate all available variantcallers */
protected def callersList: List[Variantcaller] = List(new Freebayes, new RawVcf, new Bcftools)
/** General trait for a variantcaller mode */
trait Variantcaller {
/** Name of mode, this should also be used in the config */
val name: String
/** Output dir for this mode */
def outputDir = new File(qscript.outputDir, name)
/** Prio in merging in the final file */
protected val defaultPrio: Int
/** Prio from the config */
lazy val prio: Int = config("prio_" + name, default = defaultPrio)
/** This should add the variantcaller jobs */
def addJobs()
/** Final output file of this mode */
def outputFile: File
}
/** default mode of freebayes */
class Freebayes extends Variantcaller {
val name = "freebayes"
protected val defaultPrio = 7
/** Final output file of this mode */
def outputFile = new File(outputDir, namePrefix + ".freebayes.vcf.gz")
def addJobs() {
val fb = new nl.lumc.sasc.biopet.extensions.Freebayes(qscript)
fb.bamfiles = inputBams
fb.outputVcf = new File(outputDir, namePrefix + ".freebayes.vcf")
fb.isIntermediate = true
add(fb)
//TODO: need piping for this, see also issue #114
val bz = new Bgzip(qscript)
bz.input = List(fb.outputVcf)
bz.output = outputFile
add(bz)
val ti = new Tabix(qscript)
ti.input = bz.output
ti.p = Some("vcf")
add(ti)
}
}
/** default mode of bcftools */
class Bcftools extends Variantcaller {
val name = "bcftools"
protected val defaultPrio = 8
/** Final output file of this mode */
def outputFile = new File(outputDir, namePrefix + ".bcftools.vcf.gz")
def addJobs() {
val mp = new SamtoolsMpileup(qscript)
mp.input = inputBams
mp.u = true
val bt = new BcftoolsCall(qscript)
bt.O = "z"
bt.v = true
bt.c = true
//TODO: add proper class with piping support, see also issue #114
add(new CommandLineFunction {
@Input
var input = inputBams
@Output
var output = outputFile
def commandLine: String = mp.cmdPipe + " | " + bt.cmdPipeInput + " > " + outputFile + " && tabix -p vcf " + outputFile
})
}
}
/** Makes a vcf file from a mpileup without statistics */
class RawVcf extends Variantcaller {
val name = "raw"
// This caller is designed as fallback when other variantcallers fails to report
protected val defaultPrio = Int.MaxValue
/** Final output file of this mode */
def outputFile = new File(outputDir, namePrefix + ".raw.vcf.gz")
def addJobs() {
val rawFiles = inputBams.map(bamFile => {
val m2v = new MpileupToVcf(qscript)
m2v.inputBam = bamFile
m2v.output = new File(outputDir, bamFile.getName.stripSuffix(".bam") + ".raw.vcf")
add(m2v)
val vcfFilter = new VcfFilter(qscript) {
override def configName = "vcffilter"
override def defaults = ConfigUtils.mergeMaps(Map("min_sample_depth" -> 8,
"min_alternate_depth" -> 2,
"min_samples_pass" -> 1,
"filter_ref_calls" -> true
), super.defaults)
}
vcfFilter.inputVcf = m2v.output
vcfFilter.outputVcf = new File(outputDir, bamFile.getName.stripSuffix(".bam") + ".raw.filter.vcf.gz")
add(vcfFilter)
vcfFilter.outputVcf
})
val cv = new CombineVariants(qscript)
cv.inputFiles = rawFiles
cv.outputFile = outputFile
cv.setKey = "null"
cv.excludeNonVariants = true
add(cv)
}
}
/** Location of summary file */
def summaryFile = new File(outputDir, "ShivaVariantcalling.summary.json")
/** Settings for the summary */
def summarySettings = Map("variantcallers" -> configCallers.toList)
/** Files for the summary */
def summaryFiles: Map[String, File] = {
val callers: Set[String] = config("variantcallers")
callersList.filter(x => callers.contains(x.name)).map(x => (x.name -> x.outputFile)).toMap + ("final" -> finalFile)
}
}
\ No newline at end of file
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
# Set root logger level to DEBUG and its only appender to A1.
log4j.rootLogger=ERROR, A1
# A1 is set to be a ConsoleAppender.
log4j.appender.A1=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout.
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-5p [%d] [%C{1}] - %m%n
\ No newline at end of file
package nl.lumc.sasc.biopet.pipelines.shiva
import com.google.common.io.Files
import nl.lumc.sasc.biopet.core.config.Config
import nl.lumc.sasc.biopet.extensions.bwa.BwaMem
import nl.lumc.sasc.biopet.extensions.picard.{ MarkDuplicates, MergeSamFiles, SortSam }
import nl.lumc.sasc.biopet.tools.VcfStats
import nl.lumc.sasc.biopet.utils.ConfigUtils
import org.broadinstitute.gatk.queue.QSettings
import org.scalatest.Matchers
import org.scalatest.testng.TestNGSuite
import org.testng.annotations.{ Test, DataProvider }
/**
* Created by pjvan_thof on 3/2/15.
*/
class ShivaTest extends TestNGSuite with Matchers {
def initPipeline(map: Map[String, Any]): Shiva = {
new Shiva() {
override def configName = "shiva"
override def globalConfig = new Config(ConfigUtils.mergeMaps(map, ShivaTest.config))
qSettings = new QSettings
qSettings.runName = "test"
}
}
@DataProvider(name = "shivaOptions")
def shivaOptions = {
val bool = Array(true, false)
for (