Commit 67ec1f2f authored by bow's avatar bow
Browse files

Merge branch 'review-kraken' into 'develop'

Review kraken / gears



See merge request !205
parents 1658c07f 9d3bdc2a
......@@ -6,7 +6,7 @@
package nl.lumc.sasc.biopet.core
object BiopetExecutableProtected extends BiopetExecutable {
def pipelines: List[MainCommand] = BiopetExecutablePublic.protectedPipelines ::: List(
def pipelines: List[MainCommand] = BiopetExecutablePublic.pipelines ::: List(
nl.lumc.sasc.biopet.pipelines.gatk.Shiva,
nl.lumc.sasc.biopet.pipelines.gatk.ShivaVariantcalling,
nl.lumc.sasc.biopet.pipelines.gatk.Basty)
......
......@@ -28,6 +28,15 @@ class Kraken(val root: Configurable) extends BiopetCommandLineFunction {
@Input(doc = "Input: FastQ or FastA")
var input: List[File] = _
@Output(doc = "Unidentified reads", required = false)
var unclassified_out: Option[File] = None
@Output(doc = "Identified reads", required = false)
var classified_out: Option[File] = None
@Output(doc = "Output with hits per sequence")
var output: File = _
var db: File = config("db")
var inputFastQ: Boolean = true
......@@ -36,55 +45,37 @@ class Kraken(val root: Configurable) extends BiopetCommandLineFunction {
var compressionBzip: Boolean = false
var quick: Boolean = false
var min_hits: Option[Int] = config("min_hits")
@Output(doc = "Unidentified reads", required = false)
var unclassified_out: Option[File] = None
@Output(doc = "Identified reads", required = false)
var classified_out: Option[File] = None
var minHits: Option[Int] = config("min_hits")
@Output(doc = "Output with hits per sequence")
var output: File = _
var preload: Boolean = config("preload", default = true)
var preLoad: Boolean = config("preload", default = true)
var paired: Boolean = config("paired", default = false)
executable = config("exe", default = "kraken")
override def versionRegex = """Kraken version ([\d\w\-\.]+)\n.*""".r
override def versionExitcode = List(0, 1)
override def versionCommand = executable + " --version"
override def defaultCoreMemory = 8.0
override def defaultThreads = 4
override def versionCommand = executable + " --version"
/** Sets readgroup when not set yet */
override def beforeGraph(): Unit = {
super.beforeGraph()
//FIXME: This does not do anything
}
/** Returns command to execute */
def cmdLine = {
var cmd: String = required(executable) +
"--db" + required(db) +
optional("--threads", nCoresRequest) +
conditional(inputFastQ, "--fastq-input") +
conditional(!inputFastQ, "--fasta-input") +
conditional(quick, "--quick")
min_hits match {
case Some(v) => cmd += "--min_hits " + v
case _ => cmd += ""
}
cmd += optional("--unclassified-out ", unclassified_out.get) +
optional("--classified-out ", classified_out.get) +
"--output" + required(output) +
conditional(preload, "--preload") +
conditional(paired, "--paired")
// finally the input files (R1 [R2])
cmd += input.mkString(" ")
cmd
}
def cmdLine = required(executable) +
"--db" + required(db) +
optional("--threads", nCoresRequest) +
conditional(inputFastQ, "--fastq-input") +
conditional(!inputFastQ, "--fasta-input") +
conditional(quick, "--quick") +
optional("--min_hits", minHits) +
optional("--unclassified-out ", unclassified_out.get) +
optional("--classified-out ", classified_out.get) +
"--output" + required(output) +
conditional(preLoad, "--preload") +
conditional(paired, "--paired") +
repeat(input)
}
......@@ -16,7 +16,7 @@
package nl.lumc.sasc.biopet.core
object BiopetExecutablePublic extends BiopetExecutable {
def protectedPipelines: List[MainCommand] = List(
def publicPipelines: List[MainCommand] = List(
nl.lumc.sasc.biopet.pipelines.flexiprep.Flexiprep,
nl.lumc.sasc.biopet.pipelines.mapping.Mapping,
nl.lumc.sasc.biopet.pipelines.gentrap.Gentrap,
......@@ -25,15 +25,15 @@ object BiopetExecutablePublic extends BiopetExecutable {
nl.lumc.sasc.biopet.pipelines.bamtobigwig.Bam2Wig,
nl.lumc.sasc.biopet.pipelines.carp.Carp,
nl.lumc.sasc.biopet.pipelines.toucan.Toucan,
nl.lumc.sasc.biopet.pipelines.shiva.ShivaSvCalling
nl.lumc.sasc.biopet.pipelines.shiva.ShivaSvCalling,
nl.lumc.sasc.biopet.pipelines.gears.Gears
)
def pipelines: List[MainCommand] = List(
nl.lumc.sasc.biopet.pipelines.shiva.Shiva,
nl.lumc.sasc.biopet.pipelines.shiva.ShivaVariantcalling,
nl.lumc.sasc.biopet.pipelines.basty.Basty,
nl.lumc.sasc.biopet.pipelines.gears.Gears
) ::: protectedPipelines
nl.lumc.sasc.biopet.pipelines.basty.Basty
) ::: publicPipelines
def tools: List[MainCommand] = List(
nl.lumc.sasc.biopet.tools.MergeTables,
......
......@@ -15,12 +15,310 @@
*/
package nl.lumc.sasc.biopet.pipelines.gears
import nl.lumc.sasc.biopet.core.PipelineCommand
import htsjdk.samtools.SamReaderFactory
import nl.lumc.sasc.biopet.FullVersion
import nl.lumc.sasc.biopet.core.{ PipelineCommand, MultiSampleQScript }
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.Ln
import nl.lumc.sasc.biopet.extensions.kraken.{ Kraken, KrakenReport }
import nl.lumc.sasc.biopet.extensions.picard.{ AddOrReplaceReadGroups, MarkDuplicates, MergeSamFiles, SamToFastq }
import nl.lumc.sasc.biopet.extensions.sambamba.SambambaView
import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics
import nl.lumc.sasc.biopet.pipelines.mapping.Mapping
import nl.lumc.sasc.biopet.tools.FastqSync
import org.broadinstitute.gatk.queue.QScript
class Gears(val root: Configurable) extends QScript with GearsTrait {
import scala.collection.JavaConversions._
/**
* This is a trait for the Gears pipeline
* The ShivaTrait is used as template for this pipeline
*/
class Gears(val root: Configurable) extends QScript with MultiSampleQScript { qscript =>
def this() = this(null)
/** Executed before running the script */
def init(): Unit = {
}
/** Method to add jobs */
def biopetScript(): Unit = {
addSamplesJobs()
addSummaryJobs()
}
/** Multisample meta-genome comparison */
def addMultiSampleJobs(): Unit = {
// generate report from multiple samples, this is:
// - the TSV
// - the Spearman correlation plot + table
}
/** Location of summary file */
def summaryFile = new File(outputDir, "gears.summary.json")
/** Settings of pipeline for summary */
def summarySettings = Map()
/** Files for the summary */
def summaryFiles = Map()
/** Method to make a sample */
def makeSample(id: String) = new Sample(id)
/** Class that will generate jobs for a sample */
class Sample(sampleId: String) extends AbstractSample(sampleId) {
/** Sample specific files to add to summary */
def summaryFiles: Map[String, File] = {
preProcessBam match {
case Some(pb) => Map("bamFile" -> pb)
case _ => Map()
}
} ++ Map("alignment" -> alnFile)
/** Sample specific stats to add to summary */
def summaryStats: Map[String, Any] = Map()
/** Method to make a library */
def makeLibrary(id: String) = new Library(id)
/** Class to generate jobs for a library */
class Library(libId: String) extends AbstractLibrary(libId) {
/** Library specific files to add to the summary */
def summaryFiles: Map[String, File] = {
(bamFile, preProcessBam) match {
case (Some(b), Some(pb)) => Map("bamFile" -> b, "preProcessBam" -> pb)
case (Some(b), _) => Map("bamFile" -> b)
case _ => Map()
}
}
/** Alignment results of this library ~ can only be accessed after addJobs is run! */
def alnFile: File = bamFile match {
case Some(b) => b
case _ => throw new IllegalStateException("The bamfile is not generated yet")
}
/** Library specific stats to add to summary */
def summaryStats: Map[String, Any] = Map()
/** Method to execute library preprocess */
def preProcess(input: File): Option[File] = None
/** Method to make the mapping submodule */
def makeMapping = {
val mapping = new Mapping(qscript)
mapping.sampleId = Some(sampleId)
mapping.libId = Some(libId)
mapping.outputDir = libDir
mapping.outputName = sampleId + "-" + libId
(Some(mapping), Some(mapping.finalBamFile), preProcess(mapping.finalBamFile))
}
/**
* Determine where where to start the pipeline in cases where both R1 (fastq) and BAM is specified
*/
lazy val (mapping, bamFile, preProcessBam): (Option[Mapping], Option[File], Option[File]) =
(config.contains("R1"), config.contains("bam")) match {
case (true, _) => makeMapping // Default starting from fastq files
case (false, true) => // Starting from bam file
config("bam_to_fastq", default = false).asBoolean match {
case true => makeMapping // bam file will be converted to fastq
case false =>
val file = new File(libDir, sampleId + "-" + libId + ".final.bam")
(None, Some(file), preProcess(file))
}
case _ => (None, None, None)
}
/** This will add jobs for this library */
def addJobs(): Unit = {
(config.contains("R1"), config.contains("bam")) match {
case (true, _) => mapping.foreach(mapping => {
mapping.input_R1 = config("R1")
})
case (false, true) => config("bam_to_fastq", default = false).asBoolean match {
case true =>
val samToFastq = SamToFastq(qscript, config("bam"),
new File(libDir, sampleId + "-" + libId + ".R1.fastq"),
new File(libDir, sampleId + "-" + libId + ".R2.fastq"))
samToFastq.isIntermediate = true
qscript.add(samToFastq)
mapping.foreach(mapping => {
mapping.input_R1 = samToFastq.fastqR1
mapping.input_R2 = Some(samToFastq.fastqR2)
})
case false =>
val inputSam = SamReaderFactory.makeDefault.open(config("bam"))
val readGroups = inputSam.getFileHeader.getReadGroups
val readGroupOke = readGroups.forall(readGroup => {
if (readGroup.getSample != sampleId) logger.warn("Sample ID readgroup in bam file is not the same")
if (readGroup.getLibrary != libId) logger.warn("Library ID readgroup in bam file is not the same")
readGroup.getSample == sampleId && readGroup.getLibrary == libId
})
inputSam.close()
if (!readGroupOke) {
if (config("correct_readgroups", default = false).asBoolean) {
logger.info("Correcting readgroups, file:" + config("bam"))
val aorrg = AddOrReplaceReadGroups(qscript, config("bam"), bamFile.get)
aorrg.RGID = sampleId + "-" + libId
aorrg.RGLB = libId
aorrg.RGSM = sampleId
aorrg.isIntermediate = true
qscript.add(aorrg)
} else throw new IllegalStateException("Sample readgroup and/or library of input bamfile is not correct, file: " + bamFile +
"\nPlease note that it is possible to set 'correct_readgroups' to true in the config to automatic fix this")
} else {
val oldBamFile: File = config("bam")
val oldIndex: File = new File(oldBamFile.getAbsolutePath.stripSuffix(".bam") + ".bai")
val newIndex: File = new File(libDir, oldBamFile.getName.stripSuffix(".bam") + ".bai")
val baiLn = Ln(qscript, oldIndex, newIndex)
add(baiLn)
val bamLn = Ln(qscript, oldBamFile, bamFile.get)
bamLn.deps :+= baiLn.output
add(bamLn)
}
}
case _ => logger.warn("Sample: " + sampleId + " Library: " + libId + ", no reads found")
}
mapping.foreach(mapping => {
mapping.init()
mapping.biopetScript()
addAll(mapping.functions) // Add functions of mapping to current function pool
addSummaryQScript(mapping)
})
}
}
/** This will add jobs for the double preprocessing */
protected def addDoublePreProcess(input: List[File], isIntermediate: Boolean = false): Option[File] = {
if (input == Nil) None
else if (input.tail == Nil) {
val bamFile = new File(sampleDir, input.head.getName)
val oldIndex: File = new File(input.head.getAbsolutePath.stripSuffix(".bam") + ".bai")
val newIndex: File = new File(sampleDir, input.head.getName.stripSuffix(".bam") + ".bai")
val baiLn = Ln(qscript, oldIndex, newIndex)
add(baiLn)
val bamLn = Ln(qscript, input.head, bamFile)
bamLn.deps :+= baiLn.output
add(bamLn)
Some(bamFile)
} else {
val md = new MarkDuplicates(qscript)
md.input = input
md.output = new File(sampleDir, sampleId + ".dedup.bam")
md.outputMetrics = new File(sampleDir, sampleId + ".dedup.metrics")
md.isIntermediate = isIntermediate
md.removeDuplicates = true
add(md)
addSummarizable(md, "mark_duplicates")
Some(md.output)
}
}
lazy val preProcessBam: Option[File] = addDoublePreProcess(libraries.flatMap(lib => {
(lib._2.bamFile, lib._2.preProcessBam) match {
case (_, Some(file)) => Some(file)
case (Some(file), _) => Some(file)
case _ => None
}
}).toList)
def alnFile: File = sampleBamLinkJob.output
/** Job for combining all library BAMs */
private def sampleBamLinkJob: Ln =
makeCombineJob(libraries.values.map(_.alnFile).toList, createFile(".bam"))
/** Ln or MergeSamFile job, depending on how many inputs are supplied */
private def makeCombineJob(inFiles: List[File], outFile: File,
mergeSortOrder: String = "coordinate"): Ln = {
require(inFiles.nonEmpty, "At least one input files for combine job")
val input: File = {
if (inFiles.size == 1) inFiles.head
else {
val mergedBam = createFile(".merged.bam")
val mergejob = new MergeSamFiles(qscript)
mergejob.input = inFiles
mergejob.output = mergedBam
mergejob.sortOrder = mergeSortOrder
add(mergejob)
mergejob.output
}
}
val linkJob = new Ln(qscript)
linkJob.input = input
linkJob.output = outFile
linkJob
}
/** This will add sample jobs */
def addJobs(): Unit = {
addPerLibJobs()
// merge or symlink per-library alignments
add(sampleBamLinkJob)
if (preProcessBam.isDefined) {
val bamMetrics = new BamMetrics(qscript)
bamMetrics.sampleId = Some(sampleId)
bamMetrics.inputBam = preProcessBam.get
bamMetrics.outputDir = sampleDir
bamMetrics.init()
bamMetrics.biopetScript()
addAll(bamMetrics.functions)
addSummaryQScript(bamMetrics)
}
// sambamba view -f bam -F "unmapped or mate_is_unmapped" <alnFile> > <extracted.bam>
val samFilterUnmapped = new SambambaView(qscript)
samFilterUnmapped.input = alnFile
samFilterUnmapped.filter = Some("unmapped or mate_is_unmapped")
samFilterUnmapped.output = createFile(".unmapped.bam")
samFilterUnmapped.isIntermediate = true
qscript.add(samFilterUnmapped)
// start bam to fastq (only on unaligned reads) also extract the matesam
val samToFastq = SamToFastq(qscript, alnFile,
createFile(".unmap.R1.fastq"),
createFile(".unmap.R2.fastq")
)
samToFastq.isIntermediate = true
qscript.add(samToFastq)
// sync the fastq records
val fastqSync = new FastqSync(qscript)
fastqSync.refFastq = samToFastq.fastqR1
fastqSync.inputFastq1 = samToFastq.fastqR1
fastqSync.inputFastq2 = samToFastq.fastqR2
fastqSync.outputFastq1 = createFile(".unmapsynced.R1.fastq.gz")
fastqSync.outputFastq2 = createFile(".unmapsynced.R2.fastq.gz")
fastqSync.outputStats = createFile(".syncstats.json")
qscript.add(fastqSync)
// start kraken
val krakenAnalysis = new Kraken(qscript)
krakenAnalysis.input = List(fastqSync.outputFastq1, fastqSync.outputFastq2)
krakenAnalysis.output = createFile(".krkn.raw")
krakenAnalysis.paired = true
krakenAnalysis.classified_out = Option(createFile(".krkn.classified.fastq"))
krakenAnalysis.unclassified_out = Option(createFile(".krkn.unclassified.fastq"))
qscript.add(krakenAnalysis)
// create kraken summary file
val krakenReport = new KrakenReport(qscript)
krakenReport.input = krakenAnalysis.output
krakenReport.show_zeros = true
krakenReport.output = createFile(".krkn.full")
qscript.add(krakenReport)
}
}
}
/** This object give a default main method to the pipelines */
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.pipelines.gears
import java.io.File
import htsjdk.samtools.SamReaderFactory
import nl.lumc.sasc.biopet.FullVersion
import nl.lumc.sasc.biopet.core.MultiSampleQScript
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
import nl.lumc.sasc.biopet.extensions.Ln
import nl.lumc.sasc.biopet.extensions.kraken.{ Kraken, KrakenReport }
import nl.lumc.sasc.biopet.extensions.picard.{ AddOrReplaceReadGroups, MarkDuplicates, MergeSamFiles, SamToFastq }
import nl.lumc.sasc.biopet.extensions.sambamba.SambambaView
import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics
import nl.lumc.sasc.biopet.pipelines.mapping.Mapping
import nl.lumc.sasc.biopet.tools.FastqSync
import scala.collection.JavaConversions._
/**
* This is a trait for the Gears pipeline
* The ShivaTrait is used as template for this pipeline
*/
trait GearsTrait extends MultiSampleQScript with SummaryQScript { qscript =>
/** Executed before running the script */
def init(): Unit = {
}
/** Method to add jobs */
def biopetScript(): Unit = {
addSamplesJobs()
addSummaryJobs()
}
/** Multisample meta-genome comparison */
def addMultiSampleJobs(): Unit = {
// generate report from multiple samples, this is:
// - the TSV
// - the Spearman correlation plot + table
}
/** Location of summary file */
def summaryFile = new File(outputDir, "gears.summary.json")
/** Settings of pipeline for summary */
def summarySettings = Map(
"version" -> FullVersion
)
/** Files for the summary */
def summaryFiles = Map()
/** Method to make a sample */
def makeSample(id: String) = new Sample(id)
/** Class that will generate jobs for a sample */
class Sample(sampleId: String) extends AbstractSample(sampleId) {
/** Sample specific files to add to summary */
def summaryFiles: Map[String, File] = {
preProcessBam match {
case Some(pb) => Map("bamFile" -> pb)
case _ => Map()
}
} ++ Map(
"alignment" -> alnFile
)
/** Sample specific stats to add to summary */
def summaryStats: Map[String, Any] = Map()
/** Method to make a library */
def makeLibrary(id: String) = new Library(id)
/** Class to generate jobs for a library */
class Library(libId: String) extends AbstractLibrary(libId) {
/** Library specific files to add to the summary */
def summaryFiles: Map[String, File] = {
(bamFile, preProcessBam) match {
case (Some(b), Some(pb)) => Map("bamFile" -> b, "preProcessBam" -> pb)
case (Some(b), _) => Map("bamFile" -> b)
case _ => Map()
}
}
/** Alignment results of this library ~ can only be accessed after addJobs is run! */
def alnFile: File = bamFile match {
case Some(b) => b
case _ => throw new IllegalStateException("The bamfile is not generated yet")
}
/** Library specific stats to add to summary */
def summaryStats: Map[String, Any] = Map()
/** Method to execute library preprocess */
def preProcess(input: File): Option[File] = None
/** Method to make the mapping submodule */
def makeMapping = {
val mapping = new Mapping(qscript)
mapping.sampleId = Some(sampleId)
mapping.libId = Some(libId)
mapping.outputDir = libDir
mapping.outputName = sampleId + "-" + libId
(Some(mapping), Some(mapping.finalBamFile), preProcess(mapping.finalBamFile))
}
/**
* Determine where where to start the pipeline in cases where both R1 (fastq) and BAM is specified
*/
lazy val (mapping, bamFile, preProcessBam): (Option[Mapping], Option[File], Option[File]) =