Commit 24837f5b authored by Sander van der Zeeuw's avatar Sander van der Zeeuw
Browse files

Merge branch 'feature-gears' into 'develop'

Feature gears (metagenomics)

Metagenomics pipeline, start from bam or from FastQ files.

Dependencies:
- Sambamba (v0.5.1+)
- Kraken

See merge request !168
parents 8c08d89b 900ba26b
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.kraken
import java.io.File
import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import scala.util.matching.Regex
/** Extension for Kraken */
class Kraken(val root: Configurable) extends BiopetCommandLineFunction {
@Input(doc = "Input: FastQ or FastA")
var input: List[File] = _
var db: File = config("db")
var inputFastQ: Boolean = true
var compression: Boolean = false
var compressionGzip: Boolean = false
var compressionBzip: Boolean = false
var quick: Boolean = false
var min_hits: Option[Int] = config("min_hits")
@Output(doc = "Unidentified reads", required = false)
var unclassified_out: Option[File] = None
@Output(doc = "Identified reads", required = false)
var classified_out: Option[File] = None
@Output(doc = "Output with hits per sequence")
var output: File = _
var preload: Boolean = config("preload", default = true)
var paired: Boolean = config("paired", default = false)
executable = config("exe", default = "kraken")
override val versionRegex = """Kraken version ([\d\w\-\.]+)\n.*""".r
override val versionExitcode = List(0, 1)
override val defaultCoreMemory = 8.0
override val defaultThreads = 4
override def versionCommand = executable + " --version"
/** Sets readgroup when not set yet */
override def beforeGraph: Unit = {
super.beforeGraph
}
/** Returns command to execute */
def cmdLine = {
var cmd: String = required(executable) +
"--db" + required(db) +
optional("--threads", nCoresRequest) +
conditional(inputFastQ, "--fastq-input") +
conditional(inputFastQ == false, "--fasta-input") +
conditional(quick, "--quick")
min_hits match {
case Some(v) => cmd += "--min_hits " + v
case _ => cmd += ""
}
cmd += optional("--unclassified-out ", unclassified_out.get) +
optional("--classified-out ", classified_out.get) +
"--output" + required(output) +
conditional(preload, "--preload") +
conditional(paired, "--paired")
// finally the input files (R1 [R2])
cmd += input.mkString(" ")
cmd
}
}
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.kraken
import java.io.File
import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/** Extension for Kraken */
class KrakenReport(val root: Configurable) extends BiopetCommandLineFunction {
executable = config("exe", default = "kraken-report")
override val versionRegex = """Kraken version (.*)""".r
override val versionExitcode = List(0, 1)
override val defaultCoreMemory = 4.0
override val defaultThreads = 1
override def versionCommand = new File(new File(executable).getParent, "kraken").getAbsolutePath + " --version"
var db: File = config("db")
var show_zeros: Boolean = config("show_zeros", default = false)
@Input(doc = "Input raw kraken analysis")
var input: File = _
@Output(doc = "Output path kraken report")
var output: File = _
def cmdLine: String = {
val cmd: String = "--db " + required(db) +
conditional(show_zeros, "--show-zeros") +
input.getAbsolutePath + ">" + output.getAbsolutePath
cmd
}
}
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.sambamba
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import java.io.File
/** Extension for sambamba flagstat */
class SambambaView(val root: Configurable) extends Sambamba {
override val defaultThreads = 2
@Input(doc = "Bam File")
var input: File = _
@Output(doc = "output File")
var output: File = _
var filter: Option[String] = _
val format: Option[String] = config("format", default = "bam")
val regions: Option[File] = config("regions")
val compression_level: Option[Int] = config("compression_level", default = 6)
/** Returns command to execute */
def cmdLine = required(executable) +
required("view") +
optional("--filter", filter) +
optional("--nthreads", nCoresRequest) +
optional("--format", format.get) +
optional("--regions", regions) +
optional("--compression-level", compression_level) +
required("--output" + output) +
required(input)
}
......@@ -80,6 +80,11 @@
<artifactId>Kopisu</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>Gears</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>Bam2Wig</artifactId>
......
......@@ -30,7 +30,8 @@ object BiopetExecutablePublic extends BiopetExecutable {
def pipelines: List[MainCommand] = List(
nl.lumc.sasc.biopet.pipelines.shiva.Shiva,
nl.lumc.sasc.biopet.pipelines.shiva.ShivaVariantcalling,
nl.lumc.sasc.biopet.pipelines.basty.Basty
nl.lumc.sasc.biopet.pipelines.basty.Basty,
nl.lumc.sasc.biopet.pipelines.gears.Gears
) ::: protectedPipelines
def tools: List[MainCommand] = List(
......
<?xml version="1.0" encoding="UTF-8"?>
<!--
Biopet is built on top of GATK Queue for building bioinformatic
pipelines. It is mainly intended to support LUMC SHARK cluster which is running
SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
should also be able to execute Biopet tools and pipelines.
Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
Contact us at: sasc@lumc.nl
A dual licensing mode is applied. The source code within this project that are
not part of GATK Queue is freely available for non-commercial use under an AGPL
license; For commercial users or users who do not want to follow the AGPL
license, please contact us to obtain a separate license.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>Biopet</artifactId>
<groupId>nl.lumc.sasc</groupId>
<version>0.4.0-DEV</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<inceptionYear>2015</inceptionYear>
<artifactId>Gears</artifactId>
<dependencies>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>BiopetFramework</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>nl.lumc.sasc</groupId>
<artifactId>Mapping</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.8</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_2.11</artifactId>
<version>2.2.1</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.pipelines.gears
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.core.PipelineCommand
import org.broadinstitute.gatk.queue.QScript
class Gears(val root: Configurable) extends QScript with GearsTrait {
def this() = this(null)
}
/** This object give a default main method to the pipelines */
object Gears extends PipelineCommand
\ No newline at end of file
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.pipelines.gears
import java.io.File
import nl.lumc.sasc.biopet.FullVersion
import htsjdk.samtools.SamReaderFactory
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
import nl.lumc.sasc.biopet.core.MultiSampleQScript
import nl.lumc.sasc.biopet.extensions.Ln
import nl.lumc.sasc.biopet.extensions.kraken.{ KrakenReport, Kraken }
import nl.lumc.sasc.biopet.extensions.picard.{ MergeSamFiles, AddOrReplaceReadGroups, SamToFastq, MarkDuplicates }
import nl.lumc.sasc.biopet.extensions.sambamba.SambambaView
import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics
import nl.lumc.sasc.biopet.pipelines.mapping.Mapping
import nl.lumc.sasc.biopet.tools.FastqSync
import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.queue.function.QFunction
import scala.collection.JavaConversions._
/**
* This is a trait for the Gears pipeline
* The ShivaTrait is used as template for this pipeline
*/
trait GearsTrait extends MultiSampleQScript with SummaryQScript { qscript =>
/** Executed before running the script */
def init: Unit = {
}
/** Method to add jobs */
def biopetScript: Unit = {
addSamplesJobs()
addSummaryJobs
}
/** Multisample meta-genome comparison */
def addMultiSampleJobs: Unit = {
// generate report from multiple samples, this is:
// - the TSV
// - the Spearman correlation plot + table
}
/** Location of summary file */
def summaryFile = new File(outputDir, "gears.summary.json")
/** Settings of pipeline for summary */
def summarySettings = Map(
"version" -> FullVersion
)
/** Files for the summary */
def summaryFiles = Map()
/** Method to make a sample */
def makeSample(id: String) = new Sample(id)
/** Class that will generate jobs for a sample */
class Sample(sampleId: String) extends AbstractSample(sampleId) {
/** Sample specific files to add to summary */
def summaryFiles: Map[String, File] = {
preProcessBam match {
case Some(preProcessBam) => Map("bamFile" -> preProcessBam)
case _ => Map()
}
} ++ Map(
"alignment" -> alnFile
)
/** Sample specific stats to add to summary */
def summaryStats: Map[String, Any] = Map()
/** Method to make a library */
def makeLibrary(id: String) = new Library(id)
/** Class to generate jobs for a library */
class Library(libId: String) extends AbstractLibrary(libId) {
/** Library specific files to add to the summary */
def summaryFiles: Map[String, File] = {
(bamFile, preProcessBam) match {
case (Some(bamFile), Some(preProcessBam)) => Map("bamFile" -> bamFile, "preProcessBam" -> preProcessBam)
case (Some(bamFile), _) => Map("bamFile" -> bamFile)
case _ => Map()
}
}
/** Alignment results of this library ~ can only be accessed after addJobs is run! */
def alnFile: File = bamFile match {
case Some(b) => b
case _ => throw new IllegalStateException("The bamfile is not generated yet")
}
/** Library specific stats to add to summary */
def summaryStats: Map[String, Any] = Map()
/** Method to execute library preprocess */
def preProcess(input: File): Option[File] = None
/** Method to make the mapping submodule */
def makeMapping = {
val mapping = new Mapping(qscript)
mapping.sampleId = Some(sampleId)
mapping.libId = Some(libId)
mapping.outputDir = libDir
mapping.outputName = sampleId + "-" + libId
(Some(mapping), Some(mapping.finalBamFile), preProcess(mapping.finalBamFile))
}
/**
* Determine where where to start the pipeline in cases where both R1 (fastq) and BAM is specified
*/
lazy val (mapping, bamFile, preProcessBam): (Option[Mapping], Option[File], Option[File]) =
(config.contains("R1"), config.contains("bam")) match {
case (true, _) => makeMapping // Default starting from fastq files
case (false, true) => // Starting from bam file
config("bam_to_fastq", default = false).asBoolean match {
case true => makeMapping // bam file will be converted to fastq
case false => {
val file = new File(libDir, sampleId + "-" + libId + ".final.bam")
(None, Some(file), preProcess(file))
}
}
case _ => (None, None, None)
}
/** This will add jobs for this library */
def addJobs(): Unit = {
(config.contains("R1"), config.contains("bam")) match {
case (true, _) => mapping.foreach(mapping => {
mapping.input_R1 = config("R1")
})
case (false, true) => config("bam_to_fastq", default = false).asBoolean match {
case true => {
val samToFastq = SamToFastq(qscript, config("bam"),
new File(libDir, sampleId + "-" + libId + ".R1.fastq"),
new File(libDir, sampleId + "-" + libId + ".R2.fastq"))
samToFastq.isIntermediate = true
qscript.add(samToFastq)
mapping.foreach(mapping => {
mapping.input_R1 = samToFastq.fastqR1
mapping.input_R2 = Some(samToFastq.fastqR2)
})
}
case false => {
val inputSam = SamReaderFactory.makeDefault.open(config("bam"))
val readGroups = inputSam.getFileHeader.getReadGroups
val readGroupOke = readGroups.forall(readGroup => {
if (readGroup.getSample != sampleId) logger.warn("Sample ID readgroup in bam file is not the same")
if (readGroup.getLibrary != libId) logger.warn("Library ID readgroup in bam file is not the same")
readGroup.getSample == sampleId && readGroup.getLibrary == libId
})
inputSam.close
if (!readGroupOke) {
if (config("correct_readgroups", default = false).asBoolean) {
logger.info("Correcting readgroups, file:" + config("bam"))
val aorrg = AddOrReplaceReadGroups(qscript, config("bam"), bamFile.get)
aorrg.RGID = sampleId + "-" + libId
aorrg.RGLB = libId
aorrg.RGSM = sampleId
aorrg.isIntermediate = true
qscript.add(aorrg)
} else throw new IllegalStateException("Sample readgroup and/or library of input bamfile is not correct, file: " + bamFile +
"\nPlease note that it is possible to set 'correct_readgroups' to true in the config to automatic fix this")
} else {
val oldBamFile: File = config("bam")
val oldIndex: File = new File(oldBamFile.getAbsolutePath.stripSuffix(".bam") + ".bai")
val newIndex: File = new File(libDir, oldBamFile.getName.stripSuffix(".bam") + ".bai")
val baiLn = Ln(qscript, oldIndex, newIndex)
add(baiLn)
val bamLn = Ln(qscript, oldBamFile, bamFile.get)
bamLn.deps :+= baiLn.output
add(bamLn)
}
}
}
case _ => logger.warn("Sample: " + sampleId + " Library: " + libId + ", no reads found")
}
mapping.foreach(mapping => {
mapping.init
mapping.biopetScript
addAll(mapping.functions) // Add functions of mapping to current function pool
addSummaryQScript(mapping)
})
}
}
/** This will add jobs for the double preprocessing */
protected def addDoublePreProcess(input: List[File], isIntermediate: Boolean = false): Option[File] = {
if (input == Nil) None
else if (input.tail == Nil) {
val bamFile = new File(sampleDir, input.head.getName)
val oldIndex: File = new File(input.head.getAbsolutePath.stripSuffix(".bam") + ".bai")
val newIndex: File = new File(sampleDir, input.head.getName.stripSuffix(".bam") + ".bai")
val baiLn = Ln(qscript, oldIndex, newIndex)
add(baiLn)
val bamLn = Ln(qscript, input.head, bamFile)
bamLn.deps :+= baiLn.output
add(bamLn)
Some(bamFile)
} else {
val md = new MarkDuplicates(qscript)
md.input = input
md.output = new File(sampleDir, sampleId + ".dedup.bam")
md.outputMetrics = new File(sampleDir, sampleId + ".dedup.metrics")
md.isIntermediate = isIntermediate
md.removeDuplicates = true
add(md)
addSummarizable(md, "mark_duplicates")
Some(md.output)
}
}
lazy val preProcessBam: Option[File] = addDoublePreProcess(libraries.map(lib => {
(lib._2.bamFile, lib._2.preProcessBam) match {
case (_, Some(file)) => Some(file)
case (Some(file), _) => Some(file)
case _ => None
}
}).flatten.toList)
lazy val alnFileDirty: File = sampleAlnJob.output
lazy val alnFile: File = sampleAlnJob.output
/** Job for combining all library BAMs */
private def sampleAlnJob: CombineFileFunction =
makeCombineJob(libraries.values.map(_.alnFile).toList, createFile(".bam"))
/** Super type of Ln and MergeSamFiles */
private type CombineFileFunction = QFunction { def output: File }
/** Ln or MergeSamFile job, depending on how many inputs are supplied */
private def makeCombineJob(inFiles: List[File], outFile: File,
mergeSortOrder: String = "coordinate"): CombineFileFunction = {
require(inFiles.nonEmpty, "At least one input files for combine job")
if (inFiles.size == 1) {
val job = new Ln(qscript)
job.input = inFiles.head
job.output = outFile
job
} else {
val job = new MergeSamFiles(qscript)
job.input = inFiles
job.output = outFile
job.sortOrder = mergeSortOrder
job
}
}
/** This will add sample jobs */
def addJobs(): Unit = {