Commit 74091f0f authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Merge remote-tracking branch 'remotes/origin/develop' into feature-bamstats

# Conflicts:
#	biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/BiopetFlagstat.scala
parents ed129189 edc81be3
......@@ -63,6 +63,7 @@
#if (!sampleLevel) <th>Library</th> #end
<th>Total</th>
<th>Mapped</th>
<th>Secondary</th>
<th>(%)</th>
<th>Duplicates</th>
<th>(%)</th>
......@@ -85,12 +86,14 @@
val total = summary.getValue((prefixPath ::: List("biopet_flagstat", "All")):_*).getOrElse(0L).asInstanceOf[Long]
val mapped = summary.getValue((prefixPath ::: List("biopet_flagstat", "Mapped")):_*).getOrElse(0L).asInstanceOf[Long]
val duplicates = summary.getValue((prefixPath ::: List("biopet_flagstat", "Duplicates")):_*).getOrElse(0L).asInstanceOf[Long]
val secondary = summary.getValue((prefixPath ::: List("biopet_flagstat", "NotPrimaryAlignment")):_*).getOrElse(0L).asInstanceOf[Long]
}#
<td>${total}</td>
<td>${mapped}</td>
<td>${mapped.toDouble / total * 100}%</td>
<td>${secondary}</td>
<td>${(mapped - secondary).toDouble / (total - secondary) * 100}%</td>
<td>${duplicates}</td>
<td>${duplicates.toDouble / total * 100}%</td>
<td>${duplicates.toDouble / (total - secondary) * 100}%</td>
</tr>
#end
#end
......
......@@ -17,14 +17,14 @@ package nl.lumc.sasc.biopet.pipelines.bammetrics
import java.io.File
import nl.lumc.sasc.biopet.core.annotations.{ AnnotationRefFlat, RibosomalRefFlat }
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
import nl.lumc.sasc.biopet.core.{ BiopetFifoPipe, PipelineCommand, Reference, SampleLibraryTag }
import nl.lumc.sasc.biopet.extensions.bedtools.{ BedtoolsCoverage, BedtoolsIntersect }
import nl.lumc.sasc.biopet.extensions.bedtools.{ BedtoolsCoverage, BedtoolsIntersect, BedtoolsSort }
import nl.lumc.sasc.biopet.extensions.picard._
import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsFlagstat
import nl.lumc.sasc.biopet.pipelines.bammetrics.scripts.CoverageStats
import nl.lumc.sasc.biopet.extensions.tools.BiopetFlagstat
import nl.lumc.sasc.biopet.pipelines.bammetrics.scripts.CoverageStats
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.utils.intervals.BedCheck
import org.broadinstitute.gatk.queue.QScript
......@@ -41,6 +41,8 @@ class BamMetrics(val root: Configurable) extends QScript
@Input(doc = "Bam File", shortName = "BAM", required = true)
var inputBam: File = _
override def defaults = Map("bedtoolscoverage" -> Map("sorted" -> true))
/** return location of summary file */
def summaryFile = (sampleId, libId) match {
case (Some(s), Some(l)) => new File(outputDir, s + "-" + l + ".BamMetrics.summary.json")
......@@ -164,7 +166,11 @@ class BamMetrics(val root: Configurable) extends QScript
addSummarizable(biopetFlagstatLoose, targetName + "_biopet_flagstat_loose")
add(new BiopetFifoPipe(this, List(biLoose, biopetFlagstatLoose)))
val bedCov = BedtoolsCoverage(this, intervals.bed, inputBam, depth = true)
val sorter = new BedtoolsSort(this)
sorter.input = intervals.bed
sorter.output = swapExt(targetDir, intervals.bed, ".bed", ".sorted.bed")
add(sorter)
val bedCov = BedtoolsCoverage(this, sorter.output, inputBam, depth = true)
val covStats = CoverageStats(this, targetDir, inputBam.getName.stripSuffix(".bam") + ".coverage")
covStats.title = Some("Coverage Plot")
covStats.subTitle = Some(s"for file '$targetName.bed'")
......
......@@ -56,7 +56,7 @@
<dependency>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-queue</artifactId>
<version>3.5</version>
<version>3.6</version>
<exclusions>
<exclusion>
<groupId>org.broadinstitute.gatk</groupId>
......@@ -67,7 +67,7 @@
<dependency>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-queue-extensions-public</artifactId>
<version>3.5</version>
<version>3.6</version>
</dependency>
<dependency>
<groupId>org.scalatra.scalate</groupId>
......
......@@ -69,10 +69,14 @@ trait BiopetQScript extends Configurable with GatkLogging { qscript: QScript =>
outputDir = outputDir.getAbsoluteFile
init()
biopetScript()
logger.info("Biopet script done")
if (disableScatter) for (function <- functions) function match {
case f: ScatterGatherableFunction => f.scatterCount = 1
case _ =>
if (disableScatter) {
logger.info("Disable scatters")
for (function <- functions) function match {
case f: ScatterGatherableFunction => f.scatterCount = 1
case _ =>
}
}
this match {
......@@ -81,6 +85,7 @@ trait BiopetQScript extends Configurable with GatkLogging { qscript: QScript =>
case _ => reportClass.foreach(add(_))
}
logger.info("Running pre commands")
for (function <- functions) function match {
case f: BiopetCommandLineFunction =>
f.preProcessExecutable()
......@@ -94,7 +99,8 @@ trait BiopetQScript extends Configurable with GatkLogging { qscript: QScript =>
globalConfig.writeReport(qSettings.runName, new File(outputDir, ".log/" + qSettings.runName))
else Logging.addError("Parent of output dir: '" + outputDir.getParent + "' is not writeable, output directory cannot be created")
inputFiles.foreach { i =>
logger.info("Checking input files")
inputFiles.par.foreach { i =>
if (!i.file.exists()) Logging.addError(s"Input file does not exist: ${i.file}")
if (!i.file.canRead) Logging.addError(s"Input file can not be read: ${i.file}")
if (!i.file.isAbsolute) Logging.addError(s"Input file should be an absolute path: ${i.file}")
......@@ -111,6 +117,7 @@ trait BiopetQScript extends Configurable with GatkLogging { qscript: QScript =>
if (logger.isDebugEnabled) WriteDependencies.writeDependencies(functions, new File(outputDir, s".log/${qSettings.runName}.deps.json"))
Logging.checkErrors()
logger.info("Script complete without errors")
}
/** Get implemented from org.broadinstitute.gatk.queue.QScript */
......
......@@ -24,10 +24,10 @@ import org.broadinstitute.gatk.queue.QScript
/** This trait creates a structured way of use multisample pipelines */
trait MultiSampleQScript extends SummaryQScript { qscript: QScript =>
@Argument(doc = "Only Sample", shortName = "s", required = false, fullName = "sample")
@Argument(doc = "Only Process This Sample", shortName = "s", required = false, fullName = "sample")
private[core] val onlySamples: List[String] = Nil
require(globalConfig.map.contains("samples"), "No Samples found in config")
if (!globalConfig.map.contains("samples")) Logging.addError("No Samples found in config")
/** Sample class with basic functions build in */
abstract class AbstractSample(val sampleId: String) extends Summarizable { sample =>
......@@ -190,7 +190,7 @@ trait MultiSampleQScript extends SummaryQScript { qscript: QScript =>
val samples: Map[String, Sample] = sampleIds.map(id => id -> makeSample(id)).toMap
/** Returns a list of all sampleIDs */
protected def sampleIds: Set[String] = ConfigUtils.any2map(globalConfig.map("samples")).keySet
protected def sampleIds: Set[String] = ConfigUtils.any2map(globalConfig.map.getOrElse("samples", Map())).keySet
protected lazy val nameRegex = """^[a-zA-Z0-9][a-zA-Z0-9-_]+[a-zA-Z0-9]$""".r
protected lazy val nameError = "has an invalid name. " +
......
......@@ -155,7 +155,7 @@ class VariantEffectPredictor(val root: Configurable) extends BiopetCommandLineFu
super.beforeGraph()
if (!cache && !database) {
Logging.addError("Must either set 'cache' or 'database' to true for VariantEffectPredictor")
} else if (cache && dir.isEmpty) {
} else if (cache && dir.isEmpty && dirCache.isEmpty) {
Logging.addError("Must supply 'dir_cache' to cache for VariantEffectPredictor")
}
if (statsText) _summary = new File(output.getAbsolutePath + "_summary.txt")
......
......@@ -14,13 +14,16 @@
*/
package nl.lumc.sasc.biopet.extensions.bedtools
import java.io.File
import java.io.{ File, PrintWriter }
import nl.lumc.sasc.biopet.core.Reference
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Argument, Input, Output }
import scala.io.Source
/** Extension for bedtools coverage */
class BedtoolsCoverage(val root: Configurable) extends Bedtools {
class BedtoolsCoverage(val root: Configurable) extends Bedtools with Reference {
@Input(doc = "Input file (bed/gff/vcf/bam)")
var input: File = null
......@@ -40,6 +43,9 @@ class BedtoolsCoverage(val root: Configurable) extends Bedtools {
@Argument(doc = "diffStrand", required = false)
var diffStrand: Boolean = false
@Argument(doc = "sorted", required = false)
var sorted: Boolean = config("sorted", default = false, freeVar = false)
override def defaultCoreMemory = 4.0
/** Returns command to execute */
......@@ -49,7 +55,10 @@ class BedtoolsCoverage(val root: Configurable) extends Bedtools {
conditional(depth, "-d") +
conditional(sameStrand, "-s") +
conditional(diffStrand, "-S") +
conditional(sorted, "-sorted") +
(if (sorted) required("-g", BedtoolsCoverage.getGenomeFile(referenceFai, jobTempDir)) else "") +
(if (outputAsStsout) "" else " > " + required(output))
}
object BedtoolsCoverage {
......@@ -65,4 +74,27 @@ object BedtoolsCoverage {
bedtoolsCoverage.diffStrand = diffStrand
bedtoolsCoverage
}
private var genomeCache: Map[(File, File), File] = Map()
def getGenomeFile(fai: File, dir: File): File = {
if (!genomeCache.contains((fai, dir))) genomeCache += (fai, dir) -> createGenomeFile(fai, dir)
genomeCache((fai, dir))
}
/**
* Creates the genome file. i.e. the first two columns of the fasta index
* @return
*/
def createGenomeFile(fai: File, dir: File): File = {
val tmp = File.createTempFile(fai.getName, ".genome", dir)
tmp.deleteOnExit()
val writer = new PrintWriter(tmp)
Source.fromFile(fai).
getLines().
map(s => s.split("\t").take(2).mkString("\t")).
foreach(f => writer.println(f))
writer.close()
tmp
}
}
\ No newline at end of file
package nl.lumc.sasc.biopet.extensions.bedtools
import java.io.File
import nl.lumc.sasc.biopet.core.Reference
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Argument, Input, Output }
/**
* Created by Sander Bollen on 26-5-16.
*/
class BedtoolsSort(val root: Configurable) extends Bedtools with Reference {
@Input
var input: File = null
@Output
var output: File = null
@Argument(required = false)
var faidx: File = referenceFai
def cmdLine = required(executable) + required("sort") + required("-i", input) +
optional("-faidx", faidx) +
(if (outputAsStsout) "" else " > " + required(output))
}
......@@ -16,17 +16,16 @@ package nl.lumc.sasc.biopet.extensions.gatk
import java.io.File
import nl.lumc.sasc.biopet.core.ScatterGatherableFunction
import nl.lumc.sasc.biopet.utils.VcfUtils
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile
import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Output, _ }
import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Output, Input }
//TODO: check gathering
class BaseRecalibrator(val root: Configurable) extends CommandLineGATK /* with ScatterGatherableFunction */ {
class BaseRecalibrator(val root: Configurable) extends CommandLineGATK with ScatterGatherableFunction {
def analysis_type = "BaseRecalibrator"
//TODO: check gathering
//scatterClass = classOf[ContigScatterFunction]
//setupScatterFunction = { case scatter: GATKScatterFunction => scatter.includeUnmapped = false }
scatterClass = classOf[ContigScatterFunction]
setupScatterFunction = { case scatter: GATKScatterFunction => scatter.includeUnmapped = false }
/** A database of known polymorphic sites */
@Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites", required = false, exclusiveOf = "", validation = "")
......@@ -38,7 +37,7 @@ class BaseRecalibrator(val root: Configurable) extends CommandLineGATK /* with S
/** The output recalibration table file to create */
@Output(fullName = "out", shortName = "o", doc = "The output recalibration table file to create", required = true, exclusiveOf = "", validation = "") //TODO: check gathering
//@Gather(classOf[org.broadinstitute.gatk.engine.recalibration.BQSRGatherer])
@Gather(classOf[org.broadinstitute.gatk.engine.recalibration.BQSRGatherer])
var out: File = _
/** One or more covariates to be used in the recalibration. Can be specified multiple times */
......
......@@ -58,6 +58,8 @@ class CatVariants(val root: Configurable) extends BiopetJavaCommandLineFunction
@Gather(classOf[org.broadinstitute.gatk.queue.function.scattergather.SimpleTextGatherFunction])
var log_to_file: File = _
override def defaultCoreMemory = 4.0
override def beforeGraph() = {
super.beforeGraph()
if (reference == null) reference = referenceFasta()
......
......@@ -90,7 +90,7 @@ class HaplotypeCaller(val root: Configurable) extends CommandLineGATK with Scatt
/** Mode for emitting reference confidence scores */
@Argument(fullName = "emitRefConfidence", shortName = "ERC", doc = "Mode for emitting reference confidence scores", required = false, exclusiveOf = "", validation = "")
var emitRefConfidence: String = _
var emitRefConfidence: Option[String] = config("emitRefConfidence")
/** File to which assembled haplotypes should be written */
@Output(fullName = "bamOutput", shortName = "bamout", doc = "File to which assembled haplotypes should be written", required = false, exclusiveOf = "", validation = "")
......@@ -522,12 +522,4 @@ object HaplotypeCaller {
hc.out = outputFile
hc
}
def gvcf(root: Configurable, inputFile: File, outputFile: File): HaplotypeCaller = {
val hc = apply(root, List(inputFile), outputFile)
hc.emitRefConfidence = "GVCF"
hc.variant_index_type = Some("LINEAR")
hc.variant_index_parameter = Some(hc.config("variant_index_parameter", default = 128000).asInt)
hc
}
}
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.hisat
import java.io.File
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, Reference, Version }
import nl.lumc.sasc.biopet.utils.Logging
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* Extension for hisat2
*
* Based on version 2.0.4
*/
class Hisat2(val root: Configurable) extends BiopetCommandLineFunction with Reference with Version {
// TODO: handle --sra-acc flag. This is currently unsupported by the wrapper.
@Input(doc = "Fastq file R1", shortName = "R1")
var R1: File = null
@Input(doc = "Fastq file R2", shortName = "R2", required = false)
var R2: Option[File] = None
@Output(doc = "Output file SAM", shortName = "output", required = true)
var output: File = null
executable = config("exe", default = "hisat2", freeVar = false)
def versionRegex = """.*hisat2-align-s version (.*)""".r
override def versionExitcode = List(0, 1)
def versionCommand = executable + " --version"
override def defaultCoreMemory = 4.0
override def defaultThreads = 4
/* Required */
var hisat2Index: String = config("hisat2_index")
/* Input options */
var q: Boolean = config("q", default = false)
var qseq: Boolean = config("qseq", default = false)
var f: Boolean = config("f", default = false)
var r: Boolean = config("r", default = false)
var c: Boolean = config("c", default = false)
var skip: Option[Int] = config("skip")
var upto: Option[Int] = config("upto")
var trim5: Option[Int] = config("trim5")
var trim3: Option[Int] = config("trim3")
var phred33: Boolean = config("phred33", default = false)
var phred64: Boolean = config("phred64", default = false)
var intQuals: Boolean = config("int_quals", default = false)
/* Alignment options */
var nCeil: Option[String] = config("n_ceil")
var ignoreQuals: Boolean = config("ignore_quals", default = false)
var nofw: Boolean = config("nofw", default = false)
var norc: Boolean = config("norc", default = false)
/* Spliced alignment */
var penCansplice: Option[Int] = config("pen_cansplice")
var penNoncansplice: Option[Int] = config("pen_noncansplice")
var penCanintronlen: Option[Int] = config("pen_canintronlen")
var penNoncanintronlen: Option[Int] = config("pen_noncanintronlen")
var minIntronlen: Option[Int] = config("min_intronlen")
var maxIntronlen: Option[Int] = config("max_intronlen")
var knownSplicesiteInfile: Option[File] = config("known_splicesite_infile")
var novelSplicesiteOutfile: Option[File] = config("novel_splicesite_outfile")
var novelSplicesiteInfile: Option[File] = config("novel_splicesite_infile")
var noTempSplicesite: Boolean = config("no_temp_splicesite", default = false)
var noSplicedAlignment: Boolean = config("no_spliced_alignment", default = false)
var rnaStrandness: Option[String] = config("rna_strandness")
var tmo: Boolean = config("tmo", default = false)
var dta: Boolean = config("dta", default = false)
var dtaCufflinks: Boolean = config("dta_cufflinks", default = false)
/* Scoring */
var ma: Option[Int] = config("ma")
var mp: Option[String] = config("mp") // TODO: should be (Int, Int)
var sp: Option[String] = config("sp") // TODO: should be (Int, Int)
var np: Option[Int] = config("np")
var rdg: Option[String] = config("rdg") // TODO: should be (Int, Int)
var rfg: Option[String] = config("rfg") // TODO: should be (Int, Int)
var scoreMin: Option[String] = config("score_min")
/* Reporting */
var k: Option[Int] = config("k")
var all: Option[Int] = config("all")
/* Paired-end */
var fr: Boolean = config("fr", default = false)
var rf: Boolean = config("rf", default = false)
var ff: Boolean = config("ff", default = false)
var noMixed: Boolean = config("no_mixed", default = false)
var noDiscordant: Boolean = config("no_discordant", default = false)
/* Output */
var time: Boolean = config("no_overlap", default = false)
var un: Option[String] = config("un")
var al: Option[String] = config("al")
var unConc: Option[String] = config("un_conc")
var alConc: Option[String] = config("al_conc")
var unGz: Option[String] = config("un_gz")
var alGz: Option[String] = config("al_gz")
var unConcGz: Option[String] = config("un_conc_gz")
var alConcGz: Option[String] = config("al_conc_gz")
var unBz2: Option[String] = config("un_bz2")
var alBz2: Option[String] = config("al_bz2")
var unConcBz2: Option[String] = config("un_conc_bz2")
var alConcBz2: Option[String] = config("al_conc_bz2")
var quiet: Boolean = config("quiet", default = false)
var metFile: Option[String] = config("met_file")
var metStderr: Boolean = config("met_stderr", default = false)
var met: Option[Int] = config("met")
var noHead: Boolean = config("no_head", default = false)
var noSq: Boolean = config("no_sq", default = false)
var rgId: Option[String] = config("rg_id")
var rg: List[String] = config("rg", default = Nil)
var omitSecSeq: Boolean = config("omit_sec_seq", default = false)
/* Performance */
var offrate: Option[Int] = config("offrate")
var reorder: Boolean = config("reorder", default = false)
var mm: Boolean = config("mm", default = false)
/* Other */
var qcFilter: Boolean = config("qc_filter", default = false)
var seed: Option[Int] = config("seed")
var nonDeterministic: Boolean = config("non_deterministic", default = false)
var removeChrName: Boolean = config("remove_chrname", default = false)
var addChrName: Boolean = config("add_chrname", default = false)
override def beforeGraph() {
super.beforeGraph()
val indexDir = new File(hisat2Index).getParentFile
val basename = hisat2Index.stripPrefix(indexDir.getPath + File.separator)
if (indexDir.exists()) {
if (!indexDir.list()
.toList
.filter(_.startsWith(basename))
.exists(_.endsWith(".ht2")))
Logging.addError(s"No index files found for hisat2 in: $indexDir with basename: $basename")
}
}
/** return commandline to execute */
def cmdLine = required(executable) +
conditional(q, "-q") +
conditional(qseq, "--qseq") +
conditional(f, "-f") +
conditional(r, "-r") +
conditional(c, "-c") +
optional("--skip", skip) +
optional("--upto", upto) +
optional("--trim3", trim3) +
optional("--trim5", trim5) +
conditional(phred33, "--phred33") +
conditional(phred64, "--phred64") +
conditional(intQuals, "--int-quals") +
/* Alignment options */
optional("--n-ceil", nCeil) +
conditional(ignoreQuals, "--ignore-quals") +
conditional(nofw, "--nofw") +
conditional(norc, "--norc") +
/* Spliced alignment */
optional("--pen-cansplice", penCansplice) +
optional("--pen-noncansplice", penNoncansplice) +
optional("--pen-canintronlen", penCanintronlen) +
optional("--pen-noncanintronlen", penNoncanintronlen) +
optional("--min-intronlen", minIntronlen) +
optional("--max-intronlen", maxIntronlen) +
optional("--known-splicesite-infile", knownSplicesiteInfile) +
optional("--novel-splicesite-outfile", novelSplicesiteOutfile) +
optional("--novel-splicesite-infile", novelSplicesiteInfile) +
conditional(noTempSplicesite, "--no-temp-splicesite") +
conditional(noSplicedAlignment, "--no-spliced-alignment") +
optional("--rna-strandness", rnaStrandness) +
conditional(tmo, "--tmo") +
conditional(dta, "--dta") +
conditional(dtaCufflinks, "--dta-cufflinks") +
/* Scoring */
optional("--ma", ma) +
optional("--mp", mp) +
optional("--np", np) +
optional("--sp", sp) +
optional("--rdg", rdg) +
optional("--rfg", rfg) +
optional("--score-min", scoreMin) +
/* Reporting */
optional("-k", k) +
optional("--all", all) +
/* Paired End */
conditional(fr, "--fr") +
conditional(rf, "--rf") +
conditional(ff, "--ff") +
conditional(noMixed, "--no-mixed") +
conditional(noDiscordant, "--no-discordant") +
/* Output */
conditional(time, "--time") +
optional("--un", un) +
optional("--al", al) +
optional("--un-conc", unConc) +
optional("--al-conc", alConc) +
optional("--un-gz", unGz) +
optional("--al-gz", alGz) +
optional("--un-conc-gz", unConcGz) +
optional("--al-conc-gz", alConcGz) +
optional("--un-bz2", unBz2) +
optional("--al-bz2", alBz2) +
optional("--un-conc-bz2", unConcBz2) +
optional("--al-conc-bz2", alConcBz2) +
conditional(quiet, "--quiet") +
optional("--met-file", metFile) +
conditional(metStderr, "--met-stderr") +
optional("--met", met) +