diff --git a/bammetrics/src/main/resources/nl/lumc/sasc/biopet/pipelines/bammetrics/alignmentSummary.ssp b/bammetrics/src/main/resources/nl/lumc/sasc/biopet/pipelines/bammetrics/alignmentSummary.ssp index 2ac9b5098e1176cc56f097700928eddc1b6d8f88..e836fb2faa975ac8d43afcf363859bef601a1252 100644 --- a/bammetrics/src/main/resources/nl/lumc/sasc/biopet/pipelines/bammetrics/alignmentSummary.ssp +++ b/bammetrics/src/main/resources/nl/lumc/sasc/biopet/pipelines/bammetrics/alignmentSummary.ssp @@ -63,6 +63,7 @@ #if (!sampleLevel) <th>Library</th> #end <th>Total</th> <th>Mapped</th> + <th>Secondary</th> <th>(%)</th> <th>Duplicates</th> <th>(%)</th> @@ -85,12 +86,14 @@ val total = summary.getValue((prefixPath ::: List("biopet_flagstat", "All")):_*).getOrElse(0L).asInstanceOf[Long] val mapped = summary.getValue((prefixPath ::: List("biopet_flagstat", "Mapped")):_*).getOrElse(0L).asInstanceOf[Long] val duplicates = summary.getValue((prefixPath ::: List("biopet_flagstat", "Duplicates")):_*).getOrElse(0L).asInstanceOf[Long] + val secondary = summary.getValue((prefixPath ::: List("biopet_flagstat", "NotPrimaryAlignment")):_*).getOrElse(0L).asInstanceOf[Long] }# <td>${total}</td> <td>${mapped}</td> - <td>${mapped.toDouble / total * 100}%</td> + <td>${secondary}</td> + <td>${(mapped - secondary).toDouble / (total - secondary) * 100}%</td> <td>${duplicates}</td> - <td>${duplicates.toDouble / total * 100}%</td> + <td>${duplicates.toDouble / (total - secondary) * 100}%</td> </tr> #end #end diff --git a/bammetrics/src/main/scala/nl/lumc/sasc/biopet/pipelines/bammetrics/BamMetrics.scala b/bammetrics/src/main/scala/nl/lumc/sasc/biopet/pipelines/bammetrics/BamMetrics.scala index 73704c60568ab3b739b16a3b1ad556f78cf5d0fd..2a425011f09be542e693179e68caa67a9d280344 100644 --- a/bammetrics/src/main/scala/nl/lumc/sasc/biopet/pipelines/bammetrics/BamMetrics.scala +++ b/bammetrics/src/main/scala/nl/lumc/sasc/biopet/pipelines/bammetrics/BamMetrics.scala @@ -17,14 +17,14 @@ package nl.lumc.sasc.biopet.pipelines.bammetrics import java.io.File import nl.lumc.sasc.biopet.core.annotations.{ AnnotationRefFlat, RibosomalRefFlat } -import nl.lumc.sasc.biopet.utils.config.Configurable import nl.lumc.sasc.biopet.core.summary.SummaryQScript import nl.lumc.sasc.biopet.core.{ BiopetFifoPipe, PipelineCommand, Reference, SampleLibraryTag } -import nl.lumc.sasc.biopet.extensions.bedtools.{ BedtoolsCoverage, BedtoolsIntersect } +import nl.lumc.sasc.biopet.extensions.bedtools.{ BedtoolsCoverage, BedtoolsIntersect, BedtoolsSort } import nl.lumc.sasc.biopet.extensions.picard._ import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsFlagstat -import nl.lumc.sasc.biopet.pipelines.bammetrics.scripts.CoverageStats import nl.lumc.sasc.biopet.extensions.tools.BiopetFlagstat +import nl.lumc.sasc.biopet.pipelines.bammetrics.scripts.CoverageStats +import nl.lumc.sasc.biopet.utils.config.Configurable import nl.lumc.sasc.biopet.utils.intervals.BedCheck import org.broadinstitute.gatk.queue.QScript @@ -41,6 +41,8 @@ class BamMetrics(val root: Configurable) extends QScript @Input(doc = "Bam File", shortName = "BAM", required = true) var inputBam: File = _ + override def defaults = Map("bedtoolscoverage" -> Map("sorted" -> true)) + /** return location of summary file */ def summaryFile = (sampleId, libId) match { case (Some(s), Some(l)) => new File(outputDir, s + "-" + l + ".BamMetrics.summary.json") @@ -164,7 +166,11 @@ class BamMetrics(val root: Configurable) extends QScript addSummarizable(biopetFlagstatLoose, targetName + "_biopet_flagstat_loose") add(new BiopetFifoPipe(this, List(biLoose, biopetFlagstatLoose))) - val bedCov = BedtoolsCoverage(this, intervals.bed, inputBam, depth = true) + val sorter = new BedtoolsSort(this) + sorter.input = intervals.bed + sorter.output = swapExt(targetDir, intervals.bed, ".bed", ".sorted.bed") + add(sorter) + val bedCov = BedtoolsCoverage(this, sorter.output, inputBam, depth = true) val covStats = CoverageStats(this, targetDir, inputBam.getName.stripSuffix(".bam") + ".coverage") covStats.title = Some("Coverage Plot") covStats.subTitle = Some(s"for file '$targetName.bed'") diff --git a/biopet-core/pom.xml b/biopet-core/pom.xml index b032f149372e5fb8fc8eb50c92ad6808c33749a4..2debfb9c8d76ad660cf363492f70868f04ec1808 100644 --- a/biopet-core/pom.xml +++ b/biopet-core/pom.xml @@ -56,7 +56,7 @@ <dependency> <groupId>org.broadinstitute.gatk</groupId> <artifactId>gatk-queue</artifactId> - <version>3.5</version> + <version>3.6</version> <exclusions> <exclusion> <groupId>org.broadinstitute.gatk</groupId> @@ -67,7 +67,7 @@ <dependency> <groupId>org.broadinstitute.gatk</groupId> <artifactId>gatk-queue-extensions-public</artifactId> - <version>3.5</version> + <version>3.6</version> </dependency> <dependency> <groupId>org.scalatra.scalate</groupId> diff --git a/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala b/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala index 5164eb25cadbe0956044d76af414cb2f282fac77..14689457c5aa6a2ef649fb13e97069712ef455f3 100644 --- a/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala +++ b/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala @@ -69,10 +69,14 @@ trait BiopetQScript extends Configurable with GatkLogging { qscript: QScript => outputDir = outputDir.getAbsoluteFile init() biopetScript() + logger.info("Biopet script done") - if (disableScatter) for (function <- functions) function match { - case f: ScatterGatherableFunction => f.scatterCount = 1 - case _ => + if (disableScatter) { + logger.info("Disable scatters") + for (function <- functions) function match { + case f: ScatterGatherableFunction => f.scatterCount = 1 + case _ => + } } this match { @@ -81,6 +85,7 @@ trait BiopetQScript extends Configurable with GatkLogging { qscript: QScript => case _ => reportClass.foreach(add(_)) } + logger.info("Running pre commands") for (function <- functions) function match { case f: BiopetCommandLineFunction => f.preProcessExecutable() @@ -94,7 +99,8 @@ trait BiopetQScript extends Configurable with GatkLogging { qscript: QScript => globalConfig.writeReport(qSettings.runName, new File(outputDir, ".log/" + qSettings.runName)) else Logging.addError("Parent of output dir: '" + outputDir.getParent + "' is not writeable, output directory cannot be created") - inputFiles.foreach { i => + logger.info("Checking input files") + inputFiles.par.foreach { i => if (!i.file.exists()) Logging.addError(s"Input file does not exist: ${i.file}") if (!i.file.canRead) Logging.addError(s"Input file can not be read: ${i.file}") if (!i.file.isAbsolute) Logging.addError(s"Input file should be an absolute path: ${i.file}") @@ -111,6 +117,7 @@ trait BiopetQScript extends Configurable with GatkLogging { qscript: QScript => if (logger.isDebugEnabled) WriteDependencies.writeDependencies(functions, new File(outputDir, s".log/${qSettings.runName}.deps.json")) Logging.checkErrors() + logger.info("Script complete without errors") } /** Get implemented from org.broadinstitute.gatk.queue.QScript */ diff --git a/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/MultiSampleQScript.scala b/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/MultiSampleQScript.scala index 68a68c51f528e8a3d6f1059cfbfb83438626c522..8add8ff772169863d2410b6f217663f29be2d0f5 100644 --- a/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/MultiSampleQScript.scala +++ b/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/MultiSampleQScript.scala @@ -24,10 +24,10 @@ import org.broadinstitute.gatk.queue.QScript /** This trait creates a structured way of use multisample pipelines */ trait MultiSampleQScript extends SummaryQScript { qscript: QScript => - @Argument(doc = "Only Sample", shortName = "s", required = false, fullName = "sample") + @Argument(doc = "Only Process This Sample", shortName = "s", required = false, fullName = "sample") private[core] val onlySamples: List[String] = Nil - require(globalConfig.map.contains("samples"), "No Samples found in config") + if (!globalConfig.map.contains("samples")) Logging.addError("No Samples found in config") /** Sample class with basic functions build in */ abstract class AbstractSample(val sampleId: String) extends Summarizable { sample => @@ -190,7 +190,7 @@ trait MultiSampleQScript extends SummaryQScript { qscript: QScript => val samples: Map[String, Sample] = sampleIds.map(id => id -> makeSample(id)).toMap /** Returns a list of all sampleIDs */ - protected def sampleIds: Set[String] = ConfigUtils.any2map(globalConfig.map("samples")).keySet + protected def sampleIds: Set[String] = ConfigUtils.any2map(globalConfig.map.getOrElse("samples", Map())).keySet protected lazy val nameRegex = """^[a-zA-Z0-9][a-zA-Z0-9-_]+[a-zA-Z0-9]$""".r protected lazy val nameError = "has an invalid name. " + diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/VariantEffectPredictor.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/VariantEffectPredictor.scala index 78b37da50cceae2845ea84ea773d17381572f2cf..cb231e7b322943598eed38ca4bce34f719c13567 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/VariantEffectPredictor.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/VariantEffectPredictor.scala @@ -155,7 +155,7 @@ class VariantEffectPredictor(val root: Configurable) extends BiopetCommandLineFu super.beforeGraph() if (!cache && !database) { Logging.addError("Must either set 'cache' or 'database' to true for VariantEffectPredictor") - } else if (cache && dir.isEmpty) { + } else if (cache && dir.isEmpty && dirCache.isEmpty) { Logging.addError("Must supply 'dir_cache' to cache for VariantEffectPredictor") } if (statsText) _summary = new File(output.getAbsolutePath + "_summary.txt") diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/bedtools/BedtoolsCoverage.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/bedtools/BedtoolsCoverage.scala index da42ffc028b21ea299ff6b9a13bc63717d8b0e3e..3570bcb092082fce602eb8f0f622e4cc5f65ff02 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/bedtools/BedtoolsCoverage.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/bedtools/BedtoolsCoverage.scala @@ -14,13 +14,16 @@ */ package nl.lumc.sasc.biopet.extensions.bedtools -import java.io.File +import java.io.{ File, PrintWriter } +import nl.lumc.sasc.biopet.core.Reference import nl.lumc.sasc.biopet.utils.config.Configurable import org.broadinstitute.gatk.utils.commandline.{ Argument, Input, Output } +import scala.io.Source + /** Extension for bedtools coverage */ -class BedtoolsCoverage(val root: Configurable) extends Bedtools { +class BedtoolsCoverage(val root: Configurable) extends Bedtools with Reference { @Input(doc = "Input file (bed/gff/vcf/bam)") var input: File = null @@ -40,6 +43,9 @@ class BedtoolsCoverage(val root: Configurable) extends Bedtools { @Argument(doc = "diffStrand", required = false) var diffStrand: Boolean = false + @Argument(doc = "sorted", required = false) + var sorted: Boolean = config("sorted", default = false, freeVar = false) + override def defaultCoreMemory = 4.0 /** Returns command to execute */ @@ -49,7 +55,10 @@ class BedtoolsCoverage(val root: Configurable) extends Bedtools { conditional(depth, "-d") + conditional(sameStrand, "-s") + conditional(diffStrand, "-S") + + conditional(sorted, "-sorted") + + (if (sorted) required("-g", BedtoolsCoverage.getGenomeFile(referenceFai, jobTempDir)) else "") + (if (outputAsStsout) "" else " > " + required(output)) + } object BedtoolsCoverage { @@ -65,4 +74,27 @@ object BedtoolsCoverage { bedtoolsCoverage.diffStrand = diffStrand bedtoolsCoverage } + + private var genomeCache: Map[(File, File), File] = Map() + + def getGenomeFile(fai: File, dir: File): File = { + if (!genomeCache.contains((fai, dir))) genomeCache += (fai, dir) -> createGenomeFile(fai, dir) + genomeCache((fai, dir)) + } + + /** + * Creates the genome file. i.e. the first two columns of the fasta index + * @return + */ + def createGenomeFile(fai: File, dir: File): File = { + val tmp = File.createTempFile(fai.getName, ".genome", dir) + tmp.deleteOnExit() + val writer = new PrintWriter(tmp) + Source.fromFile(fai). + getLines(). + map(s => s.split("\t").take(2).mkString("\t")). + foreach(f => writer.println(f)) + writer.close() + tmp + } } \ No newline at end of file diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/bedtools/BedtoolsSort.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/bedtools/BedtoolsSort.scala new file mode 100644 index 0000000000000000000000000000000000000000..2af292cc67dae5f907e1f799cdb040190ffc2869 --- /dev/null +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/bedtools/BedtoolsSort.scala @@ -0,0 +1,27 @@ +package nl.lumc.sasc.biopet.extensions.bedtools + +import java.io.File + +import nl.lumc.sasc.biopet.core.Reference +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Argument, Input, Output } + +/** + * Created by Sander Bollen on 26-5-16. + */ +class BedtoolsSort(val root: Configurable) extends Bedtools with Reference { + + @Input + var input: File = null + + @Output + var output: File = null + + @Argument(required = false) + var faidx: File = referenceFai + + def cmdLine = required(executable) + required("sort") + required("-i", input) + + optional("-faidx", faidx) + + (if (outputAsStsout) "" else " > " + required(output)) + +} diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/BaseRecalibrator.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/BaseRecalibrator.scala index a13b3b236e3b3cbc2555e5f8a7294215dbc4d39a..5551164ba956181e0a93ef08969ebe48ab4e0f7b 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/BaseRecalibrator.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/BaseRecalibrator.scala @@ -16,17 +16,16 @@ package nl.lumc.sasc.biopet.extensions.gatk import java.io.File +import nl.lumc.sasc.biopet.core.ScatterGatherableFunction import nl.lumc.sasc.biopet.utils.VcfUtils import nl.lumc.sasc.biopet.utils.config.Configurable import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile -import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Output, _ } +import org.broadinstitute.gatk.utils.commandline.{ Argument, Gather, Output, Input } -//TODO: check gathering -class BaseRecalibrator(val root: Configurable) extends CommandLineGATK /* with ScatterGatherableFunction */ { +class BaseRecalibrator(val root: Configurable) extends CommandLineGATK with ScatterGatherableFunction { def analysis_type = "BaseRecalibrator" - //TODO: check gathering - //scatterClass = classOf[ContigScatterFunction] - //setupScatterFunction = { case scatter: GATKScatterFunction => scatter.includeUnmapped = false } + scatterClass = classOf[ContigScatterFunction] + setupScatterFunction = { case scatter: GATKScatterFunction => scatter.includeUnmapped = false } /** A database of known polymorphic sites */ @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites", required = false, exclusiveOf = "", validation = "") @@ -38,7 +37,7 @@ class BaseRecalibrator(val root: Configurable) extends CommandLineGATK /* with S /** The output recalibration table file to create */ @Output(fullName = "out", shortName = "o", doc = "The output recalibration table file to create", required = true, exclusiveOf = "", validation = "") //TODO: check gathering - //@Gather(classOf[org.broadinstitute.gatk.engine.recalibration.BQSRGatherer]) + @Gather(classOf[org.broadinstitute.gatk.engine.recalibration.BQSRGatherer]) var out: File = _ /** One or more covariates to be used in the recalibration. Can be specified multiple times */ diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CatVariants.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CatVariants.scala index 2b87af30dd320372e784b0bacd2556e1ba0e9b86..ef97ed813055cd483b08ce41f476eb61a70914f4 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CatVariants.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CatVariants.scala @@ -58,6 +58,8 @@ class CatVariants(val root: Configurable) extends BiopetJavaCommandLineFunction @Gather(classOf[org.broadinstitute.gatk.queue.function.scattergather.SimpleTextGatherFunction]) var log_to_file: File = _ + override def defaultCoreMemory = 4.0 + override def beforeGraph() = { super.beforeGraph() if (reference == null) reference = referenceFasta() diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/HaplotypeCaller.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/HaplotypeCaller.scala index cb9de9af6849d3ef088caeead163413da7f2c0b4..c1ccd37c476c0d55d8e3f1c4741c7c47a8fe8b69 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/HaplotypeCaller.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/HaplotypeCaller.scala @@ -90,7 +90,7 @@ class HaplotypeCaller(val root: Configurable) extends CommandLineGATK with Scatt /** Mode for emitting reference confidence scores */ @Argument(fullName = "emitRefConfidence", shortName = "ERC", doc = "Mode for emitting reference confidence scores", required = false, exclusiveOf = "", validation = "") - var emitRefConfidence: String = _ + var emitRefConfidence: Option[String] = config("emitRefConfidence") /** File to which assembled haplotypes should be written */ @Output(fullName = "bamOutput", shortName = "bamout", doc = "File to which assembled haplotypes should be written", required = false, exclusiveOf = "", validation = "") @@ -522,12 +522,4 @@ object HaplotypeCaller { hc.out = outputFile hc } - - def gvcf(root: Configurable, inputFile: File, outputFile: File): HaplotypeCaller = { - val hc = apply(root, List(inputFile), outputFile) - hc.emitRefConfidence = "GVCF" - hc.variant_index_type = Some("LINEAR") - hc.variant_index_parameter = Some(hc.config("variant_index_parameter", default = 128000).asInt) - hc - } } diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/hisat/Hisat2.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/hisat/Hisat2.scala new file mode 100644 index 0000000000000000000000000000000000000000..a288fb4996cdcec4b5878d04483779d7486a1619 --- /dev/null +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/hisat/Hisat2.scala @@ -0,0 +1,259 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.extensions.hisat + +import java.io.File + +import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, Reference, Version } +import nl.lumc.sasc.biopet.utils.Logging +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } + +/** + * Extension for hisat2 + * + * Based on version 2.0.4 + */ +class Hisat2(val root: Configurable) extends BiopetCommandLineFunction with Reference with Version { + + // TODO: handle --sra-acc flag. This is currently unsupported by the wrapper. + + @Input(doc = "Fastq file R1", shortName = "R1") + var R1: File = null + + @Input(doc = "Fastq file R2", shortName = "R2", required = false) + var R2: Option[File] = None + + @Output(doc = "Output file SAM", shortName = "output", required = true) + var output: File = null + + executable = config("exe", default = "hisat2", freeVar = false) + def versionRegex = """.*hisat2-align-s version (.*)""".r + override def versionExitcode = List(0, 1) + def versionCommand = executable + " --version" + + override def defaultCoreMemory = 4.0 + override def defaultThreads = 4 + + /* Required */ + var hisat2Index: String = config("hisat2_index") + + /* Input options */ + var q: Boolean = config("q", default = false) + var qseq: Boolean = config("qseq", default = false) + var f: Boolean = config("f", default = false) + var r: Boolean = config("r", default = false) + var c: Boolean = config("c", default = false) + var skip: Option[Int] = config("skip") + var upto: Option[Int] = config("upto") + var trim5: Option[Int] = config("trim5") + var trim3: Option[Int] = config("trim3") + var phred33: Boolean = config("phred33", default = false) + var phred64: Boolean = config("phred64", default = false) + var intQuals: Boolean = config("int_quals", default = false) + + /* Alignment options */ + var nCeil: Option[String] = config("n_ceil") + var ignoreQuals: Boolean = config("ignore_quals", default = false) + var nofw: Boolean = config("nofw", default = false) + var norc: Boolean = config("norc", default = false) + + /* Spliced alignment */ + var penCansplice: Option[Int] = config("pen_cansplice") + var penNoncansplice: Option[Int] = config("pen_noncansplice") + var penCanintronlen: Option[Int] = config("pen_canintronlen") + var penNoncanintronlen: Option[Int] = config("pen_noncanintronlen") + var minIntronlen: Option[Int] = config("min_intronlen") + var maxIntronlen: Option[Int] = config("max_intronlen") + var knownSplicesiteInfile: Option[File] = config("known_splicesite_infile") + var novelSplicesiteOutfile: Option[File] = config("novel_splicesite_outfile") + var novelSplicesiteInfile: Option[File] = config("novel_splicesite_infile") + var noTempSplicesite: Boolean = config("no_temp_splicesite", default = false) + var noSplicedAlignment: Boolean = config("no_spliced_alignment", default = false) + var rnaStrandness: Option[String] = config("rna_strandness") + var tmo: Boolean = config("tmo", default = false) + var dta: Boolean = config("dta", default = false) + var dtaCufflinks: Boolean = config("dta_cufflinks", default = false) + + /* Scoring */ + var ma: Option[Int] = config("ma") + var mp: Option[String] = config("mp") // TODO: should be (Int, Int) + var sp: Option[String] = config("sp") // TODO: should be (Int, Int) + var np: Option[Int] = config("np") + var rdg: Option[String] = config("rdg") // TODO: should be (Int, Int) + var rfg: Option[String] = config("rfg") // TODO: should be (Int, Int) + var scoreMin: Option[String] = config("score_min") + + /* Reporting */ + var k: Option[Int] = config("k") + var all: Option[Int] = config("all") + + /* Paired-end */ + var fr: Boolean = config("fr", default = false) + var rf: Boolean = config("rf", default = false) + var ff: Boolean = config("ff", default = false) + var noMixed: Boolean = config("no_mixed", default = false) + var noDiscordant: Boolean = config("no_discordant", default = false) + + /* Output */ + var time: Boolean = config("no_overlap", default = false) + + var un: Option[String] = config("un") + var al: Option[String] = config("al") + var unConc: Option[String] = config("un_conc") + var alConc: Option[String] = config("al_conc") + + var unGz: Option[String] = config("un_gz") + var alGz: Option[String] = config("al_gz") + var unConcGz: Option[String] = config("un_conc_gz") + var alConcGz: Option[String] = config("al_conc_gz") + + var unBz2: Option[String] = config("un_bz2") + var alBz2: Option[String] = config("al_bz2") + var unConcBz2: Option[String] = config("un_conc_bz2") + var alConcBz2: Option[String] = config("al_conc_bz2") + + var quiet: Boolean = config("quiet", default = false) + var metFile: Option[String] = config("met_file") + var metStderr: Boolean = config("met_stderr", default = false) + var met: Option[Int] = config("met") + + var noHead: Boolean = config("no_head", default = false) + var noSq: Boolean = config("no_sq", default = false) + + var rgId: Option[String] = config("rg_id") + var rg: List[String] = config("rg", default = Nil) + + var omitSecSeq: Boolean = config("omit_sec_seq", default = false) + + /* Performance */ + var offrate: Option[Int] = config("offrate") + var reorder: Boolean = config("reorder", default = false) + var mm: Boolean = config("mm", default = false) + + /* Other */ + var qcFilter: Boolean = config("qc_filter", default = false) + var seed: Option[Int] = config("seed") + var nonDeterministic: Boolean = config("non_deterministic", default = false) + var removeChrName: Boolean = config("remove_chrname", default = false) + var addChrName: Boolean = config("add_chrname", default = false) + + override def beforeGraph() { + super.beforeGraph() + val indexDir = new File(hisat2Index).getParentFile + val basename = hisat2Index.stripPrefix(indexDir.getPath + File.separator) + if (indexDir.exists()) { + if (!indexDir.list() + .toList + .filter(_.startsWith(basename)) + .exists(_.endsWith(".ht2"))) + Logging.addError(s"No index files found for hisat2 in: $indexDir with basename: $basename") + } + } + + /** return commandline to execute */ + def cmdLine = required(executable) + + conditional(q, "-q") + + conditional(qseq, "--qseq") + + conditional(f, "-f") + + conditional(r, "-r") + + conditional(c, "-c") + + optional("--skip", skip) + + optional("--upto", upto) + + optional("--trim3", trim3) + + optional("--trim5", trim5) + + conditional(phred33, "--phred33") + + conditional(phred64, "--phred64") + + conditional(intQuals, "--int-quals") + + /* Alignment options */ + optional("--n-ceil", nCeil) + + conditional(ignoreQuals, "--ignore-quals") + + conditional(nofw, "--nofw") + + conditional(norc, "--norc") + + /* Spliced alignment */ + optional("--pen-cansplice", penCansplice) + + optional("--pen-noncansplice", penNoncansplice) + + optional("--pen-canintronlen", penCanintronlen) + + optional("--pen-noncanintronlen", penNoncanintronlen) + + optional("--min-intronlen", minIntronlen) + + optional("--max-intronlen", maxIntronlen) + + optional("--known-splicesite-infile", knownSplicesiteInfile) + + optional("--novel-splicesite-outfile", novelSplicesiteOutfile) + + optional("--novel-splicesite-infile", novelSplicesiteInfile) + + conditional(noTempSplicesite, "--no-temp-splicesite") + + conditional(noSplicedAlignment, "--no-spliced-alignment") + + optional("--rna-strandness", rnaStrandness) + + conditional(tmo, "--tmo") + + conditional(dta, "--dta") + + conditional(dtaCufflinks, "--dta-cufflinks") + + /* Scoring */ + optional("--ma", ma) + + optional("--mp", mp) + + optional("--np", np) + + optional("--sp", sp) + + optional("--rdg", rdg) + + optional("--rfg", rfg) + + optional("--score-min", scoreMin) + + /* Reporting */ + optional("-k", k) + + optional("--all", all) + + /* Paired End */ + conditional(fr, "--fr") + + conditional(rf, "--rf") + + conditional(ff, "--ff") + + conditional(noMixed, "--no-mixed") + + conditional(noDiscordant, "--no-discordant") + + /* Output */ + conditional(time, "--time") + + optional("--un", un) + + optional("--al", al) + + optional("--un-conc", unConc) + + optional("--al-conc", alConc) + + optional("--un-gz", unGz) + + optional("--al-gz", alGz) + + optional("--un-conc-gz", unConcGz) + + optional("--al-conc-gz", alConcGz) + + optional("--un-bz2", unBz2) + + optional("--al-bz2", alBz2) + + optional("--un-conc-bz2", unConcBz2) + + optional("--al-conc-bz2", alConcBz2) + + conditional(quiet, "--quiet") + + optional("--met-file", metFile) + + conditional(metStderr, "--met-stderr") + + optional("--met", met) + + conditional(noHead, "--no-head") + + conditional(noSq, "--no-sq") + + optional("--rg-id", rgId) + + repeat("--rg", rg) + + conditional(omitSecSeq, "--omit-sec-seq") + + /* Performance */ + optional("--offrate", offrate) + + optional("--threads", threads) + + conditional(reorder, "--reorder") + + conditional(mm, "--mm") + + /* Other */ + conditional(qcFilter, "--qc-filter") + + optional("--seed", seed) + + conditional(nonDeterministic, "--non-deterministic") + + conditional(removeChrName, "--remove-chrname") + + conditional(addChrName, "--add-chrname") + + /* Required */ + required("-x", hisat2Index) + + (R2 match { + case Some(r2) => required("-1", R1) + optional("-2", r2) + case otherwise => required("-U", R1) + }) + + (if (outputAsStsout) "" else required("-S", output)) +} diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/hisat/Hisat2Build.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/hisat/Hisat2Build.scala new file mode 100644 index 0000000000000000000000000000000000000000..0964fe9372ddd4c58a7d3b3921ad0a9f1f6fce5b --- /dev/null +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/hisat/Hisat2Build.scala @@ -0,0 +1,82 @@ +package nl.lumc.sasc.biopet.extensions.hisat + +import java.io.File + +import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, Reference, Version } +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.utils.commandline.Input + +/** + * Created by pjvan_thof on 7-6-16. + */ +class Hisat2Build(val root: Configurable) extends BiopetCommandLineFunction with Version { + + @Input(required = true) + var inputFasta: File = _ + + var hisat2IndexBase: String = _ + + @Input(required = false) + var snp: Option[File] = config("snp") + + @Input(required = false) + var haplotype: Option[File] = config("haplotype") + + @Input(required = false) + var ss: Option[File] = config("ss") + + @Input(required = false) + var exon: Option[File] = config("exon") + + executable = config("exe", default = "hisat2-build", freeVar = false) + def versionRegex = """.*hisat2-align-s version (.*)""".r + override def versionExitcode = List(0, 1) + def versionCommand = executable + " --version" + + var bmax: Option[Int] = config("bmax") + var bmaxdivn: Option[Int] = config("bmaxdivn") + var dcv: Option[Int] = config("dcv") + var offrate: Option[Int] = config("offrate") + var ftabchars: Option[Int] = config("ftabchars") + var localoffrate: Option[Int] = config("localoffrate") + var localftabchars: Option[Int] = config("localftabchars") + var seed: Option[Int] = config("seed") + + var largeIndex: Boolean = config("large_index", default = false) + var memoryFitting: Boolean = config("memory_fitting", default = false) + var nodc: Boolean = config("nodc", default = false) + var noref: Boolean = config("noref", default = false) + var justref: Boolean = config("justref", default = false) + var quiet: Boolean = config("quiet", default = false) + + override def beforeGraph(): Unit = { + super.beforeGraph() + val indexDir = new File(hisat2IndexBase).getParentFile + val indexName = new File(hisat2IndexBase).getName + jobOutputFile = new File(indexDir, s".$indexName.hisat2-build.out") + } + + def cmdLine = required(executable) + + optional("-p", threads) + + optional("--bmax", bmax) + + optional("--bmaxdivn", bmaxdivn) + + optional("--dcv", dcv) + + optional("--offrate", offrate) + + optional("--ftabchars", ftabchars) + + optional("--localoffrate", localoffrate) + + optional("--localftabchars", localftabchars) + + optional("--seed", seed) + + optional("--snp", snp) + + optional("--haplotype", haplotype) + + optional("--ss", ss) + + optional("--exon", exon) + + conditional(largeIndex, "--large-index") + + conditional(memoryFitting, "--memory-fitting") + + conditional(nodc, "--nodc") + + conditional(noref, "--noref") + + conditional(justref, "--justref") + + conditional(quiet, "--quiet") + + required(inputFasta) + + required(hisat2IndexBase) + +} diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/CollectMultipleMetrics.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/CollectMultipleMetrics.scala index 1dba561d044f5a66d8a762054861b1f3e8f16bd9..9ea2f61f23afef28d870cf80416e12fdb8df66d2 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/CollectMultipleMetrics.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/CollectMultipleMetrics.scala @@ -87,30 +87,31 @@ class CollectMultipleMetrics(val root: Configurable) extends Picard with Summari repeat("PROGRAM=", program, spaceSeparated = false) override def addToQscriptSummary(qscript: SummaryQScript, name: String): Unit = { - program.foreach(p => { - val stats: Any = p match { - case _ if p == Programs.CollectAlignmentSummaryMetrics.toString => - Picard.getMetrics(new File(outputName + ".alignment_summary_metrics"), groupBy = Some("CATEGORY")) - case _ if p == Programs.CollectInsertSizeMetrics.toString => - Map( - "metrics" -> Picard.getMetrics(new File(outputName + ".insert_size_metrics")), - "histogram" -> Picard.getHistogram(new File(outputName + ".insert_size_metrics")) - ) - case _ if p == Programs.QualityScoreDistribution.toString => - Picard.getHistogram(new File(outputName + ".quality_distribution_metrics")) - case _ if p == Programs.MeanQualityByCycle.toString => - Picard.getHistogram(new File(outputName + ".quality_by_cycle_metrics")) - case _ if p == Programs.CollectBaseDistributionByCycle.toString => - Picard.getHistogram(new File(outputName + ".base_distribution_by_cycle_metrics"), tag = "METRICS CLASS") - case _ => None + program + .filterNot(_ == Programs.CollectInsertSizeMetrics.toString && !new File(outputName + ".insert_size_metrics").exists()) + .foreach { p => + val stats: Any = p match { + case _ if p == Programs.CollectAlignmentSummaryMetrics.toString => + Picard.getMetrics(new File(outputName + ".alignment_summary_metrics"), groupBy = Some("CATEGORY")) + case _ if p == Programs.CollectInsertSizeMetrics.toString => + Map( + "metrics" -> Picard.getMetrics(new File(outputName + ".insert_size_metrics")), + "histogram" -> Picard.getHistogram(new File(outputName + ".insert_size_metrics")) + ) + case _ if p == Programs.QualityScoreDistribution.toString => + Picard.getHistogram(new File(outputName + ".quality_distribution_metrics")) + case _ if p == Programs.MeanQualityByCycle.toString => + Picard.getHistogram(new File(outputName + ".quality_by_cycle_metrics")) + case _ if p == Programs.CollectBaseDistributionByCycle.toString => + Picard.getHistogram(new File(outputName + ".base_distribution_by_cycle_metrics"), tag = "METRICS CLASS") + case _ => None + } + val sum = new Summarizable { + override def summaryStats = stats + override def summaryFiles: Map[String, File] = Map() + } + qscript.addSummarizable(sum, p) } - val sum = new Summarizable { - override def summaryStats = stats - override def summaryFiles: Map[String, File] = Map() - } - qscript.addSummarizable(sum, p) - }) - } def summaryStats = Map() diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/qiime/PickClosedReferenceOtus.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/qiime/PickClosedReferenceOtus.scala index 91e88648574f8b828e2bedb0456c199e29d0dccb..565accc8d492550e1d8b16a616a67829a0dd0ec0 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/qiime/PickClosedReferenceOtus.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/qiime/PickClosedReferenceOtus.scala @@ -31,8 +31,8 @@ class PickClosedReferenceOtus(val root: Configurable) extends BiopetCommandLineF var outputDir: File = null - override def defaultThreads = 3 - override def defaultCoreMemory = 16.0 + override def defaultThreads = 1 + override def defaultCoreMemory = 20.0 def versionCommand = executable + " --version" def versionRegex = """Version: (.*)""".r @@ -51,7 +51,8 @@ class PickClosedReferenceOtus(val root: Configurable) extends BiopetCommandLineF var suppressTaxonomyAssignment: Boolean = config("suppress_taxonomy_assignment", default = false) def otuTable = new File(outputDir, "otu_table.biom") - def otuMap = new File(outputDir, "uclust_ref_picked_otus" + File.separator + "seqs_otus.txt") + def otuMap = new File(outputDir, "uclust_ref_picked_otus" + File.separator + + inputFasta.getName.stripSuffix(".fna").stripSuffix(".fasta").stripSuffix(".fa") + "_otus.txt") @Output private var outputFiles: List[File] = Nil diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/qiime/PickOpenReferenceOtus.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/qiime/PickOpenReferenceOtus.scala new file mode 100644 index 0000000000000000000000000000000000000000..9c2c367d5ec18a2e88e39f3684fa9e66ac1cfeee --- /dev/null +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/qiime/PickOpenReferenceOtus.scala @@ -0,0 +1,88 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.extensions.qiime + +import java.io.File + +import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, Version } +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } + +/** + * Created by pjvan_thof on 12/4/15. + */ +class PickOpenReferenceOtus(val root: Configurable) extends BiopetCommandLineFunction with Version { + executable = config("exe", default = "pick_open_reference_otus.py") + + @Input(required = true) + var inputFasta: File = _ + + var outputDir: File = null + + override def defaultThreads = 1 + override def defaultCoreMemory = 20.0 + def versionCommand = executable + " --version" + def versionRegex = """Version: (.*)""".r + + @Input(required = false) + var parameterFp: Option[File] = config("parameter_fp") + + @Input(required = false) + var referenceFp: Option[File] = config("reference_fp") + + @Input(required = false) + var taxonomyFp: Option[File] = config("taxonomy_fp") + + var force: Boolean = config("force", default = false) + var printOnly: Boolean = config("print_only", default = false) + var suppressTaxonomyAssignment: Boolean = config("suppress_taxonomy_assignment", default = false) + var percentSubsample: Option[Double] = config("percent_subsample") + var prefilterPercentId: Option[Double] = config("prefilter_percent_id") + var suppressStep4: Boolean = config("suppress_step4", default = false) + var minOtuSize: Option[Int] = config("min_otu_size") + var suppressAlignAndTree: Boolean = config("suppress_taxonomy_assignment", default = false) + + def otuTable = new File(outputDir, "otu_table_mc2_w_tax.biom") + def failedOtuTable = new File(outputDir, "otu_table_mc2_w_tax_no_pynast_failures.biom") + def otuMap = new File(outputDir, "final_otu_map.txt") + + @Output + private var outputFiles: List[File] = Nil + + override def beforeGraph(): Unit = { + super.beforeGraph() + jobOutputFile = new File(outputDir, ".std.out") + outputFiles ::= otuTable + outputFiles ::= failedOtuTable + outputFiles ::= otuMap + } + + def cmdLine = executable + required("-f") + + required("-i", inputFasta) + + required("-o", outputDir) + + optional("--reference_fp", referenceFp) + + optional("--parameter_fp", parameterFp) + + optional("--taxonomy_fp", taxonomyFp) + + conditional(force, "--force") + + conditional(printOnly, "--print_only") + + conditional(suppressTaxonomyAssignment, "--suppress_taxonomy_assignment") + + (if (threads > 1) required("-a") + required("-O", threads) else "") + + optional("--percent_subsample", percentSubsample) + + optional("--prefilter_percent_id", prefilterPercentId) + + conditional(suppressStep4, "--suppress_step4") + + optional("-min_otu_size", minOtuSize) + + conditional(suppressAlignAndTree, "--suppress_align_and_tree") + +} diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/seqtk/SeqtkSample.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/seqtk/SeqtkSample.scala new file mode 100644 index 0000000000000000000000000000000000000000..db7d32e61348e9cf5dcf7bbabcea7c51b9ef9f84 --- /dev/null +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/seqtk/SeqtkSample.scala @@ -0,0 +1,47 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.extensions.seqtk + +import java.io.File + +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } + +/** + * Wrapper for the seqtk sample subcommand. + * Written based on seqtk version 1.0-r63-dirty. + */ +class SeqtkSample(val root: Configurable) extends Seqtk { + + /** input file */ + @Input(doc = "Input file (FASTQ or FASTA)", required = true) + var input: File = _ + + /** output file */ + @Output(doc = "Output file", required = true) + var output: File = _ + + var s: Option[Int] = config("seed") + + var sample: Double = _ + + def cmdLine = required(executable) + + " sample " + + optional("-s", s) + + required(input) + + (if (sample > 1) required(sample.toInt) else required(sample)) + + (if (outputAsStsout) "" else " > " + required(output)) + +} diff --git a/biopet-extensions/src/test/resources/ref.fa.fai b/biopet-extensions/src/test/resources/ref.fa.fai new file mode 100644 index 0000000000000000000000000000000000000000..d7e459dd9559f020608cbb9144af1476240c247e --- /dev/null +++ b/biopet-extensions/src/test/resources/ref.fa.fai @@ -0,0 +1 @@ +chr1 9 6 9 10 \ No newline at end of file diff --git a/biopet-extensions/src/test/scala/nl/lumc/sasc/biopet/extensions/BedToolsTest.scala b/biopet-extensions/src/test/scala/nl/lumc/sasc/biopet/extensions/BedToolsTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..3096b818708c2648f0f3c3a5533344459d185b3c --- /dev/null +++ b/biopet-extensions/src/test/scala/nl/lumc/sasc/biopet/extensions/BedToolsTest.scala @@ -0,0 +1,35 @@ +package nl.lumc.sasc.biopet.extensions + +import java.io.File +import java.nio.file.Paths + +import nl.lumc.sasc.biopet.extensions.bedtools.BedtoolsCoverage +import nl.lumc.sasc.biopet.utils.config.{ Config, Configurable } +import org.scalatest.Matchers +import org.scalatest.testng.TestNGSuite +import org.testng.annotations.Test + +import scala.io.Source + +/** + * Created by Sander Bollen on 12-5-16. + */ +class BedToolsTest extends TestNGSuite with Matchers { + + @Test + def testBedtoolsCoverageCreateGenomeFile() = { + val file = new File(Paths.get(this.getClass.getResource("/ref.fa.fai").toURI).toString) + val tmp = File.createTempFile("test", ".bed") + tmp.deleteOnExit() + class TestCov(override val root: Configurable) extends BedtoolsCoverage(root) { + jobTempDir = tmp + override def referenceFai = file + + def genome = BedtoolsCoverage.createGenomeFile(file, file.getParentFile) + } + val cov = new TestCov(null) + val genome = cov.genome + Source.fromFile(genome).getLines().mkString("\n") shouldBe "chr1\t9" + } + +} diff --git a/biopet-package/src/main/scala/nl/lumc/sasc/biopet/BiopetExecutableMain.scala b/biopet-package/src/main/scala/nl/lumc/sasc/biopet/BiopetExecutableMain.scala index 76ffcfc3c45cf88aca2fe2c36befb305e624ea99..25985115855087bd810116ea99ed1ce96cda02e2 100644 --- a/biopet-package/src/main/scala/nl/lumc/sasc/biopet/BiopetExecutableMain.scala +++ b/biopet-package/src/main/scala/nl/lumc/sasc/biopet/BiopetExecutableMain.scala @@ -34,6 +34,7 @@ object BiopetExecutableMain extends BiopetExecutable { nl.lumc.sasc.biopet.pipelines.gears.GearsSingle, nl.lumc.sasc.biopet.pipelines.gears.Gears, nl.lumc.sasc.biopet.pipelines.gwastest.GwasTest, + nl.lumc.sasc.biopet.pipelines.gwastest.impute.Impute2Vcf, nl.lumc.sasc.biopet.pipelines.shiva.ShivaVariantcalling, nl.lumc.sasc.biopet.pipelines.basty.Basty, nl.lumc.sasc.biopet.pipelines.shiva.Shiva, diff --git a/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/MergeOtuMaps.scala b/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/MergeOtuMaps.scala index d7abf1a8f04520febfbd122cb8883732d86586d9..7cdb85288d36e7124f83a0dc9d2518384d4381af 100644 --- a/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/MergeOtuMaps.scala +++ b/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/MergeOtuMaps.scala @@ -31,9 +31,14 @@ class MergeOtuMaps(val root: Configurable) extends ToolCommandFunction { @Output(doc = "Output", shortName = "output", required = true) var output: File = _ + var skipPrefix: List[String] = config("skip_prefix", default = Nil) + override def defaultCoreMemory = 6.0 - override def cmdLine = super.cmdLine + repeat("-I", input) + required("-o", output) + override def cmdLine = super.cmdLine + + repeat("-I", input) + + required("-o", output) + + repeat("-p", skipPrefix) } diff --git a/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/SeqStat.scala b/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/SeqStat.scala index 5243327b367521e575025ac7b2d3093e872aa78a..a32216225f63ac2c8fbc4047d36716164de55a23 100644 --- a/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/SeqStat.scala +++ b/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/SeqStat.scala @@ -36,7 +36,7 @@ class SeqStat(val root: Configurable) extends ToolCommandFunction with Summariza @Output(doc = "Output JSON", shortName = "output", required = true) var output: File = null - override def defaultCoreMemory = 2.5 + override def defaultCoreMemory = 4.0 override def cmdLine = super.cmdLine + required("-i", input) + required("-o", output) diff --git a/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/ValidateFastq.scala b/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/ValidateFastq.scala index 37009ba2378398e441270603b53809bd3f01ef44..1ea41dd50ba72e833f166a0b51135f499d86316f 100644 --- a/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/ValidateFastq.scala +++ b/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/ValidateFastq.scala @@ -29,6 +29,8 @@ class ValidateFastq(val root: Configurable) extends ToolCommandFunction { @Input(doc = "Input R1 fastq file", required = false) var r2Fastq: Option[File] = None + override def defaultCoreMemory = 4.0 + override def cmdLine = super.cmdLine + required("-i", r1Fastq) + optional("-j", r2Fastq) diff --git a/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/VcfWithVcf.scala b/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/VcfWithVcf.scala index f49fac4357ddd1cf2860e44a006c1554e8f44afd..138ab1acdcb68fac9f3be14502ad3fbc6bc44ef5 100644 --- a/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/VcfWithVcf.scala +++ b/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/VcfWithVcf.scala @@ -43,7 +43,7 @@ class VcfWithVcf(val root: Configurable) extends ToolCommandFunction with Refere var fields: List[(String, String, Option[String])] = List() - override def defaultCoreMemory = 2.0 + override def defaultCoreMemory = 4.0 override def beforeGraph() { super.beforeGraph() diff --git a/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/WipeReads.scala b/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/WipeReads.scala index 33fdcb06dfa04203d79bb8830e26968988f23beb..3b39efb21a4e517e5a014c45df06caa19087943a 100644 --- a/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/WipeReads.scala +++ b/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/WipeReads.scala @@ -18,9 +18,8 @@ import java.io.File import nl.lumc.sasc.biopet.core.ToolCommandFunction import nl.lumc.sasc.biopet.utils.config.Configurable -import org.broadinstitute.gatk.utils.commandline.{ Input, Output } +import org.broadinstitute.gatk.utils.commandline.{ Argument, Input, Output } -// TODO: finish implementation for usage in pipelines /** * WipeReads function class for usage in Biopet pipelines * @@ -36,9 +35,52 @@ class WipeReads(val root: Configurable) extends ToolCommandFunction { @Input(doc = "Interval file", shortName = "r", required = true) var intervalFile: File = null + @Argument(doc = "Minimum MAPQ of reads in target region to remove (default: 0)") + var minMapQ: Option[Int] = config("min_mapq") + + @Argument(doc = "Read group IDs to be removed (default: remove reads from all read groups)") + var readgroup: Set[String] = config("read_group", default = Nil) + + @Argument(doc = "Whether to remove multiple-mapped reads outside the target regions (default: yes)") + var limitRemoval: Boolean = config("limit_removal", default = false) + + @Argument(doc = "Whether to index output BAM file or not") + var noMakeIndex: Boolean = config("no_make_index", default = false) + + @Argument(doc = "GTF feature containing intervals (default: exon)") + var featureType: Option[String] = config("feature_type") + + @Argument(doc = "Expected maximum number of reads in target regions (default: 7e7)") + var bloomSize: Option[Long] = config("bloom_size") + + @Argument(doc = "False positive rate (default: 4e-7)") + var falsePositive: Option[Long] = config("false_positive") + @Output(doc = "Output BAM", shortName = "o", required = true) var outputBam: File = null + @Output(required = false) + private var outputIndex: Option[File] = None + @Output(doc = "BAM containing discarded reads", shortName = "f", required = false) - var discardedBam: File = null + var discardedBam: Option[File] = None + + override def beforeGraph() { + super.beforeGraph() + if (!noMakeIndex) outputIndex = Some(new File(outputBam.getPath.stripSuffix(".bam") + ".bai")) + } + + override def cmdLine = super.cmdLine + + required("-I", inputBam) + + required("-r", intervalFile) + + required("-o", outputBam) + + optional("-f", discardedBam) + + optional("-Q", minMapQ) + + readgroup.foreach(rg => required("-G", rg)) + + conditional(limitRemoval, "--limit_removal") + + conditional(noMakeIndex, "--no_make_index") + + optional("-t", featureType) + + optional("--bloom_size", bloomSize) + + optional("--false_positive", falsePositive) + } diff --git a/biopet-tools/pom.xml b/biopet-tools/pom.xml index f2a5cfa6547ec9c95e732448b1d6be2f2c046a30..fc5f3d753583a3a097ce74dcfffad0f8111e623a 100644 --- a/biopet-tools/pom.xml +++ b/biopet-tools/pom.xml @@ -65,7 +65,7 @@ <dependency> <groupId>com.github.broadinstitute</groupId> <artifactId>picard</artifactId> - <version>1.141</version> + <version>2.4.1</version> </dependency> </dependencies> </project> \ No newline at end of file diff --git a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/AnnotateVcfWithBed.scala b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/AnnotateVcfWithBed.scala index 4478b9f0ef2be289c6836f51e0ea87eff37c7816..c046226689d19262323650ec63925420dcac20b2 100644 --- a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/AnnotateVcfWithBed.scala +++ b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/AnnotateVcfWithBed.scala @@ -52,22 +52,22 @@ object AnnotateVcfWithBed extends ToolCommand { class OptParser extends AbstractOptParser { opt[File]('I', "inputFile") required () unbounded () valueName "<vcf file>" action { (x, c) => c.copy(inputFile = x) - } text "Input is a required file property" + } text "Input VCF file. Mandatory field" opt[File]('B', "bedFile") required () unbounded () valueName "<bed file>" action { (x, c) => c.copy(bedFile = x) - } text "Bedfile is a required file property" + } text "Input Bed file. Mandatory field" opt[File]('o', "output") required () unbounded () valueName "<vcf file>" action { (x, c) => c.copy(outputFile = x) - } text "out is a required file property" + } text "Output VCF file. Mandatory field" opt[String]('f', "fieldName") required () unbounded () valueName "<name of field in vcf file>" action { (x, c) => c.copy(fieldName = x) } text "Name of info field in new vcf file" - opt[String]('d', "fieldDescription") unbounded () valueName "<name of field in vcf file>" action { (x, c) => + opt[String]('d', "fieldDescription") unbounded () valueName "<description of field in vcf file>" action { (x, c) => c.copy(fieldDescription = x) } text "Description of field in new vcf file" - opt[String]('t', "fieldType") unbounded () valueName "<name of field in vcf file>" action { (x, c) => + opt[String]('t', "fieldType") unbounded () valueName "<type of field in vcf file>" action { (x, c) => c.copy(fieldType = x) - } text "Description of field in new vcf file" + } text "Type of field in new vcf file. Can be 'Integer', 'Flag', 'Character', 'Float'" } /** diff --git a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/BaseCounter.scala b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/BaseCounter.scala index 8d3f7a0df21dd156d1e20da94b7a4e9541f2b80a..014718a9bdd62f15b7f9faf9a78864d3e3644c5b 100644 --- a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/BaseCounter.scala +++ b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/BaseCounter.scala @@ -38,13 +38,13 @@ object BaseCounter extends ToolCommand { class OptParser extends AbstractOptParser { opt[File]('r', "refFlat") required () valueName "<file>" action { (x, c) => c.copy(refFlat = x) - } + } text "refFlat file. Mandatory" opt[File]('o', "outputDir") required () valueName "<directory>" action { (x, c) => c.copy(outputDir = x) - } + } text "Output directory. Mandatory" opt[File]('b', "bam") required () valueName "<file>" action { (x, c) => c.copy(bamFile = x) - } + } text "Bam file. Mandatory" opt[String]('p', "prefix") valueName "<prefix>" action { (x, c) => c.copy(prefix = x) } diff --git a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/BedtoolsCoverageToCounts.scala b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/BedtoolsCoverageToCounts.scala index a1180a38608911fc527c42aa36b71536a0a1d3ec..9cc8d76ba9bd45619795afde25147223861282b9 100644 --- a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/BedtoolsCoverageToCounts.scala +++ b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/BedtoolsCoverageToCounts.scala @@ -27,10 +27,10 @@ object BedtoolsCoverageToCounts extends ToolCommand { class OptParser extends AbstractOptParser { opt[File]('I', "input") required () valueName "<file>" action { (x, c) => c.copy(input = x) - } + } text "Coverage file produced with bedtools" opt[File]('o', "output") required () unbounded () valueName "<file>" action { (x, c) => c.copy(output = x) - } + } text "Output file name" } /** diff --git a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/CheckAllelesVcfInBam.scala b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/CheckAllelesVcfInBam.scala index 68e548c12454ceb2ecbe8579c2540ba4fbfc0ead..5490de4cc70fc90384a61fc13601ff4af92c18d4 100644 --- a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/CheckAllelesVcfInBam.scala +++ b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/CheckAllelesVcfInBam.scala @@ -47,19 +47,19 @@ object CheckAllelesVcfInBam extends ToolCommand { class OptParser extends AbstractOptParser { opt[File]('I', "inputFile") required () maxOccurs 1 valueName "<file>" action { (x, c) => c.copy(inputFile = x) - } + } text "VCF file" opt[File]('o', "outputFile") required () maxOccurs 1 valueName "<file>" action { (x, c) => c.copy(outputFile = x) - } + } text "output VCF file name" opt[String]('s', "sample") unbounded () minOccurs 1 action { (x, c) => c.copy(samples = x :: c.samples) - } + } text "sample name" opt[File]('b', "bam") unbounded () minOccurs 1 action { (x, c) => c.copy(bamFiles = x :: c.bamFiles) - } + } text "bam file, from which the variants (VCF files) were called" opt[Int]('m', "min_mapping_quality") maxOccurs 1 action { (x, c) => c.copy(minMapQual = c.minMapQual) - } + } text "minimum mapping quality score for a read to be taken into account" } private class CountReport( diff --git a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/ExtractAlignedFastq.scala b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/ExtractAlignedFastq.scala index 2cbf52f44fbaf85495ea1af0bc58f28d17c1e0b3..13622ef41b3691b0959f43211ac232668d32f760 100644 --- a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/ExtractAlignedFastq.scala +++ b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/ExtractAlignedFastq.scala @@ -192,7 +192,7 @@ object ExtractAlignedFastq extends ToolCommand { opt[String]('r', "interval") required () unbounded () valueName "<interval>" action { (x, c) => // yes, we are appending and yes it's O(n) ~ preserving order is more important than speed here c.copy(intervals = c.intervals :+ x) - } text "Interval strings" + } text "Interval strings (e.g. chr1:1-100)" opt[File]('i', "in1") required () valueName "<fastq>" action { (x, c) => c.copy(inputFastq1 = x) @@ -220,7 +220,11 @@ object ExtractAlignedFastq extends ToolCommand { opt[Int]('s', "read_suffix_length") optional () action { (x, c) => c.copy(commonSuffixLength = x) - } text "Length of common suffix from each read pair (default: 0)" + } text + """Length of suffix mark from each read pair (default: 0). This is used for distinguishing read pairs with + different suffices. For example, if your FASTQ records end with `/1` for the first pair and `/2` for the + second pair, the value of `read_suffix_length` should be 2." + """.stripMargin note( """ diff --git a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/MergeOtuMaps.scala b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/MergeOtuMaps.scala index 989d7d9a8a6bd53cae624f5ab7eeff23cbae5db1..0a2a49c0030154d3e7e3263018ae07eae4650777 100644 --- a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/MergeOtuMaps.scala +++ b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/MergeOtuMaps.scala @@ -24,7 +24,9 @@ import scala.io.Source * Created by pjvan_thof on 12/18/15. */ object MergeOtuMaps extends ToolCommand { - case class Args(inputFiles: List[File] = Nil, outputFile: File = null) extends AbstractArgs + case class Args(inputFiles: List[File] = Nil, + outputFile: File = null, + skipPrefix: List[String] = Nil) extends AbstractArgs class OptParser extends AbstractOptParser { opt[File]('I', "input") minOccurs 2 required () unbounded () valueName "<file>" action { (x, c) => @@ -33,6 +35,9 @@ object MergeOtuMaps extends ToolCommand { opt[File]('o', "output") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) => c.copy(outputFile = x) } + opt[String]('p', "skipPrefix") unbounded () valueName "<text>" action { (x, c) => + c.copy(skipPrefix = x :: c.skipPrefix) + } } /** @@ -40,23 +45,24 @@ object MergeOtuMaps extends ToolCommand { */ def main(args: Array[String]): Unit = { val argsParser = new OptParser - val commandArgs: Args = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException) + val cmdArgs: Args = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException) - var map: Map[Long, String] = Map() + var map: Map[String, String] = Map() - for (inputFile <- commandArgs.inputFiles) { + for (inputFile <- cmdArgs.inputFiles) { logger.info(s"Start reading $inputFile") val reader = Source.fromFile(inputFile) reader.getLines().foreach { line => val values = line.split("\t", 2) - val key = values.head.toLong - map += key -> (line.stripPrefix(s"$key") + map.getOrElse(key, "")) + val key = values.head + if (!cmdArgs.skipPrefix.exists(key.startsWith)) + map += key -> (line.stripPrefix(s"$key") + map.getOrElse(key, "")) } reader.close() } - logger.info(s"Start writing to ${commandArgs.outputFile}") - val writer = new PrintWriter(commandArgs.outputFile) + logger.info(s"Start writing to ${cmdArgs.outputFile}") + val writer = new PrintWriter(cmdArgs.outputFile) map.foreach { case (key, list) => writer.println(key + list) } writer.close() } diff --git a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala index b22eb83e1e40b2096525d0c59f5a3fea9f30ff26..7a5ee564025d5fe90ed3f274a34669713f94ff0b 100644 --- a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala +++ b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala @@ -281,8 +281,17 @@ object VcfStats extends ToolCommand { val stats = createStats logger.info("Starting on: " + interval) + val query = reader.query(interval.getContig, interval.getStart, interval.getEnd) + if (!query.hasNext) { + mergeNestedStatsMap(stats.generalStats, fillGeneral(adInfoTags)) + for (sample <- samples) yield { + mergeNestedStatsMap(stats.samplesStats(sample).genotypeStats, fillGenotype(adGenotypeTags)) + } + chunkCounter += 1 + } + for ( - record <- reader.query(interval.getContig, interval.getStart, interval.getEnd) if record.getStart <= interval.getEnd + record <- query if record.getStart <= interval.getEnd ) { mergeNestedStatsMap(stats.generalStats, checkGeneral(record, adInfoTags)) for (sample1 <- samples) yield { @@ -414,6 +423,59 @@ object VcfStats extends ToolCommand { } } + protected[tools] def fillGeneral(additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = { + val buffer = mutable.Map[String, Map[Any, Int]]() + + def addToBuffer(key: String, value: Any, found: Boolean): Unit = { + val map = buffer.getOrElse(key, Map()) + if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1))) + else buffer += key -> (map + (value -> map.getOrElse(value, 0))) + } + + buffer += "QUAL" -> Map("not set" -> 1) + + addToBuffer("SampleDistribution-Het", "not set", found = false) + addToBuffer("SampleDistribution-HetNonRef", "not set", found = false) + addToBuffer("SampleDistribution-Hom", "not set", found = false) + addToBuffer("SampleDistribution-HomRef", "not set", found = false) + addToBuffer("SampleDistribution-HomVar", "not set", found = false) + addToBuffer("SampleDistribution-Mixed", "not set", found = false) + addToBuffer("SampleDistribution-NoCall", "not set", found = false) + addToBuffer("SampleDistribution-NonInformative", "not set", found = false) + addToBuffer("SampleDistribution-Available", "not set", found = false) + addToBuffer("SampleDistribution-Called", "not set", found = false) + addToBuffer("SampleDistribution-Filtered", "not set", found = false) + addToBuffer("SampleDistribution-Variant", "not set", found = false) + + addToBuffer("general", "Total", found = true) + addToBuffer("general", "Biallelic", false) + addToBuffer("general", "ComplexIndel", false) + addToBuffer("general", "Filtered", false) + addToBuffer("general", "FullyDecoded", false) + addToBuffer("general", "Indel", false) + addToBuffer("general", "Mixed", false) + addToBuffer("general", "MNP", false) + addToBuffer("general", "MonomorphicInSamples", false) + addToBuffer("general", "NotFiltered", false) + addToBuffer("general", "PointEvent", false) + addToBuffer("general", "PolymorphicInSamples", false) + addToBuffer("general", "SimpleDeletion", false) + addToBuffer("general", "SimpleInsertion", false) + addToBuffer("general", "SNP", false) + addToBuffer("general", "StructuralIndel", false) + addToBuffer("general", "Symbolic", false) + addToBuffer("general", "SymbolicOrSV", false) + addToBuffer("general", "Variant", false) + + val skipTags = List("QUAL", "general") + + for (tag <- additionalTags if !skipTags.contains(tag)) { + addToBuffer(tag, 0, found = false) + } + + Map("total" -> buffer.toMap) + } + /** Function to check all general stats, all info expect sample/genotype specific stats */ protected[tools] def checkGeneral(record: VariantContext, additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = { val buffer = mutable.Map[String, Map[Any, Int]]() @@ -470,6 +532,42 @@ object VcfStats extends ToolCommand { Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap) } + protected[tools] def fillGenotype(additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = { + val buffer = mutable.Map[String, Map[Any, Int]]() + + def addToBuffer(key: String, value: Any, found: Boolean): Unit = { + val map = buffer.getOrElse(key, Map()) + if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1))) + else buffer += key -> (map + (value -> map.getOrElse(value, 0))) + } + + buffer += "DP" -> Map("not set" -> 1) + buffer += "GQ" -> Map("not set" -> 1) + + addToBuffer("general", "Total", found = true) + addToBuffer("general", "Het", false) + addToBuffer("general", "HetNonRef", false) + addToBuffer("general", "Hom", false) + addToBuffer("general", "HomRef", false) + addToBuffer("general", "HomVar", false) + addToBuffer("general", "Mixed", false) + addToBuffer("general", "NoCall", false) + addToBuffer("general", "NonInformative", false) + addToBuffer("general", "Available", false) + addToBuffer("general", "Called", false) + addToBuffer("general", "Filtered", false) + addToBuffer("general", "Variant", false) + + val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general") + + for (tag <- additionalTags if !skipTags.contains(tag)) { + addToBuffer(tag, 0, found = false) + } + + Map("total" -> buffer.toMap) + + } + /** Function to check sample/genotype specific stats */ protected[tools] def checkGenotype(record: VariantContext, genotype: Genotype, additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = { val buffer = mutable.Map[String, Map[Any, Int]]() diff --git a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfToTsv.scala b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfToTsv.scala index 1c40d228d200623e56b618e279763f96304ceb99..01c17027db3821eb82d800b04c3267d37fd85214 100644 --- a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfToTsv.scala +++ b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfToTsv.scala @@ -34,28 +34,28 @@ object VcfToTsv extends ToolCommand { class OptParser extends AbstractOptParser { opt[File]('I', "inputFile") required () maxOccurs 1 valueName "<file>" action { (x, c) => c.copy(inputFile = x) - } + } text "Input vcf file" opt[File]('o', "outputFile") maxOccurs 1 valueName "<file>" action { (x, c) => c.copy(outputFile = x) } text "output file, default to stdout" opt[String]('f', "field") unbounded () action { (x, c) => c.copy(fields = x :: c.fields) - } + } text "Genotype field to use" valueName "Genotype field name" opt[String]('i', "info_field") unbounded () action { (x, c) => c.copy(infoFields = x :: c.infoFields) - } + } text "Info field to use" valueName "Info field name" opt[Unit]("all_info") unbounded () action { (x, c) => c.copy(allInfo = true) - } + } text "Use all info fields in the vcf header" opt[Unit]("all_format") unbounded () action { (x, c) => c.copy(allFormat = true) - } + } text "Use all genotype fields in the vcf header" opt[String]('s', "sample_field") unbounded () action { (x, c) => c.copy(sampleFields = x :: c.sampleFields) - } + } text "Genotype fields to use in the tsv file" opt[Unit]('d', "disable_defaults") unbounded () action { (x, c) => c.copy(disableDefaults = true) - } + } text "Don't output the default columns from the vcf file" opt[String]("separator") maxOccurs 1 action { (x, c) => c.copy(separator = x) } text "Optional separator. Default is tab-delimited" diff --git a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfWithVcf.scala b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfWithVcf.scala index e47551599e2b486a603d71707bce4c6eaaa26745..6b3ec58aea9852897f5ca39ad8d07dd3cf55951d 100644 --- a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfWithVcf.scala +++ b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfWithVcf.scala @@ -177,10 +177,10 @@ object VcfWithVcf extends ToolCommand { def getSecondaryRecords(secondaryReader: VCFFileReader, record: VariantContext, matchAllele: Boolean): List[VariantContext] = { if (matchAllele) { - secondaryReader.query(record.getContig, record.getStart, record.getEnd).toList. - filter(x => record.getAlternateAlleles.exists(x.hasAlternateAllele)) + secondaryReader.query(record.getContig, record.getStart, record.getEnd). + filter(x => record.getAlternateAlleles.exists(x.hasAlternateAllele)).toList } else { - secondaryReader.query(record.getContig, record.getStart, record.getEnd).toList + secondaryReader.query(record.getContig, record.getStart, record.getEnd).toIterable.toList } } diff --git a/biopet-utils/pom.xml b/biopet-utils/pom.xml index 0ce26945050bd58a48420965b6e05b9e41267480..55bc64cc023bf4b34943bcc8db9bd25710f8167e 100644 --- a/biopet-utils/pom.xml +++ b/biopet-utils/pom.xml @@ -66,7 +66,7 @@ <dependency> <groupId>com.github.samtools</groupId> <artifactId>htsjdk</artifactId> - <version>1.141</version> + <version>2.4.1</version> </dependency> <dependency> <groupId>org.scala-lang</groupId> diff --git a/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/Logging.scala b/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/Logging.scala index d9497e96645d3d6b528f5014a7a9da13d87f73c5..c59d3ac906c5e80f7279adb6a0b6c5ce2d33aca0 100644 --- a/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/Logging.scala +++ b/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/Logging.scala @@ -39,6 +39,7 @@ object Logging { def addError(error: String, debug: String = null): Unit = { val msg = error + (if (debug != null && logger.isDebugEnabled) "; " + debug else "") + logger.error(msg) errors.append(new Exception(msg)) } diff --git a/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/ToolCommand.scala b/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/ToolCommand.scala index 903168917e2a644528e7eb5fd5cdcaed2e6ea9fc..d1bd6842f6b5cac0c8b1c19bc60d96d73228f88a 100644 --- a/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/ToolCommand.scala +++ b/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/ToolCommand.scala @@ -35,7 +35,7 @@ trait ToolCommand extends MainCommand with Logging { case "error" => logger.setLevel(org.apache.log4j.Level.ERROR) case _ => } - } text "Log level" validate { + } text "Level of log information printed. Possible levels: 'debug', 'info', 'warn', 'error'" validate { case "debug" | "info" | "warn" | "error" => success case _ => failure("Log level must be <debug/info/warn/error>") } diff --git a/carp/src/main/scala/nl/lumc/sasc/biopet/pipelines/carp/Carp.scala b/carp/src/main/scala/nl/lumc/sasc/biopet/pipelines/carp/Carp.scala index 14f1e381e9961bcb9d5cae54dd46f95b9faf124e..ecd8fb0c7625fae59ba42bacb81f525750ee7578 100644 --- a/carp/src/main/scala/nl/lumc/sasc/biopet/pipelines/carp/Carp.scala +++ b/carp/src/main/scala/nl/lumc/sasc/biopet/pipelines/carp/Carp.scala @@ -24,6 +24,7 @@ import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsView import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics import nl.lumc.sasc.biopet.pipelines.bamtobigwig.Bam2Wig import nl.lumc.sasc.biopet.pipelines.mapping.MultisampleMappingTrait +import nl.lumc.sasc.biopet.utils.Logging import nl.lumc.sasc.biopet.utils.config._ import org.broadinstitute.gatk.queue.QScript @@ -49,7 +50,8 @@ class Carp(val root: Configurable) extends QScript with MultisampleMappingTrait "samtoolsview" -> Map( "h" -> true, "b" -> true - ) + ), + "macs2callpeak" -> Map("fileformat" -> "") ) def summaryFile = new File(outputDir, "Carp.summary.json") @@ -91,6 +93,7 @@ class Carp(val root: Configurable) extends QScript with MultisampleMappingTrait macs2.treatment = preProcessBam.get macs2.name = Some(sampleId) macs2.outputdir = sampleDir + File.separator + "macs2" + File.separator + sampleId + File.separator + macs2.fileformat = if (paired) Some("BAMPE") else Some("BAM") add(macs2) } } @@ -102,6 +105,13 @@ class Carp(val root: Configurable) extends QScript with MultisampleMappingTrait Some(carp) } + lazy val paired: Boolean = { + val notPaired = samples.forall(_._2.libraries.forall(_._2.inputR2.isEmpty)) + val p = samples.forall(_._2.libraries.forall(_._2.inputR2.isDefined)) + if (!notPaired && !p) Logging.addError("Combination of Paired-end and Single-end detected, this is not allowed in Carp") + p + } + override def init() = { super.init() // ensure that no samples are called 'control' since that is our reserved keyword @@ -119,6 +129,7 @@ class Carp(val root: Configurable) extends QScript with MultisampleMappingTrait macs2.treatment = sample.preProcessBam.get macs2.control = samples(controlId).preProcessBam.get macs2.name = Some(sampleId + "_VS_" + controlId) + macs2.fileformat = if (paired) Some("BAMPE") else Some("BAM") macs2.outputdir = sample.sampleDir + File.separator + "macs2" + File.separator + macs2.name.get + File.separator add(macs2) } diff --git a/carp/src/test/scala/nl/lumc/sasc/biopet/pipelines/carp/CarpTest.scala b/carp/src/test/scala/nl/lumc/sasc/biopet/pipelines/carp/CarpTest.scala index 5d8bbd8ad77129961813b0e328e8df27bce28b00..3ba85bccab141d308714e0dd1ec109262f2bc834 100644 --- a/carp/src/test/scala/nl/lumc/sasc/biopet/pipelines/carp/CarpTest.scala +++ b/carp/src/test/scala/nl/lumc/sasc/biopet/pipelines/carp/CarpTest.scala @@ -21,7 +21,7 @@ import nl.lumc.sasc.biopet.utils.config.Config import nl.lumc.sasc.biopet.extensions.bwa.BwaMem import nl.lumc.sasc.biopet.extensions.macs2.Macs2CallPeak import nl.lumc.sasc.biopet.extensions.picard.{ MergeSamFiles, SortSam } -import nl.lumc.sasc.biopet.utils.ConfigUtils +import nl.lumc.sasc.biopet.utils.{ ConfigUtils, Logging } import org.apache.commons.io.FileUtils import org.broadinstitute.gatk.queue.QSettings import org.scalatest.Matchers @@ -65,13 +65,15 @@ class CarpTest extends TestNGSuite with Matchers { } if (!sample1 && !sample2 && !sample3 && !threatment && !control) { // When no samples - intercept[IllegalArgumentException] { + intercept[IllegalStateException] { initPipeline(map).script() } + Logging.errors.clear() } else if (threatment && !control) { // If control of a samples does not exist in samples intercept[IllegalStateException] { initPipeline(map).script() } + Logging.errors.clear() } else { // When samples are correct val carp = initPipeline(map) carp.script() diff --git a/docs/developer/code-style.md b/docs/developer/code-style.md index 8e20b7747d886b98b3ada34d13e28d355298f2cd..2b82f5139871a9be8237a04f10c85fc5866bcff1 100644 --- a/docs/developer/code-style.md +++ b/docs/developer/code-style.md @@ -22,7 +22,7 @@ class extractReads {} ``` -- Avoid using `null`, the Option `type` in Scala can be used instead +- Avoid using `null`; Scala's `Option` type should be used instead ```scala // correct: diff --git a/docs/developer/example-depends.md b/docs/developer/example-depends.md new file mode 100644 index 0000000000000000000000000000000000000000..da26d0a11b2a04672044f46f68fcd9203b6eb524 --- /dev/null +++ b/docs/developer/example-depends.md @@ -0,0 +1,186 @@ +# Developer - Using Biopet as a dependency for your own project + +You can use Biopet as a library for your own Scala project. +This can be useful if you want to make your own pipeline that you don't want to add back upstream. + +## Prerequisites + +At bare minimum you will need: + +* Java 8 +* Scala 2.10 or higher +* SBT or Maven + +We highly recommend you to use an IDE such as IntelliJ IDEA for development. + +### Maven dependencies + +If you decide to use Maven, you should clone the [GATK public github repository](https://github.com/broadgsa/gatk). +You should use GATK 3.6. +After cloning GATK 3.6, run the following in a terminal + +`mvn clean install` + +You should perform the same steps for [Biopet](https://github.com/biopet/biopet). This document assumes you are working with Biopet 0.7 or higher. + + +### SBT dependencies + +You can develop biopet pipelines with SBT as well. However, since GATK uses Maven, you will still need to install GATK +into your local Maven repository with `mvn install`. + +After this, you can create a regular `build.sbt` file in the project root directory. In addition to the regular +SBT settings, you will also need to make SBT aware of the local GATK Maven installation you just did. This can be done +by adding a new resolver object: + +``` +resolvers += { + val repo = new IBiblioResolver + repo.setM2compatible(true) + repo.setName("localhost") + repo.setRoot(s"file://${Path.userHome.absolutePath}/.m2/repository") + repo.setCheckconsistency(false) + new RawRepository(repo) +} +``` + +Having set this, you can then add specific biopet modules as your library dependency. Here is one example that adds +the Flexiprep version 0.7.0 dependency: + +``` +libraryDependencies ++= Seq( + "nl.lumc.sasc" % "Flexiprep" % "0.7.0" +) +``` + +In some cases, there may be a conflict with the `org.reflections` package used (this is a transitive dependency of +GATK). If you encounter this, we recommend forcing the version to 0.9.9-RC1 like so: + +``` +libraryDependencies ++= Seq( + "org.reflections" % "reflections" % "0.9.9-RC1" force() +) +``` + +## Project structure + +You should follow typical Scala folder structure. Ideally your IDE will handles this for you. +An example structure looks like: + +``` +. +├── pom.xml +├── src +│  ├── main +│  │  ├── resources +│  │  │  └── path +│  │  │  └── to +│  │  │  └── your +│  │  │  └── myProject +│  │  │  └── a_resource.txt +│  │  └── scala +│  │  └── path +│  │  └── to +│  │  └── your +│  │  └── myProject +│  │  └── MyProject.scala +│  └── test +│  ├── resources +│  └── scala +│  └── path +│  └── to +│  └── your +│  └── MyProject +│  └── MyProjectTest.scala + +``` + +## POM + +(skip this section if using SBT) + +When using Biopet, your Maven pom.xml file should at minimum contain the following dependency: + +```xml + <dependencies> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>BiopetCore</artifactId> + <version>0.7.0</version> + </dependency> + </dependencies> +``` + +In case you want to use a specific pipeline you want to add this to your dependencies. E.g. + +```xml + <dependencies> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>BiopetCore</artifactId> + <version>0.7.0</version> + </dependency> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>Shiva</artifactId> + <version>0.7.0</version> + </dependency> + </dependencies> +``` + +For a complete example pom.xml see [here](../examples/pom.xml). + + +## SBT build + +You can use SBT to build a fat JAR that contains all the required class files in a single JAR file. This can be done +using the [sbt-assembly plugin](https://github.com/sbt/sbt-assembly). Keep in mind that you have to explicitly define a specific merge strategy for conflicting +file names. In our experience, the merge strategy below works quite well: + +``` +assemblyMergeStrategy in assembly := { + case "git.properties" => MergeStrategy.first + // Discard the GATK's queueJobReport.R and use the one from Biopet + case PathList("org", "broadinstitute", "gatk", "queue", "util", x) if x.endsWith("queueJobReport.R") + => MergeStrategy.first + case "GATKText.properties" => MergeStrategy.first + case "dependency_list.txt" => MergeStrategy.discard + case other => MergeStrategy.defaultMergeStrategy(other) +} +``` + +## New pipeline + +To create a new pipeline in your project you need a class that extends from `Qscript` and `SummaryQScript`. + +E.g.: + +```scala +class MyProject(val root: Configurable) extends Qscript with SummaryQScript { + + def init(): Unit = {} + + def biopetScript(): Unit = {} # pipeline code here + + def summarySettings = Map() + def summaryFiles = Map() + def summaryStats = Map() + + def summaryFile: File = new File() + +} +``` + +To make your pipeline runnable from the command line, you need to add a one line object: + +```scala +object MyProject extends PipelineCommand +``` + +When you build your jar, you cna then simply use: + +``` +java -jar MyProject.jar -config some_config.yml <other arguments> +``` + +This jar comes with all standard biopet arguments. \ No newline at end of file diff --git a/docs/examples/pom.xml b/docs/examples/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..e4823166ae37a9ea98f60ac2013dafc1606faae2 --- /dev/null +++ b/docs/examples/pom.xml @@ -0,0 +1,281 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <groupId>path.to.your</groupId> + <artifactId>MyProject</artifactId> + <version>YourVersion1.0</version> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + <scoverage.plugin.version>1.0.4</scoverage.plugin.version> + <sting.shade.phase>package</sting.shade.phase> + <app.main.class>path.to.your.MyProject</app.main.class> + </properties> + + <dependencies> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>BiopetCore</artifactId> + <version>0.7.0</version> + </dependency> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>Shiva</artifactId> + <version>0.7.0</version> + </dependency> + <dependency> + <groupId>org.testng</groupId> + <artifactId>testng</artifactId> + <version>6.8</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.scalatest</groupId> + <artifactId>scalatest_2.10</artifactId> + <version>2.2.1</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.json4s</groupId> + <artifactId>json4s-native_2.10</artifactId> + <version>3.3.0</version> + </dependency> + </dependencies> + + <build> + <sourceDirectory>${basedir}/src/main/scala</sourceDirectory> + <testSourceDirectory>${basedir}/src/test/scala</testSourceDirectory> + <testResources> + <testResource> + <directory>${basedir}/src/test/resources</directory> + <includes> + <include>**/*</include> + </includes> + </testResource> + </testResources> + <resources> + <resource> + <directory>${basedir}/src/main/resources</directory> + <includes> + <include>**/*</include> + </includes> + </resource> + </resources> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-shade-plugin</artifactId> + <version>2.4.1</version> + <configuration> + <!--suppress MavenModelInspection --> + <finalName>Magpie-${project.version}-${git.commit.id.abbrev}</finalName> + <transformers> + <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> + <manifestEntries> + <Main-Class>${app.main.class}</Main-Class> + <!--suppress MavenModelInspection --> + <X-Compile-Source-JDK>${maven.compile.source}</X-Compile-Source-JDK> + <!--suppress MavenModelInspection --> + <X-Compile-Target-JDK>${maven.compile.target}</X-Compile-Target-JDK> + </manifestEntries> + </transformer> + </transformers> + <filters> + </filters> + </configuration> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + <version>2.18.1</version> + <configuration> + <forkCount>1C</forkCount> + <workingDirectory>${project.build.directory}</workingDirectory> + </configuration> + </plugin> + <plugin> + <artifactId>maven-dependency-plugin</artifactId> + <version>2.10</version> + <executions> + <execution> + <id>copy-installed</id> + <phase>prepare-package</phase> + <goals> + <goal>list</goal> + </goals> + <configuration> + <outputFile>${project.build.outputDirectory}/dependency_list.txt</outputFile> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>net.alchim31.maven</groupId> + <artifactId>scala-maven-plugin</artifactId> + <version>3.2.0</version> + <executions> + <execution> + <id>scala-compile</id> + <goals> + <goal>compile</goal> + <goal>testCompile</goal> + </goals> + <configuration> + <args> + <arg>-dependencyfile</arg> + <arg>${project.build.directory}/.scala_dependencies</arg> + <arg>-deprecation</arg> + <arg>-feature</arg> + </args> + </configuration> + </execution> + </executions> + <!-- ... (see other usage or goals for details) ... --> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-jar-plugin</artifactId> + <version>2.5</version> + <executions> + <execution> + <goals> + <goal>test-jar</goal> + </goals> + </execution> + </executions> + <configuration> + <archive> + <manifest> + <addDefaultImplementationEntries>true</addDefaultImplementationEntries> + <addDefaultSpecificationEntries>true</addDefaultSpecificationEntries> + </manifest> + </archive> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <version>2.3.2</version> + <configuration> + <showDeprecation>true</showDeprecation> + </configuration> + </plugin> + <plugin> + <groupId>org.scalariform</groupId> + <artifactId>scalariform-maven-plugin</artifactId> + <version>0.1.4</version> + <executions> + <execution> + <phase>process-sources</phase> + <goals> + <goal>format</goal> + </goals> + <configuration> + <rewriteArrowSymbols>false</rewriteArrowSymbols> + <alignParameters>true</alignParameters> + <alignSingleLineCaseStatements_maxArrowIndent>40 + </alignSingleLineCaseStatements_maxArrowIndent> + <alignSingleLineCaseStatements>true</alignSingleLineCaseStatements> + <compactStringConcatenation>false</compactStringConcatenation> + <compactControlReadability>false</compactControlReadability> + <doubleIndentClassDeclaration>false</doubleIndentClassDeclaration> + <formatXml>true</formatXml> + <indentLocalDefs>false</indentLocalDefs> + <indentPackageBlocks>true</indentPackageBlocks> + <indentSpaces>2</indentSpaces> + <placeScaladocAsterisksBeneathSecondAsterisk>false + </placeScaladocAsterisksBeneathSecondAsterisk> + <preserveDanglingCloseParenthesis>true</preserveDanglingCloseParenthesis> + <preserveSpaceBeforeArguments>false</preserveSpaceBeforeArguments> + <rewriteArrowSymbols>false</rewriteArrowSymbols> + <spaceBeforeColon>false</spaceBeforeColon> + <spaceInsideBrackets>false</spaceInsideBrackets> + <spaceInsideParentheses>false</spaceInsideParentheses> + <spacesWithinPatternBinders>true</spacesWithinPatternBinders> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>pl.project13.maven</groupId> + <artifactId>git-commit-id-plugin</artifactId> + <version>2.1.10</version> + <executions> + <execution> + <goals> + <goal>revision</goal> + </goals> + </execution> + </executions> + <configuration> + <prefix>git</prefix> + <dateFormat>dd.MM.yyyy '@' HH:mm:ss z</dateFormat> + <verbose>false</verbose> + <dotGitDirectory>${basedir}/../../.git</dotGitDirectory> + <useNativeGit>true</useNativeGit> + <skipPoms>false</skipPoms> + <generateGitPropertiesFile>true</generateGitPropertiesFile> + <generateGitPropertiesFilename>src/main/resources/git.properties</generateGitPropertiesFilename> + <failOnNoGitDirectory>false</failOnNoGitDirectory> + <abbrevLength>8</abbrevLength> + <skip>false</skip> + <gitDescribe> + <skip>false</skip> + <always>false</always> + <abbrev>8</abbrev> + <dirty>-dirty</dirty> + <forceLongFormat>false</forceLongFormat> + </gitDescribe> + </configuration> + </plugin> + <plugin> + <groupId>com.mycila</groupId> + <artifactId>license-maven-plugin</artifactId> + <version>2.6</version> + <configuration> + <excludes> + <exclude>**/*git*</exclude> + <exclude>**/*.bam</exclude> + <exclude>**/*.bai</exclude> + <exclude>**/*.gtf</exclude> + <exclude>**/*.fq</exclude> + <exclude>**/*.sam</exclude> + <exclude>**/*.bed</exclude> + <exclude>**/*.refFlat</exclude> + <exclude>**/*.R</exclude> + <exclude>**/*.rscript</exclude> + </excludes> + </configuration> + </plugin> + <plugin> + <groupId>org.scoverage</groupId> + <artifactId>scoverage-maven-plugin</artifactId> + <version>${scoverage.plugin.version}</version> + <configuration> + <scalaVersion>2.10.2</scalaVersion> + <!-- other parameters --> + </configuration> + </plugin> + </plugins> + </build> + <reporting> + <plugins> + <plugin> + <groupId>org.scoverage</groupId> + <artifactId>scoverage-maven-plugin</artifactId> + <version>${scoverage.plugin.version}</version> + </plugin> + </plugins> + </reporting> +</project> \ No newline at end of file diff --git a/docs/general/config.md b/docs/general/config.md index e9b9d5e78b2140bcc3029ee9cc4fa76278127148..1e999dae200e59b4d73cd824379d335c65870ff5 100644 --- a/docs/general/config.md +++ b/docs/general/config.md @@ -138,4 +138,4 @@ During execution, biopet framework will resolve the value for each ConfigNamespa ### JSON validation To check if the created JSON file is correct their are several possibilities: the simplest way is using [this](http://jsonformatter.curiousconcept.com/) -website. It is also possible to use Python, Scala or any other programming languages for validating JSON files but this requires some more knowledge. \ No newline at end of file +website. It is also possible to use Python, Scala or any other programming languages for validating JSON files but this requires some more knowledge. diff --git a/docs/pipelines/gears.md b/docs/pipelines/gears.md index 2e4cffde83657aae033d299272420f5c4deda3f8..9d3d71f1db716594b2960986573d0d07228b5def 100644 --- a/docs/pipelines/gears.md +++ b/docs/pipelines/gears.md @@ -10,6 +10,7 @@ Pipeline analysis components include: - [Kraken, DerrickWood](https://github.com/DerrickWood/kraken) - [Qiime closed reference](http://qiime.org) + - [Qiime open reference](http://qiime.org) - [Qiime rtax](http://qiime.org) (**Experimental**) - SeqCount (**Experimental**) @@ -23,6 +24,7 @@ This pipeline is used to analyse a group of samples. This pipeline only accepts | --- | ---- | ------- | -------- | | gears_use_kraken | Boolean | true | Run fastq file with kraken | | gears_use_qiime_closed | Boolean | false | Run fastq files with qiime with the closed reference module | +| gears_use_qiime_open | Boolean | false | Run fastq files with qiime with the open reference module | | gears_use_qiime_rtax | Boolean | false | Run fastq files with qiime with the rtax module | | gears_use_seq_count | Boolean | false | Produces raw count files | @@ -75,6 +77,7 @@ Please refer [to our mapping pipeline](mapping.md) for information about how the | --- | ---- | ------- | -------- | | gears_use_kraken | Boolean | true | Run fastq file with kraken | | gears_use_qiime_closed | Boolean | false | Run fastq files with qiime with the closed reference module | +| gears_use_qiime_open | Boolean | false | Run fastq files with qiime with the open reference module | | gears_use_qiime_rtax | Boolean | false | Run fastq files with qiime with the rtax module | | gears_use_seq_count | Boolean | false | Produces raw count files | diff --git a/docs/pipelines/gentrap.md b/docs/pipelines/gentrap.md index baceae70e90ee1c5441fae1b48844ab99b8f5959..cd29f9b95cf20ad378d3e2c04fb29618647229e2 100644 --- a/docs/pipelines/gentrap.md +++ b/docs/pipelines/gentrap.md @@ -9,6 +9,7 @@ At the moment, Gentrap supports the following aligners: 1. [GSNAP](http://research-pub.gene.com/gmap/) 2. [TopHat](http://ccb.jhu.edu/software/tophat/index.shtml) 3. [Star](https://github.com/alexdobin/STAR/releases) +4. [Hisat2](https://ccb.jhu.edu/software/hisat2/index.shtml) and the following quantification modes: @@ -31,48 +32,34 @@ To get help creating the appropriate [configs](../general/config.md) please refe Samples are single experimental units whose expression you want to measure. They usually consist of a single sequencing library, but in some cases (for example when the experiment demands each sample have a minimum library depth) a single sample may contain multiple sequencing libraries as well. All this is can be configured using the correct JSON nesting, with the following pattern: -~~~ json -{ - "samples": { - "sample_A": { - "libraries": { - "lib_01": { - "R1": "/absolute/path/to/first/read/pair.fq", - "R2": "/absolute/path/to/second/read/pair.fq" - } - } - } - } -} +~~~ yaml +--- + samples: + sample_A: + libraries: + lib_01: + R1: "/absolute/path/to/first/read/pair.fq" + R2: "/absolute/path/to/second/read/pair.fq" ~~~ In the example above, there is one sample (named `sample_A`) which contains one sequencing library (named `lib_01`). The library itself is paired end, with both `R1` and `R2` pointing to the location of the files in the file system. A more complicated example is the following: -~~~ json -{ - "samples": { - "sample_X": { - "libraries": { - "lib_one": { - "R1": "/absolute/path/to/first/read/pair.fq", - "R2": "/absolute/path/to/second/read/pair.fq" - } - } - }, - "sample_Y": { - "libraries": { - "lib_one": { - "R1": "/absolute/path/to/first/read/pair.fq", - "R2": "/absolute/path/to/second/read/pair.fq" - }, - "lib_two": { - "R1": "/absolute/path/to/first/read/pair.fq", - "R2": "/absolute/path/to/second/read/pair.fq" - } - } - } - } -} +~~~ yaml +--- + samples: + sample_X: + libraries: + lib_one: + R1: "/absolute/path/to/first/read/pair.fq" + R2: "/absolute/path/to/second/read/pair.fq" + sample_Y: + libraries: + lib_one: + R1: "/absolute/path/to/first/read/pair.fq" + R2: "/absolute/path/to/second/read/pair.fq" + lib_two: + R1: "/absolute/path/to/first/read/pair.fq" + R2: "/absolute/path/to/second/read/pair.fq" ~~~ In this case, we have two samples (`sample_X` and `sample_Y`) and `sample_Y` has two different libraries (`lib_one` and `lib_two`). Notice that the names of the samples and libraries may change, but several keys such as `samples`, `libraries`, `R1`, and `R2` remain the same. @@ -83,8 +70,8 @@ In this case, we have two samples (`sample_X` and `sample_Y`) and `sample_Y` has For the pipeline settings, there are some values that you need to specify while some are optional. Required settings are: 1. `output_dir`: path to output directory (if it does not exist, Gentrap will create it for you). -2. `aligner`: which aligner to use (`gsnap` or `tophat`) -3. `reference_fasta`: this must point to a reference FASTA file and in the same directory, there must be a `.dict` file of the FASTA file. +2. `aligner`: which aligner to use (`gsnap`, `tophat`, `hisat2`, `star` or `star-2pass`). `star-2pass` enables the 2-pass mapping option of STAR, for the most sensitive novel junction discovery. For more, please refer to [STAR user Manual](https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf) +3. `reference_fasta`: this must point to a reference FASTA file and in the same directory, there must be a `.dict` file of the FASTA file. If the `.dict` file does not exist, you can create it using: ```` java -jar <picard jar> CreateSequenceDictionary R=<reference.fasta> O=<outputDict> ```` 4. `expression_measures`: this entry determines which expression measurement modes Gentrap will do. You can choose zero or more from the following: `fragments_per_gene`, `base_counts`, `cufflinks_strict`, `cufflinks_guided` and/or `cufflinks_blind`. If you only wish to align, you can set the value as an empty list (`[]`). 5. `strand_protocol`: this determines whether your library is prepared with a specific stranded protocol or not. There are two protocols currently supported now: `dutp` for dUTP-based protocols and `non_specific` for non-strand-specific protocols. 6. `annotation_refflat`: contains the path to an annotation refFlat file of the entire genome @@ -98,16 +85,29 @@ While optional settings are: 5. `call_variants`: whether to call variants on the RNA-seq data or not, defaults to `false`. Thus, an example settings configuration is as follows: +~~~ yaml +--- + output_dir: "/path/to/output/dir" + expression_measures: + - "fragments_per_gene" + - "bases_per_gene" + strand_protocol: "dutp" + reference_fasta: "/path/to/reference/fastafile" + annotation_gtf: "/path/to/gtf" + annotation_refflat: "/path/to/refflat" +~~~ -~~~ json -{ - "output_dir": "/path/to/output/dir", - "expression_measures": ["fragments_per_gene", "bases_per_gene"], - "strand_protocol": "dutp", - "reference_fasta": "/path/to/reference/fastafile", - "annotation_gtf": "/path/to/gtf", - "annotation_refflat": "/path/to/refflat", -} +#### Best practice example +If you are unsure of how to use the numerous options of gentrap, please refer to the following best practice configuration file example. +~~~ yaml +--- + output_dir: "/path/to/output/dir" + aligner: "gsnap" + reference_fasta: "/path/to/reference/fastafile" + expression_measures: + - "fragments_per_gene" + strand_protocol: "dutp" + annotation_refflat: "/path/to/refflat" ~~~ #### Example configurations @@ -125,7 +125,7 @@ biopet pipeline gentrap -config </path/to/config.json> -qsub -jobParaEnv BWA -ru You can also use the `biopet` environment module (recommended) when you are running the pipeline in SHARK: ~~~ bash -$ module load biopet/v0.5.0 +$ module load biopet/v0.7.0 $ biopet pipeline gentrap -config </path/to/config.json> -qsub -jobParaEnv BWA -run ~~~ diff --git a/docs/pipelines/mapping.md b/docs/pipelines/mapping.md index 79d2a57494ca3fb24e369515dcfed47b883bf556..0e9efaac19c34ef974e54502eb9353b754bfeaad 100644 --- a/docs/pipelines/mapping.md +++ b/docs/pipelines/mapping.md @@ -16,6 +16,7 @@ After the QC, the pipeline simply maps the reads with the chosen aligner. The re * <a href="http://www.well.ox.ac.uk/project-stampy" target="_blank">Stampy</a> * <a href="http://research-pub.gene.com/gmap/" target="_blank">Gsnap</a> * <a href="https://ccb.jhu.edu/software/tophat" target="_blank">TopHat</a> + * <a href="https://ccb.jhu.edu/software/hisat2/index.shtml" target="_blank">Hisat2</a> * <a href="https://github.com/alexdobin/STAR" target="_blank">Star</a> * <a href="https://github.com/alexdobin/STAR" target="_blank">Star-2pass</a> * <a href="http://broadinstitute.github.io/picard/" target="_blank">Picard tool suite</a> @@ -48,7 +49,7 @@ All other values should be provided in the config. Specific config values toward | ---- | ---- | -------- | | output_dir | Path (**required**) | directory for output files | | reference_fasta | Path (**required**) | Path to indexed fasta file to be used as reference | -| aligner | String (optional) | Which aligner to use. Defaults to `bwa`. Choose from [`bwa`, `bwa-aln`, `bowtie`, `gsnap`, `tophat`, `stampy`, `star`, `star-2pass`] | +| aligner | String (optional) | Which aligner to use. Defaults to `bwa`. Choose from [`bwa`, `bwa-aln`, `bowtie`, `gsnap`, `tophat`, `stampy`, `star`, `star-2pass`, `hisat2`] | | skip_flexiprep | Boolean (optional) | Whether to skip the flexiprep QC step (default = False) | | skip_markduplicates | Boolean (optional) | Whether to skip the Picard Markduplicates step (default = False) | | skip_metrics | Boolean (optional) | Whether to skip the metrics gathering step (default = False) | diff --git a/docs/pipelines/tinycap.md b/docs/pipelines/tinycap.md index 9d9d374adb82aa77686bf540e0883ffdcecbaa63..8ba9a524b04598ee5b8a5e9a293f01495438c9ed 100644 --- a/docs/pipelines/tinycap.md +++ b/docs/pipelines/tinycap.md @@ -55,35 +55,23 @@ One can specify other options such as: `bowtie` (alignment) options, clipping an "chunkmbs": 256, # this is a performance option, keep it high (256) as many alternative alignments are possible "seedmms": 3, "seedlen": 25, - "k": 5, # take and report best 5 alignments - "best": true # sort by best hit + "k": 3, # take and report best 3 alignments + "best": true, # sort by best hit, + "strata" true # select from best strata }, "sickle": { - "lengthThreshold": 8 # minimum length to keep after trimming + "lengthThreshold": 15 # minimum length to keep after trimming }, "cutadapt": { - "error_rate": 0.2, # recommended: 0.2, allow more mismatches in adapter to be clipped of (ratio) - "minimum_length": 8, # minimum length to keep after clipping, setting lower will cause multiple alignments afterwards + "error_rate": 0.1, # recommended: 0.1, allow more mismatches in adapter to be clipped of (ratio) + "minimum_length": 15, # minimum length to keep after clipping, setting lower will cause multiple alignments afterwards "q": 30, # minimum quality over the read after the clipping in order to keep and report the read - "default_clip_mode": "both", # clip from: front/end/both (5'/3'/both). Depending on the protocol. Setting `both` makes clipping take more time, but is safest to do on short sequences such as smallRNA. - "times": 2 # in cases of chimera reads/adapters, how many times should cutadapt try to remove am adapter-sequence + "default_clip_mode": "3", # clip from: front/end/both (5'/3'/both). Depending on the protocol. + "times": 1, # in cases of chimera reads/adapters, how many times should cutadapt try to remove am adapter-sequence + "ignore_fastqc_adapters": true # by default ignore the detected adapters by FastQC. These tend to give false positive hits for smallRNA projects. } ``` -The settings above is quite strict and aggressive on the clipping with `cutadapt`. By default the option `sensitiveAdapterSearch` is turned on in the TinyCap pipeline: - -```json -"fastqc": { - "sensitiveAdapterSearch": true -} -``` - -This setting is turned on to enable aggressive adapter trimming. e.g. found (partial) adapters in `FastQC` -are all used in `Cutadapt`. Depending on the sequencing technique and sample preparation, e.g. short -sequences (76bp - 100bp). Turning of this option will still produce sensible results. - - - ## Examine results ### Result files diff --git a/docs/releasenotes/release_notes_0.7.0.md b/docs/releasenotes/release_notes_0.7.0.md new file mode 100644 index 0000000000000000000000000000000000000000..869b8cd417d001aaca0f4cd364ad51aba56fd8bf --- /dev/null +++ b/docs/releasenotes/release_notes_0.7.0.md @@ -0,0 +1,38 @@ +# Release notes Biopet version 0.6.0 + +## General Code changes + +* Switched to full public licence +* Upgraded to Queue 3.6 + * Upgraded to java 8 + * Upgraded to Picard / htsjdk 2.4.1 + +## Functionality + +* [Gears](../pipelines/gears.md): Added `pick_open_reference_otus` reference module of [Qiime](http://qiime.org/) +* Fixed default aligner in [Gentrap](../pipelines/gentrap.md) to gsnap +* Make `sample` and `library id` required in [Flexiprep](../pipelines/flexiprep.md) when started from the `CLI` +* [Core] Raised some default memory limits ([#356](https://git.lumc.nl/biopet/biopet/issues/356)) +* [Carp](../pipelines/carp.md): Our MACS2 wrapper now auto-detects whether a sample is single-end or paired-end +* Added a `sort by name` step when htseq in Gentrap is executed +* Fixed file name of bam files in Carp +* VcfWithVcf now checks if chromosomes are in the correct reference +* Added sync stats to flexiprep report +* Added check in BamMetrics to check whether contigs a given bed file are defined in the used reference-genome. +* [TinyCap](../pipelines/tinycap.md) now has validated settings for miRNA runs. Some parameters changed for alignment. +* [Flexiprep](../pipelines/flexiprep.md) now has the option to provide custom adapters sequences and ignoring adapters found by `FastQC`. +* Utils - BamUtils is now estimating insert size by sampling the bam-file taking all parts of the available contigs. +* Fix in VCF filter (#370)[https://git.lumc.nl/biopet/biopet/merge_requests/370] +* Fix Star wrapper - added full functionality in wrapper. + +## Reporting + +* Report now corrects for secondary reads in alignment stats +* ShivaSVcalling, fixed the headers in the output vcf-files for Clever, Breakdancer and Pindel. +* [Flexiprep](../pipelines/flexiprep.md) now reports the histogram of sizes from clipped adapters. (raw json output) +* Fix reported mapping percentage in case of allowing multimapping (in RNAseq aligners) + +## Backward incompatible changes + +* Changing `namespace` in config values. [!348](https://git.lumc.nl/biopet/biopet/merge_requests/348) + The nomenclature regarding namespaces in configuration files and options has now been harmonized. Whereas this was previously variously called "submodule", "level", or "name(space)", it is now called "namespace" everywhere. This means the config function used for accessing configuration values changed argument name submodule to namespace. Any projects depending on Biopet will therefore have to refactor their usage of this function. diff --git a/docs/tools/AnnotateVcfWithBed.md b/docs/tools/AnnotateVcfWithBed.md new file mode 100644 index 0000000000000000000000000000000000000000..4a7971085045d4103d32e98c90515264328c0ea2 --- /dev/null +++ b/docs/tools/AnnotateVcfWithBed.md @@ -0,0 +1,39 @@ +# AnnotateVcfWithBed + +## Introduction + This tool to annotates a vcf file using the input from a bed file + +## Example +To get the help menu: +~~~ +Usage: AnnotateVcfWithBed [options] + + -l <value> | --log_level <value> + Log level + -h | --help + Print usage + -v | --version + Print version + -I <vcf file> | --inputFile <vcf file> + Input is a required file property + -B <bed file> | --bedFile <bed file> + Bedfile is a required file property + -o <vcf file> | --output <vcf file> + out is a required file property + -f <name of field in vcf file> | --fieldName <name of field in vcf file> + Name of info field in new vcf file + -d <name of field in vcf file> | --fieldDescription <name of field in vcf file> + Description of field in new vcf file + -t <name of field in vcf file> | --fieldType <name of field in vcf file> + Description of field in new vcf file +~~~ + +To run the tool use: +~~~ +biopet tool AnnotateVcfWithBed -I myVcf.vcf -B myBed.bed -o myannotatedVcf.vcf +~~~ + + +## Results +The result of this tool will be a vcf file with an extra field with annotation + diff --git a/docs/tools/BaseCounter.md b/docs/tools/BaseCounter.md new file mode 100644 index 0000000000000000000000000000000000000000..7bceca3364a1023c1bf40cdf65b5857c5f8819ff --- /dev/null +++ b/docs/tools/BaseCounter.md @@ -0,0 +1,26 @@ +# BaseCounter + +## Introduction +This tool will generate Base count based on a bam file and a refflat file + +## Example +Help menu +~~~~ +Usage: BaseCounter [options] + + -l <value> | --log_level <value> + Log level + -h | --help + Print usage + -v | --version + Print version + -r <file> | --refFlat <file> + refFlat file. Mandatory + -o <directory> | --outputDir <directory> + Output directory. Mandatory + -b <file> | --bam <file> + Bam file. Mandatory + -p <prefix> | --prefix + + ~~~~ + diff --git a/docs/tools/CheckAllelesVcfInBam.md b/docs/tools/CheckAllelesVcfInBam.md index 476f8a60c6df23b959a14af8323e11439940595f..f3b633d11e9b05da3f7e1434bc7c0f3114853ade 100644 --- a/docs/tools/CheckAllelesVcfInBam.md +++ b/docs/tools/CheckAllelesVcfInBam.md @@ -1,7 +1,7 @@ # CheckAllelesVcfInBam ## Introduction -This tool has been written to check the allele frequency in BAM files. +This tool has been written to check the allele frequency in BAM files. This is meant for comparison with the allele frequency reported at the VCF file ## Example To get the help menu: @@ -16,14 +16,15 @@ Usage: CheckAllelesVcfInBam [options] -v | --version Print version -I <file> | --inputFile <file> - + VCF file -o <file> | --outputFile <file> - + output VCF file name -s <value> | --sample <value> - + sample name -b <value> | --bam <value> - + bam file, from which the variants (VCF files) were called -m <value> | --min_mapping_quality <value> + minimum mapping quality score for a read to be taken into account ~~~ To run the tool: diff --git a/docs/tools/ExtractAlignedFastq.md b/docs/tools/ExtractAlignedFastq.md index c1f069e6bab80ae5a1d3cfe760121e778289e3c4..412fda6a65df71fb3a1a6358815890eb453bba47 100644 --- a/docs/tools/ExtractAlignedFastq.md +++ b/docs/tools/ExtractAlignedFastq.md @@ -23,7 +23,7 @@ Usage: ExtractAlignedFastq [options] -I <bam> | --input_file <bam> Input BAM file -r <interval> | --interval <interval> - Interval strings + Interval strings (e.g. chr1:1-100) -i <fastq> | --in1 <fastq> Input FASTQ file 1 -j <fastq> | --in2 <fastq> @@ -35,15 +35,20 @@ Usage: ExtractAlignedFastq [options] -Q <value> | --min_mapq <value> Minimum MAPQ of reads in target region to remove (default: 0) -s <value> | --read_suffix_length <value> - Length of common suffix from each read pair (default: 0) - -This tool creates FASTQ file(s) containing reads mapped to the given alignment intervals. + Length of suffix mark from each read pair (default: 0). This is used for distinguishing read pairs with + different suffices. For example, if your FASTQ records end with `/1` for the first pair and `/2` for the + second pair, the value of `read_suffix_length` should be 2. + +This tool creates FASTQ file(s) containing reads mapped to the given alignment intervals. A set of FASTQ files that was +used in creating the BAM file is also required since this is used for retrieving full sequences of FASTQ records which +map to the given region. This is useful since some of the records may have undergone modifications such as quality +trimming before alignment. In this case, retrieving the aligned SAM records will only give the modified sequence. ~~~ To run the tool: ~~~ biopet tool ExtractAlignedFastq \ ---input_file myBam.bam --in1 myFastq_R1.fastq --out1 myOutFastq_R1.fastq --interval myTarget.bed +--input_file myBam.bam --in1 myFastq_R1.fastq --out1 myOutFastq_R1.fastq --interval chr5:100-200 ~~~ * Note that this tool works for single end and paired end data. The above example can be easily extended for paired end data. The only thing one should add is: `--in2 myFastq_R2.fastq --out2 myOutFastq_R2.fastq` diff --git a/docs/tools/FastqSplitter.md b/docs/tools/FastqSplitter.md index 6f89bb766ea66ee85825500dc492854e02faa949..c5cae711ba0b738add5f36c7ae111c0a30f5ef17 100644 --- a/docs/tools/FastqSplitter.md +++ b/docs/tools/FastqSplitter.md @@ -1,10 +1,8 @@ # FastqSplitter ## Introduction -This tool splits a fastq files based on the number of output files specified. So if one specifies 5 output files it will split the fastq -into 5 files. This can be very usefull if one wants to use chunking option in one of our pipelines, we can generate the exact amount of fastqs -needed for the number of chunks specified. Note that this will be automatically done inside the pipelines. - +This tool divides a fastq file into smaller fastq files, based on the number of output files specified. For ecample, if one specifies 5 output files it will split the fastq +into 5 files of equal size. This can be very useful if one wants to use chunking option in one of our pipelines: FastqSplitter can generate the exact number of fastq files (chunks) as needed. This tool is used internally in our pipelines as required ## Example To get the help menu: @@ -29,7 +27,7 @@ biopet tool FastqSplitter --inputFile myFastq.fastq \ --output mySplittedFastq_1.fastq --output mySplittedFastq_2.fastq \ --output mySplittedFastq_3.fastq ~~~ -The above invocation will split the input in 3 equally divided fastq files. +The above invocation will split the input file into 3 fastq files of equal size. ## Output Multiple fastq files based on the number of outputFiles specified. \ No newline at end of file diff --git a/docs/tools/FindRepeatsPacBio.md b/docs/tools/FindRepeatsPacBio.md index 02351d091c22a6c30e59465d6ce99e3e6f14d3ea..6aebc65c2b4d9eabd1faf105690cbce42a1e0e5f 100644 --- a/docs/tools/FindRepeatsPacBio.md +++ b/docs/tools/FindRepeatsPacBio.md @@ -1,9 +1,10 @@ # FindRepeatsPacBio ## Introduction -This tool looks and annotates repeat regions inside a BAM file. It extracts the regions of interest from a bed file and then intersects -those regions with the BAM file. On those extracted regions the tool will perform a - Mpileup and counts all insertions/deletions etc. etc. for that specific location on a per read basis. +This tool searches for and annotates repeat regions inside a BAM file. +It intersect the regions provided in the bed file with the BAM file and extracts them. +On the extracted regions *samtools mpileup* will be run and all insertions, deletions or substitutions will be counted on a per read basis + ## Example diff --git a/docs/tools/MpileupToVcf.md b/docs/tools/MpileupToVcf.md index d1b3200e4f14878f22cd48168ad0f22eb9f999ed..c1666209ed4a7eeac69f1375512d0f94796a69d7 100644 --- a/docs/tools/MpileupToVcf.md +++ b/docs/tools/MpileupToVcf.md @@ -1,10 +1,10 @@ # MpileupToVcf ## Introduction -This tool enables a user to extract a VCF file out a mpileup file generated from the BAM file. -The tool can also stream through STDin and STDout so that the mpileup file is not stored on disk. -Mpileup files tend to be very large since they describe each covered base position in the genome on a per read basis, -so usually one does not want to safe these files. +This tool enables a user to extract a VCF file out a mpileup file generated from the BAM file using *samtools mpileup*, for instance. +The tool can also stream through STDin and STDout so that it is not necessary to store the mpileup file on disk. +Mpileup files can to be very large because they describe each covered base position in the genome on a per read basis, +so it is not desired to store them. ---- diff --git a/docs/tools/SamplesTsvToJson.md b/docs/tools/SamplesTsvToJson.md index 322b6aeddb13eebe28d7c561bb5aeb1948796bd0..b54bda47ad9f8504c74150b96cc5e239eb8d2448 100644 --- a/docs/tools/SamplesTsvToJson.md +++ b/docs/tools/SamplesTsvToJson.md @@ -1,7 +1,7 @@ # SamplesTsvToJson -This tool enables a user to create a full sample sheet in JSON format suitable for all our Queue pipelines. -The tool can be started as follows: +This tool enables a user to create a full sample sheet in JSON format, suitable for all our Queue pipelines, from TSV file(s). +The tool can be called as follows: ~~~ bash biopet tool SamplesTsvToJson @@ -27,11 +27,11 @@ Usage: SamplesTsvToJson [options] ~~~ -The tool is designed in such a way that a user can provide a TAB seperated file (TSV) with sample specific properties and even those will be parsed by the tool. -For example: a user wants to have certain properties e.g. which treatment a sample got than the user should provide a extra columns called treatment and then the -JSON file is parsed with those properties inside it as well. The order of columns does not matter. +A user provides a TAB separated file (TSV) with sample specific properties which are parsed into JSON format by the tool. +For example, a user wants to add certain properties to the description of a sample, such as the treatment a sample received. Then a TSV file with an extra column called treatment is provided. +The resulting JSON file will have the 'treatment' property in it as well. The order of the columns is not relevant to the end result -The tag files works the same only the value are prefixed in the key `tags`. +The tag files works the same only the value is prefixed in the key `tags`. #### Example diff --git a/docs/tools/VcfFilter.md b/docs/tools/VcfFilter.md index c9b3f064700c74181489097aaebfc6085dc8497e..1335e9b94a9f06790bf807b11a8e971ae8363fb3 100644 --- a/docs/tools/VcfFilter.md +++ b/docs/tools/VcfFilter.md @@ -1,6 +1,11 @@ # VcfFilter ## Introduction +This tool filters VCF files on a number values. For example, it can filter on sample depth and/or total depth. +It can also filter out the reference calls and/or minimum number of sample passes. +For more on filtering options and how to set them, please refer to the help menu. + + This tool enables a user to filter VCF files. For example on sample depth and/or total depth. It can also be used to filter out the reference calls and/or minimum number of sample passes. There is a wide set of options which one can use to change the filter settings. @@ -32,19 +37,19 @@ Usage: VcfFilter [options] Min number of samples to pass --minAlternateDepth, --minBamAlternateDepth and --minSampleDepth --minBamAlternateDepth <int> --denovoInSample <sample> - Only show variants that contain unique alleles in compete set for given sample + Only keep variants that contain unique alleles in complete set for the given sample --mustHaveVariant <sample> - Given sample must have 1 alternative allele + Only keep variants that for the given sample have an alternative allele --diffGenotype <sample:sample> - Given samples must have a different genotype - --filterHetVarToHomVar <sample:sample> - If variants in sample 1 are heterogeneous and alternative alleles are homogeneous in sample 2 variants are filtered + Only keep variands that for the given samples have a different genotype + --filterHetVarToHomVar <sample1:sample2> + Filter out varianst that are heterozygous in sample1 and homozygous in sample2 --filterRefCalls - Filter when there are only ref calls + Filter out ref calls --filterNoCalls - Filter when there are only no calls + Filter out no calls --minQualScore <value> - Min qual score + Filter out variants with Min qual score below threshold ~~~ To run the tool: @@ -54,4 +59,4 @@ biopet tool VcfFilter --inputVcf myInput.vcf \ ~~~ ## Output -The output is a vcf file containing the filters specified values. \ No newline at end of file +The output is a vcf file containing the values that pass the user-defined filtering options diff --git a/docs/tools/VcfToTsv.md b/docs/tools/VcfToTsv.md index 59d42da464bc146cb4e76c2864fc44e507df0f03..586363fdbb24d2027bf7ad6fed38465d25b37230 100644 --- a/docs/tools/VcfToTsv.md +++ b/docs/tools/VcfToTsv.md @@ -1,10 +1,10 @@ # VcfToTsv ## Introduction -This tool enables a user to convert a vcf file to a tab delimited file (TSV). -This can be very usefull since some programs only accept a TSV for downstream analysis. -It gets rid of the vcf header and parses all data columns in a nice TSV file. -There is also a possibility to only select some specific fields from you vcf and only parse those fields to a TSV. +Tool converts a vcf file to a Tab Separated Values (TSV) file. For every key in the INFO column of the VCF file, a separate column will be created with the corresponding values. +User can select the keys that will be parsed into the output TSV file. +This can be useful in the case a program only accepts a TSV file for downstream analysis. + ## Example To open the help menu: @@ -14,26 +14,33 @@ biopet tool VcfToTsv -h Usage: VcfToTsv [options] -l <value> | --log_level <value> - Log level + Level of log information printed. Possible levels: 'debug', 'info', 'warn', 'error' -h | --help Print usage -v | --version Print version -I <file> | --inputFile <file> - + Input vcf file -o <file> | --outputFile <file> output file, default to stdout - -f <value> | --field <value> - - -i <value> | --info_field <value> - + -f Genotype field name | --field Genotype field name + Genotype field to use + -i Info field name | --info_field Info field name + Info field to use --all_info - + Use all info fields in the vcf header --all_format - + Use all genotype fields in the vcf header -s <value> | --sample_field <value> - + Genotype fields to use in the tsv file -d | --disable_defaults + Don't output the default columns from the vcf file + --separator <value> + Optional separator. Default is tab-delimited + --list_separator <value> + Optional list separator. By default, lists are separated by a comma + --max_decimals <value> + Number of decimal places for numbers. Default is 2 ~~~ To run the tool: diff --git a/docs/tools/VepNormalizer.md b/docs/tools/VepNormalizer.md index a7cc5912596a10bfddce6462349013681baa3f9c..e5624e9a5667787073cf36a1441f37bc90514044 100644 --- a/docs/tools/VepNormalizer.md +++ b/docs/tools/VepNormalizer.md @@ -3,11 +3,12 @@ VepNormalizer Introduction ------------ -This tool normalizes a VCF file annotated with the Variant Effect Predictor (VEP). +This tool modifies a VCF file annotated with the Variant Effect Predictor (VEP). Since the VEP does not use INFO fields to annotate, but rather puts all its annotations in one big string inside a "CSQ" INFO tag it is necessary to normalize it. -This normalizer will use the information in the CSQ header to create INFO fields for each annotation field. -It has two modes: `standard` and `explode`. The `standard` mode will produce a VCF according to the VCF specification. +Tool will parse the information in the CSQ header to create INFO fields for each annotation field. Tool has two modes: `standard` and `explode`. + +The `standard` mode will produce a VCF according to the VCF specification. This means that every VEP INFO tag will consist of the comma-separated list of values for each transcript. In case the value is empty, the VEP INFO tag will not be shown for that specific record @@ -20,6 +21,7 @@ The CSQ tag is by default removed from the output VCF file. If one wishes to ret Example --------- +Help menu: ~~~ bash biopet tool VepNormalizer -h diff --git a/docs/tools/WipeReads.md b/docs/tools/WipeReads.md index 0263e1b7fa14c8f6884a8e6f601025137cad4e29..6c26fe2eed5757a1969c80899e61244e9a7967d2 100644 --- a/docs/tools/WipeReads.md +++ b/docs/tools/WipeReads.md @@ -1,10 +1,9 @@ # WipeReads ## Introduction -WipeReads is a tool for removing reads from indexed BAM files. -It respects pairing information and can be set to remove reads whose duplicate -maps outside of the target region. The main use case is to remove reads mapping -to known ribosomal RNA regions (using a supplied BED file containing intervals for these regions). +WipeReads is a tool for removing reads from indexed BAM files that are inside a user defined region. +It takes pairing information into account and can be set to remove reads if one of the pairs maps outside of the target region. +An application example is to remove reads mapping to known ribosomal RNA regions (using a supplied BED file containing intervals for these regions). ## Example To open the help menu: @@ -62,5 +61,5 @@ biopet tool WipeReads --input_file myBam.bam \ ~~~ ## Output -This tool outputs a bam file containing all the reads not inside a ribosomal region. -And optionally a bam file with only the ribosomal reads +This tool outputs a bam file containing all the reads not inside the ribosomal region. +It can optionally output a bam file with only the reads inside the ribosomal region diff --git a/docs/tools/bedtoolscoveragetocounts.md b/docs/tools/bedtoolscoveragetocounts.md index c733b818536d691dd5ea84d1b8c8081a28943688..d48af0c6ef3720da4c8cc1829d8f5f4103296dad 100644 --- a/docs/tools/bedtoolscoveragetocounts.md +++ b/docs/tools/bedtoolscoveragetocounts.md @@ -17,12 +17,13 @@ Usage: BedtoolsCoverageToCounts [options] -v | --version Print version -I <file> | --input <file> - + coverage file produced with bedtools -o <file> | --output <file> + output file name ~~~ -input: coverage file produced with bedtools -output: a count file with the counts from the the values inside the coverage file. Where values could be almost everything, e.g. +input: A coverage file produced with bedtools +output: A count file with the counts from the the values inside the coverage file. Where values could be almost everything, e.g. genes, ensemblIDs etc. etc. To run the tool: diff --git a/flexiprep/src/main/resources/nl/lumc/sasc/biopet/pipelines/flexiprep/flexiprepReadSummary.ssp b/flexiprep/src/main/resources/nl/lumc/sasc/biopet/pipelines/flexiprep/flexiprepReadSummary.ssp index fc8b8ce55816d6b45b9937157ca6ae048e8f7a3c..a79af000e41eae8b9871b5dfc40ed2a08a508e66 100644 --- a/flexiprep/src/main/resources/nl/lumc/sasc/biopet/pipelines/flexiprep/flexiprepReadSummary.ssp +++ b/flexiprep/src/main/resources/nl/lumc/sasc/biopet/pipelines/flexiprep/flexiprepReadSummary.ssp @@ -19,6 +19,10 @@ val trimCount = summary.getLibraryValues("flexiprep", "settings", "skip_trim").count(_._2 == Some(false)) val clipCount = summary.getLibraryValues("flexiprep", "settings", "skip_clip").count(_._2 == Some(false)) val librariesCount = summary.samples.foldLeft(0)(_ + summary.libraries(_).size) + + val paired: Boolean = if (sampleId.isDefined && libId.isDefined) + summary.getLibraryValue(sampleId.get, libId.get, "flexiprep", "settings", "paired").get.asInstanceOf[Boolean] + else summary.getLibraryValues("flexiprep", "settings", "paired").values.exists(_ == Some(true)) }# #if (showIntro) @@ -62,9 +66,6 @@ #if (showPlot) #{ - val paired: Boolean = if (sampleId.isDefined && libId.isDefined) - summary.getLibraryValue(sampleId.get, libId.get, "flexiprep", "settings", "paired").get.asInstanceOf[Boolean] - else summary.getLibraryValues("flexiprep", "settings", "paired").values.exists(_ == Some(true)) FlexiprepReport.readSummaryPlot(outputDir, "QC_Reads_R1","R1", summary, sampleId = sampleId) if (paired) FlexiprepReport.readSummaryPlot(outputDir, "QC_Reads_R2","R2", summary, sampleId = sampleId) }# @@ -105,6 +106,7 @@ <th>Before QC</th> <th>Clipping</th> <th>Trimming</th> + #if (paired == true) <th>Out of Sync</th> #end <th>After QC</th> </tr></thead> <tbody> @@ -140,8 +142,8 @@ #for (read <- reads) #if (read == "R2") </tr><tr> #end #{ - val beforeTotal = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "seqstat_" + read, "reads", "num_total") - val afterTotal = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "seqstat_" + read + "_qc", "reads", "num_total") + val beforeTotal = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "seqstat_" + read, "reads", "num_total").getOrElse(0).toString.toLong + val afterTotal = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "seqstat_" + read + "_qc", "reads", "num_total").getOrElse(0).toString.toLong val clippingDiscardedToShort = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "clipping_" + read, "num_reads_discarded_too_short").getOrElse(0).toString.toLong val clippingDiscardedToLong = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "clipping_" + read, "num_reads_discarded_too_long").getOrElse(0).toString.toLong var trimmingDiscarded = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "trimming_" + read, "num_reads_discarded_total").getOrElse(0).toString.toLong @@ -150,6 +152,7 @@ <td>${beforeTotal}</td> <td>${clippingDiscardedToShort + clippingDiscardedToLong}</td> <td>${trimmingDiscarded}</td> + #if (paired == true) <td>${beforeTotal - clippingDiscardedToShort - clippingDiscardedToLong - trimmingDiscarded - afterTotal}</td> #end <td>${afterTotal}</td> #end </tr> diff --git a/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala b/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala index cad37a02ac69ac0ae743aa05985a77705e35589e..14738eb16e7649272ad4783c7b1a89df1ada42b6 100644 --- a/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala +++ b/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala @@ -29,7 +29,15 @@ import nl.lumc.sasc.biopet.utils.config.Configurable class Cutadapt(root: Configurable, fastqc: Fastqc) extends nl.lumc.sasc.biopet.extensions.Cutadapt(root) { val ignoreFastqcAdapters: Boolean = config("ignore_fastqc_adapters", default = false) - val customAdaptersConfig: Map[String, Any] = config("custom_adapters", default = Map.empty) + + val customAdaptersEnd: Map[String, Any] = config("custom_adapters_end", default = Map()) + adapter ++= customAdaptersEnd.values.map(_.toString) + + val customAdaptersFront: Map[String, Any] = config("custom_adapters_front", default = Map()) + front ++= customAdaptersFront.values.map(_.toString) + + val customAdaptersAny: Map[String, Any] = config("custom_adapters_any", default = Map()) + anywhere ++= customAdaptersAny.values.map(_.toString) /** Clipped adapter names from FastQC */ protected def seqToName: Map[String, String] = { @@ -42,7 +50,7 @@ class Cutadapt(root: Configurable, fastqc: Fastqc) extends nl.lumc.sasc.biopet.e } def customAdapters: Set[AdapterSequence] = { - customAdaptersConfig.flatMap(adapter => { + (customAdaptersEnd ++ customAdaptersFront ++ customAdaptersAny).flatMap(adapter => { adapter match { case (adapterName: String, sequence: String) => Some(AdapterSequence(adapterName, sequence)) diff --git a/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala b/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala index 1b767e903837159089fcc8eb727d004b74d86e9e..522a08bc19244ca632466bec35396573c9b84814 100644 --- a/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala +++ b/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala @@ -16,11 +16,11 @@ package nl.lumc.sasc.biopet.pipelines.flexiprep import nl.lumc.sasc.biopet.core.summary.SummaryQScript import nl.lumc.sasc.biopet.core.{ BiopetFifoPipe, PipelineCommand, SampleLibraryTag } -import nl.lumc.sasc.biopet.extensions.{ Zcat, Gzip } +import nl.lumc.sasc.biopet.extensions.{ Gzip, Zcat } import nl.lumc.sasc.biopet.utils.config.Configurable import nl.lumc.sasc.biopet.utils.IoUtils._ -import nl.lumc.sasc.biopet.extensions.tools.{ ValidateFastq, SeqStat, FastqSync } - +import nl.lumc.sasc.biopet.extensions.tools.{ FastqSync, SeqStat, ValidateFastq } +import nl.lumc.sasc.biopet.utils.Logging import org.broadinstitute.gatk.queue.QScript class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with SampleLibraryTag { @@ -74,33 +74,37 @@ class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with /** Function that's need to be executed before the script is accessed */ def init() { - require(outputDir != null, "Missing output directory on flexiprep module") - require(inputR1 != null, "Missing input R1 on flexiprep module") - require(sampleId != null, "Missing sample ID on flexiprep module") - require(libId != null, "Missing library ID on flexiprep module") + if (inputR1 == null) Logging.addError("Missing input R1 on flexiprep module") + if (sampleId == null || sampleId == None) Logging.addError("Missing sample ID on flexiprep module") + if (libId == null || libId == None) Logging.addError("Missing library ID on flexiprep module") - paired = inputR2.isDefined + if (inputR1 == null) Logging.addError("Missing input R1 on flexiprep module") + else { + paired = inputR2.isDefined - inputFiles :+= new InputFile(inputR1) - inputR2.foreach(inputFiles :+= new InputFile(_)) + inputFiles :+= new InputFile(inputR1) + inputR2.foreach(inputFiles :+= new InputFile(_)) - R1Name = getUncompressedFileName(inputR1) - inputR2.foreach { fileR2 => - paired = true - R2Name = getUncompressedFileName(fileR2) + R1Name = getUncompressedFileName(inputR1) + inputR2.foreach { fileR2 => + paired = true + R2Name = getUncompressedFileName(fileR2) + } } } /** Script to add jobs */ def biopetScript() { - runInitialJobs() + if (inputR1 != null) { + runInitialJobs() - if (paired) runTrimClip(inputR1, inputR2, outputDir) - else runTrimClip(inputR1, outputDir) + if (paired) runTrimClip(inputR1, inputR2, outputDir) + else runTrimClip(inputR1, outputDir) - val R1Files = for ((k, v) <- outputFiles if k.endsWith("output_R1")) yield v - val R2Files = for ((k, v) <- outputFiles if k.endsWith("output_R2")) yield v - runFinalize(R1Files.toList, R2Files.toList) + val R1Files = for ((k, v) <- outputFiles if k.endsWith("output_R1")) yield v + val R2Files = for ((k, v) <- outputFiles if k.endsWith("output_R2")) yield v + runFinalize(R1Files.toList, R2Files.toList) + } } /** Add init non chunkable jobs */ diff --git a/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/QcCommand.scala b/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/QcCommand.scala index 1aaca2eb4302e9e959e88687e8659b268bb532e4..ced36e88c2cef488f05b6d876c556f69bd7c348f 100644 --- a/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/QcCommand.scala +++ b/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/QcCommand.scala @@ -103,13 +103,11 @@ class QcCommand(val root: Configurable, val fastqc: Fastqc) extends BiopetComman clip = if (!flexiprep.skipClip) { val cutadapt = clip.getOrElse(new Cutadapt(root, fastqc)) - val foundAdapters = if (!cutadapt.ignoreFastqcAdapters) { - fastqc.foundAdapters.map(_.seq) ++ cutadapt.customAdapters.map(_.seq) - } else { - cutadapt.customAdapters.map(_.seq) - } + val foundAdapters: Set[String] = if (!cutadapt.ignoreFastqcAdapters) { + fastqc.foundAdapters.map(_.seq) + } else Set() - if (foundAdapters.nonEmpty) { + if (foundAdapters.nonEmpty || cutadapt.adapter.nonEmpty || cutadapt.front.nonEmpty || cutadapt.anywhere.nonEmpty) { cutadapt.fastqInput = seqtk.output cutadapt.fastqOutput = new File(output.getParentFile, input.getName + ".cutadapt.fq") cutadapt.statsOutput = new File(flexiprep.outputDir, s"${flexiprep.sampleId.getOrElse("x")}-${flexiprep.libId.getOrElse("x")}.$read.clip.stats") diff --git a/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepTest.scala b/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepTest.scala index c553d8075e0e8be6f86a465fcfcc7db4ad269003..276f7832206994defc61a3b0f2f5a2ee326b858a 100644 --- a/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepTest.scala +++ b/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepTest.scala @@ -17,7 +17,7 @@ package nl.lumc.sasc.biopet.pipelines.flexiprep import java.io.File import com.google.common.io.Files -import nl.lumc.sasc.biopet.extensions.tools.{ ValidateFastq, SeqStat } +import nl.lumc.sasc.biopet.extensions.tools.{ SeqStat, ValidateFastq } import nl.lumc.sasc.biopet.utils.ConfigUtils import nl.lumc.sasc.biopet.utils.config.Config import org.apache.commons.io.FileUtils @@ -83,6 +83,18 @@ class FlexiprepTest extends TestNGSuite with Matchers { } + @Test + def testNoSample: Unit = { + val map = ConfigUtils.mergeMaps(Map( + "output_dir" -> FlexiprepTest.outputDir + ), Map(FlexiprepTest.executables.toSeq: _*)) + val flexiprep: Flexiprep = initPipeline(map) + + intercept[IllegalStateException] { + flexiprep.script() + } + } + // remove temporary run directory all tests in the class have been run @AfterClass def removeTempOutputDir() = { FileUtils.deleteDirectory(FlexiprepTest.outputDir) diff --git a/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/Gears.scala b/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/Gears.scala index 7e9362ffc3fc41933ed51e4f1705c9b3d44aee90..17bd86c56518d279a2139a0fc38177ee529b84a6 100644 --- a/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/Gears.scala +++ b/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/Gears.scala @@ -15,10 +15,11 @@ package nl.lumc.sasc.biopet.pipelines.gears import nl.lumc.sasc.biopet.core.BiopetQScript.InputFile -import nl.lumc.sasc.biopet.core.{ PipelineCommand, MultiSampleQScript } +import nl.lumc.sasc.biopet.core.{ MultiSampleQScript, PipelineCommand } import nl.lumc.sasc.biopet.extensions.tools.MergeOtuMaps -import nl.lumc.sasc.biopet.extensions.{ Gzip, Zcat, Ln } +import nl.lumc.sasc.biopet.extensions.{ Gzip, Ln, Zcat } import nl.lumc.sasc.biopet.extensions.qiime.MergeOtuTables +import nl.lumc.sasc.biopet.extensions.seqtk.SeqtkSample import nl.lumc.sasc.biopet.pipelines.flexiprep.Flexiprep import nl.lumc.sasc.biopet.utils.config.Configurable import org.broadinstitute.gatk.queue.QScript @@ -36,6 +37,8 @@ class Gears(val root: Configurable) extends QScript with MultiSampleQScript { qs Some(gearsReport) } + override def defaults = Map("mergeotumaps" -> Map("skip_prefix" -> "New.")) + override def fixedValues = Map("gearssingle" -> Map("skip_flexiprep" -> true)) /** Init for pipeline */ @@ -52,22 +55,30 @@ class Gears(val root: Configurable) extends QScript with MultiSampleQScript { qs } def qiimeClosedDir: Option[File] = { - if (samples.values.flatMap(_.gs.qiimeClosed).nonEmpty) { + if (samples.values.flatMap(_.gearsSingle.qiimeClosed).nonEmpty) { Some(new File(outputDir, "qiime_closed_reference")) } else None + } + def qiimeOpenDir: Option[File] = { + if (samples.values.flatMap(_.gearsSingle.qiimeOpen).nonEmpty) { + Some(new File(outputDir, "qiime_open_reference")) + } else None } def qiimeClosedOtuTable: Option[File] = qiimeClosedDir.map(new File(_, "otu_table.biom")) def qiimeClosedOtuMap: Option[File] = qiimeClosedDir.map(new File(_, "otu_map.txt")) + def qiimeOpenOtuTable: Option[File] = qiimeOpenDir.map(new File(_, "otu_table.biom")) + def qiimeOpenOtuMap: Option[File] = qiimeOpenDir.map(new File(_, "otu_map.txt")) + /** * Method where the multisample jobs should be added, this will be executed only when running the -sample argument is not given. */ def addMultiSampleJobs(): Unit = { - val gss = samples.values.flatMap(_.gs.qiimeClosed).toList - val closedOtuTables = gss.map(_.otuTable) - val closedOtuMaps = gss.map(_.otuMap) + val qiimeCloseds = samples.values.flatMap(_.gearsSingle.qiimeClosed).toList + val closedOtuTables = qiimeCloseds.map(_.otuTable) + val closedOtuMaps = qiimeCloseds.map(_.otuMap) require(closedOtuTables.size == closedOtuMaps.size) if (closedOtuTables.nonEmpty) { if (closedOtuTables.size > 1) { @@ -85,14 +96,35 @@ class Gears(val root: Configurable) extends QScript with MultiSampleQScript { qs add(Ln(qscript, closedOtuMaps.head, qiimeClosedOtuMap.get)) add(Ln(qscript, closedOtuTables.head, qiimeClosedOtuTable.get)) } + } + + val qiimeOpens = samples.values.flatMap(_.gearsSingle.qiimeOpen).toList + val openOtuTables = qiimeOpens.map(_.otuTable) + val openOtuMaps = qiimeOpens.map(_.otuMap) + require(openOtuTables.size == openOtuMaps.size) + if (openOtuTables.nonEmpty) { + if (openOtuTables.size > 1) { + val mergeTables = new MergeOtuTables(qscript) + mergeTables.input = openOtuTables + mergeTables.outputFile = qiimeOpenOtuTable.get + add(mergeTables) + + val mergeMaps = new MergeOtuMaps(qscript) + mergeMaps.input = openOtuMaps + mergeMaps.output = qiimeOpenOtuMap.get + add(mergeMaps) - //TODO: Plots + } else { + add(Ln(qscript, openOtuMaps.head, qiimeOpenOtuMap.get)) + add(Ln(qscript, openOtuTables.head, qiimeOpenOtuTable.get)) + } } } /** * Factory method for Sample class + * * @param id SampleId * @return Sample class */ @@ -101,6 +133,7 @@ class Gears(val root: Configurable) extends QScript with MultiSampleQScript { qs class Sample(sampleId: String) extends AbstractSample(sampleId) { /** * Factory method for Library class + * * @param id SampleId * @return Sample class */ @@ -115,10 +148,9 @@ class Gears(val root: Configurable) extends QScript with MultiSampleQScript { qs flexiprep.inputR2 = config("R2") flexiprep.outputDir = new File(libDir, "flexiprep") - lazy val gs = new GearsSingle(qscript) - gs.sampleId = Some(sampleId) - gs.libId = Some(libId) - gs.outputDir = libDir + val libraryGears: Boolean = config("library_gears", default = false) + + lazy val gearsSingle = if (libraryGears) Some(new GearsSingle(qscript)) else None /** Function that add library jobs */ protected def addJobs(): Unit = { @@ -126,9 +158,15 @@ class Gears(val root: Configurable) extends QScript with MultiSampleQScript { qs flexiprep.inputR2.foreach(inputFiles :+= InputFile(_, config("R2_md5"))) add(flexiprep) - gs.fastqR1 = Some(flexiprep.fastqR1Qc) - gs.fastqR2 = flexiprep.fastqR2Qc - add(gs) + gearsSingle.foreach { gs => + gs.sampleId = Some(sampleId) + gs.libId = Some(libId) + gs.outputDir = libDir + + gs.fastqR1 = Some(addDownsample(flexiprep.fastqR1Qc, gs.outputDir)) + gs.fastqR2 = flexiprep.fastqR2Qc.map(addDownsample(_, gs.outputDir)) + add(gs) + } } /** Must return files to store into summary */ @@ -138,9 +176,9 @@ class Gears(val root: Configurable) extends QScript with MultiSampleQScript { qs def summaryStats = Map() } - lazy val gs = new GearsSingle(qscript) - gs.sampleId = Some(sampleId) - gs.outputDir = sampleDir + lazy val gearsSingle = new GearsSingle(qscript) + gearsSingle.sampleId = Some(sampleId) + gearsSingle.outputDir = sampleDir /** Function to add sample jobs */ protected def addJobs(): Unit = { @@ -156,9 +194,9 @@ class Gears(val root: Configurable) extends QScript with MultiSampleQScript { qs add(Zcat(qscript, flexipreps.flatMap(_.fastqR2Qc)) | new Gzip(qscript) > file) } - gs.fastqR1 = Some(mergeR1) - gs.fastqR2 = mergeR2 - add(gs) + gearsSingle.fastqR1 = Some(addDownsample(mergeR1, gearsSingle.outputDir)) + gearsSingle.fastqR2 = mergeR2.map(addDownsample(_, gearsSingle.outputDir)) + add(gearsSingle) } /** Must return files to store into summary */ @@ -168,11 +206,28 @@ class Gears(val root: Configurable) extends QScript with MultiSampleQScript { qs def summaryStats: Any = Map() } + val downSample: Option[Double] = config("gears_downsample") + + def addDownsample(input: File, dir: File): File = { + downSample match { + case Some(x) => + val output = new File(dir, input.getName + ".fq.gz") + val seqtk = new SeqtkSample(this) + seqtk.input = input + seqtk.sample = x + add(seqtk | new Gzip(this) > output) + output + case _ => input + } + } + /** Must return a map with used settings for this pipeline */ - def summarySettings: Map[String, Any] = Map() + def summarySettings: Map[String, Any] = Map("gears_downsample" -> downSample) /** File to put in the summary for thie pipeline */ def summaryFiles: Map[String, File] = ( + qiimeOpenOtuTable.map("qiime_open_otu_table" -> _) ++ + qiimeOpenOtuMap.map("qiime_open_otu_map" -> _) ++ qiimeClosedOtuTable.map("qiime_closed_otu_table" -> _) ++ qiimeClosedOtuMap.map("qiime_closed_otu_map" -> _) ).toMap diff --git a/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsQiimeClosed.scala b/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsQiimeClosed.scala index da7a357b2cd239bf721bf52826a98ae332f1f34a..a57a6ee7622d0e37be33ac4b108c8b2bab83b973 100644 --- a/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsQiimeClosed.scala +++ b/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsQiimeClosed.scala @@ -18,15 +18,15 @@ import java.io.{ File, PrintWriter } import nl.lumc.sasc.biopet.core.summary.SummaryQScript import nl.lumc.sasc.biopet.core.SampleLibraryTag -import nl.lumc.sasc.biopet.extensions.Flash import nl.lumc.sasc.biopet.extensions.qiime._ +import nl.lumc.sasc.biopet.extensions.seqtk.SeqtkSample import nl.lumc.sasc.biopet.utils.ConfigUtils import nl.lumc.sasc.biopet.utils.config.Configurable import org.broadinstitute.gatk.queue.QScript import scala.collection.mutable import scala.collection.mutable.ListBuffer -import scala.xml.{ PrettyPrinter, Elem } +import scala.xml.{ Elem, PrettyPrinter } /** * Created by pjvan_thof on 12/4/15. @@ -43,6 +43,7 @@ class GearsQiimeClosed(val root: Configurable) extends QScript with SummaryQScri def init() = { require(fastqInput != null) + require(sampleId.isDefined) } private var _otuMap: File = _ @@ -60,7 +61,7 @@ class GearsQiimeClosed(val root: Configurable) extends QScript with SummaryQScri add(splitLib) val closedReference = new PickClosedReferenceOtus(this) - closedReference.inputFasta = splitLib.outputSeqs + closedReference.inputFasta = addDownsample(splitLib.outputSeqs, new File(splitLib.outputDir, s"${sampleId.get}.downsample.fna")) closedReference.outputDir = new File(outputDir, "pick_closed_reference_otus") add(closedReference) _otuMap = closedReference.otuMap @@ -77,6 +78,21 @@ class GearsQiimeClosed(val root: Configurable) extends QScript with SummaryQScri /** Name of summary output file */ def summaryFile: File = new File(outputDir, "summary.closed_reference.json") + + val downSample: Option[Double] = config("downsample") + + def addDownsample(input: File, output: File): File = { + downSample match { + case Some(x) => + val seqtk = new SeqtkSample(this) + seqtk.input = input + seqtk.sample = x + seqtk.output = output + add(seqtk) + output + case _ => input + } + } } object GearsQiimeClosed { @@ -111,9 +127,8 @@ object GearsQiimeClosed { taxonomy.foldLeft(root) { (a, b) => val n = b.split("__", 2) val level = n(0) - val name = n(1) - val bla = a.childs.find(_ == TaxNode(name, level)) - bla match { + val name = if (level == "Unassigned") "Unassigned" else n(1) + a.childs.find(_ == TaxNode(name, level)) match { case Some(node) => node case _ => val node = TaxNode(name, level) diff --git a/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsQiimeOpen.scala b/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsQiimeOpen.scala new file mode 100644 index 0000000000000000000000000000000000000000..edfac0a1a41bf790145c3188358fe50ef1b068b7 --- /dev/null +++ b/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsQiimeOpen.scala @@ -0,0 +1,90 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.pipelines.gears + +import nl.lumc.sasc.biopet.core.SampleLibraryTag +import nl.lumc.sasc.biopet.core.summary.SummaryQScript +import nl.lumc.sasc.biopet.extensions.qiime._ +import nl.lumc.sasc.biopet.extensions.seqtk.SeqtkSample +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.queue.QScript + +/** + * Created by pjvan_thof on 12/4/15. + */ +class GearsQiimeOpen(val root: Configurable) extends QScript with SummaryQScript with SampleLibraryTag { + + var fastqInput: File = _ + + override def defaults = Map( + "splitlibrariesfastq" -> Map( + "barcode_type" -> "not-barcoded" + ) + ) + + def init() = { + require(fastqInput != null) + require(sampleId.isDefined) + } + + private var _otuMap: File = _ + def otuMap = _otuMap + + private var _otuTable: File = _ + def otuTable = _otuTable + + def biopetScript() = { + + val splitLib = new SplitLibrariesFastq(this) + splitLib.input :+= fastqInput + splitLib.outputDir = new File(outputDir, "split_libraries_fastq") + sampleId.foreach(splitLib.sampleIds :+= _.replaceAll("_", "-")) + add(splitLib) + + val openReference = new PickOpenReferenceOtus(this) + openReference.inputFasta = addDownsample(splitLib.outputSeqs, new File(splitLib.outputDir, s"${sampleId.get}.downsample.fna")) + openReference.outputDir = new File(outputDir, "pick_open_reference_otus") + add(openReference) + _otuMap = openReference.otuMap + _otuTable = openReference.otuTable + + addSummaryJobs() + } + + /** Must return a map with used settings for this pipeline */ + def summarySettings: Map[String, Any] = Map() + + /** File to put in the summary for thie pipeline */ + def summaryFiles: Map[String, File] = Map("otu_table" -> otuTable, "otu_map" -> otuMap) + + /** Name of summary output file */ + def summaryFile: File = new File(outputDir, "summary.open_reference.json") + + val downSample: Option[Double] = config("downsample") + + def addDownsample(input: File, output: File): File = { + downSample match { + case Some(x) => + val seqtk = new SeqtkSample(this) + seqtk.input = input + seqtk.sample = x + seqtk.output = output + add(seqtk) + output + case _ => input + } + } +} + diff --git a/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsReport.scala b/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsReport.scala index 21167a04aeea81fa537c62449a24e326a53fdb88..d28668c484e60ce3aaf781fb974fffd16651477e 100644 --- a/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsReport.scala +++ b/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsReport.scala @@ -40,6 +40,8 @@ object GearsReport extends MultisampleReportBuilder { val krakenExecuted = summary.getSampleValues("gearskraken", "stats", "krakenreport").values.forall(_.isDefined) val qiimeClosesOtuTable = summary.getValue("gears", "files", "pipeline", "qiime_closed_otu_table", "path") .map(x => new File(x.toString)) + val qiimeOpenOtuTable = summary.getValue("gears", "files", "pipeline", "qiime_open_otu_table", "path") + .map(x => new File(x.toString)) ReportPage( (if (krakenExecuted) List("Kraken analysis" -> ReportPage(List(), List( @@ -48,6 +50,9 @@ object GearsReport extends MultisampleReportBuilder { else Nil) ::: (if (qiimeClosesOtuTable.isDefined) List("Qiime closed reference analysis" -> ReportPage(List(), List( "Krona plot" -> ReportSection("/nl/lumc/sasc/biopet/pipelines/gears/qiimeKrona.ssp" )), Map("biomFile" -> qiimeClosesOtuTable.get))) + else Nil) ::: (if (qiimeOpenOtuTable.isDefined) List("Qiime open reference analysis" -> ReportPage(List(), List( + "Krona plot" -> ReportSection("/nl/lumc/sasc/biopet/pipelines/gears/qiimeKrona.ssp" + )), Map("biomFile" -> qiimeOpenOtuTable.get))) else Nil) ::: List("Samples" -> generateSamplesPage(pageArgs)) ++ Map( "Versions" -> ReportPage(List(), List( @@ -71,6 +76,8 @@ object GearsReport extends MultisampleReportBuilder { val krakenExecuted = summary.getValue(Some(sampleId), None, "gearskraken", "stats", "krakenreport").isDefined val qiimeClosesOtuTable = summary.getValue(Some(sampleId), None, "gearsqiimeclosed", "files", "pipeline", "otu_table", "path") .map(x => new File(x.toString)) + val qiimeOpenOtuTable = summary.getValue(Some(sampleId), None, "gearsqiimeopen", "files", "pipeline", "otu_table", "path") + .map(x => new File(x.toString)) ReportPage((if (krakenExecuted) List("Kraken" -> ReportPage(List(), List( "Kraken analysis" -> ReportSection("/nl/lumc/sasc/biopet/pipelines/gears/krakenKrona.ssp" @@ -78,6 +85,9 @@ object GearsReport extends MultisampleReportBuilder { else Nil) ::: (if (qiimeClosesOtuTable.isDefined) List("Qiime closed reference analysis" -> ReportPage(List(), List( "Krona plot" -> ReportSection("/nl/lumc/sasc/biopet/pipelines/gears/qiimeKrona.ssp" )), Map("biomFile" -> qiimeClosesOtuTable.get))) + else Nil) ::: (if (qiimeOpenOtuTable.isDefined) List("Qiime open reference analysis" -> ReportPage(List(), List( + "Krona plot" -> ReportSection("/nl/lumc/sasc/biopet/pipelines/gears/qiimeKrona.ssp" + )), Map("biomFile" -> qiimeOpenOtuTable.get))) else Nil) ::: List( "Libraries" -> generateLibraryPage(args) ), List("QC reads" -> ReportSection("/nl/lumc/sasc/biopet/pipelines/flexiprep/flexiprepReadSummary.ssp"), @@ -91,6 +101,8 @@ object GearsReport extends MultisampleReportBuilder { val krakenExecuted = summary.getValue(Some(sampleId), Some(libId), "gearskraken", "stats", "krakenreport").isDefined val qiimeClosesOtuTable = summary.getValue(Some(sampleId), Some(libId), "gearsqiimeclosed", "files", "pipeline", "otu_table", "path") .map(x => new File(x.toString)) + val qiimeOpenOtuTable = summary.getValue(Some(sampleId), Some(libId), "gearsqiimeopen", "files", "pipeline", "otu_table", "path") + .map(x => new File(x.toString)) ReportPage( (if (flexiprepExecuted) List("QC" -> FlexiprepReport.flexiprepPage) else Nil @@ -100,6 +112,9 @@ object GearsReport extends MultisampleReportBuilder { else Nil) ::: (if (qiimeClosesOtuTable.isDefined) List("Qiime closed reference analysis" -> ReportPage(List(), List( "Krona plot" -> ReportSection("/nl/lumc/sasc/biopet/pipelines/gears/qiimeKrona.ssp" )), Map("biomFile" -> qiimeClosesOtuTable.get))) + else Nil) ::: (if (qiimeOpenOtuTable.isDefined) List("Qiime open reference analysis" -> ReportPage(List(), List( + "Krona plot" -> ReportSection("/nl/lumc/sasc/biopet/pipelines/gears/qiimeKrona.ssp" + )), Map("biomFile" -> qiimeOpenOtuTable.get))) else Nil), List( "QC reads" -> ReportSection("/nl/lumc/sasc/biopet/pipelines/flexiprep/flexiprepReadSummary.ssp"), "QC bases" -> ReportSection("/nl/lumc/sasc/biopet/pipelines/flexiprep/flexiprepBaseSummary.ssp") diff --git a/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsSingle.scala b/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsSingle.scala index beba80ad632fa135374a8e23d21c55f007dc7545..60e72489ca37dbfbb5d5d3ffbaca10a1c9c3f313 100644 --- a/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsSingle.scala +++ b/gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsSingle.scala @@ -18,6 +18,7 @@ import nl.lumc.sasc.biopet.core.summary.SummaryQScript import nl.lumc.sasc.biopet.core.BiopetQScript.InputFile import nl.lumc.sasc.biopet.core.{ PipelineCommand, SampleLibraryTag } import nl.lumc.sasc.biopet.pipelines.flexiprep.Flexiprep +import nl.lumc.sasc.biopet.utils.Logging import nl.lumc.sasc.biopet.utils.config.Configurable import org.broadinstitute.gatk.queue.QScript @@ -42,12 +43,14 @@ class GearsSingle(val root: Configurable) extends QScript with SummaryQScript wi lazy val krakenScript = if (config("gears_use_kraken", default = true)) Some(new GearsKraken(this)) else None lazy val qiimeRatx = if (config("gears_use_qiime_rtax", default = false)) Some(new GearsQiimeRtax(this)) else None lazy val qiimeClosed = if (config("gears_use_qiime_closed", default = false)) Some(new GearsQiimeClosed(this)) else None + lazy val qiimeOpen = if (config("gears_use_qiime_open", default = false)) Some(new GearsQiimeOpen(this)) else None lazy val seqCount = if (config("gears_use_seq_count", default = false)) Some(new GearsSeqCount(this)) else None /** Executed before running the script */ def init(): Unit = { - require(fastqR1.isDefined || bamFile.isDefined, "Please specify fastq-file(s) or bam file") - require(fastqR1.isDefined != bamFile.isDefined, "Provide either a bam file or a R1/R2 file") + if (!fastqR1.isDefined && !bamFile.isDefined) Logging.addError("Please specify fastq-file(s) or bam file") + if (fastqR1.isDefined == bamFile.isDefined) Logging.addError("Provide either a bam file or a R1/R2 file") + if (sampleId == null || sampleId == None) Logging.addError("Missing sample ID on GearsSingle module") if (outputName == null) { if (fastqR1.isDefined) outputName = fastqR1.map(_.getName @@ -61,7 +64,7 @@ class GearsSingle(val root: Configurable) extends QScript with SummaryQScript wi if (fastqR1.isDefined) { fastqR1.foreach(inputFiles :+= InputFile(_)) fastqR2.foreach(inputFiles :+= InputFile(_)) - } else inputFiles :+= InputFile(bamFile.get) + } else bamFile.foreach(inputFiles :+= InputFile(_)) } override def reportClass = { @@ -80,6 +83,8 @@ class GearsSingle(val root: Configurable) extends QScript with SummaryQScript wi val flexiprep = new Flexiprep(this) flexiprep.inputR1 = r1 flexiprep.inputR2 = r2 + flexiprep.sampleId = if (sampleId.isEmpty) Some("noSampleName") else sampleId + flexiprep.libId = if (libId.isEmpty) Some("noLibName") else libId flexiprep.outputDir = new File(outputDir, "flexiprep") add(flexiprep) (flexiprep.fastqR1Qc, flexiprep.fastqR2Qc) @@ -134,6 +139,12 @@ class GearsSingle(val root: Configurable) extends QScript with SummaryQScript wi add(qiimeClosed) } + qiimeOpen foreach { qiimeOpen => + qiimeOpen.outputDir = new File(outputDir, "qiime_open") + qiimeOpen.fastqInput = combinedFastq + add(qiimeOpen) + } + seqCount.foreach { seqCount => seqCount.fastqInput = combinedFastq seqCount.outputDir = new File(outputDir, "seq_count") @@ -151,7 +162,8 @@ class GearsSingle(val root: Configurable) extends QScript with SummaryQScript wi "skip_flexiprep" -> skipFlexiprep, "gears_use_kraken" -> krakenScript.isDefined, "gear_use_qiime_rtax" -> qiimeRatx.isDefined, - "gear_use_qiime_closed" -> qiimeClosed.isDefined + "gear_use_qiime_closed" -> qiimeClosed.isDefined, + "gear_use_qiime_open" -> qiimeOpen.isDefined ) /** Statistics shown in the summary file */ diff --git a/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsKrakenTest.scala b/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsKrakenTest.scala index 4b3c0b05d8866306bddccea27c903ca7f22b787e..9f6e98c617ab40ff1eaa241f29f17f50c36b355f 100644 --- a/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsKrakenTest.scala +++ b/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsKrakenTest.scala @@ -24,7 +24,7 @@ import org.testng.annotations.Test /** * Created by pjvan_thof on 2/5/16. */ -class GearsKrakenTest extends TestNGSuite with Matchers { +class KrakenToKronaTest extends TestNGSuite with Matchers { private def resourcePath(p: String): String = { Paths.get(getClass.getResource(p).toURI).toString } diff --git a/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsQiimeClosedTest.scala b/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsQiimeClosedTest.scala index 4bf714302a4ee584288093ecd708f68c40489e9c..568abfbe0f6e3e17af96febea71a6efb5ccf2bce 100644 --- a/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsQiimeClosedTest.scala +++ b/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsQiimeClosedTest.scala @@ -24,7 +24,7 @@ import org.testng.annotations.Test /** * Created by pjvan_thof on 2/5/16. */ -class GearsQiimeClosedTest extends TestNGSuite with Matchers { +class BiomToKronaTest extends TestNGSuite with Matchers { private def resourcePath(p: String): String = { Paths.get(getClass.getResource(p).toURI).toString } diff --git a/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsSingleTest.scala b/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsSingleTest.scala index 3a7f27c64e21cec810317736446a27e77d4f331b..31ffcab22c517a62264166c95564081aa431b6cc 100644 --- a/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsSingleTest.scala +++ b/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsSingleTest.scala @@ -21,9 +21,8 @@ import nl.lumc.sasc.biopet.extensions.kraken.{ Kraken, KrakenReport } import nl.lumc.sasc.biopet.extensions.picard.SamToFastq import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsView import nl.lumc.sasc.biopet.extensions.tools.KrakenReportToJson -import nl.lumc.sasc.biopet.utils.ConfigUtils +import nl.lumc.sasc.biopet.utils.{ ConfigUtils, Logging } import nl.lumc.sasc.biopet.utils.config.Config -import org.apache.commons.io.FileUtils import org.broadinstitute.gatk.queue.QSettings import org.scalatest.Matchers import org.scalatest.testng.TestNGSuite @@ -35,7 +34,7 @@ import org.testng.annotations._ * Created by wyleung on 10/22/15. */ -class GearsSingleTest extends TestNGSuite with Matchers { +abstract class TestGearsSingle extends TestNGSuite with Matchers { def initPipeline(map: Map[String, Any]): GearsSingle = { new GearsSingle { override def configNamespace = "gearssingle" @@ -47,80 +46,160 @@ class GearsSingleTest extends TestNGSuite with Matchers { } } - @DataProvider(name = "gearsOptions") - def gearsOptions = { - val bool = Array(true, false) - - for ( - fromBam <- bool; - pair <- bool; - hasOutputName <- bool; - kraken <- bool; - qiimeClosed <- bool; - qiimeRtax <- bool; - seqCount <- bool - ) yield Array("", fromBam, pair, hasOutputName, kraken, qiimeClosed, qiimeRtax, seqCount) - } + def paired: Boolean = false + def hasOutputName: Boolean = false + def kraken: Option[Boolean] = None + def qiimeClosed: Boolean = false + def qiimeOpen: Boolean = false + def qiimeRtax: Boolean = false + def seqCount: Boolean = false + def downsample: Option[Int] = None + + def inputMode: Option[String] = Some("fastq") - @Test(dataProvider = "gearsOptions") - def testGears(dummy: String, - fromBam: Boolean, - paired: Boolean, - hasOutputName: Boolean, - kraken: Boolean, - qiimeClosed: Boolean, - qiimeRtax: Boolean, - seqCount: Boolean) = { + @Test + def testGears(): Unit = { val map = ConfigUtils.mergeMaps(Map( - "gears_use_kraken" -> kraken, "gears_use_qiime_rtax" -> qiimeRtax, "gears_use_qiime_closed" -> qiimeClosed, + "gears_use_qiime_open" -> qiimeOpen, "gears_use_seq_count" -> seqCount, - "output_dir" -> GearsSingleTest.outputDir - ), Map(GearsSingleTest.executables.toSeq: _*)) + "output_dir" -> TestGearsSingle.outputDir + ) ++ + kraken.map("gears_use_kraken" -> _) ++ + downsample.map("downsample" -> _), + Map(TestGearsSingle.executables.toSeq: _*)) val gears: GearsSingle = initPipeline(map) - - if (fromBam) { - gears.bamFile = Some(GearsSingleTest.bam) - } else { - gears.fastqR1 = Some(GearsSingleTest.r1) - gears.fastqR2 = if (paired) Some(GearsSingleTest.r2) else None + gears.sampleId = Some("sampleName") + gears.libId = Some("libName") + + inputMode match { + case Some("fastq") => + gears.fastqR1 = Some(TestGearsSingle.r1) + gears.fastqR2 = if (paired) Some(TestGearsSingle.r2) else None + case Some("bam") => gears.bamFile = Some(TestGearsSingle.bam) + case None => + case _ => new IllegalStateException(s"$inputMode not allowed as inputMode") } + if (hasOutputName) gears.outputName = "test" - gears.script() - - if (hasOutputName) { - gears.outputName shouldBe "test" + if (inputMode.isEmpty) { + intercept[IllegalArgumentException] { + gears.script() + } + Logging.errors.clear() } else { - // in the following cases the filename should have been determined by the filename - gears.outputName shouldBe (if (fromBam) "bamfile" else "R1") + + gears.script() + + if (hasOutputName) { + gears.outputName shouldBe "test" + } else { + // in the following cases the filename should have been determined by the filename + gears.outputName shouldBe (if (inputMode == Some("bam")) "bamfile" else "R1") + } + + gears.summarySettings("gears_use_kraken") shouldBe kraken.getOrElse(true) + gears.summarySettings("gear_use_qiime_rtax") shouldBe qiimeRtax + gears.summarySettings("gear_use_qiime_closed") shouldBe qiimeClosed + gears.summarySettings("gear_use_qiime_open") shouldBe qiimeOpen + + gears.krakenScript.isDefined shouldBe kraken.getOrElse(true) + gears.qiimeClosed.isDefined shouldBe qiimeClosed + gears.qiimeOpen.isDefined shouldBe qiimeOpen + gears.qiimeRatx.isDefined shouldBe qiimeRtax + gears.seqCount.isDefined shouldBe seqCount + + // SamToFastq should have started if it was started from bam + gears.functions.count(_.isInstanceOf[SamtoolsView]) shouldBe (if (inputMode == Some("bam")) 1 else 0) + gears.functions.count(_.isInstanceOf[SamToFastq]) shouldBe (if (inputMode == Some("bam")) 1 else 0) + + gears.functions.count(_.isInstanceOf[Kraken]) shouldBe (if (kraken.getOrElse(true)) 1 else 0) + gears.functions.count(_.isInstanceOf[KrakenReport]) shouldBe (if (kraken.getOrElse(true)) 1 else 0) + gears.functions.count(_.isInstanceOf[KrakenReportToJson]) shouldBe (if (kraken.getOrElse(true)) 1 else 0) } + } +} + +class GearsSingleNoInputTest extends TestGearsSingle { + override def inputMode = None +} + +class GearsSingleDefaultTest extends TestGearsSingle +class GearsSingleKrakenTest extends TestGearsSingle { + override def kraken = Some(true) +} +class GearsSingleQiimeClosedTest extends TestGearsSingle { + override def qiimeClosed = true +} +class GearsSingleQiimeOpenTest extends TestGearsSingle { + override def qiimeOpen = true +} +class GearsSingleQiimeRtaxTest extends TestGearsSingle { + override def qiimeRtax = true +} +class GearsSingleseqCountTest extends TestGearsSingle { + override def seqCount = true +} - gears.krakenScript.isDefined shouldBe kraken - gears.qiimeClosed.isDefined shouldBe qiimeClosed - gears.qiimeRatx.isDefined shouldBe qiimeRtax - gears.seqCount.isDefined shouldBe seqCount +class GearsSingleKrakenPairedTest extends TestGearsSingle { + override def paired = true + override def kraken = Some(true) +} +class GearsSingleQiimeClosedPairedTest extends TestGearsSingle { + override def paired = true + override def qiimeClosed = true +} +class GearsSingleQiimeOpenPairedTest extends TestGearsSingle { + override def paired = true + override def qiimeOpen = true +} +class GearsSingleQiimeRtaxPairedTest extends TestGearsSingle { + override def paired = true + override def qiimeRtax = true +} +class GearsSingleseqCountPairedTest extends TestGearsSingle { + override def paired = true + override def seqCount = true +} - // SamToFastq should have started if it was started from bam - gears.functions.count(_.isInstanceOf[SamtoolsView]) shouldBe (if (fromBam) 1 else 0) - gears.functions.count(_.isInstanceOf[SamToFastq]) shouldBe (if (fromBam) 1 else 0) +class GearsSingleAllTest extends TestGearsSingle { + override def kraken = Some(true) + override def qiimeClosed = true + override def qiimeOpen = true + override def qiimeRtax = true + override def seqCount = true +} +class GearsSingleAllPairedTest extends TestGearsSingle { + override def kraken = Some(true) + override def qiimeClosed = true + override def qiimeOpen = true + override def qiimeRtax = true + override def seqCount = true + override def paired = true +} - gears.functions.count(_.isInstanceOf[Kraken]) shouldBe (if (kraken) 1 else 0) - gears.functions.count(_.isInstanceOf[KrakenReport]) shouldBe (if (kraken) 1 else 0) - gears.functions.count(_.isInstanceOf[KrakenReportToJson]) shouldBe (if (kraken) 1 else 0) - } +class GearsSingleBamTest extends TestGearsSingle { + override def inputMode = Some("bam") +} - // remove temporary run directory all tests in the class have been run - @AfterClass def removeTempOutputDir() = { - FileUtils.deleteDirectory(GearsSingleTest.outputDir) - } +class GearsSingleQiimeClosedDownsampleTest extends TestGearsSingle { + override def paired = true + override def qiimeClosed = true + override def downsample = Some(10000) +} +class GearsSingleQiimeOpenDownsampleTest extends TestGearsSingle { + override def paired = true + override def qiimeOpen = true + override def downsample = Some(10000) } -object GearsSingleTest { +object TestGearsSingle { val outputDir = Files.createTempDir() + outputDir.deleteOnExit() new File(outputDir, "input").mkdirs() val r1 = new File(outputDir, "input" + File.separator + "R1.fq") @@ -141,6 +220,7 @@ object GearsSingleTest { "md5sum" -> Map("exe" -> "test"), "assigntaxonomy" -> Map("exe" -> "test"), "pickclosedreferenceotus" -> Map("exe" -> "test"), + "pickopenreferenceotus" -> Map("exe" -> "test"), "pickotus" -> Map("exe" -> "test"), "pickrepset" -> Map("exe" -> "test"), "splitlibrariesfastq" -> Map("exe" -> "test"), diff --git a/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsTest.scala b/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsTest.scala index 9a62e439a082f38e002a5db25e445fae2a9d3af1..01332f4c3f8e94a9b8d9b27d7b0246ba2fa5b731 100644 --- a/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsTest.scala +++ b/gears/src/test/scala/nl/lumc/sasc/biopet/pipelines/gears/GearsTest.scala @@ -17,18 +17,17 @@ package nl.lumc.sasc.biopet.pipelines.gears import java.io.File import com.google.common.io.Files -import nl.lumc.sasc.biopet.utils.ConfigUtils +import nl.lumc.sasc.biopet.utils.{ ConfigUtils, Logging } import nl.lumc.sasc.biopet.utils.config.Config -import org.apache.commons.io.FileUtils import org.broadinstitute.gatk.queue.QSettings import org.scalatest.Matchers import org.scalatest.testng.TestNGSuite -import org.testng.annotations.{ DataProvider, Test, AfterClass } +import org.testng.annotations.{ DataProvider, Test } /** * Created by pjvanthof on 04/02/16. */ -class GearsTest extends TestNGSuite with Matchers { +abstract class GearsTest extends TestNGSuite with Matchers { def initPipeline(map: Map[String, Any]): Gears = { new Gears { override def configNamespace = "gears" @@ -40,45 +39,71 @@ class GearsTest extends TestNGSuite with Matchers { } } + def kraken: Option[Boolean] = None + def qiimeClosed: Boolean = false + def qiimeOpen: Boolean = false + def qiimeRtax: Boolean = false + def seqCount: Boolean = false + def libraryGears: Boolean = false + @DataProvider(name = "gearsOptions") def gearsOptions = { val bool = Array(true, false) for ( - s1 <- bool; s2 <- bool; qiimeClosed <- bool - ) yield Array("", s1, s2, qiimeClosed) + s1 <- bool; s2 <- bool + ) yield Array("", s1, s2) } @Test(dataProvider = "gearsOptions") - def testGears(dummy: String, sample1: Boolean, sample2: Boolean, qiimeCLosed: Boolean): Unit = { + def testGears(dummy: String, sample1: Boolean, sample2: Boolean): Unit = { val map = { var m: Map[String, Any] = GearsTest.config if (sample1) m = ConfigUtils.mergeMaps(GearsTest.sample1, m) if (sample2) m = ConfigUtils.mergeMaps(GearsTest.sample2, m) - ConfigUtils.mergeMaps(Map("gear_use_qiime_closed" -> qiimeCLosed), m) + ConfigUtils.mergeMaps(Map( + "gears_use_qiime_rtax" -> qiimeRtax, + "gears_use_qiime_closed" -> qiimeClosed, + "gears_use_qiime_open" -> qiimeOpen, + "gears_use_seq_count" -> seqCount, + "library_gears" -> libraryGears, + "output_dir" -> TestGearsSingle.outputDir + ) ++ kraken.map("gears_use_kraken" -> _), m) } if (!sample1 && !sample2) { // When no samples - intercept[IllegalArgumentException] { + intercept[IllegalStateException] { initPipeline(map).script() } + Logging.errors.clear() } else { val pipeline = initPipeline(map) pipeline.script() } } +} - // remove temporary run directory all tests in the class have been run - @AfterClass def removeTempOutputDir() = { - FileUtils.deleteDirectory(GearsTest.outputDir) - } - +class GearsDefaultTest extends GearsTest +class GearsKrakenTest extends GearsTest { + override def kraken = Some(true) +} +class GearsQiimeClosedTest extends GearsTest { + override def kraken = Some(false) + override def qiimeClosed = true +} +class GearsQiimeOpenTest extends GearsTest { + override def kraken = Some(false) + override def qiimeOpen = true +} +class GearsLibraryTest extends GearsTest { + override def libraryGears = true } object GearsTest { val outputDir = Files.createTempDir() new File(outputDir, "input").mkdirs() + outputDir.deleteOnExit() val r1 = new File(outputDir, "input" + File.separator + "R1.fq") Files.touch(r1) @@ -104,7 +129,8 @@ object GearsTest { "fastqc" -> Map("exe" -> "test"), "seqtk" -> Map("exe" -> "test"), "sickle" -> Map("exe" -> "test"), - "cutadapt" -> Map("exe" -> "test") + "cutadapt" -> Map("exe" -> "test"), + "pickopenreferenceotus" -> Map("exe" -> "test") ) val sample1 = Map( diff --git a/generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/GenerateIndexes.scala b/generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/GenerateIndexes.scala index da0de36e440a8f2a79b25c6ca5735d59027b9fca..234c0961729a8ee8884b1113fe05cc7081dc987b 100644 --- a/generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/GenerateIndexes.scala +++ b/generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/GenerateIndexes.scala @@ -24,6 +24,7 @@ import nl.lumc.sasc.biopet.extensions.bowtie.{ Bowtie2Build, BowtieBuild } import nl.lumc.sasc.biopet.extensions.bwa.BwaIndex import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants import nl.lumc.sasc.biopet.extensions.gmap.GmapBuild +import nl.lumc.sasc.biopet.extensions.hisat.Hisat2Build import nl.lumc.sasc.biopet.extensions.picard.CreateSequenceDictionary import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsFaidx import nl.lumc.sasc.biopet.utils.ConfigUtils @@ -267,6 +268,17 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript "bowtie_index" -> bowtie2Index.reference.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta") ) + // Hisat2 index + val hisat2Index = new Hisat2Build(this) + hisat2Index.inputFasta = createLinks(new File(genomeDir, "hisat2")) + hisat2Index.hisat2IndexBase = new File(new File(genomeDir, "hisat2"), "reference").getAbsolutePath + add(hisat2Index) + configDeps :+= hisat2Index.jobOutputFile + outputConfig += "hisat2" -> Map( + "reference_fasta" -> hisat2Index.inputFasta.getAbsolutePath, + "hisat_index" -> hisat2Index.inputFasta.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta") + ) + val writeConfig = new WriteConfig writeConfig.deps = configDeps writeConfig.out = new File(genomeDir, s"$speciesName-$genomeName.json") diff --git a/generate-indexes/src/test/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/GenerateIndexesTest.scala b/generate-indexes/src/test/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/GenerateIndexesTest.scala index c26f690b3c1c8e8d41bfb82dc551ed14d92775da..5931c7e0e827be453141f191d4f05005e27931c5 100644 --- a/generate-indexes/src/test/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/GenerateIndexesTest.scala +++ b/generate-indexes/src/test/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/GenerateIndexesTest.scala @@ -102,6 +102,7 @@ object GenerateIndexesTest { val config = Map("output_dir" -> outputDir, "bwa" -> Map("exe" -> "test"), "star" -> Map("exe" -> "test"), + "hisat2build" -> Map("exe" -> "test"), "bowtiebuild" -> Map("exe" -> "test"), "bowtie2build" -> Map("exe" -> "test"), "gmapbuild" -> Map("exe" -> "test"), diff --git a/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/Gentrap.scala b/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/Gentrap.scala index 767f0a72a146f0252b3427ef09c3936bff79c9f6..2582192a319ff36ded25fb359671d2d6a61112f9 100644 --- a/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/Gentrap.scala +++ b/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/Gentrap.scala @@ -117,16 +117,18 @@ class Gentrap(val root: Configurable) extends QScript ), // disable markduplicates since it may not play well with all aligners (this can still be overriden via config) "mapping" -> Map( + "aligner" -> "gsnap", "skip_markduplicates" -> true + ), + "wipereads" -> Map( + "limit_removal" -> true, + "no_make_index" -> false ) ) lazy val fragmentsPerGene = if (expMeasures().contains(ExpMeasures.FragmentsPerGene)) Some(new FragmentsPerGene(this)) else None - lazy val fragmentsPerExon = if (expMeasures().contains(ExpMeasures.FragmentsPerExon)) - Some(new FragmentsPerExon(this)) else None - lazy val baseCounts = if (expMeasures().contains(ExpMeasures.BaseCounts)) Some(new BaseCounts(this)) else None @@ -139,7 +141,7 @@ class Gentrap(val root: Configurable) extends QScript lazy val cufflinksStrict = if (expMeasures().contains(ExpMeasures.CufflinksStrict)) Some(new CufflinksStrict(this)) else None - def executedMeasures = (fragmentsPerGene :: fragmentsPerExon :: baseCounts :: cufflinksBlind :: + def executedMeasures = (fragmentsPerGene :: baseCounts :: cufflinksBlind :: cufflinksGuided :: cufflinksStrict :: Nil).flatten /** Whether to do simple variant calling on RNA or not */ @@ -208,7 +210,7 @@ class Gentrap(val root: Configurable) extends QScript job.inputBam = bamFile.get ribosomalRefFlat().foreach(job.intervalFile = _) job.outputBam = createFile("cleaned.bam") - job.discardedBam = createFile("rrna.bam") + job.discardedBam = Some(createFile("rrna.bam")) add(job) Some(job.outputBam) } else bamFile @@ -248,7 +250,7 @@ object Gentrap extends PipelineCommand { } /** Converts string with underscores into camel-case strings */ - private def camelize(ustring: String): String = ustring + private[gentrap] def camelize(ustring: String): String = ustring .split("_") .map(_.toLowerCase.capitalize) .mkString("") diff --git a/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/measures/FragmentsPerGene.scala b/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/measures/FragmentsPerGene.scala index 40cf4ca6798bc5106a81cd92687bef2d49a2f5e4..f06a3b8e857e3b1766d92554c954a6276d880dcb 100644 --- a/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/measures/FragmentsPerGene.scala +++ b/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/measures/FragmentsPerGene.scala @@ -16,6 +16,7 @@ package nl.lumc.sasc.biopet.pipelines.gentrap.measures import nl.lumc.sasc.biopet.core.annotations.AnnotationGtf import nl.lumc.sasc.biopet.extensions.HtseqCount +import nl.lumc.sasc.biopet.extensions.picard.SortSam import nl.lumc.sasc.biopet.utils.config.Configurable import org.broadinstitute.gatk.queue.QScript @@ -25,19 +26,31 @@ import org.broadinstitute.gatk.queue.QScript class FragmentsPerGene(val root: Configurable) extends QScript with Measurement with AnnotationGtf { def mergeArgs = MergeArgs(idCols = List(1), valCol = 2, numHeaderLines = 0, fallback = "0") - override def fixedValues: Map[String, Any] = Map("htseqcount" -> Map("order" -> "pos")) + override def fixedValues: Map[String, Any] = Map("htseqcount" -> Map("order" -> "")) + + lazy val sortOnId: Boolean = config("sort_on_id", default = true) /** Pipeline itself */ def biopetScript(): Unit = { val jobs = bamFiles.map { case (id, file) => - //TODO: ID sorting job + + val bamFile = if (sortOnId) { + val sortSam = new SortSam(this) + sortSam.input = file + sortSam.output = swapExt(outputDir, file, ".bam", ".idsorted.bam") + sortSam.sortOrder = "queryname" + sortSam.isIntermediate = true + add(sortSam) + sortSam.output + } else file val job = new HtseqCount(this) job.inputAnnotation = annotationGtf - job.inputAlignment = file + job.inputAlignment = bamFile job.output = new File(outputDir, s"$id.$name.counts") job.format = Option("bam") + job.order = if (sortOnId) Some("name") else Some("pos") add(job) id -> job } diff --git a/gentrap/src/test/scala/nl/lumc/sasc/biopet/pipelines/gentrap/GentrapTest.scala b/gentrap/src/test/scala/nl/lumc/sasc/biopet/pipelines/gentrap/GentrapTest.scala index a981f4e48f85b440e8091eeac8f3687cc88b49ea..4f2ae6ea2121b6427a3086e28ba9e46085a8bf27 100644 --- a/gentrap/src/test/scala/nl/lumc/sasc/biopet/pipelines/gentrap/GentrapTest.scala +++ b/gentrap/src/test/scala/nl/lumc/sasc/biopet/pipelines/gentrap/GentrapTest.scala @@ -17,16 +17,19 @@ package nl.lumc.sasc.biopet.pipelines.gentrap import java.io.{ File, FileOutputStream } import com.google.common.io.Files +import nl.lumc.sasc.biopet.core.{ BiopetFifoPipe, BiopetPipe } import nl.lumc.sasc.biopet.extensions._ -import nl.lumc.sasc.biopet.extensions.tools.BaseCounter -import nl.lumc.sasc.biopet.utils.ConfigUtils +import nl.lumc.sasc.biopet.extensions.gmap.Gsnap +import nl.lumc.sasc.biopet.extensions.hisat.Hisat2 +import nl.lumc.sasc.biopet.extensions.tools.{ BaseCounter, WipeReads } +import nl.lumc.sasc.biopet.utils.{ ConfigUtils, Logging } import nl.lumc.sasc.biopet.utils.config.Config import org.broadinstitute.gatk.queue.QSettings import org.scalatest.Matchers import org.scalatest.testng.TestNGSuite import org.testng.annotations.{ DataProvider, Test } -abstract class GentrapTestAbstract(val expressionMeasure: String) extends TestNGSuite with Matchers { +abstract class GentrapTestAbstract(val expressionMeasures: List[String]) extends TestNGSuite with Matchers { def initPipeline(map: Map[String, Any]): Gentrap = { new Gentrap() { @@ -39,116 +42,127 @@ abstract class GentrapTestAbstract(val expressionMeasure: String) extends TestNG } } - /** Convenience method for making library config */ - private def makeLibConfig(idx: Int, paired: Boolean = true) = { - val files = Map("R1" -> GentrapTest.inputTouch("test_R1.fq")) - if (paired) (s"lib_$idx", files ++ Map("R2" -> GentrapTest.inputTouch("test_R2.fq"))) - else (s"lib_$idx", files) - } - - /** Convenience type for sample config */ - private type SamplesConfig = Map[String, Map[String, Map[String, Map[String, Map[String, String]]]]] - - /** Convenience method for making a single sample config */ - private def makeSampleConfig(sampleIdx: Int, numLibs: Int, paired: Boolean) = - (s"sample_$sampleIdx", - Map("libraries" -> - (1 to numLibs) - .map(n => makeLibConfig(n, paired)) - .toMap - ) - ) - - /** Convenience method for making all samples config */ - private def makeSamplesConfig(numSamples: Int, numLibsEachSample: Int, pairMode: String): SamplesConfig = - Map("samples" -> - (1 to numSamples) - // if paired == "mixed", alternate paired/not paired between each number - .map(n => makeSampleConfig(n, numLibsEachSample, if (pairMode == "mixed") n % 2 == 0 else pairMode == "paired")) - .toMap - ) + def strandProtocols = Array("non_specific", "dutp") - val validExpressionMeasures = Set( - "fragments_per_gene", "fragments_per_exon", "base_counts", - "cufflinks_strict", "cufflinks_guided", "cufflinks_blind") + def aligner: Option[String] = None + def removeRiboReads: Option[Boolean] = Some(false) + def sample1: Boolean = true + def sample2: Boolean = true + def callVariants: Option[Boolean] = None @DataProvider(name = "expMeasuresstrandProtocol") def expMeasuresStrandProtocolProvider = { - - //val sampleConfigs = Array(pairedOneSampleOneLib, pairedOneSampleTwoLib, pairedOneSampleThreeLib) - val sampleConfigs = for { - (sampleNum, libNum) <- Seq( - // check multiple libs for single run only ~ to trim down less-informative tests - // need to check 2 and 3 samples since multi-sample plotting differs when sample is 1 or 2 and 3 - (1, 1), (1, 2), (2, 1), (3, 1) - ) - libType <- Seq("paired", "single", "mixed") - } yield makeSamplesConfig(sampleNum, libNum, libType) - - val strandProtocols = Array("non_specific", "dutp") - // get all possible combinations of expression measures - val expressionMeasures = validExpressionMeasures - //.subsets - //.map(_.toList) - .toArray - for { - sampleConfig <- sampleConfigs.toArray - //expressionMeasure <- expressionMeasures strandProtocol <- strandProtocols - } yield Array(sampleConfig, List(expressionMeasure), strandProtocol) + } yield Array(strandProtocol) } @Test(dataProvider = "expMeasuresstrandProtocol") - def testGentrap(sampleConfig: SamplesConfig, expMeasures: List[String], strandProtocol: String) = { + def testGentrap(strandProtocol: String) = { val settings = Map( "output_dir" -> GentrapTest.outputDir, "gsnap" -> Map("db" -> "test", "dir" -> "test"), - "aligner" -> "gsnap", - "expression_measures" -> expMeasures, + "expression_measures" -> expressionMeasures, "strand_protocol" -> strandProtocol - ) - val config = ConfigUtils.mergeMaps(settings ++ sampleConfig, Map(GentrapTest.executables.toSeq: _*)) + ) ++ + aligner.map("aligner" -> _) ++ + removeRiboReads.map("remove_ribosomal_reads" -> _) ++ + callVariants.map("call_variants" -> _) + val configs: List[Option[Map[String, Any]]] = List(Some(settings), (if (sample1) Some(GentrapTest.sample1) else None), (if (sample2) Some(GentrapTest.sample2) else None)) + val config = configs.flatten.foldLeft(GentrapTest.executables)((a, b) => ConfigUtils.mergeMaps(a, b)) val gentrap: Gentrap = initPipeline(config) - gentrap.script() - val functions = gentrap.functions.groupBy(_.getClass) - val numSamples = sampleConfig("samples").size - - if (expMeasures.contains("fragments_per_gene")) - assert(gentrap.functions.exists(_.isInstanceOf[HtseqCount])) - - if (expMeasures.contains("fragments_per_exon")) - assert(gentrap.functions.exists(_.isInstanceOf[HtseqCount])) - - if (expMeasures.contains("base_counts")) - gentrap.functions.count(_.isInstanceOf[BaseCounter]) shouldBe numSamples - - if (expMeasures.contains("cufflinks_strict")) { - assert(gentrap.functions.exists(_.isInstanceOf[Cufflinks])) - assert(gentrap.functions.exists(_.isInstanceOf[Ln])) + val numSamples = (sample1, sample2) match { + case (true, true) => 2 + case (_, true) => 1 + case (true, _) => 1 + case _ => 0 } - if (expMeasures.contains("cufflinks_guided")) { - assert(gentrap.functions.exists(_.isInstanceOf[Cufflinks])) - assert(gentrap.functions.exists(_.isInstanceOf[Ln])) - } + if (numSamples == 0) { + intercept[IllegalArgumentException] { + gentrap.script() + } + Logging.errors.clear() + } else { + gentrap.script() + + val functions = gentrap.functions.flatMap { + case f: BiopetFifoPipe => f.pipesJobs + case f: BiopetPipe => f.pipesJobs + case f => List(f) + }.groupBy(_.getClass) + + gentrap.shivaVariantcalling.isDefined shouldBe callVariants.getOrElse(false) + + gentrap.summarySettings.getOrElse("expression_measures", List()).asInstanceOf[List[String]].sorted shouldBe + expressionMeasures.map(Gentrap.camelize(_)).sorted + gentrap.summarySettings.get("call_variants") shouldBe Some(callVariants.getOrElse(false)) + gentrap.summarySettings.get("remove_ribosomal_reads") shouldBe Some(removeRiboReads.getOrElse(false)) + gentrap.summarySettings.get("strand_protocol") shouldBe Some(Gentrap.camelize(strandProtocol)) + + if (expressionMeasures.contains("fragments_per_gene")) + assert(gentrap.functions.exists(_.isInstanceOf[HtseqCount])) + + if (expressionMeasures.contains("fragments_per_exon")) + assert(gentrap.functions.exists(_.isInstanceOf[HtseqCount])) + + if (expressionMeasures.contains("base_counts")) + gentrap.functions.count(_.isInstanceOf[BaseCounter]) shouldBe numSamples + + if (expressionMeasures.contains("cufflinks_strict")) { + assert(gentrap.functions.exists(_.isInstanceOf[Cufflinks])) + assert(gentrap.functions.exists(_.isInstanceOf[Ln])) + } + + if (expressionMeasures.contains("cufflinks_guided")) { + assert(gentrap.functions.exists(_.isInstanceOf[Cufflinks])) + assert(gentrap.functions.exists(_.isInstanceOf[Ln])) + } + + if (expressionMeasures.contains("cufflinks_blind")) { + assert(gentrap.functions.exists(_.isInstanceOf[Cufflinks])) + assert(gentrap.functions.exists(_.isInstanceOf[Ln])) + } + + gentrap.removeRibosomalReads shouldBe removeRiboReads.getOrElse(false) + gentrap.functions.exists(_.isInstanceOf[WipeReads]) shouldBe removeRiboReads.getOrElse(false) + + val classMap = Map( + "gsnap" -> classOf[Gsnap], + "tophat" -> classOf[Tophat], + "star" -> classOf[Star], + "star-2pass" -> classOf[Star], + "hisat2" -> classOf[Hisat2] + ) - if (expMeasures.contains("cufflinks_blind")) { - assert(gentrap.functions.exists(_.isInstanceOf[Cufflinks])) - assert(gentrap.functions.exists(_.isInstanceOf[Ln])) + val alignerClass = classMap.get(aligner.getOrElse("gsnap")) + + alignerClass.foreach(c => assert(functions.keys.exists(_ == c))) + classMap.values.filterNot(Some(_) == alignerClass).foreach(x => assert(!functions.keys.exists(_ == x))) } } } -class GentrapFragmentsPerGeneTest extends GentrapTestAbstract("fragments_per_gene") +class GentrapFragmentsPerGeneTest extends GentrapTestAbstract(List("fragments_per_gene")) //class GentrapFragmentsPerExonTest extends GentrapTestAbstract("fragments_per_exon") -class GentrapBaseCountsTest extends GentrapTestAbstract("base_counts") -class GentrapCufflinksStrictTest extends GentrapTestAbstract("cufflinks_strict") -class GentrapCufflinksGuidedTest extends GentrapTestAbstract("cufflinks_guided") -class GentrapCufflinksBlindTest extends GentrapTestAbstract("cufflinks_blind") +class GentrapBaseCountsTest extends GentrapTestAbstract(List("base_counts")) +class GentrapCufflinksStrictTest extends GentrapTestAbstract(List("cufflinks_strict")) +class GentrapCufflinksGuidedTest extends GentrapTestAbstract(List("cufflinks_guided")) +class GentrapCufflinksBlindTest extends GentrapTestAbstract(List("cufflinks_blind")) +class GentrapAllTest extends GentrapTestAbstract(List("fragments_per_gene", "base_counts", "cufflinks_strict", "cufflinks_guided", "cufflinks_blind")) +class GentrapNoSamplesTest extends GentrapTestAbstract(List("fragments_per_gene")) { + override def sample1 = false + override def sample2 = false +} +class GentrapRemoveRibosomeTest extends GentrapTestAbstract(List("fragments_per_gene")) { + override def removeRiboReads = Some(true) +} +class GentrapCallVariantsTest extends GentrapTestAbstract(List("fragments_per_gene")) { + override def callVariants = Some(true) +} object GentrapTest { val outputDir = Files.createTempDir() @@ -171,14 +185,16 @@ object GentrapTest { copyFile("ref.dict") copyFile("ref.fa.fai") - val executables = Map( + val executables: Map[String, Any] = Map( "reference_fasta" -> (outputDir + File.separator + "ref.fa"), "refFlat" -> (outputDir + File.separator + "ref.fa"), "annotation_gtf" -> (outputDir + File.separator + "ref.fa"), "annotation_bed" -> (outputDir + File.separator + "ref.fa"), "annotation_refflat" -> (outputDir + File.separator + "ref.fa"), + "ribosome_refflat" -> (outputDir + File.separator + "ref.fa"), "varscan_jar" -> "test", - "rscript" -> Map("exe" -> "test") + "rscript" -> Map("exe" -> "test"), + "gatk_jar" -> "test" ) ++ Seq( // fastqc executables "fastqc", "seqtk", "sickle", "cutadapt", @@ -189,4 +205,26 @@ object GentrapTest { // bam2wig executables "igvtools", "wigtobigwig" ).map { case exe => exe -> Map("exe" -> "test") }.toMap + + val sample1: Map[String, Any] = Map( + "samples" -> Map("sample1" -> Map("libraries" -> Map( + "lib1" -> Map( + "R1" -> inputTouch("1_1_R1.fq"), + "R2" -> inputTouch("1_1_R2.fq") + ) + ) + ))) + + val sample2: Map[String, Any] = Map( + "samples" -> Map("sample3" -> Map("libraries" -> Map( + "lib1" -> Map( + "R1" -> inputTouch("2_1_R1.fq"), + "R2" -> inputTouch("2_1_R2.fq") + ), + "lib2" -> Map( + "R1" -> inputTouch("2_2_R1.fq"), + "R2" -> inputTouch("2_2_R2.fq") + ) + ) + ))) } diff --git a/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTest.scala b/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTest.scala index 89c788abedca5c9961614063c7f024a5f203a2f1..5acfbde9183609323100cb8a7cca405bc7c78a1a 100644 --- a/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTest.scala +++ b/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTest.scala @@ -15,97 +15,35 @@ package nl.lumc.sasc.biopet.pipelines.gwastest import java.io.File -import java.util -import htsjdk.samtools.reference.FastaSequenceFile import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand, Reference } import nl.lumc.sasc.biopet.extensions.Snptest import nl.lumc.sasc.biopet.extensions.gatk.{ CatVariants, SelectVariants } -import nl.lumc.sasc.biopet.extensions.tools.{ GensToVcf, SnptestToVcf } -import nl.lumc.sasc.biopet.pipelines.gwastest.impute.ImputeOutput -import nl.lumc.sasc.biopet.utils.Logging +import nl.lumc.sasc.biopet.extensions.tools.SnptestToVcf import nl.lumc.sasc.biopet.utils.config.Configurable import nl.lumc.sasc.biopet.utils.intervals.BedRecordList import org.broadinstitute.gatk.queue.QScript -import scala.collection.JavaConversions._ - /** * Created by pjvanthof on 16/03/16. */ class GwasTest(val root: Configurable) extends QScript with BiopetQScript with Reference { def this() = this(null) - import GwasTest._ - - val inputVcf: Option[File] = config("input_vcf") + val inputVcf: File = config("input_vcf") val phenotypeFile: File = config("phenotype_file") - val specsFile: Option[File] = config("imute_specs_file") - - val inputGens: List[GensInput] = if (inputVcf.isDefined) List[GensInput]() - else config("input_gens", default = Nil).asList.map { - case value: Map[String, Any] => - GensInput(new File(value("genotypes").toString), - value.get("info").map(x => new File(x.toString)), - value("contig").toString) - case value: util.LinkedHashMap[String, _] => - GensInput(new File(value.get("genotypes").toString), - value.toMap.get("info").map(x => new File(x.toString)), - value.get("contig").toString) - case _ => throw new IllegalArgumentException - } ++ (specsFile match { - case Some(file) => imputeSpecsToGensInput(file, config("validate_specs", default = true)) - case _ => Nil - }) - override def dictRequired = true override def defaults = Map("snptest" -> Map("genotype_field" -> "GP")) /** Init for pipeline */ def init(): Unit = { - inputGens.foreach { g => - val referenceDict = new FastaSequenceFile(referenceFasta(), true).getSequenceDictionary - if (referenceDict.getSequenceIndex(g.contig) == -1) - Logging.addError(s"Contig '${g.contig}' does not exist on reference: ${referenceFasta()}") - } } /** Pipeline itself */ def biopetScript(): Unit = { - val (vcfFile, chrVcfFiles): (File, Map[String, File]) = inputVcf.map((_, Map[String, File]())).getOrElse { - require(inputGens.nonEmpty, "No vcf file or gens files defined in config") - val outputDirGens = new File(outputDir, "gens_to_vcf") - val cvTotal = new CatVariants(this) - cvTotal.assumeSorted = true - cvTotal.outputFile = new File(outputDirGens, "merge.gens.vcf.gz") - val chrGens = inputGens.groupBy(_.contig).map { - case (contig, gens) => - val cvChr = new CatVariants(this) - cvChr.assumeSorted = true - //cvChr.isIntermediate = true - cvChr.outputFile = new File(outputDirGens, s"$contig.merge.gens.vcf.gz") - gens.zipWithIndex.foreach { gen => - val gensToVcf = new GensToVcf(this) - gensToVcf.inputGens = gen._1.genotypes - gensToVcf.inputInfo = gen._1.info - gensToVcf.contig = gen._1.contig - gensToVcf.samplesFile = phenotypeFile - gensToVcf.outputVcf = new File(outputDirGens, gen._1.genotypes.getName + s".${gen._2}.vcf.gz") - gensToVcf.isIntermediate = true - add(gensToVcf) - cvChr.variant :+= gensToVcf.outputVcf - } - add(cvChr) - cvTotal.variant :+= cvChr.outputFile - contig -> cvChr.outputFile - } - add(cvTotal) - (cvTotal.outputFile, chrGens) - } - val snpTests = BedRecordList.fromReference(referenceFasta()) .scatter(config("bin_size", default = 1000000)) .allRecords.map { region => @@ -119,7 +57,7 @@ class GwasTest(val root: Configurable) extends QScript with BiopetQScript with R bedFile.deleteOnExit() val sv = new SelectVariants(this) - sv.variant = chrVcfFiles.getOrElse(region.chr, vcfFile) + sv.variant = inputVcf sv.out = new File(regionDir, s"$name.vcf.gz") sv.intervals :+= bedFile sv.isIntermediate = true @@ -147,11 +85,4 @@ class GwasTest(val root: Configurable) extends QScript with BiopetQScript with R } } -object GwasTest extends PipelineCommand { - case class GensInput(genotypes: File, info: Option[File], contig: String) - - def imputeSpecsToGensInput(specsFile: File, validate: Boolean = true): List[GensInput] = { - ImputeOutput.readSpecsFile(specsFile, validate) - .map(x => GensInput(x.gens, Some(x.gensInfo), x.chromosome)) - } -} \ No newline at end of file +object GwasTest extends PipelineCommand \ No newline at end of file diff --git a/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/Impute2Vcf.scala b/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/Impute2Vcf.scala new file mode 100644 index 0000000000000000000000000000000000000000..7e47d69cedcc7a644b2d70057d80b2cc0dd06515 --- /dev/null +++ b/gwas-test/src/main/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/Impute2Vcf.scala @@ -0,0 +1,89 @@ +package nl.lumc.sasc.biopet.pipelines.gwastest.impute + +import java.io.File +import java.util + +import htsjdk.samtools.reference.FastaSequenceFile +import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand, Reference } +import nl.lumc.sasc.biopet.extensions.gatk.CatVariants +import nl.lumc.sasc.biopet.extensions.tools.GensToVcf +import nl.lumc.sasc.biopet.pipelines.gwastest.impute.Impute2Vcf.GensInput +import nl.lumc.sasc.biopet.utils.Logging +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.queue.QScript + +import scala.collection.JavaConversions._ + +/** + * Created by pjvan_thof on 27-5-16. + */ +class Impute2Vcf(val root: Configurable) extends QScript with BiopetQScript with Reference { + def this() = this(null) + + val specsFile: Option[File] = config("imute_specs_file") + + val phenotypeFile: File = config("phenotype_file") + + val inputGens: List[GensInput] = config("input_gens", default = Nil).asList.map { + case value: Map[String, Any] => + GensInput(new File(value("genotypes").toString), + value.get("info").map(x => new File(x.toString)), + value("contig").toString) + case value: util.LinkedHashMap[String, _] => + GensInput(new File(value.get("genotypes").toString), + value.toMap.get("info").map(x => new File(x.toString)), + value.get("contig").toString) + case _ => throw new IllegalArgumentException + } ++ (specsFile match { + case Some(file) => Impute2Vcf.imputeSpecsToGensInput(file, config("validate_specs", default = true)) + case _ => Nil + }) + + /** Init for pipeline */ + def init(): Unit = { + inputGens.foreach { g => + val referenceDict = new FastaSequenceFile(referenceFasta(), true).getSequenceDictionary + if (referenceDict.getSequenceIndex(g.contig) == -1) + Logging.addError(s"Contig '${g.contig}' does not exist on reference: ${referenceFasta()}") + } + } + + /** Pipeline itself */ + def biopetScript(): Unit = { + require(inputGens.nonEmpty, "No vcf file or gens files defined in config") + val cvTotal = new CatVariants(this) + cvTotal.assumeSorted = true + cvTotal.outputFile = new File(outputDir, "merge.gens.vcf.gz") + val chrGens = inputGens.groupBy(_.contig).map { + case (contig, gens) => + val cvChr = new CatVariants(this) + cvChr.assumeSorted = true + //cvChr.isIntermediate = true + cvChr.outputFile = new File(outputDir, s"$contig.merge.gens.vcf.gz") + gens.zipWithIndex.foreach { gen => + val gensToVcf = new GensToVcf(this) + gensToVcf.inputGens = gen._1.genotypes + gensToVcf.inputInfo = gen._1.info + gensToVcf.contig = gen._1.contig + gensToVcf.samplesFile = phenotypeFile + gensToVcf.outputVcf = new File(outputDir, gen._1.genotypes.getName + s".${gen._2}.vcf.gz") + gensToVcf.isIntermediate = true + add(gensToVcf) + cvChr.variant :+= gensToVcf.outputVcf + } + add(cvChr) + cvTotal.variant :+= cvChr.outputFile + contig -> cvChr.outputFile + } + add(cvTotal) + } +} + +object Impute2Vcf extends PipelineCommand { + case class GensInput(genotypes: File, info: Option[File], contig: String) + + def imputeSpecsToGensInput(specsFile: File, validate: Boolean = true): List[GensInput] = { + ImputeOutput.readSpecsFile(specsFile, validate) + .map(x => GensInput(x.gens, Some(x.gensInfo), x.chromosome)) + } +} \ No newline at end of file diff --git a/gwas-test/src/test/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTestTest.scala b/gwas-test/src/test/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTestTest.scala index af3ef492794ecba966b206fc978170f380c73543..52c7e1870dbc7bc472a493d2773448cd76f898b2 100644 --- a/gwas-test/src/test/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTestTest.scala +++ b/gwas-test/src/test/scala/nl/lumc/sasc/biopet/pipelines/gwastest/GwasTestTest.scala @@ -18,6 +18,7 @@ import java.io.File import java.nio.file.Paths import com.google.common.io.Files +import nl.lumc.sasc.biopet.utils.Logging import nl.lumc.sasc.biopet.utils.config.Config import org.broadinstitute.gatk.queue.QSettings import org.scalatest.Matchers @@ -39,6 +40,7 @@ class GwasTestTest extends TestNGSuite with Matchers { @Test def testFromVcf: Unit = { + Logging.errors.clear() val pipeline = initPipeline(GwasTestTest.config ++ Map("input_vcf" -> GwasTestTest.vcfFile.toString ) @@ -46,34 +48,6 @@ class GwasTestTest extends TestNGSuite with Matchers { pipeline.script() } - @Test - def testFromGens: Unit = { - val pipeline = initPipeline(GwasTestTest.config ++ - Map("input_gens" -> List(Map("genotypes" -> GwasTestTest.vcfFile, "contig" -> "chrQ")) - ) - ) - pipeline.script() - } - - @Test - def testWrongContig: Unit = { - val pipeline = initPipeline(GwasTestTest.config ++ - Map("input_gens" -> List(Map("genotypes" -> GwasTestTest.vcfFile, "contig" -> "chrBla")) - ) - ) - intercept[IllegalStateException] { - pipeline.script() - } - } - - @Test - def testFromSpecs: Unit = { - val pipeline = initPipeline(GwasTestTest.config ++ - Map("imute_specs_file" -> GwasTestTest.resourcePath("/specs/files.specs")) - ) - pipeline.script() - } - @Test def testEmpty: Unit = { val pipeline = initPipeline(GwasTestTest.config) @@ -101,8 +75,8 @@ object GwasTestTest { } val config = Map( - "reference_fasta" -> GwasTestTest.reference.toString, - "phenotype_file" -> GwasTestTest.phenotypeFile.toString, + "reference_fasta" -> reference.toString, + "phenotype_file" -> phenotypeFile.toString, "output_dir" -> outputDir, "snptest" -> Map("exe" -> "test"), "md5sum" -> Map("exe" -> "test"), diff --git a/gwas-test/src/test/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/Impute2VcfTest.scala b/gwas-test/src/test/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/Impute2VcfTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..ca42456ac55f2768b48ee54bc1bd1b5e99383559 --- /dev/null +++ b/gwas-test/src/test/scala/nl/lumc/sasc/biopet/pipelines/gwastest/impute/Impute2VcfTest.scala @@ -0,0 +1,89 @@ +package nl.lumc.sasc.biopet.pipelines.gwastest.impute + +import java.io.File +import java.nio.file.Paths + +import com.google.common.io.Files +import nl.lumc.sasc.biopet.utils.config.Config +import org.broadinstitute.gatk.queue.QSettings +import org.scalatest.Matchers +import org.scalatest.testng.TestNGSuite +import org.testng.annotations.Test + +/** + * Created by pjvan_thof on 31-5-16. + */ +class Impute2VcfTest extends TestNGSuite with Matchers { + def initPipeline(map: Map[String, Any]): Impute2Vcf = { + new Impute2Vcf { + override def configNamespace = "impute2vcf" + override def globalConfig = new Config(map) + qSettings = new QSettings + qSettings.runName = "test" + } + } + + @Test + def testFromGens: Unit = { + val pipeline = initPipeline(Impute2VcfTest.config ++ + Map("input_gens" -> List(Map("genotypes" -> Impute2VcfTest.vcfFile, "contig" -> "chrQ")) + ) + ) + pipeline.script() + } + + @Test + def testWrongContig: Unit = { + val pipeline = initPipeline(Impute2VcfTest.config ++ + Map("input_gens" -> List(Map("genotypes" -> Impute2VcfTest.vcfFile, "contig" -> "chrBla")) + ) + ) + intercept[IllegalStateException] { + pipeline.script() + } + } + + @Test + def testFromSpecs: Unit = { + val pipeline = initPipeline(Impute2VcfTest.config ++ + Map("imute_specs_file" -> Impute2VcfTest.resourcePath("/specs/files.specs")) + ) + pipeline.script() + } + + @Test + def testEmpty: Unit = { + val pipeline = initPipeline(Impute2VcfTest.config) + intercept[IllegalArgumentException] { + pipeline.script() + } + } + +} + +object Impute2VcfTest { + val vcfFile = File.createTempFile("gwas.", ".vcf") + Files.touch(vcfFile) + vcfFile.deleteOnExit() + + val phenotypeFile = File.createTempFile("gwas.", ".txt") + phenotypeFile.deleteOnExit() + + val outputDir = Files.createTempDir() + outputDir.deleteOnExit() + + val reference = new File(resourcePath("/fake_chrQ.fa")) + + private def resourcePath(p: String): String = { + Paths.get(getClass.getResource(p).toURI).toString + } + + val config = Map( + "reference_fasta" -> reference.toString, + "phenotype_file" -> phenotypeFile.toString, + "output_dir" -> outputDir, + "md5sum" -> Map("exe" -> "test"), + "gatk_jar" -> "test" + ) + +} diff --git a/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala b/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala index 08909d2fa499be636393efee4901e1e01712d0f2..b0a435ff2f9084e46e5b526427be7682b819381e 100644 --- a/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala +++ b/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala @@ -22,6 +22,7 @@ import nl.lumc.sasc.biopet.core.summary.SummaryQScript import nl.lumc.sasc.biopet.extensions.bowtie.{ Bowtie2, Bowtie } import nl.lumc.sasc.biopet.extensions.bwa.{ BwaAln, BwaMem, BwaSampe, BwaSamse } import nl.lumc.sasc.biopet.extensions.gmap.Gsnap +import nl.lumc.sasc.biopet.extensions.hisat.Hisat2 import nl.lumc.sasc.biopet.extensions.picard.{ AddOrReplaceReadGroups, MarkDuplicates, MergeSamFiles, ReorderSam, SortSam } import nl.lumc.sasc.biopet.extensions.tools.FastqSplitter import nl.lumc.sasc.biopet.extensions._ @@ -98,7 +99,10 @@ class Mapping(val root: Configurable) extends QScript with SummaryQScript with S /** location of summary file */ def summaryFile = new File(outputDir, sampleId.getOrElse("x") + "-" + libId.getOrElse("x") + ".summary.json") - override def defaults = Map("gsnap" -> Map("batch" -> 4)) + override def defaults = Map( + "gsnap" -> Map("batch" -> 4), + "star" -> Map("outsamunmapped" -> "Within") + ) override def fixedValues = Map( "gsnap" -> Map("format" -> "sam"), @@ -124,7 +128,7 @@ class Mapping(val root: Configurable) extends QScript with SummaryQScript with S "skip_markduplicates" -> skipMarkduplicates, "aligner" -> aligner, "chunking" -> chunking, - "numberChunks" -> (if (chunking) numberChunks.getOrElse(1) else None) + "number_of_chunks" -> (if (chunking) numberChunks.getOrElse(1) else None) ) ++ (if (root == null) Map("reference" -> referenceSummary) else Map()) override def reportClass = { @@ -229,6 +233,7 @@ class Mapping(val root: Configurable) extends QScript with SummaryQScript with S case "bowtie" => addBowtie(R1, R2, outputBam) case "bowtie2" => addBowtie2(R1, R2, outputBam) case "gsnap" => addGsnap(R1, R2, outputBam) + case "hisat2" => addHisat2(R1, R2, outputBam) // TODO: make TopHat here accept multiple input files case "tophat" => addTophat(R1, R2, outputBam) case "stampy" => addStampy(R1, R2, outputBam) @@ -360,6 +365,32 @@ class Mapping(val root: Configurable) extends QScript with SummaryQScript with S ar._2 } + def addHisat2(R1: File, R2: Option[File], output: File): File = { + val hisat2 = new Hisat2(this) + hisat2.R1 = R1 + hisat2.R2 = R2 + hisat2.rgId = Some(readgroupId) + hisat2.rg +:= s"PL:$platform" + hisat2.rg +:= s"PU:$platformUnit" + libId match { + case Some(id) => hisat2.rg +:= s"LB:$id" + case otherwise => ; + } + sampleId match { + case Some(id) => hisat2.rg +:= s"SM:$id" + case otherwise => ; + } + + val sortSam = new SortSam(this) + sortSam.output = output + val pipe = hisat2 | sortSam + pipe.isIntermediate = chunking || !skipMarkduplicates + pipe.threadsCorrection = 1 + add(pipe) + + output + } + def addTophat(R1: File, R2: Option[File], output: File): File = { // TODO: merge mapped and unmapped BAM ~ also dealing with validation errors in the unmapped BAM val tophat = new Tophat(this) diff --git a/mapping/src/test/scala/nl/lumc/sasc/biopet/pipelines/mapping/MappingTest.scala b/mapping/src/test/scala/nl/lumc/sasc/biopet/pipelines/mapping/MappingTest.scala index 2c7f805a8b1a52c563484742e4203f37b4f3514c..3a621de8f9f2ac42caa15af6cdf0c3400efab307 100644 --- a/mapping/src/test/scala/nl/lumc/sasc/biopet/pipelines/mapping/MappingTest.scala +++ b/mapping/src/test/scala/nl/lumc/sasc/biopet/pipelines/mapping/MappingTest.scala @@ -98,6 +98,7 @@ abstract class AbstractTestMapping(val aligner: String) extends TestNGSuite with val r2 = new File(outputDir, "input" + File.separator + "R2.fq") val r1Zipped = new File(outputDir, "input" + File.separator + "R1.fq.gz") val r2Zipped = new File(outputDir, "input" + File.separator + "R2.fq.gz") + val hisat2Index = new File(outputDir, "ref.1.ht2") @BeforeClass def createTempFiles: Unit = { @@ -105,6 +106,7 @@ abstract class AbstractTestMapping(val aligner: String) extends TestNGSuite with Files.touch(r2) Files.touch(r1Zipped) Files.touch(r2Zipped) + Files.touch(hisat2Index) copyFile("ref.fa") copyFile("ref.dict") @@ -125,6 +127,7 @@ abstract class AbstractTestMapping(val aligner: String) extends TestNGSuite with "reference_fasta" -> (outputDir + File.separator + "ref.fa"), "db" -> "test", "bowtie_index" -> (outputDir + File.separator + "ref"), + "hisat2_index" -> (outputDir + File.separator + "ref"), "fastqc" -> Map("exe" -> "test"), "seqtk" -> Map("exe" -> "test"), "gsnap" -> Map("exe" -> "test"), @@ -135,6 +138,7 @@ abstract class AbstractTestMapping(val aligner: String) extends TestNGSuite with "star" -> Map("exe" -> "test"), "bowtie" -> Map("exe" -> "test"), "bowtie2" -> Map("exe" -> "test"), + "hisat2" -> Map("exe" -> "test"), "stampy" -> Map("exe" -> "test", "genome" -> "test", "hash" -> "test"), "samtools" -> Map("exe" -> "test"), "kraken" -> Map("exe" -> "test", "db" -> "test"), @@ -154,6 +158,7 @@ class MappingStarTest extends AbstractTestMapping("star") class MappingStar2PassTest extends AbstractTestMapping("star-2pass") class MappingBowtieTest extends AbstractTestMapping("bowtie") class MappingBowtie2Test extends AbstractTestMapping("bowtie2") +class MappingHisat2Test extends AbstractTestMapping("hisat2") class MappingStampyTest extends AbstractTestMapping("stampy") class MappingGsnapTest extends AbstractTestMapping("gsnap") class MappingTophatTest extends AbstractTestMapping("tophat") diff --git a/mapping/src/test/scala/nl/lumc/sasc/biopet/pipelines/mapping/MultisampleMappingTest.scala b/mapping/src/test/scala/nl/lumc/sasc/biopet/pipelines/mapping/MultisampleMappingTest.scala index 525005fdde51037050ea079cff46b5313f3c0e15..f036640ebb8700232b0b6f057cc45f48b64e3a36 100644 --- a/mapping/src/test/scala/nl/lumc/sasc/biopet/pipelines/mapping/MultisampleMappingTest.scala +++ b/mapping/src/test/scala/nl/lumc/sasc/biopet/pipelines/mapping/MultisampleMappingTest.scala @@ -19,7 +19,7 @@ import java.io.{ File, FileOutputStream } import com.google.common.io.Files import nl.lumc.sasc.biopet.extensions.kraken.Kraken import nl.lumc.sasc.biopet.extensions.picard.{ MarkDuplicates, MergeSamFiles } -import nl.lumc.sasc.biopet.utils.ConfigUtils +import nl.lumc.sasc.biopet.utils.{ ConfigUtils, Logging } import nl.lumc.sasc.biopet.utils.config.Config import org.broadinstitute.gatk.queue.QSettings import org.scalatest.Matchers @@ -72,9 +72,10 @@ trait MultisampleMappingTestTrait extends TestNGSuite with Matchers { } if (!sample1 && !sample2 && !sample3 && !sample4) { // When no samples - intercept[IllegalArgumentException] { + intercept[IllegalStateException] { initPipeline(map).script() } + Logging.errors.clear() } else if (sample4 && !bamToFastq && !correctReadgroups) { intercept[IllegalStateException] { initPipeline(map).script() diff --git a/mkdocs.yml b/mkdocs.yml index e6475c8369d1d85ab155307bd672fc40e68ff9c6..3771882ece0bc2c8e34f5d7ee7ad15404c6460ec 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -20,6 +20,7 @@ pages: - TinyCap (smallRNA): 'pipelines/tinycap.md' - Toucan (Annotation): 'pipelines/toucan.md' - Tools: + - AnnotateVcfWithBed: 'tools/AnnotateVcfWithBed.md' - SamplesTsvToJson: 'tools/SamplesTsvToJson.md' - BedToInterval: 'tools/bedtointerval.md' - BastyGenerateFasta: 'tools/BastyGenerateFasta.md' @@ -36,6 +37,7 @@ pages: - VcfFilter: 'tools/VcfFilter.md' - VepNormalizer: 'tools/VepNormalizer.md' - Release notes: + - 0.7.0: 'releasenotes/release_notes_0.7.0.md' - 0.6.0: 'releasenotes/release_notes_0.6.0.md' - 0.5.0: 'releasenotes/release_notes_0.5.0.md' - 0.4.0: 'releasenotes/release_notes_0.4.0.md' @@ -45,9 +47,10 @@ pages: - Developer: - Getting Started: 'developer/getting-started.md' - Code Style: 'developer/code-style.md' - - Example pipeline: 'developer/example-pipeline.md' - - Example tool: 'developer/example-tool.md' - - Example pipeable: 'developer/example-pipeable.md' + - Example Dependecies: 'developer/example-depends.md' + - Example Pipeline: 'developer/example-pipeline.md' + - Example Tool: 'developer/example-tool.md' + - Example Pipeable: 'developer/example-pipeable.md' - Scala docs: 'developer/scaladocs.md' #- ['developing/Setup.md', 'Developing', 'Setting up your local development environment'] #theme: readthedocs diff --git a/pom.xml b/pom.xml index 0ed2d1a87482b85067f704a61d70330605016fa2..aaeccd4a8bd94e6c9c1afa17d6fe4b88572fed86 100644 --- a/pom.xml +++ b/pom.xml @@ -81,6 +81,7 @@ <artifactId>maven-surefire-plugin</artifactId> <version>2.18.1</version> <configuration> + <reuseForks>false</reuseForks> <forkCount>1C</forkCount> <argLine>-Xmx300m</argLine> <workingDirectory>${project.build.directory}</workingDirectory> diff --git a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCallerAllele.scala b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCallerAllele.scala index 5d384bb997587e168349bdef612ecae009119223..6077e6821fa30d776a910dd4914ec131355fbf87 100644 --- a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCallerAllele.scala +++ b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCallerAllele.scala @@ -20,6 +20,7 @@ package nl.lumc.sasc.biopet.pipelines.shiva.variantcallers import nl.lumc.sasc.biopet.extensions.gatk +import nl.lumc.sasc.biopet.utils.Logging import nl.lumc.sasc.biopet.utils.config.Configurable /** Allele mode for Haplotypecaller */ @@ -27,9 +28,11 @@ class HaplotypeCallerAllele(val root: Configurable) extends Variantcaller { val name = "haplotypecaller_allele" protected def defaultPrio = 5 + lazy val alleles: File = config("input_alleles") + def biopetScript() { val hc = gatk.HaplotypeCaller(this, inputBams.values.toList, outputFile) - hc.alleles = config("input_alleles") + hc.alleles = Some(alleles) hc.genotyping_mode = Some("GENOTYPE_GIVEN_ALLELES") add(hc) } diff --git a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCallerGvcf.scala b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCallerGvcf.scala index 6568c09d4d44e968b2f232a095cefa8591e4eba1..2e4d5c8222787566334d6a2298d23a74ed9cb43a 100644 --- a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCallerGvcf.scala +++ b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaplotypeCallerGvcf.scala @@ -34,14 +34,21 @@ class HaplotypeCallerGvcf(val root: Configurable) extends Variantcaller { def getGvcfs = gVcfFiles + override def fixedValues = Map("haplotypecaller" -> Map("emitRefConfidence" -> "GVCF")) + + override def defaults = Map("haplotypecaller" -> Map( + "variant_index_type" -> "LINEAR", + "variant_index_parameter" -> 128000) + ) + def biopetScript() { - val gvcfFiles = for ((sample, inputBam) <- inputBams) yield { - val hc = gatk.HaplotypeCaller.gvcf(this, inputBam, new File(outputDir, sample + ".gvcf.vcf.gz")) + gVcfFiles = for ((sample, inputBam) <- inputBams) yield { + val hc = gatk.HaplotypeCaller(this, List(inputBam), new File(outputDir, sample + ".gvcf.vcf.gz")) add(hc) sample -> hc.out } - val genotypeGVCFs = gatk.GenotypeGVCFs(this, gvcfFiles.values.toList, outputFile) + val genotypeGVCFs = gatk.GenotypeGVCFs(this, gVcfFiles.values.toList, outputFile) add(genotypeGVCFs) } } diff --git a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/UnifiedGenotyperAllele.scala b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/UnifiedGenotyperAllele.scala index a1086ad855746959a53ad4c74b27251fea602d76..949c5f392133baca88c55ccf5ee11f48f86c765e 100644 --- a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/UnifiedGenotyperAllele.scala +++ b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/UnifiedGenotyperAllele.scala @@ -27,9 +27,11 @@ class UnifiedGenotyperAllele(val root: Configurable) extends Variantcaller { val name = "unifiedgenotyper_allele" protected def defaultPrio = 9 + lazy val alleles: File = config("input_alleles") + def biopetScript() { val ug = gatk.UnifiedGenotyper(this, inputBams.values.toList, outputFile) - ug.alleles = config("input_alleles") + ug.alleles = Some(alleles) ug.genotyping_mode = Some("GENOTYPE_GIVEN_ALLELES") add(ug) } diff --git a/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaTest.scala b/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaTest.scala index 56881f2dd5f2d48ef5cda84b87216106c57ce1ba..ef62700ca687b0153c21fa75510c867e06b7c0b4 100644 --- a/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaTest.scala +++ b/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaTest.scala @@ -20,7 +20,7 @@ import com.google.common.io.Files import nl.lumc.sasc.biopet.extensions.gatk.{ BaseRecalibrator, IndelRealigner, PrintReads, RealignerTargetCreator } import nl.lumc.sasc.biopet.extensions.picard.MarkDuplicates import nl.lumc.sasc.biopet.extensions.tools.VcfStats -import nl.lumc.sasc.biopet.utils.ConfigUtils +import nl.lumc.sasc.biopet.utils.{ ConfigUtils, Logging } import nl.lumc.sasc.biopet.utils.config.Config import org.broadinstitute.gatk.queue.QSettings import org.scalatest.Matchers @@ -86,6 +86,7 @@ trait ShivaTestTrait extends TestNGSuite with Matchers { intercept[IllegalArgumentException] { initPipeline(map).script() } + Logging.errors.clear() } else { val pipeline = initPipeline(map) pipeline.script() diff --git a/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaploTypeCallerGvcfTest.scala b/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaploTypeCallerGvcfTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..f07376157878f0c0807f13fca1d1410886721890 --- /dev/null +++ b/shiva/src/test/scala/nl/lumc/sasc/biopet/pipelines/shiva/variantcallers/HaploTypeCallerGvcfTest.scala @@ -0,0 +1,33 @@ +package nl.lumc.sasc.biopet.pipelines.shiva.variantcallers + +import java.io.File + +import org.scalatest.Matchers +import org.scalatest.testng.TestNGSuite +import org.testng.annotations.Test + +/** + * Created by Sander Bollen on 13-7-16. + */ +class HaploTypeCallerGvcfTest extends TestNGSuite with Matchers { + + @Test + def testGvcfFiles = { + val samples = List("sample01", "sample02", "sample03") + val hc = new HaplotypeCallerGvcf(null) + hc.inputBams = createInputMap(samples) + hc.biopetScript() + + hc.getGvcfs.size shouldBe 3 + hc.getGvcfs.keys.toList.sorted shouldEqual samples.sorted + } + + def createInputMap(samples: List[String]): Map[String, File] = { + samples map { x => + val file = File.createTempFile(x, ".bam") + file.deleteOnExit() + x -> file + } toMap + } + +} diff --git a/tinycap/src/main/scala/nl/lumc/sasc/biopet/pipelines/tinycap/TinyCap.scala b/tinycap/src/main/scala/nl/lumc/sasc/biopet/pipelines/tinycap/TinyCap.scala index 1b4f80ee47dba29419792a9673673bf224bbfdbe..5911b88d77d2bce63372949e2efd805d03b88a39 100644 --- a/tinycap/src/main/scala/nl/lumc/sasc/biopet/pipelines/tinycap/TinyCap.scala +++ b/tinycap/src/main/scala/nl/lumc/sasc/biopet/pipelines/tinycap/TinyCap.scala @@ -64,22 +64,24 @@ class TinyCap(val root: Configurable) extends QScript "bowtie" -> Map( "chunkmbs" -> 256, "seedmms" -> 3, - "seedlen" -> 25, - "k" -> 5, - "best" -> true + "seedlen" -> 28, + "k" -> 3, + "best" -> true, + "strata" -> true ), "sickle" -> Map( "lengthThreshold" -> 15 ), "fastqc" -> Map( - "sensitiveAdapterSearch" -> true + "sensitiveAdapterSearch" -> false ), "cutadapt" -> Map( - "error_rate" -> 0.2, + "error_rate" -> 0.1, "minimum_length" -> 15, "q" -> 30, - "default_clip_mode" -> "both", - "times" -> 2 + "default_clip_mode" -> "3", + "times" -> 1, + "ignore_fastqc_adapters" -> true ) )