Commit 86e4be83 authored by Wai Yi Leung's avatar Wai Yi Leung
Browse files

Merge branch 'develop' into feature-small_rna

parents eff61f75 09da7b13
......@@ -47,16 +47,24 @@ For BAM files as input one should use a config like this:
``` yaml
samples:
Sample_ID_1:
tags:
gender: male
father: sampleNameFather
mother: sampleNameMother
libraries:
Lib_ID_1:
tags:
key: value
bam: MyFirst.bam
Lib_ID_2:
bam: MySecond.bam
```
Note that there is a tool called [SamplesTsvToJson](../tools/SamplesTsvToJson.md) this enables a user to get the sample config without any chance of creating a wrongly formatted JSON file.
#### Tags
In the `tags` key inside a sample or library users can supply tags that belong to samples/libraries. These tags will we automatically parsed inside the summary of a pipeline.
### The settings config
The settings config enables a user to alter the settings for almost all settings available in the tools used for a given pipeline.
......
......@@ -21,12 +21,18 @@ Usage: SamplesTsvToJson [options]
Print version
-i <file> | --inputFiles <file>
Input must be a tsv file, first line is seen as header and must at least have a 'sample' column, 'library' column is optional, multiple files allowed
-t <file> | --tagFiles <file>
-o <file> | --outputFile <file>
~~~
The tool is designed in such a way that a user can provide a TAB seperated file (TSV) with sample specific properties and even those will be parsed by the tool.
For example: a user wants to have certain properties e.g. which treatment a sample got than the user should provide a extra columns called treatment and then the
JSON file is parsed with those properties inside it as well. The order of columns does not matter.
The tag files works the same only the value are prefixed in the key `tags`.
#### Example
~~~ json
......
......@@ -19,8 +19,8 @@ import java.io.File
import htsjdk.samtools.reference.IndexedFastaSequenceFile
import nl.lumc.sasc.biopet.core.summary.{ SummaryQScript, Summarizable }
import nl.lumc.sasc.biopet.utils.Logging
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.utils.{ ConfigUtils, Logging }
import nl.lumc.sasc.biopet.utils.config.{ Config, Configurable }
import scala.collection.JavaConversions._
......@@ -69,16 +69,40 @@ trait Reference extends Configurable {
/** Returns the fasta file */
def referenceFasta(): File = {
val file: File = config("reference_fasta")
checkFasta(file)
val dict = new File(file.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta").stripSuffix(".fna") + ".dict")
val fai = new File(file.getAbsolutePath + ".fai")
this match {
case c: BiopetCommandLineFunction => c.deps :::= dict :: fai :: Nil
case _ =>
if (config.contains("reference_fasta")) {
checkFasta(file)
val dict = new File(file.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta").stripSuffix(".fna") + ".dict")
val fai = new File(file.getAbsolutePath + ".fai")
this match {
case c: BiopetCommandLineFunction => c.deps :::= dict :: fai :: Nil
case _ =>
}
} else {
val defaults = ConfigUtils.mergeMaps(this.defaults, this.internalDefaults)
def getReferences(map: Map[String, Any]): Set[(String, String)] = (for (
(species, species_content: Map[String, Any]) <- map.getOrElse("references", Map[String, Any]()).asInstanceOf[Map[String, Any]].toList;
(reference_name, _) <- species_content.toList
) yield (species, reference_name)).toSet
val references = getReferences(defaults) ++ getReferences(Config.global.map)
if (!references.contains((referenceSpecies, referenceName))) {
val buffer = new StringBuilder()
if (references.exists(_._1 == referenceSpecies)) {
buffer.append(s"Reference: '$referenceName' does not exist in config for species: '$referenceSpecies'")
buffer.append(s"\nRefrences found for species '$referenceSpecies':")
references.filter(_._1 == referenceSpecies).foreach(x => buffer.append("\n - " + x._2))
} else {
buffer.append(s"Species: '$referenceSpecies' does not exist in config")
if (references.nonEmpty) buffer.append("\n References available in config (species -> reference_name):")
else buffer.append("\n No references found in user or global config")
references.toList.sorted.foreach(x => buffer.append(s"\n - ${x._1} -> ${x._2}"))
}
Logging.addError(buffer.toString)
}
}
file
}
......@@ -117,6 +141,7 @@ object Reference {
/**
* Raise an exception when given fasta file has no fai file
*
* @param fastaFile Fasta file
*/
def requireFai(fastaFile: File): Unit = {
......@@ -132,6 +157,7 @@ object Reference {
/**
* Raise an exception when given fasta file has no dict file
*
* @param fastaFile Fasta file
*/
def requireDict(fastaFile: File): Unit = {
......
......@@ -24,7 +24,7 @@ trait PythonCommandLineFunction extends BiopetCommandLineFunction {
@Input(doc = "Python script", required = false)
var python_script: File = _
executable = config("exe", default = "python", submodule = "python")
executable = config("exe", default = "python", submodule = "python", freeVar = false)
protected var python_script_name: String = _
......
......@@ -112,11 +112,15 @@ class WriteSummary(val root: Configurable) extends InProcessFunction with Config
Map("samples" -> q.samples.map {
case (sampleName, sample) =>
sampleName -> Map(
qscript.summaryName -> Map("settings" -> sample.summarySettings),
qscript.summaryName -> Map(
"settings" -> sample.summarySettings,
"tags" -> sample.sampleTags),
"libraries" -> sample.libraries.map {
case (libName, lib) =>
libName -> Map(
qscript.summaryName -> Map("settings" -> lib.summarySettings)
qscript.summaryName -> Map(
"settings" -> lib.summarySettings,
"tags" -> lib.libTags)
)
}
)
......
......@@ -197,7 +197,7 @@ class WriteSummaryTest extends TestNGSuite with Matchers {
object WriteSummaryTest {
def makeWriter(root: Configurable, c: Map[String, Any] = Map()) = new WriteSummary(root) {
override def globalConfig = new Config(c)
override def globalConfig = new Config(c + ("exe" -> "test"))
override def outputs = Seq()
override def inputs = Seq()
qSettings = new QSettings {
......@@ -238,7 +238,7 @@ object WriteSummaryTest {
libId = l
summaryName = "test"
outputDir = new File(".").getAbsoluteFile
override def globalConfig = new Config(c)
override def globalConfig = new Config(c + ("exe" -> "test"))
def summarySettings: Map[String, Any] = settings
def summaryFiles: Map[String, File] = files
val tempFile = File.createTempFile("summary", ".json")
......@@ -256,7 +256,7 @@ object WriteSummaryTest {
new MultiSampleQScript with QScript {
summaryName = "test"
outputDir = new File(".").getAbsoluteFile
override def globalConfig = new Config(c)
override def globalConfig = new Config(c + ("exe" -> "test"))
def summarySettings: Map[String, Any] = settings
def summaryFiles: Map[String, File] = files
val tempFile = File.createTempFile("summary", ".json")
......
#!/usr/bin/env python
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
from __future__ import print_function
__author__="Peter van 't Hof"
import sys
import re
upacPatern = re.compile(r'[RYKMSWBDHV]')
if __name__ == "__main__":
for line in sys.stdin:
l = line.strip().split("\t")
if len(l) >= 3:
l[3] = upacPatern.sub("N", l[3])
print("\t".join(map(str, l)))
......@@ -33,7 +33,9 @@ if __name__ == "__main__":
"""
for line in sys.stdin:
l = line.strip().split("\t")
if l[3] == "0":
l[2] = upacPatern.sub("N", l[2])
if len(l) < 4 or l[3] == "0":
# no alignment to this position
print("\t".join(map(str, l)))
continue
......@@ -49,5 +51,4 @@ if __name__ == "__main__":
if new_size == 0:
l[5] = ""
l[2] = upacPatern.sub("N", l[2])
print("\t".join(map(str, l)))
......@@ -114,7 +114,7 @@ class VariantEffectPredictor(val root: Configurable) extends BiopetCommandLineFu
var fasta: Option[String] = config("fasta")
var sift: Option[String] = config("sift")
var polyphen: Option[String] = config("polyphen")
var custom: Option[String] = config("custom")
var custom: List[String] = config("custom", default = Nil)
var plugin: List[String] = config("plugin", default = Nil)
var individual: Option[String] = config("individual")
var fields: Option[String] = config("fields")
......@@ -227,7 +227,7 @@ class VariantEffectPredictor(val root: Configurable) extends BiopetCommandLineFu
optional("--fasta", fasta) +
optional("--sift", sift) +
optional("--polyphen", polyphen) +
optional("--custom", custom) +
repeat("--custom", custom) +
repeat("--plugin", plugin) +
optional("--individual", individual) +
optional("--fields", fields) +
......
......@@ -69,8 +69,8 @@ class Kraken(val root: Configurable) extends BiopetCommandLineFunction with Vers
optional("--threads", nCoresRequest) +
conditional(quick, "--quick") +
optional("--min_hits", minHits) +
optional("--unclassified-out ", unclassified_out.get) +
optional("--classified-out ", classified_out.get) +
optional("--unclassified-out ", unclassified_out) +
optional("--classified-out ", classified_out) +
required("--output", output) +
conditional(preLoad, "--preload") +
conditional(paired, "--paired") +
......
......@@ -65,8 +65,10 @@ class AddOrReplaceReadGroups(val root: Configurable) extends Picard {
/** Returns command to execute */
override def cmdLine = super.cmdLine +
required("INPUT=", input, spaceSeparated = false) +
required("OUTPUT=", output, spaceSeparated = false) +
(if (inputAsStdin) required("INPUT=", new File("/dev/stdin"), spaceSeparated = false)
else required("INPUT=", input, spaceSeparated = false)) +
(if (outputAsStsout) required("OUTPUT=", new File("/dev/stdout"), spaceSeparated = false)
else required("OUTPUT=", output, spaceSeparated = false)) +
required("SORT_ORDER=", sortOrder, spaceSeparated = false) +
required("RGID=", RGID, spaceSeparated = false) +
required("RGLB=", RGLB, spaceSeparated = false) +
......
......@@ -34,6 +34,9 @@ class ReorderSam(val root: Configurable) extends Picard with Reference {
@Output(doc = "Output SAM or BAM file", required = true)
var output: File = null
@Output(doc = "The output file to bam file to", required = true)
lazy val outputIndex: File = new File(output.getAbsolutePath.stripSuffix(".bam") + ".bai")
@Argument(doc = "Allow incomplete dict concordance", required = false)
var allowIncompleteDictConcordance: Boolean = config("allow_incomplete_dict_concordance", default = false)
......@@ -49,6 +52,8 @@ class ReorderSam(val root: Configurable) extends Picard with Reference {
conditional(allowIncompleteDictConcordance, "ALLOW_INCOMPLETE_DICT_CONCORDANCE=TRUE") +
conditional(allowContigLengthDiscordance, "ALLOW_CONTIG_LENGTH_DISCORDANCE=TRUE") +
required("REFERENCE=", reference, spaceSeparated = false) +
required("INPUT=", input, spaceSeparated = false) +
required("OUTPUT=", output, spaceSeparated = false)
(if (inputAsStdin) required("INPUT=", new File("/dev/stdin"), spaceSeparated = false)
else required("INPUT=", input, spaceSeparated = false)) +
(if (outputAsStsout) required("OUTPUT=", new File("/dev/stdout"), spaceSeparated = false)
else required("OUTPUT=", output, spaceSeparated = false))
}
package nl.lumc.sasc.biopet.extensions.samtools
import nl.lumc.sasc.biopet.core.extensions.PythonCommandLineFunction
import nl.lumc.sasc.biopet.utils.config.Configurable
/**
* Created by sajvanderzeeuw on 19-1-16.
*/
class FixMpileup(val root: Configurable) extends PythonCommandLineFunction {
setPythonScript("fix_iupac_mpileup.py", "/nl/lumc/sasc/biopet/extensions/samtools/")
def cmdLine = getPythonCommand
}
......@@ -37,6 +37,7 @@ class SamtoolsMpileup(val root: Configurable) extends Samtools with Reference {
var disableBaq: Boolean = config("disable_baq", default = false)
var u: Boolean = config("u", default = false)
var v: Boolean = config("u", default = false)
var minMapQuality: Option[Int] = config("min_map_quality")
var minBaseQuality: Option[Int] = config("min_base_quality")
var depth: Option[Int] = config("depth")
......@@ -57,6 +58,7 @@ class SamtoolsMpileup(val root: Configurable) extends Samtools with Reference {
conditional(outputMappingQuality, "-s") +
conditional(disableBaq, "-B") +
conditional(u, "-u") +
conditional(v, "-v") +
(if (outputAsStsout) "" else required("-o", output)) +
(if (inputAsStdin) "-" else repeat(input))
}
......
......@@ -44,6 +44,24 @@
<artifactId>BiopetTools</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.8</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId>
<version>1.9.5</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_2.10</artifactId>
<version>2.2.1</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
\ No newline at end of file
......@@ -39,6 +39,20 @@ class VcfFilter(val root: Configurable) extends ToolCommandFunction {
var minSamplesPass: Option[Int] = config("min_samples_pass")
var minGenomeQuality: Option[Int] = config("min_genome_quality")
var filterRefCalls: Boolean = config("filter_ref_calls", default = false)
var invertedOutputVcf: Option[File] = None
var resToDom: Option[String] = None
var trioCompound: Option[String] = None
var deNovoInSample: Option[String] = None
var deNovoTrio: Option[String] = None
var trioLossOfHet: Option[String] = None
var mustHaveVariant: List[String] = Nil
var calledIn: List[String] = Nil
var mustHaveGenotype: List[String] = Nil
var diffGenotype: List[String] = Nil
var filterHetVarToHomVar: List[String] = Nil
var minQualScore: Option[Double] = None
var id: List[String] = Nil
var idFile: Option[File] = None
override def defaultCoreMemory = 3.0
......@@ -55,5 +69,19 @@ class VcfFilter(val root: Configurable) extends ToolCommandFunction {
optional("--minAlternateDepth", minAlternateDepth) +
optional("--minSamplesPass", minSamplesPass) +
optional("--minGenomeQuality", minGenomeQuality) +
conditional(filterRefCalls, "--filterRefCalls")
conditional(filterRefCalls, "--filterRefCalls") +
optional("--invertedOutputVcf", invertedOutputVcf) +
optional("--resToDom", resToDom) +
optional("--trioCompound", trioCompound) +
optional("--deNovoInSample", deNovoInSample) +
optional("--deNovoTrio", deNovoTrio) +
optional("--trioLossOfHet", trioLossOfHet) +
repeat("--mustHaveVariant", mustHaveVariant) +
repeat("--calledIn", calledIn) +
repeat("--mustHaveGenotype", mustHaveGenotype) +
repeat("--diffGenotype", diffGenotype) +
repeat("--filterHetVarToHomVar", filterHetVarToHomVar) +
optional("--minQualScore", minQualScore) +
repeat("--id", id) +
optional("--idFile", idFile)
}
import java.io.File
import nl.lumc.sasc.biopet.extensions.tools.VcfFilter
import org.scalatest.Matchers
import org.scalatest.testng.TestNGSuite
import org.testng.annotations.{ DataProvider, Test }
/**
* Created by ahbbollen on 2-3-16.
*/
class VcfFilterTest extends TestNGSuite with Matchers {
def cmd(s: String) = {
s.replace("'", "").replace(" ", " ").trim
}
@Test
def testBeforeGraph() = {
val filterer = new VcfFilter(null)
val iVcf = File.createTempFile("vcfFilter", ".vcf.gz")
val oVcf = File.createTempFile("vcfFilter", ".vcf.gz")
iVcf.deleteOnExit()
oVcf.deleteOnExit()
filterer.inputVcf = iVcf
filterer.outputVcf = oVcf
filterer.beforeGraph()
filterer.outputVcfIndex.getAbsolutePath shouldBe oVcf.getAbsolutePath + ".tbi"
}
@DataProvider(name = "functions")
def functions = {
Array(
() => testCommand(minSampleDepth = Some(2)),
() => testCommand(minTotalDepth = Some(2)),
() => testCommand(minAlternateDepth = Some(2)),
() => testCommand(minSamplesPass = Some(2)),
() => testCommand(minGenomeQuality = Some(50)),
() => testCommand(filterRefCalls = true),
() => testCommand(invertedOutputVcf = Some(File.createTempFile("vcfFilter", ".vcf"))),
() => testCommand(resToDom = Some("dummy")),
() => testCommand(trioCompound = Some("dummy")),
() => testCommand(deNovoInSample = Some("dummy")),
() => testCommand(deNovoTrio = Some("dummy")),
() => testCommand(trioLossOfHet = Some("dummy")),
() => testCommand(mustHaveVariant = List("sample1", "sample2")),
() => testCommand(calledIn = List("sample1", "sample2")),
() => testCommand(mustHaveGenotype = List("sample1:HET", "sample2:HET")),
() => testCommand(diffGenotype = List("sample1:sample2", "sample2:sample3")),
() => testCommand(minQualScore = Some(50.0)),
() => testCommand(filterHetVarToHomVar = List("dummy")),
() => testCommand(id = List("rs01", "rs02")),
() => testCommand(idFile = Some(File.createTempFile("vcfFilter", ".txt")))
).map(Array(_))
}
@Test(dataProvider = "functions")
def executer(function0: Function0[Unit]): Unit = function0()
protected def testCommand(minSampleDepth: Option[Int] = None,
minTotalDepth: Option[Int] = None,
minAlternateDepth: Option[Int] = None,
minSamplesPass: Option[Int] = None,
minGenomeQuality: Option[Int] = None,
filterRefCalls: Boolean = false,
invertedOutputVcf: Option[File] = None,
resToDom: Option[String] = None,
trioCompound: Option[String] = None,
deNovoInSample: Option[String] = None,
deNovoTrio: Option[String] = None,
trioLossOfHet: Option[String] = None,
mustHaveVariant: List[String] = Nil,
calledIn: List[String] = Nil,
mustHaveGenotype: List[String] = Nil,
diffGenotype: List[String] = Nil,
filterHetVarToHomVar: List[String] = Nil,
minQualScore: Option[Double] = None,
id: List[String] = Nil,
idFile: Option[File] = None): Unit = {
val vcfFilter = new VcfFilter(null)
vcfFilter.minSampleDepth = minSampleDepth
vcfFilter.minTotalDepth = minTotalDepth
vcfFilter.minAlternateDepth = minAlternateDepth
vcfFilter.minSamplesPass = minSamplesPass
vcfFilter.minGenomeQuality = minGenomeQuality
vcfFilter.filterRefCalls = filterRefCalls
vcfFilter.invertedOutputVcf = invertedOutputVcf
vcfFilter.resToDom = resToDom
vcfFilter.trioCompound = trioCompound
vcfFilter.deNovoInSample = deNovoInSample
vcfFilter.deNovoTrio = deNovoTrio
vcfFilter.trioLossOfHet = trioLossOfHet
vcfFilter.mustHaveVariant = mustHaveVariant
vcfFilter.calledIn = calledIn
vcfFilter.mustHaveGenotype = mustHaveGenotype
vcfFilter.diffGenotype = diffGenotype
vcfFilter.filterHetVarToHomVar = filterHetVarToHomVar
vcfFilter.minQualScore = minQualScore
vcfFilter.id = id
vcfFilter.idFile = idFile
val command = cmd(vcfFilter.cmdLine)
var cmdString: List[String] = Nil
if (minSampleDepth.isDefined) {
cmdString = "--minSampleDepth " + minSampleDepth.getOrElse("") :: cmdString
}
if (minTotalDepth.isDefined) {
cmdString = "--minTotalDepth " + minTotalDepth.getOrElse("") :: cmdString
}
if (minAlternateDepth.isDefined) {
cmdString = "--minAlternateDepth " + minAlternateDepth.getOrElse("") :: cmdString
}
if (minSamplesPass.isDefined) {
cmdString = "--minSamplesPass " + minSamplesPass.getOrElse("") :: cmdString
}
if (minGenomeQuality.isDefined) {
cmdString = "--minGenomeQuality " + minGenomeQuality.getOrElse("") :: cmdString
}
if (filterRefCalls) {
cmdString = "--filterRefCalls" :: cmdString
}
if (invertedOutputVcf.isDefined) {
cmdString = "--invertedOutputVcf " + invertedOutputVcf.getOrElse(new File("")).getAbsolutePath :: cmdString
}
if (resToDom.isDefined) {
cmdString = "--resToDom " + resToDom.getOrElse("") :: cmdString
}
if (trioCompound.isDefined) {
cmdString = "--trioCompound " + trioCompound.getOrElse("") :: cmdString
}
if (deNovoInSample.isDefined) {
cmdString = "--deNovoInSample " + deNovoInSample.getOrElse("") :: cmdString
}
if (deNovoTrio.isDefined) {
cmdString = "--deNovoTrio " + deNovoTrio.getOrElse("") :: cmdString
}
if (trioLossOfHet.isDefined) {
cmdString = "--trioLossOfHet " + trioLossOfHet.getOrElse("") :: cmdString
}
if (mustHaveVariant.nonEmpty) {
cmdString = mustHaveVariant.map(x => "--mustHaveVariant " + x) ::: cmdString
}
if (calledIn.nonEmpty) {
cmdString = calledIn.map(x => "--calledIn " + x) ::: cmdString
}
if (mustHaveGenotype.nonEmpty) {
cmdString = mustHaveGenotype.map(x => "--mustHaveGenotype " + x) ::: cmdString
}
if (diffGenotype.nonEmpty) {