Commit 25a8db4f authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Merge branch 'develop' into feature-vcf_stats

parents 3ce22c45 74077f92
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="EntryPointsManager">
<entry_points version="2.0" />
</component>
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
......
......@@ -41,7 +41,7 @@ class Basty(val root: Configurable) extends QScript with MultiSampleQScript {
var outputSnps: FastaOutput = _
protected def addJobs(): Unit = {
addLibsJobs()
addPerLibJobs()
output = addGenerateFasta(sampleId, sampleDir)
outputSnps = addGenerateFasta(sampleId, sampleDir, snpsOnly = true)
}
......@@ -56,11 +56,13 @@ class Basty(val root: Configurable) extends QScript with MultiSampleQScript {
gatkPipeline.biopetScript
addAll(gatkPipeline.functions)
addSamplesJobs()
}
def addMultiSampleJobs(): Unit = {
val refVariants = addGenerateFasta(null, outputDir + "reference/", outputName = "reference")
val refVariantSnps = addGenerateFasta(null, outputDir + "reference/", outputName = "reference", snpsOnly = true)
addSamplesJobs()
val catVariants = Cat(this, refVariants.variants :: samples.map(_._2.output.variants).toList, outputDir + "fastas/variant.fasta")
add(catVariants)
val catVariantsSnps = Cat(this, refVariantSnps.variants :: samples.map(_._2.outputSnps.variants).toList, outputDir + "fastas/variant.snps_only.fasta")
......@@ -122,13 +124,14 @@ class Basty(val root: Configurable) extends QScript with MultiSampleQScript {
val gubbins = new RunGubbins(this)
gubbins.fastafile = concensusVariants
gubbins.startingTree = raxmlBi.getBipartitionsFile
gubbins.startingTree = Some(raxmlBi.getBipartitionsFile)
gubbins.outputDirectory = outputDir + dirSufixGubbins
add(gubbins)
}
addTreeJobs(catVariantsSnps.output, catConsensusVariantsSnps.output, outputDir + "trees" + File.separator + "snps_only", "snps_only")
addTreeJobs(catVariants.output, catConsensusVariants.output, outputDir + "trees" + File.separator + "snps_indels", "snps_indels")
}
def addGenerateFasta(sampleName: String, outputDir: String, outputName: String = null,
......
......@@ -12,7 +12,7 @@ class BaseRecalibrator(val root: Configurable) extends org.broadinstitute.gatk.q
memoryLimit = Option(4)
override val defaultVmem = "8G"
if (config.contains("scattercount")) scatterCount = config("scattercount")
if (config.contains("scattercount")) scatterCount = config("scattercount", default = 1)
if (config.contains("dbsnp")) knownSites :+= new File(config("dbsnp").asString)
if (config.contains("known_sites")) knownSites :+= new File(config("known_sites").asString)
}
......
......@@ -19,7 +19,7 @@ trait GatkGeneral extends CommandLineGATK with BiopetJavaCommandLineFunction {
if (config.contains("intervals")) intervals = config("intervals").asFileList
if (config.contains("exclude_intervals")) excludeIntervals = config("exclude_intervals").asFileList
reference_sequence = config("reference")
gatk_key = config("gatk_key")
reference_sequence = config("reference", required = true)
if (config.contains("gatk_key")) gatk_key = config("gatk_key")
if (config.contains("pedigree")) pedigree = config("pedigree").asFileList
}
......@@ -36,7 +36,6 @@ class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScri
var singleSampleCalling = config("single_sample_calling", default = true)
var reference: File = config("reference", required = true)
var dbsnp: File = config("dbsnp")
var useAllelesOption: Boolean = config("use_alleles_option", default = false)
val externalGvcfs = config("external_gvcfs_files", default = Nil).asFileList
......@@ -72,7 +71,7 @@ class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScri
samToFastq.isIntermediate = true
qscript.add(samToFastq)
mapping.input_R1 = samToFastq.fastqR1
mapping.input_R2 = samToFastq.fastqR2
mapping.input_R2 = Some(samToFastq.fastqR2)
mapping.init
mapping.biopetScript
addAll(mapping.functions) // Add functions of mapping to curent function pool
......@@ -127,7 +126,7 @@ class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScri
gatkVariantcalling.outputDir = sampleDir + "/variantcalling/"
protected def addJobs(): Unit = {
addLibsJobs()
addPerLibJobs()
gatkVariantcalling.inputBams = libraries.map(_._2.mapping.finalBamFile).toList
gatkVariantcalling.preProcesBams = false
if (!singleSampleCalling) {
......@@ -150,10 +149,11 @@ class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScri
override def configPath: List[String] = super.configPath ::: "multisample" :: Nil
}
def biopetScript() {
addSamplesJobs
def biopetScript(): Unit = {
addSamplesJobs()
}
//SampleWide jobs
def addMultiSampleJobs(): Unit = {
val gvcfFiles: List[File] = if (mergeGvcfs && externalGvcfs.size + samples.size > 1) {
val newFile = outputDir + "merged.gvcf.vcf.gz"
add(CombineGVCFs(this, externalGvcfs ++ samples.map(_._2.gatkVariantcalling.scriptOutput.gvcfFile), newFile))
......
......@@ -32,9 +32,6 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr
@Argument(doc = "Reference", shortName = "R", required = false)
var reference: File = config("reference", required = true)
@Argument(doc = "Dbsnp", shortName = "dbsnp", required = false)
var dbsnp: File = config("dbsnp")
@Argument(doc = "OutputName", required = false)
var outputName: String = _
......
#!/usr/bin/env python
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
"""
(Re-)sync two filtered paired end FASTQ files.
Given two filtered paired end read files and one of the original read files,
re-sync the filtered reads by filtering out anything that is only present in
one of the two files.
Usage:
{command} <orig.fq> <reads_1.fq> <reads_2.fq> \\
<reads_1.synced.fq> <reads_2.synced.fq>
The synced reads are written to disk as <reads_1.synced.fq> and
<reads_2.synced.fq>. Afterwards some counts are printed.
Both Illumina old-style and new-style paired-end header lines are supported.
The original read file is used to speed up processing: it contains all
possible reads from both edited reads (in all files in the same order) so it
can process all files line by line, not having to read a single file in
memory. Some ideas were taken from [1].
[1] https://gist.github.com/588841/
2011-11-03, Martijn Vermaat <m.vermaat.hg@lumc.nl>
"""
import sys
import re
def sync_paired_end_reads(original, reads_a, reads_b, synced_a, synced_b):
"""
Filter out reads from two paired end read files that are not present in
both of them. Do this in a reasonable amount of time by using a file
containing all of the reads for one of the paired ends.
All arguments are open file handles.
@arg original: File containing all original reads for one of the paired
ends.
@arg reads_a: First from paired end read files.
@arg reads_b: Second from paired end read files.
@arg synced_a: Filtered reads from first paired end read file.
@arg synced_b: Filtered reads from second paired end read file.
@return: Triple (filtered_a, filtered_b, kept) containing counts
of the number of reads filtered from both input files and
the total number of reads kept in the synced results.
@todo: Print warnings if obvious things are not right (a or b still has
lines after original is processed).
"""
# This matches 1, 2, or 3 preceded by / _ or whitespace. Its rightmost
# match in a header line is used to identify the read pair.
sep = re.compile('[\s_/][123]')
def next_record(fh):
return [fh.readline().strip() for i in range(4)]
def head(record):
return sep.split(record[0])[:-1]
headers = (sep.split(x.strip())[:-1] for i, x in enumerate(original)
if not (i % 4))
filtered_a = filtered_b = kept = 0
a, b = next_record(reads_a), next_record(reads_b)
for header in headers:
if header == head(a) and head(b) != header:
a = next_record(reads_a)
filtered_a += 1
if header == head(b) and head(a) != header:
b = next_record(reads_b)
filtered_b += 1
if header == head(a) == head(b):
print >>synced_a, '\n'.join(a)
print >>synced_b, '\n'.join(b)
a, b = next_record(reads_a), next_record(reads_b)
kept += 1
return filtered_a, filtered_b, kept
if __name__ == '__main__':
if len(sys.argv) < 6:
sys.stderr.write(__doc__.split('\n\n\n')[0].strip().format(
command=sys.argv[0]) + '\n')
sys.exit(1)
try:
original = open(sys.argv[1], 'r')
reads_a = open(sys.argv[2], 'r')
reads_b = open(sys.argv[3], 'r')
synced_a = open(sys.argv[4], 'w')
synced_b = open(sys.argv[5], 'w')
filtered_a, filtered_b, kept = \
sync_paired_end_reads(original, reads_a, reads_b,
synced_a, synced_b)
print 'Filtered %i reads from first read file.' % filtered_a
print 'Filtered %i reads from second read file.' % filtered_b
print 'Synced read files contain %i reads.' % kept
except IOError as (_, message):
sys.stderr.write('Error: %s\n' % message)
sys.exit(1)
......@@ -38,7 +38,7 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab
val defaultThreads = 1
@Argument(doc = "Vmem", required = false)
var vmem: String = _
var vmem: Option[String] = None
val defaultVmem: String = ""
@Argument(doc = "Executable", required = false)
......@@ -58,9 +58,9 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab
if (threads == 0) threads = getThreads(defaultThreads)
if (threads > 1) nCoresRequest = Option(threads)
if (vmem == null) {
if (vmem.isEmpty) {
vmem = config("vmem")
if (vmem == null && !defaultVmem.isEmpty) vmem = defaultVmem
if (vmem.isEmpty && defaultVmem.nonEmpty) vmem = Some(defaultVmem)
}
if (vmem != null) jobResourceRequests :+= "h_vmem=" + vmem
jobName = configName + ":" + firstOutput.getName
......
......@@ -17,7 +17,7 @@ package nl.lumc.sasc.biopet.core
import java.io.File
import java.io.PrintWriter
import nl.lumc.sasc.biopet.core.config.{ Config, Configurable }
import nl.lumc.sasc.biopet.core.config.{ ConfigValueIndex, Config, Configurable }
import org.broadinstitute.gatk.utils.commandline.Argument
import org.broadinstitute.gatk.queue.QSettings
import org.broadinstitute.gatk.queue.function.QFunction
......@@ -29,7 +29,14 @@ trait BiopetQScript extends Configurable with GatkLogging {
@Argument(doc = "JSON config file(s)", fullName = "config_file", shortName = "config", required = false)
val configfiles: List[File] = Nil
var outputDir: String = _
var outputDir: String = {
val temp = Config.getValueFromMap(Config.global.map, ConfigValueIndex(this.configName, configPath, "output_dir"))
if (temp.isEmpty) throw new IllegalArgumentException("No output_dir defined in config")
else {
val t = temp.get.value.toString
if (!t.endsWith("/")) t + "/" else t
}
}
@Argument(doc = "Disable all scatters", shortName = "DSC", required = false)
var disableScatter: Boolean = false
......
......@@ -26,7 +26,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Argument }
*/
trait MultiSampleQScript extends BiopetQScript {
@Argument(doc = "Only Sample", shortName = "sample", required = false)
val onlySample: List[String] = Nil
private val onlySamples: List[String] = Nil
require(Config.global.map.contains("samples"), "No Samples found in config")
......@@ -94,7 +94,7 @@ trait MultiSampleQScript extends BiopetQScript {
protected def addJobs()
/** function add all libraries in one call */
protected final def addLibsJobs(): Unit = {
protected final def addPerLibJobs(): Unit = {
for ((libraryId, library) <- libraries) {
library.addAndTrackJobs()
}
......@@ -108,7 +108,7 @@ trait MultiSampleQScript extends BiopetQScript {
def createFile(suffix: String) = new File(sampleDir, sampleId + suffix)
/** Returns sample directory */
def sampleDir = outputDir + "samples" + File.pathSeparator + sampleId + File.pathSeparator
def sampleDir = outputDir + "samples" + File.separator + sampleId + File.separator
}
/** Sample type, need implementation in pipeline */
......@@ -125,17 +125,24 @@ trait MultiSampleQScript extends BiopetQScript {
val samples: Map[String, Sample] = sampleIds.map(id => id -> makeSample(id)).toMap
/** Returns a list of all sampleIDs */
protected def sampleIds: Set[String] = if (onlySample != Nil) onlySample.toSet else {
ConfigUtils.any2map(Config.global.map("samples")).keySet
}
protected def sampleIds: Set[String] = ConfigUtils.any2map(Config.global.map("samples")).keySet
/** Runs addAndTrackJobs method for each sample */
final def addSamplesJobs() {
for ((sampleId, sample) <- samples) {
sample.addAndTrackJobs()
}
if (onlySamples.isEmpty) {
samples.foreach { case (sampleId, sample) => sample.addAndTrackJobs() }
addMultiSampleJobs()
} else onlySamples.foreach(sampleId => samples.get(sampleId) match {
case Some(sample) => sample.addAndTrackJobs()
case None => logger.warn("sampleId '" + sampleId + "' not found")
})
}
/**
* Method where the multisample jobs should be added, this will be executed only when running the -sample argument is not given
*/
def addMultiSampleJobs()
/** Stores sample state */
private var currentSample: Option[String] = None
......
......@@ -43,7 +43,7 @@ class Bowtie(val root: Configurable) extends BiopetCommandLineFunction {
override val defaultThreads = 8
var sam: Boolean = config("sam", default = true)
var sam_RG: String = config("sam-RG")
var sam_RG: Option[String] = config("sam-RG")
var seedlen: Option[Int] = config("seedlen")
var seedmms: Option[Int] = config("seedmms")
var k: Option[Int] = config("k")
......
......@@ -43,8 +43,8 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction {
if (config.contains("front")) for (adapter <- config("front").asList) opt_front += adapter.toString
var opt_discard: Boolean = config("discard", default = false)
var opt_minimum_length: String = config("minimum_length", 1)
var opt_maximum_length: String = config("maximum_length")
var opt_minimum_length: Option[Int] = config("minimum_length", 1)
var opt_maximum_length: Option[Int] = config("maximum_length")
def cmdLine = required(executable) +
// options
......
......@@ -28,7 +28,14 @@ trait PythonCommandLineFunction extends BiopetCommandLineFunction {
executable = config("exe", default = "python", submodule = "python")
protected var python_script_name: String = _
def setPythonScript(script: String) { setPythonScript(script, "") }
def setPythonScript(script: String) {
python_script = new File(script)
if (!python_script.exists()) {
setPythonScript(script, "")
} else {
python_script_name = script
}
}
def setPythonScript(script: String, subpackage: String) {
python_script_name = script
python_script = new File(".queue/tmp/" + subpackage + python_script_name)
......
......@@ -60,11 +60,11 @@ class Raxml(val root: Configurable) extends BiopetCommandLineFunction {
private var out: List[File] = Nil
var executableNonThreads: String = config("exe", default = "raxmlHPC")
var executableThreads: String = config("exe_pthreads")
var executableThreads: Option[String] = config("exe_pthreads")
override def afterGraph {
if (threads == 0) threads = getThreads(defaultThreads)
executable = if (threads > 1 && executableThreads != null) executableThreads else executableNonThreads
executable = if (threads > 1 && executableThreads.isDefined) executableThreads.get else executableNonThreads
super.afterGraph
out +:= getInfoFile
f match {
......
......@@ -24,7 +24,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Argument, Input, Output }
class RunGubbins(val root: Configurable) extends BiopetCommandLineFunction {
@Input(doc = "Contaminants", required = false)
var startingTree: File = config("starting_tree")
var startingTree: Option[File] = config("starting_tree")
@Input(doc = "Fasta file", shortName = "FQ")
var fastafile: File = _
......@@ -36,21 +36,21 @@ class RunGubbins(val root: Configurable) extends BiopetCommandLineFunction {
var outputDirectory: String = _
executable = config("exe", default = "run_gubbins.py")
var outgroup: String = config("outgroup")
var filterPercentage: String = config("filter_percentage")
var treeBuilder: String = config("tree_builder")
var outgroup: Option[String] = config("outgroup")
var filterPercentage: Option[String] = config("filter_percentage")
var treeBuilder: Option[String] = config("tree_builder")
var iterations: Option[Int] = config("iterations")
var minSnps: Option[Int] = config("min_snps")
var convergeMethod: String = config("converge_method")
var convergeMethod: Option[String] = config("converge_method")
var useTimeStamp: Boolean = config("use_time_stamp", default = false)
var prefix: String = config("prefix")
var prefix: Option[String] = config("prefix")
var verbose: Boolean = config("verbose", default = false)
var noCleanup: Boolean = config("no_cleanup", default = false)
override def afterGraph: Unit = {
super.afterGraph
jobLocalDir = new File(outputDirectory)
if (prefix == null) prefix = fastafile.getName
if (prefix.isEmpty) prefix = Some(fastafile.getName)
val out: List[String] = List(".recombination_predictions.embl",
".recombination_predictions.gff",
".branch_base_reconstruction.embl",
......
......@@ -43,7 +43,7 @@ class Sickle(val root: Configurable) extends BiopetCommandLineFunction {
var fastqc: Fastqc = _
executable = config("exe", default = "sickle", freeVar = false)
var qualityType: String = config("qualitytype")
var qualityType: Option[String] = config("qualitytype")
var qualityThreshold: Option[Int] = config("qualityThreshold")
var lengthThreshold: Option[Int] = config("lengthThreshold")
var noFiveprime: Boolean = config("noFiveprime", default = false)
......@@ -54,7 +54,7 @@ class Sickle(val root: Configurable) extends BiopetCommandLineFunction {
override def versionCommand = executable + " --version"
override def afterGraph {
if (qualityType == null && defaultQualityType != null) qualityType = defaultQualityType
if (qualityType.isEmpty) qualityType = Some(defaultQualityType)
}
def cmdLine = {
......
package nl.lumc.sasc.biopet.extensions
import java.io.File
import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
/**
* Created by pjvan_thof on 1/29/15.
* Versions from the executable are not reliable, this extension is based on md5 '3d033ff8a1f4ea9cb3ede12939561141' from the executable
*/
class WigToBigWig(val root: Configurable) extends BiopetCommandLineFunction {
@Input(doc = "Input wig file")
var inputWigFile: File = _
@Input(doc = "Input chrom sizes file")
var inputChromSizesFile: File = _
@Output(doc = "Output BigWig file")
var outputBigWig: File = _
executable = config("exe", default = "wigToBigWig")
var blockSize: Option[Int] = config("blockSize")
var itemsPerSlot: Option[Int] = config("itemsPerSlot")
var clip: Boolean = config("clip", default = false)
var unc: Boolean = config("unc", default = false)
def cmdLine = required(executable) +
optional("-blockSize=", blockSize, spaceSeparated = false) +
optional("-itemsPerSlot=", itemsPerSlot, spaceSeparated = false) +
conditional(clip, "-clip") +
conditional(unc, "-unc") +
required(inputWigFile) +
required(inputChromSizesFile) +
required(outputBigWig)
}
\ No newline at end of file
......@@ -34,7 +34,7 @@ class BwaMem(val root: Configurable) extends Bwa {
@Output(doc = "Output file SAM", shortName = "output")
var output: File = _
var R: String = config("R")
var R: Option[String] = config("R")
var k: Option[Int] = config("k")
var r: Option[Float] = config("r")
var S: Boolean = config("S", default = false)
......@@ -49,11 +49,11 @@ class BwaMem(val root: Configurable) extends Bwa {
var e: Boolean = config("e", default = false)
var A: Option[Int] = config("A")
var B: Option[Int] = config("B")
var O: String = config("O")
var E: String = config("E")
var L: String = config("L")
var O: Option[String] = config("O")
var E: Option[String] = config("E")
var L: Option[String] = config("L")
var U: Option[Int] = config("U")
var x: String = config("x")
var x: Option[String] = config("x")
var p: Boolean = config("p", default = false)
var v: Option[Int] = config("v")
var T: Option[Int] = config("T")
......@@ -61,7 +61,7 @@ class BwaMem(val root: Configurable) extends Bwa {
var a: Boolean = config("a", default = false)
var C: Boolean = config("C", default = false)
var Y: Boolean = config("Y", default = false)
var I: String = config("I")
var I: Option[String] = config("I")
override val defaultVmem = "6G"
override val defaultThreads = 8
......
......@@ -6,7 +6,11 @@ import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
/**
* Created by pjvan_thof on 1/16/15.
* BWA sampe wrapper
*
* based on executable version 0.7.10-r789
*
* @param root Configurable
*/
class BwaSampe(val root: Configurable) extends Bwa {
@Input(doc = "Fastq file R1", required = true)
......@@ -39,11 +43,14 @@ class BwaSampe(val root: Configurable) extends Bwa {