Commit 04210893 authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Added FilePipes

parent f298fe2c
......@@ -34,6 +34,9 @@ trait BiopetCommandLineFunction extends CommandLineFunction with Configurable {
@Input(doc = "deps", required = false)
var deps: List[File] = Nil
@Output
var outputFiles: List[File] = Nil
var threads = 0
def defaultThreads = 1
......@@ -282,6 +285,36 @@ trait BiopetCommandLineFunction extends CommandLineFunction with Configurable {
addJobReportBinding("command", cmd)
cmd
}
def requiredInput(prefix: String, arg: Either[File, BiopetCommandLineFunction]): String = {
arg match {
case Left(file) => {
deps :+= file
required(prefix, file)
}
case Right(cmd) => {
cmd._outputAsStdout = true
if (cmd.outputs != null) outputFiles ++= cmd.outputs
if (cmd.inputs != null) deps ++= cmd.inputs
s"'${prefix}' <( ${cmd.commandLine} ) "
}
}
}
def requiredOutput(prefix: String, arg: Either[File, BiopetCommandLineFunction]): String = {
arg match {
case Left(file) => {
deps :+= file
required(prefix, file)
}
case Right(cmd) => {
cmd._inputAsStdin = true
if (cmd.outputs != null) outputFiles ++= cmd.outputs
if (cmd.inputs != null) deps ++= cmd.inputs
s"'${prefix}' >( ${cmd.commandLine} ) "
}
}
}
}
/** stores global caches */
......
......@@ -33,8 +33,7 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction with Su
@Input(doc = "Input fastq file")
var fastq_input: File = _
@Output(doc = "Output fastq file")
var fastq_output: File = _
var fastq_output: Either[File, BiopetCommandLineFunction] = _
@Output(doc = "Output statistics file")
var stats_output: File = _
......@@ -63,7 +62,7 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction with Su
optional("-M", opt_maximum_length) +
// input / output
required(fastq_input) +
(if (outputAsStsout) "" else required("--output", fastq_output) +
(if (outputAsStsout) "" else requiredOutput("--output", fastq_output) +
" > " + required(stats_output))
/** Output summary stats */
......
......@@ -28,7 +28,7 @@ class Gzip(val root: Configurable) extends BiopetCommandLineFunction {
@Output(doc = "Unzipped file", required = true)
var output: File = _
executable = config("exe", default = "gzip")
executable = config("exe", default = "gzip", freeVar = false)
override def versionRegex = """gzip (.*)""".r
override def versionCommand = executable + " --version"
......
......@@ -34,9 +34,6 @@ class RunGubbins(val root: Configurable) extends BiopetCommandLineFunction {
@Input(doc = "Fasta file", shortName = "FQ")
var fastafile: File = _
@Output(doc = "Output", shortName = "out")
var outputFiles: List[File] = Nil
@Argument(required = true)
var outputDirectory: File = null
......
......@@ -36,8 +36,7 @@ class Sickle(val root: Configurable) extends BiopetCommandLineFunction with Summ
@Input(doc = "R2 input", required = false)
var input_R2: File = _
@Output(doc = "R1 output")
var output_R1: File = _
var output_R1: Either[File, BiopetCommandLineFunction] = _
@Output(doc = "R2 output", required = false)
var output_R2: File = _
......@@ -76,7 +75,7 @@ class Sickle(val root: Configurable) extends BiopetCommandLineFunction with Summ
cmd +
(if (inputAsStdin) required("-f", new File("/dev/stdin")) else required("-f", input_R1)) +
required("-t", qualityType) +
(if (outputAsStsout) required("-o", new File("/dev/stdout")) else required("-o", output_R1)) +
(if (outputAsStsout) required("-o", new File("/dev/stdout")) else requiredOutput("-o", output_R1)) +
optional("-q", qualityThreshold) +
optional("-l", lengthThreshold) +
conditional(noFiveprime, "-x") +
......
......@@ -54,9 +54,6 @@ class CollectMultipleMetrics(val root: Configurable) extends Picard with Summari
@Argument(doc = "Stop after processing N reads", required = false)
var stopAfter: Option[Long] = config("stop_after")
@Output
protected var outputFiles: List[File] = Nil
override def beforeGraph(): Unit = {
super.beforeGraph()
if (reference == null) reference = referenceFasta()
......
......@@ -27,8 +27,7 @@ object BiopetExecutablePublic extends BiopetExecutable {
nl.lumc.sasc.biopet.pipelines.bamtobigwig.Bam2Wig,
nl.lumc.sasc.biopet.pipelines.carp.Carp,
nl.lumc.sasc.biopet.pipelines.toucan.Toucan,
nl.lumc.sasc.biopet.pipelines.shiva.ShivaSvCalling,
nl.lumc.sasc.biopet.pipelines.gears.Gears
nl.lumc.sasc.biopet.pipelines.shiva.ShivaSvCalling
)
def pipelines: List[MainCommand] = List(
......
......@@ -17,7 +17,7 @@ package nl.lumc.sasc.biopet.extensions.tools
import java.io.File
import nl.lumc.sasc.biopet.core.{BiopetCommandLineFunction, ToolCommandFuntion}
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, ToolCommandFuntion }
import nl.lumc.sasc.biopet.core.summary.Summarizable
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
......@@ -52,29 +52,12 @@ class FastqSync(val root: Configurable) extends ToolCommandFuntion with Summariz
override def defaultCoreMemory = 4.0
override def beforeGraph(): Unit = {
inputFastq1 match {
case Left(file) => deps :+= file
case Right(cmd) => deps :::= cmd.deps
}
inputFastq2 match {
case Left(file) => deps :+= file
case _ =>
}
}
// executed command line
override def cmdLine =
super.cmdLine +
required("-r", refFastq) +
(inputFastq1 match {
case Left(file) => required("-i", file)
case Right(cmd) => " -i <( " + cmd.commandLine + " )"
}) +
(inputFastq2 match {
case Left(file) => required("-j", file)
case Right(cmd) => " -j <( " + cmd.commandLine + " )"
}) +
requiredInput("-i", inputFastq1) +
requiredInput("-j", inputFastq2) +
required("-o", outputFastq1) +
required("-p", outputFastq2) + " > " +
required(outputStats)
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.pipelines.flexiprep
import java.io.File
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.extensions.Ln
import scala.collection.mutable
import scala.io.Source
class Cutadapt(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Cutadapt(root) {
var fastqc: Fastqc = _
override def beforeCmd() {
super.beforeCmd()
val foundAdapters = fastqc.foundAdapters.map(_.seq)
if (default_clip_mode == "3") opt_adapter ++= foundAdapters
else if (default_clip_mode == "5") opt_front ++= foundAdapters
else if (default_clip_mode == "both") opt_anywhere ++= foundAdapters
}
override def summaryStats: Map[String, Any] = {
val trimR = """.*Trimmed reads: *(\d*) .*""".r
val tooShortR = """.*Too short reads: *(\d*) .*""".r
val tooLongR = """.*Too long reads: *(\d*) .*""".r
val adapterR = """Adapter '([C|T|A|G]*)'.*trimmed (\d*) times.""".r
val stats: mutable.Map[String, Int] = mutable.Map("trimmed" -> 0, "tooshort" -> 0, "toolong" -> 0)
val adapter_stats: mutable.Map[String, List[Any]] = mutable.Map()
if (stats_output.exists) for (line <- Source.fromFile(stats_output).getLines()) {
line match {
case trimR(m) => stats += ("trimmed" -> m.toInt)
case tooShortR(m) => stats += ("tooshort" -> m.toInt)
case tooLongR(m) => stats += ("toolong" -> m.toInt)
case adapterR(adapter, count) =>
val adapterName = fastqc.foundAdapters.find(_.seq == adapter) match {
case None => "unknown"
case Some(a) => a.name
}
adapter_stats += (adapterName -> List(adapter, count.toInt))
case _ =>
}
}
Map("num_reads_affected" -> stats("trimmed"),
"num_reads_discarded_too_short" -> stats("tooshort"),
"num_reads_discarded_too_long" -> stats("toolong"),
"adapters" -> adapter_stats.toMap
)
}
override def cmdLine = {
if (opt_adapter.nonEmpty || opt_anywhere.nonEmpty || opt_front.nonEmpty) {
analysisName = getClass.getSimpleName
super.cmdLine
} else {
analysisName = getClass.getSimpleName + "-ln"
Ln(this, fastq_input, fastq_output, relative = true).cmd
}
}
}
object Cutadapt {
def apply(root: Configurable, input: File, output: File): Cutadapt = {
val cutadapt = new Cutadapt(root)
cutadapt.fastq_input = input
cutadapt.fastq_output = output
cutadapt.stats_output = new File(output.getAbsolutePath.substring(0, output.getAbsolutePath.lastIndexOf(".")) + ".stats")
cutadapt
}
}
......@@ -181,9 +181,6 @@ class Fastqc(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Fastqc(r
} else Set()
}
@Output
var outputFiles: List[File] = Nil
def summaryFiles: Map[String, File] = {
val outputFiles = Map("plot_duplication_levels" -> ("Images" + File.separator + "duplication_levels.png"),
"plot_kmer_profiles" -> ("Images" + File.separator + "kmer_profiles.png"),
......
......@@ -3,7 +3,8 @@ package nl.lumc.sasc.biopet.pipelines.flexiprep
import java.io.File
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, BiopetPipe }
import nl.lumc.sasc.biopet.extensions.{ Gzip, Zcat, Sickle }
import nl.lumc.sasc.biopet.extensions.{ Gzip, Sickle, Cutadapt }
import nl.lumc.sasc.biopet.extensions.seqtk.SeqtkSeq
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
......@@ -23,18 +24,6 @@ class QcCommand(val root: Configurable, val fastqc: Fastqc) extends BiopetComman
@Output(required = true)
var output: File = _
val zcat = new Zcat(root)
val seqtk = new SeqtkSeq(root)
val cutadept = flexiprep.skipClip match {
case false => Some(new Cutadapt(root))
case true => None
}
val sickle = flexiprep.skipTrim match {
case false => Some(new Sickle(root))
case true => None
}
val gzip = Gzip(root)
override def beforeGraph(): Unit = {
super.beforeGraph()
deps :::= fastqc.outputFiles
......@@ -43,7 +32,9 @@ class QcCommand(val root: Configurable, val fastqc: Fastqc) extends BiopetComman
override def defaultCoreMemory = 2.0
override def defaultThreads = 3
override def beforeCmd(): Unit = {
def cmdLine = {
val seqtk = new SeqtkSeq(root)
seqtk.input = input
seqtk.Q = fastqc.encoding match {
case null => None
case enc if enc.contains("Sanger / Illumina 1.9") => None
......@@ -53,27 +44,46 @@ class QcCommand(val root: Configurable, val fastqc: Fastqc) extends BiopetComman
case _ => None
}
if (seqtk.Q.isDefined) seqtk.V = true
}
def cmdLine = {
val sanger = seqtk.Q match {
case Some(_) => Some(seqtk)
case _ => None
}
val clip = cutadept match {
case Some(cutadept) =>
val foundAdapters = fastqc.foundAdapters.map(_.seq)
val clip = if (!flexiprep.skipClip) {
val foundAdapters = fastqc.foundAdapters.map(_.seq)
if (foundAdapters.nonEmpty) {
val cutadept = new nl.lumc.sasc.biopet.extensions.Cutadapt(root)
cutadept.stats_output = new File(flexiprep.outputDir, s"${flexiprep.sampleId.getOrElse("x")}-${flexiprep.libId.getOrElse("x")}.clip.stats")
if (cutadept.default_clip_mode == "3") cutadept.opt_adapter ++= foundAdapters
else if (cutadept.default_clip_mode == "5") cutadept.opt_front ++= foundAdapters
else if (cutadept.default_clip_mode == "both") cutadept.opt_anywhere ++= foundAdapters
if (foundAdapters.nonEmpty) Some(cutadept)
else None
case _ => None
Some(cutadept)
} else None
} else None
val trim = if (!flexiprep.skipTrim) {
val sickle = new nl.lumc.sasc.biopet.extensions.Sickle(root)
sickle.output_stats = new File(flexiprep.outputDir, s"${flexiprep.sampleId.getOrElse("x")}-${flexiprep.libId.getOrElse("x")}.trim.stats")
Some(sickle)
} else None
val gzip = new Gzip(root)
val cmd = (clip, trim) match {
case (Some(clip), Some(trim)) => {
clip.fastq_output = Right(trim)
trim.output_R1 = Right(gzip > output)
seqtk | clip
}
case (Some(clip), _) => {
clip.fastq_output = Right(gzip > output)
seqtk | clip
}
case (_, Some(trim)) => {
trim.output_R1 = Right(gzip > output)
seqtk | trim
}
case _ => {
seqtk | gzip > output
}
}
val trim = sickle
val cmds = ((if (input.getName.endsWith(".gz") || input.getName.endsWith(".gzip")) Some(zcat) else None) ::
sanger :: clip :: trim :: Some(gzip) :: Nil).flatten
val cmd = input :<: cmds.tail.foldLeft(cmds.head)((a, b) => a | b) > output
//val cmds = (Some(seqtk) :: clip :: trim :: Some(new Gzip(root)) :: Nil).flatten
cmd.beforeGraph()
cmd.commandLine
}
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.pipelines.flexiprep
import java.io.File
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.extensions.Ln
class SeqtkSeq(root: Configurable) extends nl.lumc.sasc.biopet.extensions.seqtk.SeqtkSeq(root) {
var fastqc: Fastqc = _
override def beforeCmd() {
super.beforeCmd()
if (fastqc != null && Q.isEmpty) {
val encoding = fastqc.encoding
Q = encoding match {
case null => None
case enc if enc.contains("Sanger / Illumina 1.9") => None
case enc if enc.contains("Illumina <1.3") => Option(64)
case enc if enc.contains("Illumina 1.3") => Option(64)
case enc if enc.contains("Illumina 1.5") => Option(64)
case _ => None
}
if (Q.isDefined) V = true
}
}
override def beforeGraph() {
if (fastqc != null) deps ::= fastqc.output
}
override def cmdLine = {
if (Q.isDefined) {
analysisName = getClass.getSimpleName
super.cmdLine
} else {
analysisName = getClass.getSimpleName + "-skip"
(inputAsStdin, outputAsStsout) match {
case (true, true) => super.cmdLine
case _ => Ln(this, input, output).cmd
}
}
}
}
object SeqtkSeq {
def apply(root: Configurable, fastqc: Fastqc): SeqtkSeq = {
val seqtkSeq = new SeqtkSeq(root)
seqtkSeq.fastqc = fastqc
seqtkSeq
}
def apply(root: Configurable, input: File, output: File, fastqc: Fastqc = null): SeqtkSeq = {
val seqtkSeq = new SeqtkSeq(root)
seqtkSeq.input = input
seqtkSeq.output = output
seqtkSeq.fastqc = fastqc
seqtkSeq
}
}
\ No newline at end of file
......@@ -22,7 +22,7 @@ import nl.lumc.sasc.biopet.utils.config.Config
import nl.lumc.sasc.biopet.extensions._
import nl.lumc.sasc.biopet.extensions.bwa.{ BwaAln, BwaMem, BwaSampe, BwaSamse }
import nl.lumc.sasc.biopet.extensions.picard.{ AddOrReplaceReadGroups, MarkDuplicates, MergeSamFiles, SortSam }
import nl.lumc.sasc.biopet.pipelines.flexiprep.{ Cutadapt, Fastqc, SeqtkSeq }
import nl.lumc.sasc.biopet.pipelines.flexiprep.{ Cutadapt, Fastqc }
import nl.lumc.sasc.biopet.extensions.tools.{ FastqSync, SeqStat }
import nl.lumc.sasc.biopet.utils.ConfigUtils
import org.apache.commons.io.FileUtils
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment