Commit 4fba67e1 authored by Sander Bollen's avatar Sander Bollen

Merge branch 'develop' into fix-vcfstats-summary

parents 22344aa7 bb8b4550
......@@ -155,7 +155,7 @@ class VariantEffectPredictor(val root: Configurable) extends BiopetCommandLineFu
super.beforeGraph()
if (!cache && !database) {
Logging.addError("Must either set 'cache' or 'database' to true for VariantEffectPredictor")
} else if (cache && dir.isEmpty) {
} else if (cache && dir.isEmpty && dirCache.isEmpty) {
Logging.addError("Must supply 'dir_cache' to cache for VariantEffectPredictor")
}
if (statsText) _summary = new File(output.getAbsolutePath + "_summary.txt")
......
......@@ -58,6 +58,8 @@ class CatVariants(val root: Configurable) extends BiopetJavaCommandLineFunction
@Gather(classOf[org.broadinstitute.gatk.queue.function.scattergather.SimpleTextGatherFunction])
var log_to_file: File = _
override def defaultCoreMemory = 4.0
override def beforeGraph() = {
super.beforeGraph()
if (reference == null) reference = referenceFasta()
......
......@@ -36,7 +36,7 @@ class SeqStat(val root: Configurable) extends ToolCommandFunction with Summariza
@Output(doc = "Output JSON", shortName = "output", required = true)
var output: File = null
override def defaultCoreMemory = 2.5
override def defaultCoreMemory = 4.0
override def cmdLine = super.cmdLine + required("-i", input) + required("-o", output)
......
......@@ -29,6 +29,8 @@ class ValidateFastq(val root: Configurable) extends ToolCommandFunction {
@Input(doc = "Input R1 fastq file", required = false)
var r2Fastq: Option[File] = None
override def defaultCoreMemory = 4.0
override def cmdLine = super.cmdLine +
required("-i", r1Fastq) +
optional("-j", r2Fastq)
......
......@@ -43,7 +43,7 @@ class VcfWithVcf(val root: Configurable) extends ToolCommandFunction with Refere
var fields: List[(String, String, Option[String])] = List()
override def defaultCoreMemory = 2.0
override def defaultCoreMemory = 4.0
override def beforeGraph() {
super.beforeGraph()
......
......@@ -55,35 +55,23 @@ One can specify other options such as: `bowtie` (alignment) options, clipping an
"chunkmbs": 256, # this is a performance option, keep it high (256) as many alternative alignments are possible
"seedmms": 3,
"seedlen": 25,
"k": 5, # take and report best 5 alignments
"best": true # sort by best hit
"k": 3, # take and report best 3 alignments
"best": true, # sort by best hit,
"strata" true # select from best strata
},
"sickle": {
"lengthThreshold": 8 # minimum length to keep after trimming
"lengthThreshold": 15 # minimum length to keep after trimming
},
"cutadapt": {
"error_rate": 0.2, # recommended: 0.2, allow more mismatches in adapter to be clipped of (ratio)
"minimum_length": 8, # minimum length to keep after clipping, setting lower will cause multiple alignments afterwards
"error_rate": 0.1, # recommended: 0.1, allow more mismatches in adapter to be clipped of (ratio)
"minimum_length": 15, # minimum length to keep after clipping, setting lower will cause multiple alignments afterwards
"q": 30, # minimum quality over the read after the clipping in order to keep and report the read
"default_clip_mode": "both", # clip from: front/end/both (5'/3'/both). Depending on the protocol. Setting `both` makes clipping take more time, but is safest to do on short sequences such as smallRNA.
"times": 2 # in cases of chimera reads/adapters, how many times should cutadapt try to remove am adapter-sequence
"default_clip_mode": "3", # clip from: front/end/both (5'/3'/both). Depending on the protocol.
"times": 1, # in cases of chimera reads/adapters, how many times should cutadapt try to remove am adapter-sequence
"ignore_fastqc_adapters": true # by default ignore the detected adapters by FastQC. These tend to give false positive hits for smallRNA projects.
}
```
The settings above is quite strict and aggressive on the clipping with `cutadapt`. By default the option `sensitiveAdapterSearch` is turned on in the TinyCap pipeline:
```json
"fastqc": {
"sensitiveAdapterSearch": true
}
```
This setting is turned on to enable aggressive adapter trimming. e.g. found (partial) adapters in `FastQC`
are all used in `Cutadapt`. Depending on the sequencing technique and sample preparation, e.g. short
sequences (76bp - 100bp). Turning of this option will still produce sensible results.
## Examine results
### Result files
......
......@@ -19,6 +19,10 @@
val trimCount = summary.getLibraryValues("flexiprep", "settings", "skip_trim").count(_._2 == Some(false))
val clipCount = summary.getLibraryValues("flexiprep", "settings", "skip_clip").count(_._2 == Some(false))
val librariesCount = summary.samples.foldLeft(0)(_ + summary.libraries(_).size)
val paired: Boolean = if (sampleId.isDefined && libId.isDefined)
summary.getLibraryValue(sampleId.get, libId.get, "flexiprep", "settings", "paired").get.asInstanceOf[Boolean]
else summary.getLibraryValues("flexiprep", "settings", "paired").values.exists(_ == Some(true))
}#
#if (showIntro)
......@@ -62,9 +66,6 @@
#if (showPlot)
#{
val paired: Boolean = if (sampleId.isDefined && libId.isDefined)
summary.getLibraryValue(sampleId.get, libId.get, "flexiprep", "settings", "paired").get.asInstanceOf[Boolean]
else summary.getLibraryValues("flexiprep", "settings", "paired").values.exists(_ == Some(true))
FlexiprepReport.readSummaryPlot(outputDir, "QC_Reads_R1","R1", summary, sampleId = sampleId)
if (paired) FlexiprepReport.readSummaryPlot(outputDir, "QC_Reads_R2","R2", summary, sampleId = sampleId)
}#
......@@ -105,6 +106,7 @@
<th>Before QC</th>
<th>Clipping</th>
<th>Trimming</th>
#if (paired == true) <th>Out of Sync</th> #end
<th>After QC</th>
</tr></thead>
<tbody>
......@@ -140,8 +142,8 @@
#for (read <- reads)
#if (read == "R2") </tr><tr> #end
#{
val beforeTotal = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "seqstat_" + read, "reads", "num_total")
val afterTotal = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "seqstat_" + read + "_qc", "reads", "num_total")
val beforeTotal = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "seqstat_" + read, "reads", "num_total").getOrElse(0).toString.toLong
val afterTotal = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "seqstat_" + read + "_qc", "reads", "num_total").getOrElse(0).toString.toLong
val clippingDiscardedToShort = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "clipping_" + read, "num_reads_discarded_too_short").getOrElse(0).toString.toLong
val clippingDiscardedToLong = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "clipping_" + read, "num_reads_discarded_too_long").getOrElse(0).toString.toLong
var trimmingDiscarded = summary.getLibraryValue(sample, libId, "flexiprep", "stats", "trimming_" + read, "num_reads_discarded_total").getOrElse(0).toString.toLong
......@@ -150,6 +152,7 @@
<td>${beforeTotal}</td>
<td>${clippingDiscardedToShort + clippingDiscardedToLong}</td>
<td>${trimmingDiscarded}</td>
#if (paired == true) <td>${beforeTotal - clippingDiscardedToShort - clippingDiscardedToLong - trimmingDiscarded - afterTotal}</td> #end
<td>${afterTotal}</td>
#end
</tr>
......
......@@ -16,11 +16,11 @@ package nl.lumc.sasc.biopet.pipelines.flexiprep
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
import nl.lumc.sasc.biopet.core.{ BiopetFifoPipe, PipelineCommand, SampleLibraryTag }
import nl.lumc.sasc.biopet.extensions.{ Zcat, Gzip }
import nl.lumc.sasc.biopet.extensions.{ Gzip, Zcat }
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.utils.IoUtils._
import nl.lumc.sasc.biopet.extensions.tools.{ ValidateFastq, SeqStat, FastqSync }
import nl.lumc.sasc.biopet.extensions.tools.{ FastqSync, SeqStat, ValidateFastq }
import nl.lumc.sasc.biopet.utils.Logging
import org.broadinstitute.gatk.queue.QScript
class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with SampleLibraryTag {
......@@ -74,33 +74,37 @@ class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with
/** Function that's need to be executed before the script is accessed */
def init() {
require(outputDir != null, "Missing output directory on flexiprep module")
require(inputR1 != null, "Missing input R1 on flexiprep module")
require(sampleId != null, "Missing sample ID on flexiprep module")
require(libId != null, "Missing library ID on flexiprep module")
if (inputR1 == null) Logging.addError("Missing input R1 on flexiprep module")
if (sampleId == null || sampleId == None) Logging.addError("Missing sample ID on flexiprep module")
if (libId == null || libId == None) Logging.addError("Missing library ID on flexiprep module")
paired = inputR2.isDefined
if (inputR1 == null) Logging.addError("Missing input R1 on flexiprep module")
else {
paired = inputR2.isDefined
inputFiles :+= new InputFile(inputR1)
inputR2.foreach(inputFiles :+= new InputFile(_))
inputFiles :+= new InputFile(inputR1)
inputR2.foreach(inputFiles :+= new InputFile(_))
R1Name = getUncompressedFileName(inputR1)
inputR2.foreach { fileR2 =>
paired = true
R2Name = getUncompressedFileName(fileR2)
R1Name = getUncompressedFileName(inputR1)
inputR2.foreach { fileR2 =>
paired = true
R2Name = getUncompressedFileName(fileR2)
}
}
}
/** Script to add jobs */
def biopetScript() {
runInitialJobs()
if (inputR1 != null) {
runInitialJobs()
if (paired) runTrimClip(inputR1, inputR2, outputDir)
else runTrimClip(inputR1, outputDir)
if (paired) runTrimClip(inputR1, inputR2, outputDir)
else runTrimClip(inputR1, outputDir)
val R1Files = for ((k, v) <- outputFiles if k.endsWith("output_R1")) yield v
val R2Files = for ((k, v) <- outputFiles if k.endsWith("output_R2")) yield v
runFinalize(R1Files.toList, R2Files.toList)
val R1Files = for ((k, v) <- outputFiles if k.endsWith("output_R1")) yield v
val R2Files = for ((k, v) <- outputFiles if k.endsWith("output_R2")) yield v
runFinalize(R1Files.toList, R2Files.toList)
}
}
/** Add init non chunkable jobs */
......
......@@ -17,7 +17,7 @@ package nl.lumc.sasc.biopet.pipelines.flexiprep
import java.io.File
import com.google.common.io.Files
import nl.lumc.sasc.biopet.extensions.tools.{ ValidateFastq, SeqStat }
import nl.lumc.sasc.biopet.extensions.tools.{ SeqStat, ValidateFastq }
import nl.lumc.sasc.biopet.utils.ConfigUtils
import nl.lumc.sasc.biopet.utils.config.Config
import org.apache.commons.io.FileUtils
......@@ -83,6 +83,18 @@ class FlexiprepTest extends TestNGSuite with Matchers {
}
@Test
def testNoSample: Unit = {
val map = ConfigUtils.mergeMaps(Map(
"output_dir" -> FlexiprepTest.outputDir
), Map(FlexiprepTest.executables.toSeq: _*))
val flexiprep: Flexiprep = initPipeline(map)
intercept[IllegalStateException] {
flexiprep.script()
}
}
// remove temporary run directory all tests in the class have been run
@AfterClass def removeTempOutputDir() = {
FileUtils.deleteDirectory(FlexiprepTest.outputDir)
......
......@@ -18,6 +18,7 @@ import nl.lumc.sasc.biopet.core.summary.SummaryQScript
import nl.lumc.sasc.biopet.core.BiopetQScript.InputFile
import nl.lumc.sasc.biopet.core.{ PipelineCommand, SampleLibraryTag }
import nl.lumc.sasc.biopet.pipelines.flexiprep.Flexiprep
import nl.lumc.sasc.biopet.utils.Logging
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.queue.QScript
......@@ -48,6 +49,7 @@ class GearsSingle(val root: Configurable) extends QScript with SummaryQScript wi
def init(): Unit = {
require(fastqR1.isDefined || bamFile.isDefined, "Please specify fastq-file(s) or bam file")
require(fastqR1.isDefined != bamFile.isDefined, "Provide either a bam file or a R1/R2 file")
if (sampleId == null || sampleId == None) Logging.addError("Missing sample ID on GearsSingle module")
if (outputName == null) {
if (fastqR1.isDefined) outputName = fastqR1.map(_.getName
......@@ -80,6 +82,8 @@ class GearsSingle(val root: Configurable) extends QScript with SummaryQScript wi
val flexiprep = new Flexiprep(this)
flexiprep.inputR1 = r1
flexiprep.inputR2 = r2
flexiprep.sampleId = if (sampleId.isEmpty) Some("noSampleName") else sampleId
flexiprep.libId = if (libId.isEmpty) Some("noLibName") else libId
flexiprep.outputDir = new File(outputDir, "flexiprep")
add(flexiprep)
(flexiprep.fastqR1Qc, flexiprep.fastqR2Qc)
......
......@@ -80,6 +80,8 @@ class GearsSingleTest extends TestNGSuite with Matchers {
), Map(GearsSingleTest.executables.toSeq: _*))
val gears: GearsSingle = initPipeline(map)
gears.sampleId = Some("sampleName")
gears.libId = Some("libName")
if (fromBam) {
gears.bamFile = Some(GearsSingleTest.bam)
......@@ -113,6 +115,18 @@ class GearsSingleTest extends TestNGSuite with Matchers {
gears.functions.count(_.isInstanceOf[KrakenReportToJson]) shouldBe (if (kraken) 1 else 0)
}
@Test
def testNoSample: Unit = {
val map = ConfigUtils.mergeMaps(Map(
"output_dir" -> GearsSingleTest.outputDir
), Map(GearsSingleTest.executables.toSeq: _*))
val gears: GearsSingle = initPipeline(map)
intercept[IllegalArgumentException] {
gears.script()
}
}
// remove temporary run directory all tests in the class have been run
@AfterClass def removeTempOutputDir() = {
FileUtils.deleteDirectory(GearsSingleTest.outputDir)
......
......@@ -117,6 +117,7 @@ class Gentrap(val root: Configurable) extends QScript
),
// disable markduplicates since it may not play well with all aligners (this can still be overriden via config)
"mapping" -> Map(
"aligner" -> "gsnap",
"skip_markduplicates" -> true
)
)
......
......@@ -16,6 +16,7 @@ package nl.lumc.sasc.biopet.pipelines.gentrap.measures
import nl.lumc.sasc.biopet.core.annotations.AnnotationGtf
import nl.lumc.sasc.biopet.extensions.HtseqCount
import nl.lumc.sasc.biopet.extensions.picard.SortSam
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.queue.QScript
......@@ -25,19 +26,31 @@ import org.broadinstitute.gatk.queue.QScript
class FragmentsPerGene(val root: Configurable) extends QScript with Measurement with AnnotationGtf {
def mergeArgs = MergeArgs(idCols = List(1), valCol = 2, numHeaderLines = 0, fallback = "0")
override def fixedValues: Map[String, Any] = Map("htseqcount" -> Map("order" -> "pos"))
override def fixedValues: Map[String, Any] = Map("htseqcount" -> Map("order" -> ""))
lazy val sortOnId: Boolean = config("sort_on_id", default = true)
/** Pipeline itself */
def biopetScript(): Unit = {
val jobs = bamFiles.map {
case (id, file) =>
//TODO: ID sorting job
val bamFile = if (sortOnId) {
val sortSam = new SortSam(this)
sortSam.input = file
sortSam.output = swapExt(outputDir, file, ".bam", ".idsorted.bam")
sortSam.sortOrder = "queryname"
sortSam.isIntermediate = true
add(sortSam)
sortSam.output
} else file
val job = new HtseqCount(this)
job.inputAnnotation = annotationGtf
job.inputAlignment = file
job.inputAlignment = bamFile
job.output = new File(outputDir, s"$id.$name.counts")
job.format = Option("bam")
job.order = if (sortOnId) Some("name") else Some("pos")
add(job)
id -> job
}
......
......@@ -17,7 +17,10 @@ package nl.lumc.sasc.biopet.pipelines.gentrap
import java.io.{ File, FileOutputStream }
import com.google.common.io.Files
import nl.lumc.sasc.biopet.core.{ BiopetFifoPipe, BiopetPipe }
import nl.lumc.sasc.biopet.extensions._
import nl.lumc.sasc.biopet.extensions.gmap.Gsnap
import nl.lumc.sasc.biopet.extensions.hisat.Hisat2
import nl.lumc.sasc.biopet.extensions.tools.BaseCounter
import nl.lumc.sasc.biopet.utils.ConfigUtils
import nl.lumc.sasc.biopet.utils.config.Config
......@@ -26,7 +29,7 @@ import org.scalatest.Matchers
import org.scalatest.testng.TestNGSuite
import org.testng.annotations.{ DataProvider, Test }
abstract class GentrapTestAbstract(val expressionMeasure: String) extends TestNGSuite with Matchers {
abstract class GentrapTestAbstract(val expressionMeasure: String, val aligner: Option[String]) extends TestNGSuite with Matchers {
def initPipeline(map: Map[String, Any]): Gentrap = {
new Gentrap() {
......@@ -105,15 +108,18 @@ abstract class GentrapTestAbstract(val expressionMeasure: String) extends TestNG
val settings = Map(
"output_dir" -> GentrapTest.outputDir,
"gsnap" -> Map("db" -> "test", "dir" -> "test"),
"aligner" -> "gsnap",
"expression_measures" -> expMeasures,
"strand_protocol" -> strandProtocol
)
) ++ aligner.map("aligner" -> _)
val config = ConfigUtils.mergeMaps(settings ++ sampleConfig, Map(GentrapTest.executables.toSeq: _*))
val gentrap: Gentrap = initPipeline(config)
gentrap.script()
val functions = gentrap.functions.groupBy(_.getClass)
val functions = gentrap.functions.flatMap {
case f: BiopetFifoPipe => f.pipesJobs
case f: BiopetPipe => f.pipesJobs
case f => List(f)
}.groupBy(_.getClass)
val numSamples = sampleConfig("samples").size
if (expMeasures.contains("fragments_per_gene"))
......@@ -139,16 +145,29 @@ abstract class GentrapTestAbstract(val expressionMeasure: String) extends TestNG
assert(gentrap.functions.exists(_.isInstanceOf[Cufflinks]))
assert(gentrap.functions.exists(_.isInstanceOf[Ln]))
}
val classMap = Map(
"gsnap" -> classOf[Gsnap],
"tophat" -> classOf[Tophat],
"star" -> classOf[Star],
"star-2pass" -> classOf[Star],
"hisat2" -> classOf[Hisat2]
)
val alignerClass = classMap.get(aligner.getOrElse("gsnap"))
alignerClass.foreach(c => assert(functions.keys.exists(_ == c)))
classMap.values.filterNot(Some(_) == alignerClass).foreach(x => assert(!functions.keys.exists(_ == x)))
}
}
class GentrapFragmentsPerGeneTest extends GentrapTestAbstract("fragments_per_gene")
//class GentrapFragmentsPerExonTest extends GentrapTestAbstract("fragments_per_exon")
class GentrapBaseCountsTest extends GentrapTestAbstract("base_counts")
class GentrapCufflinksStrictTest extends GentrapTestAbstract("cufflinks_strict")
class GentrapCufflinksGuidedTest extends GentrapTestAbstract("cufflinks_guided")
class GentrapCufflinksBlindTest extends GentrapTestAbstract("cufflinks_blind")
class GentrapFragmentsPerGeneTest extends GentrapTestAbstract("fragments_per_gene", None)
//class GentrapFragmentsPerExonTest extends GentrapTestAbstract("fragments_per_exon", None)
class GentrapBaseCountsTest extends GentrapTestAbstract("base_counts", None)
class GentrapCufflinksStrictTest extends GentrapTestAbstract("cufflinks_strict", None)
class GentrapCufflinksGuidedTest extends GentrapTestAbstract("cufflinks_guided", None)
class GentrapCufflinksBlindTest extends GentrapTestAbstract("cufflinks_blind", None)
object GentrapTest {
val outputDir = Files.createTempDir()
......
......@@ -64,22 +64,24 @@ class TinyCap(val root: Configurable) extends QScript
"bowtie" -> Map(
"chunkmbs" -> 256,
"seedmms" -> 3,
"seedlen" -> 25,
"k" -> 5,
"best" -> true
"seedlen" -> 28,
"k" -> 3,
"best" -> true,
"strata" -> true
),
"sickle" -> Map(
"lengthThreshold" -> 15
),
"fastqc" -> Map(
"sensitiveAdapterSearch" -> true
"sensitiveAdapterSearch" -> false
),
"cutadapt" -> Map(
"error_rate" -> 0.2,
"error_rate" -> 0.1,
"minimum_length" -> 15,
"q" -> 30,
"default_clip_mode" -> "both",
"times" -> 2
"default_clip_mode" -> "3",
"times" -> 1,
"ignore_fastqc_adapters" -> true
)
)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment