Commit f6791cc0 authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Remove old summary code

parent 83cf7809
......@@ -45,12 +45,6 @@ class Seqstat(val root: Configurable) extends BiopetCommandLineFunction with Sum
def cmdLine = required(executable) + required(input) + " > " + required(output)
def getSummary: Json = {
val json = Parse.parseOption(Source.fromFile(output).mkString)
if (json.isEmpty) return jNull
else return json.get.fieldOrEmptyObject("stats")
}
def summaryData: Map[String, Any] = {
val map = ConfigUtils.fileToConfigMap(output)
......@@ -79,65 +73,4 @@ object Seqstat {
seqstat.output = new File(outDir + fastqfile.getName.substring(0, fastqfile.getName.lastIndexOf(".")) + ".seqstats.json")
return seqstat
}
def mergeSummaries(jsons: List[Json]): Json = {
def addJson(json: Json, total: mutable.Map[String, Long]) {
for (key <- json.objectFieldsOrEmpty) {
if (json.field(key).get.isObject) addJson(json.field(key).get, total)
else if (json.field(key).get.isNumber) {
val number = json.field(key).get.numberOrZero.toLong
if (total.contains(key)) {
if (key == "len_min") {
if (total(key) > number) total(key) = number
} else if (key == "len_max") {
if (total(key) < number) total(key) = number
} else total(key) += number
} else total += (key -> number)
}
}
}
var basesTotal: mutable.Map[String, Long] = mutable.Map()
var readsTotal: mutable.Map[String, Long] = mutable.Map()
var encoding: Set[Json] = Set()
for (json <- jsons) {
encoding += json.fieldOrEmptyString("qual_encoding")
val bases = json.fieldOrEmptyObject("bases")
addJson(bases, basesTotal)
val reads = json.fieldOrEmptyObject("reads")
addJson(reads, readsTotal)
}
return ("bases" := (
("num_n" := basesTotal("num_n")) ->:
("num_total" := basesTotal("num_total")) ->:
("num_qual_gte" := (
("1" := basesTotal("1")) ->:
("10" := basesTotal("10")) ->:
("20" := basesTotal("20")) ->:
("30" := basesTotal("30")) ->:
("40" := basesTotal("40")) ->:
("50" := basesTotal("50")) ->:
("60" := basesTotal("60")) ->:
jEmptyObject
)) ->: jEmptyObject)) ->:
("reads" := (
("num_with_n" := readsTotal("num_with_n")) ->:
("num_total" := readsTotal("num_total")) ->:
("len_min" := readsTotal("len_min")) ->:
("len_max" := readsTotal("len_max")) ->:
("num_mean_qual_gte" := (
("1" := readsTotal("1")) ->:
("10" := readsTotal("10")) ->:
("20" := readsTotal("20")) ->:
("30" := readsTotal("30")) ->:
("40" := readsTotal("40")) ->:
("50" := readsTotal("50")) ->:
("60" := readsTotal("60")) ->:
jEmptyObject
)) ->: jEmptyObject)) ->:
("qual_encoding" := encoding.head) ->:
jEmptyObject
}
}
......@@ -105,32 +105,6 @@ class FastqSync(val root: Configurable) extends BiopetJavaCommandLineFunction wi
case _ => v1
}
}
// summary statistics
def summary: Json = {
val regex = new Regex("""Filtered (\d*) reads from first read file.
|Filtered (\d*) reads from second read file.
|Synced read files contain (\d*) reads.""".stripMargin,
"R1", "R2", "RL")
val (countFilteredR1, countFilteredR2, countRLeft) =
if (outputStats.exists) {
val text = Source
.fromFile(outputStats)
.getLines()
.mkString("\n")
regex.findFirstMatchIn(text) match {
case None => (0, 0, 0)
case Some(rmatch) => (rmatch.group("R1").toInt, rmatch.group("R2").toInt, rmatch.group("RL").toInt)
}
} else (0, 0, 0)
("num_reads_discarded_R1" := countFilteredR1) ->:
("num_reads_discarded_R2" := countFilteredR2) ->:
("num_reads_kept" := countRLeft) ->:
jEmptyObject
}
}
object FastqSync extends ToolCommand {
......@@ -210,30 +184,6 @@ object FastqSync extends ToolCommand {
(numDiscA, numDiscB, numKept)
}
/** Function to merge this tool's summary with summaries from other objects */
// TODO: refactor this into the object? At least make it work on the summary object
def mergeSummaries(jsons: List[Json]): Json = {
val (read1FilteredCount, read2FilteredCount, readsLeftCount) = jsons
// extract the values we require from each JSON object into tuples
.map {
case json =>
(json.field("num_reads_discarded_R1").get.numberOrZero.toInt,
json.field("num_reads_discarded_R2").get.numberOrZero.toInt,
json.field("num_reads_kept").get.numberOrZero.toInt)
}
// reduce the tuples
.reduceLeft {
(x: (Int, Int, Int), y: (Int, Int, Int)) =>
(x._1 + y._1, x._2 + y._2, x._3 + y._3)
}
("num_reads_discarded_R1" := read1FilteredCount) ->:
("num_reads_discarded_R2" := read2FilteredCount) ->:
("num_reads_kept" := readsLeftCount) ->:
jEmptyObject
}
case class Args(refFastq: File = new File(""),
inputFastq1: File = new File(""),
inputFastq2: File = new File(""),
......
......@@ -86,31 +86,6 @@ class Cutadapt(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Cutada
}
def summaryFiles: Map[String, File] = Map("input" -> fastq_input, "output" -> fastq_output)
def getSummary: Json = {
val trimR = """.*Trimmed reads: *(\d*) .*""".r
val tooShortR = """.*Too short reads: *(\d*) .*""".r
val tooLongR = """.*Too long reads: *(\d*) .*""".r
val adapterR = """Adapter '([C|T|A|G]*)'.*trimmed (\d*) times.""".r
var stats: mutable.Map[String, Int] = mutable.Map("trimmed" -> 0, "tooshort" -> 0, "toolong" -> 0)
var adapter_stats: mutable.Map[String, Int] = mutable.Map()
if (stats_output.exists) for (line <- Source.fromFile(stats_output).getLines) {
line match {
case trimR(m) => stats += ("trimmed" -> m.toInt)
case tooShortR(m) => stats += ("tooshort" -> m.toInt)
case tooLongR(m) => stats += ("toolong" -> m.toInt)
case adapterR(adapter, count) => adapter_stats += (adapter -> count.toInt)
case _ =>
}
}
return ("num_reads_affected" := stats("trimmed")) ->:
("num_reads_discarded_too_short" := stats("tooshort")) ->:
("num_reads_discarded_too_long" := stats("toolong")) ->:
("adapters" := adapter_stats.toMap) ->:
jEmptyObject
}
}
object Cutadapt {
......@@ -121,30 +96,4 @@ object Cutadapt {
cutadapt.stats_output = new File(output.getAbsolutePath.substring(0, output.getAbsolutePath.lastIndexOf(".")) + ".stats")
return cutadapt
}
def mergeSummaries(jsons: List[Json]): Json = {
var affected = 0
var tooShort = 0
var tooLong = 0
var adapter_stats: mutable.Map[String, Int] = mutable.Map()
for (json <- jsons) {
affected += json.field("num_reads_affected").get.numberOrZero.toInt
tooShort += json.field("num_reads_discarded_too_short").get.numberOrZero.toInt
tooLong += json.field("num_reads_discarded_too_long").get.numberOrZero.toInt
val adapters = json.fieldOrEmptyObject("adapters")
for (key <- adapters.objectFieldsOrEmpty) {
val number = adapters.field(key).get.numberOrZero.toInt
if (adapter_stats.contains(key)) adapter_stats(key) += number
else adapter_stats += (key -> number)
}
}
return ("num_reads_affected" := affected) ->:
("num_reads_discarded_too_short" := tooShort) ->:
("num_reads_discarded_too_long" := tooLong) ->:
("adapters" := adapter_stats.toMap) ->:
jEmptyObject
}
}
......@@ -23,11 +23,7 @@ import org.broadinstitute.gatk.utils.commandline.Output
import scala.io.Source
import argonaut._, Argonaut._
import scalaz._, Scalaz._
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.utils.ConfigUtils
/**
* FastQC wrapper with added functionality for the Flexiprep pipeline
......@@ -142,32 +138,6 @@ class Fastqc(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Fastqc(r
} else Set()
}
/**
* Summary of the FastQC run, stored in a [[Json]] object
*
* @deprecated
*/
def summary: Json = {
val outputMap =
Map("plot_duplication_levels" -> "Images/duplication_levels.png",
"plot_kmer_profiles" -> "Images/kmer_profiles.png",
"plot_per_base_gc_content" -> "Images/per_base_gc_content.png",
"plot_per_base_n_content" -> "Images/per_base_n_content.png",
"plot_per_base_quality" -> "Images/per_base_quality.png",
"plot_per_base_sequence_content" -> "Images/per_base_sequence_content.png",
"plot_per_sequence_gc_content" -> "Images/per_sequence_gc_content.png",
"plot_per_sequence_quality" -> "Images/per_sequence_quality.png",
"plot_sequence_length_distribution" -> "Images/sequence_length_distribution.png",
"fastqc_data" -> "fastqc_data.txt")
.map {
case (name, relPath) =>
name -> Map("path" -> (outputDir + File.separator + relPath))
}
ConfigUtils.mapToJson(outputMap)
}
@Output
var outputFiles: List[File] = Nil
......
......@@ -61,13 +61,11 @@ class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with
var fastqc_R1_after: Fastqc = _
var fastqc_R2_after: Fastqc = _
//val summary = new FlexiprepSummary(this)
def init() {
require(outputDir != null, "Missing output directory on flexiprep module")
require(input_R1 != null, "Missing input R1 on flexiprep module")
//require(sampleId != null, "Missing sample ID on flexiprep module")
//require(libId != null, "Missing library ID on flexiprep module")
require(sampleId != null, "Missing sample ID on flexiprep module")
require(libId != null, "Missing library ID on flexiprep module")
paired = input_R2.isDefined
......@@ -88,8 +86,6 @@ class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with
}
case _ =>
}
//summary.out = new File(outputDir, sampleId.getOrElse("x") + "-" + libId.getOrElse("x") + ".qc.summary.json")
}
def biopetScript() {
......@@ -109,24 +105,14 @@ class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with
fastqc_R1 = Fastqc(this, input_R1, new File(outputDir, R1_name + ".fastqc/"))
add(fastqc_R1)
//summary.addFastqc(fastqc_R1)
addSummarizable(fastqc_R1, "fastqc_R1")
outputFiles += ("fastqc_R1" -> fastqc_R1.output)
//val md5sum_R1 = Md5sum(this, input_R1, outputDir)
//add(md5sum_R1)
//summary.addMd5sum(md5sum_R1, R2 = false, after = false)
if (paired) {
fastqc_R2 = Fastqc(this, input_R2.get, new File(outputDir, R2_name + ".fastqc/"))
add(fastqc_R2)
addSummarizable(fastqc_R2, "fastqc_R2")
//summary.addFastqc(fastqc_R2, R2 = true)
outputFiles += ("fastqc_R2" -> fastqc_R2.output)
//val md5sum_R2 = Md5sum(this, input_R2.get, outputDir)
//add(md5sum_R2)
//summary.addMd5sum(md5sum_R2, R2 = true, after = false)
}
}
......@@ -165,14 +151,12 @@ class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with
seqstat_R1.isIntermediate = true
add(seqstat_R1)
addSummarizable(seqstat_R1, "seqstat_R1")
//summary.addSeqstat(seqstat_R1, R2 = false, after = false, chunk)
if (paired) {
val seqstat_R2 = Seqstat(this, R2, outDir)
seqstat_R2.isIntermediate = true
add(seqstat_R2)
addSummarizable(seqstat_R2, "seqstat_R2")
//summary.addSeqstat(seqstat_R2, R2 = true, after = false, chunk)
}
if (!skipClip) { // Adapter clipping
......@@ -182,7 +166,6 @@ class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with
cutadapt_R1.isIntermediate = true
add(cutadapt_R1)
addSummarizable(cutadapt_R1, "clipping_R1")
//summary.addCutadapt(cutadapt_R1, R2 = false, chunk)
R1 = cutadapt_R1.fastq_output
deps ::= R1
outputFiles += ("cutadapt_R1_stats" -> cutadapt_R1.stats_output)
......@@ -194,7 +177,6 @@ class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with
cutadapt_R2.isIntermediate = true
add(cutadapt_R2)
addSummarizable(cutadapt_R2, "clipping_R2")
//summary.addCutadapt(cutadapt_R2, R2 = true, chunk)
R2 = cutadapt_R2.fastq_output
deps ::= R2
......@@ -208,7 +190,6 @@ class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with
fqSync.deps :::= deps
add(fqSync)
addSummarizable(fqSync, "fastq_sync")
//summary.addFastqcSync(fqSync, chunk)
outputFiles += ("syncStats" -> fqSync.outputStats)
R1 = fqSync.outputFastq1
R2 = fqSync.outputFastq2
......@@ -230,7 +211,6 @@ class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with
sickle.isIntermediate = true
add(sickle)
addSummarizable(sickle, "trimming")
//summary.addSickle(sickle, chunk)
R1 = sickle.output_R1
if (paired) R2 = sickle.output_R2
}
......@@ -238,13 +218,13 @@ class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with
val seqstat_R1_after = Seqstat(this, R1, outDir)
seqstat_R1_after.deps = deps
add(seqstat_R1_after)
//summary.addSeqstat(seqstat_R1_after, R2 = false, after = true, chunk)
addSummarizable(seqstat_R1_after, "seqstat_R1_after")
if (paired) {
val seqstat_R2_after = Seqstat(this, R2, outDir)
seqstat_R2_after.deps = deps
add(seqstat_R2_after)
//summary.addSeqstat(seqstat_R2_after, R2 = true, after = true, chunk)
addSummarizable(seqstat_R2_after, "seqstat_R2_after")
}
outputFiles += (chunk + "output_R1" -> R1)
......@@ -264,27 +244,17 @@ class Flexiprep(val root: Configurable) extends QScript with SummaryQScript with
if (paired) outputFiles += ("output_R2_gzip" -> R2)
if (!skipTrim || !skipClip) {
val md5sum_R1 = Md5sum(this, R1, outputDir)
add(md5sum_R1)
//summary.addMd5sum(md5sum_R1, R2 = false, after = true)
if (paired) {
val md5sum_R2 = Md5sum(this, R2, outputDir)
add(md5sum_R2)
//summary.addMd5sum(md5sum_R2, R2 = true, after = true)
}
fastqc_R1_after = Fastqc(this, R1, new File(outputDir, R1_name + ".qc.fastqc/"))
add(fastqc_R1_after)
addSummarizable(fastqc_R1_after, "fastqc_R1_qc")
//summary.addFastqc(fastqc_R1_after, after = true)
if (paired) {
fastqc_R2_after = Fastqc(this, R2, new File(outputDir, R2_name + ".qc.fastqc/"))
add(fastqc_R2_after)
addSummarizable(fastqc_R2_after, "fastqc_R2_qc")
//summary.addFastqc(fastqc_R2_after, R2 = true, after = true)
}
}
//add(summary)
addSummaryJobs
}
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.pipelines.flexiprep
import java.io.PrintWriter
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.{ Md5sum, Seqstat }
import nl.lumc.sasc.biopet.tools.FastqSync
import org.broadinstitute.gatk.queue.function.InProcessFunction
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import java.io.File
import argonaut._, Argonaut._
import scalaz._, Scalaz._
class FlexiprepSummary(val root: Configurable) extends InProcessFunction with Configurable {
this.analysisName = getClass.getSimpleName
@Input(doc = "deps")
var deps: List[File] = Nil
@Output(doc = "Summary output", required = true)
var out: File = _
class Chunk {
var seqstatR1: Seqstat = _
var seqstatR2: Seqstat = _
var seqstatR1after: Seqstat = _
var seqstatR2after: Seqstat = _
var cutadaptR1: Cutadapt = _
var cutadaptR2: Cutadapt = _
var fastqSync: FastqSync = _
var sickle: Sickle = _
}
var chunks: Map[String, Chunk] = Map()
var md5R1: Md5sum = _
var md5R2: Md5sum = _
var md5R1after: Md5sum = _
var md5R2after: Md5sum = _
var fastqcR1: Fastqc = _
var fastqcR2: Fastqc = _
var fastqcR1after: Fastqc = _
var fastqcR2after: Fastqc = _
var flexiprep: Flexiprep = if (root.isInstanceOf[Flexiprep]) root.asInstanceOf[Flexiprep] else {
throw new IllegalStateException("Root is no instance of Flexiprep")
}
var resources: Map[String, Json] = Map()
def addFastqc(fastqc: Fastqc, R2: Boolean = false, after: Boolean = false): Fastqc = {
if (!R2 && !after) this.fastqcR1 = fastqc
else if (!R2 && after) this.fastqcR1after = fastqc
else if (R2 && !after) this.fastqcR2 = fastqc
else if (R2 && after) this.fastqcR2after = fastqc
deps ::= fastqc.output
return fastqc
}
def addMd5sum(md5sum: Md5sum, R2: Boolean = false, after: Boolean = false): Md5sum = {
if (!R2 && !after) this.md5R1 = md5sum
else if (!R2 && after) this.md5R1after = md5sum
else if (R2 && !after) this.md5R2 = md5sum
else if (R2 && after) this.md5R2after = md5sum
deps ::= md5sum.output
return md5sum
}
def addSeqstat(seqstat: Seqstat, R2: Boolean = false, after: Boolean = false, chunk: String = ""): Seqstat = {
if (!chunks.contains(chunk)) chunks += (chunk -> new Chunk)
if (!R2 && !after) chunks(chunk).seqstatR1 = seqstat
else if (!R2 && after) chunks(chunk).seqstatR1after = seqstat
else if (R2 && !after) chunks(chunk).seqstatR2 = seqstat
else if (R2 && after) chunks(chunk).seqstatR2after = seqstat
deps ::= seqstat.output
return seqstat
}
def addCutadapt(cutadapt: Cutadapt, R2: Boolean = false, chunk: String = ""): Cutadapt = {
if (!chunks.contains(chunk)) chunks += (chunk -> new Chunk)
if (!R2) chunks(chunk).cutadaptR1 = cutadapt
else chunks(chunk).cutadaptR2 = cutadapt
deps ::= cutadapt.stats_output
return cutadapt
}
def addSickle(sickle: Sickle, chunk: String = ""): Sickle = {
if (!chunks.contains(chunk)) chunks += (chunk -> new Chunk)
chunks(chunk).sickle = sickle
deps ::= sickle.output_stats
return sickle
}
def addFastqcSync(fastqSync: FastqSync, chunk: String = ""): FastqSync = {
if (!chunks.contains(chunk)) chunks += (chunk -> new Chunk)
chunks(chunk).fastqSync = fastqSync
deps ::= fastqSync.outputStats
fastqSync
}
// format: OFF
override def run {
logger.debug("Start")
md5Summary()
val summary =
("samples" := ( flexiprep.sampleId.getOrElse("x") :=
("libraries" := ( flexiprep.libId.getOrElse("x") := (
("flexiprep" := (
("clipping" := !flexiprep.skipClip) ->:
("trimming" := !flexiprep.skipTrim) ->:
("paired" := flexiprep.paired) ->:
jEmptyObject)) ->:
("stats" := (
("fastq" := seqstatSummary) ->:
("clipping" :=? clipstatSummary) ->?:
("trimming" :=? trimstatSummary) ->?:
jEmptyObject)) ->:
("resources" := (("raw_R1" := getResources(fastqcR1, md5R1)) ->:
("raw_R2" :?= getResources(fastqcR2, md5R2)) ->?:
("proc_R1" :?= getResources(fastqcR1after, md5R1after)) ->?:
("proc_R2" :?= getResources(fastqcR2after, md5R2after)) ->?:
jEmptyObject)) ->:
jEmptyObject ))->: jEmptyObject)->: jEmptyObject)->: jEmptyObject) ->: jEmptyObject
// format: ON
val summeryText = summary.spaces2
logger.debug("\n" + summeryText)
val writer = new PrintWriter(out)
writer.write(summeryText)
writer.close()
logger.debug("Stop")
}
def seqstatSummary(): Option[Json] = {
val R1_chunks = for ((key, value) <- chunks) yield value.seqstatR1.getSummary
val R1: Json = Seqstat.mergeSummaries(R1_chunks.toList)
val R2: Option[Json] = if (!flexiprep.paired) None
else if (chunks.size == 1) Option(chunks.head._2.seqstatR2.getSummary)
else {
val s = for ((key, value) <- chunks) yield value.seqstatR2.getSummary
Option(Seqstat.mergeSummaries(s.toList))
}
val R1_proc: Option[Json] = if (flexiprep.skipClip && flexiprep.skipTrim) None
else if (chunks.size == 1) Option(chunks.head._2.seqstatR1after.getSummary)
else {
val s = for ((key, value) <- chunks) yield value.seqstatR1after.getSummary
Option(Seqstat.mergeSummaries(s.toList))
}
val R2_proc: Option[Json] = if (!flexiprep.paired || (flexiprep.skipClip && flexiprep.skipTrim)) None
else if (chunks.size == 1) Option(chunks.head._2.seqstatR2after.getSummary)
else {
val s = for ((key, value) <- chunks) yield value.seqstatR2after.getSummary
Option(Seqstat.mergeSummaries(s.toList))
}