diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/Seqstat.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/Seqstat.scala index 7aba4994d5bfb0e7613cdf8b9cfd92e8f41fa7e1..5227fc289a0ac72a9782759e4e67f88bb3474ab6 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/Seqstat.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/Seqstat.scala @@ -6,12 +6,12 @@ package nl.lumc.sasc.biopet.tools import java.io.File -import scala.annotation.tailrec +import org.broadinstitute.gatk.utils.commandline.{ Output, Input } + import scala.collection.JavaConverters._ -import scala.collection.JavaConversions._ import scala.collection.mutable -import scala.collection.parallel.mutable.ParMap -import scala.collection.mutable.{ Map => MutMap } +import scala.collection.immutable.{ Map } +import scala.io.Source import scala.language.postfixOps import htsjdk.samtools.fastq.{ FastqReader, FastqRecord } @@ -30,6 +30,23 @@ import nl.lumc.sasc.biopet.utils.ConfigUtils */ class Seqstat(val root: Configurable) extends BiopetJavaCommandLineFunction { javaMainClass = getClass.getName + + @Input(doc = "Input FASTQ", shortName = "input", required = true) + var input: File = _ + + @Output(doc = "Output JSON", shortName = "output", required = true) + var output: File = _ + + override val defaultVmem = "4G" + memoryLimit = Option(3.0) + + override def commandLine = super.commandLine + required("-i", input) + " > " + required(output) + + def getSummary: Json = { + val json = Parse.parseOption(Source.fromFile(output).mkString) + if (json.isEmpty) return jNull + else return json.get.fieldOrEmptyObject("stats") + } } object FqEncoding extends Enumeration { @@ -40,6 +57,80 @@ object FqEncoding extends Enumeration { } object Seqstat extends ToolCommand { + def apply(root: Configurable, input: File, output: File): Seqstat = { + val seqstat = new Seqstat(root) + seqstat.input = input + seqstat.output = output + return seqstat + } + + def apply(root: Configurable, fastqfile: File, outDir: String): Seqstat = { + val seqstat = new Seqstat(root) + val ext = fastqfile.getName.substring(fastqfile.getName.lastIndexOf(".")) + seqstat.input = fastqfile + seqstat.output = new File(outDir + fastqfile.getName.substring(0, fastqfile.getName.lastIndexOf(".")) + ".seqstats.json") + return seqstat + } + + def mergeSummaries(jsons: List[Json]): Json = { + def addJson(json: Json, total: mutable.Map[String, Long]) { + for (key <- json.objectFieldsOrEmpty) { + if (json.field(key).get.isObject) addJson(json.field(key).get, total) + else if (json.field(key).get.isNumber) { + val number = json.field(key).get.numberOrZero.toLong + if (total.contains(key)) { + if (key == "len_min") { + if (total(key) > number) total(key) = number + } else if (key == "len_max") { + if (total(key) < number) total(key) = number + } else total(key) += number + } else total += (key -> number) + } + } + } + + var basesTotal: mutable.Map[String, Long] = mutable.Map() + var readsTotal: mutable.Map[String, Long] = mutable.Map() + var encoding: Set[Json] = Set() + for (json <- jsons) { + encoding += json.fieldOrEmptyString("qual_encoding") + + val bases = json.fieldOrEmptyObject("bases") + addJson(bases, basesTotal) + + val reads = json.fieldOrEmptyObject("reads") + addJson(reads, readsTotal) + } + return ("bases" := ( + ("num_n" := basesTotal("num_n")) ->: + ("num_total" := basesTotal("num_total")) ->: + ("num_qual_gte" := ( + ("1" := basesTotal("1")) ->: + ("10" := basesTotal("10")) ->: + ("20" := basesTotal("20")) ->: + ("30" := basesTotal("30")) ->: + ("40" := basesTotal("40")) ->: + ("50" := basesTotal("50")) ->: + ("60" := basesTotal("60")) ->: + jEmptyObject)) ->: jEmptyObject)) ->: + ("reads" := ( + ("num_with_n" := readsTotal("num_with_n")) ->: + ("num_total" := readsTotal("num_total")) ->: + ("len_min" := readsTotal("len_min")) ->: + ("len_max" := readsTotal("len_max")) ->: + ("num_mean_qual_gte" := ( + ("1" := readsTotal("1")) ->: + ("10" := readsTotal("10")) ->: + ("20" := readsTotal("20")) ->: + ("30" := readsTotal("30")) ->: + ("40" := readsTotal("40")) ->: + ("50" := readsTotal("50")) ->: + ("60" := readsTotal("60")) ->: + jEmptyObject)) ->: jEmptyObject)) ->: + ("qual_encoding" := encoding.head) ->: + jEmptyObject + } + import FqEncoding._ var phredEncoding: FqEncoding.Value = Sanger @@ -254,7 +345,7 @@ object Seqstat extends ToolCommand { val commandArgs: Args = parseArgs(args) - logger.info("Start") + logger.info("Start seqstat") val reader = new FastqReader(commandArgs.fastq) val numReads = seqStat(reader) @@ -262,7 +353,7 @@ object Seqstat extends ToolCommand { logger.debug(nucs) // logger.debug(baseStats) - logger.info("Done") + logger.info("Seqstat done") val report: Map[String, Any] = Map( ("files", @@ -292,7 +383,9 @@ object Seqstat extends ToolCommand { )) ) - val jsonReport: Json = ConfigUtils.mapToJson(report) + val jsonReport: Json = { + ConfigUtils.mapToJson(report) + } println(jsonReport.spaces2) } } diff --git a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala index db5c638fd5e2f671c7e90ffd0b2ca80bd36d1a13..74047f3d82b217b629f46654b6644ae2ce882368 100644 --- a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala +++ b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala @@ -20,7 +20,8 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Argument } import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand } import nl.lumc.sasc.biopet.core.config.Configurable -import nl.lumc.sasc.biopet.extensions.{ Gzip, Pbzip2, Md5sum, Zcat, Seqstat } +import nl.lumc.sasc.biopet.extensions.{ Gzip, Pbzip2, Md5sum, Zcat } +import nl.lumc.sasc.biopet.tools.Seqstat import nl.lumc.sasc.biopet.tools.FastqSync class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { diff --git a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepSummary.scala b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepSummary.scala index 84acd18206b1f44e37e60b7c6d3b703e1b0d2d21..6bdaee51121da7632fd2039948c8dec16be1692b 100644 --- a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepSummary.scala +++ b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepSummary.scala @@ -17,7 +17,8 @@ package nl.lumc.sasc.biopet.pipelines.flexiprep import java.io.PrintWriter import nl.lumc.sasc.biopet.core.config.Configurable -import nl.lumc.sasc.biopet.extensions.{ Md5sum, Seqstat } +import nl.lumc.sasc.biopet.extensions.{ Md5sum } +import nl.lumc.sasc.biopet.tools.Seqstat import nl.lumc.sasc.biopet.tools.FastqSync import org.broadinstitute.gatk.queue.function.InProcessFunction import org.broadinstitute.gatk.utils.commandline.{ Input, Output }