Commit 14dcba60 authored by Peter van 't Hof's avatar Peter van 't Hof

Fixed agregated stats

parent 575110d4
......@@ -16,12 +16,12 @@ package nl.lumc.sasc.biopet.extensions.tools
import java.io.File
import nl.lumc.sasc.biopet.core.summary.{Summarizable, SummaryQScript}
import nl.lumc.sasc.biopet.core.{Reference, ToolCommandFunction}
import nl.lumc.sasc.biopet.core.summary.{ Summarizable, SummaryQScript }
import nl.lumc.sasc.biopet.core.{ Reference, ToolCommandFunction }
import nl.lumc.sasc.biopet.tools.vcfstats.VcfStats
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.utils.{ConfigUtils, tryToParseNumber}
import org.broadinstitute.gatk.utils.commandline.{Input, Output}
import nl.lumc.sasc.biopet.utils.{ ConfigUtils, tryToParseNumber }
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import scala.io.Source
......
......@@ -15,7 +15,7 @@
package nl.lumc.sasc.biopet
import nl.lumc.sasc.biopet.tools.vcfstats.VcfStats
import nl.lumc.sasc.biopet.utils.{BiopetExecutable, MainCommand}
import nl.lumc.sasc.biopet.utils.{ BiopetExecutable, MainCommand }
object BiopetToolsExecutable extends BiopetExecutable {
......
......@@ -103,54 +103,20 @@ object BamStats extends ToolCommand {
val stats = waitOnFutures(processUnmappedReads(bamFile) :: contigsFutures.map(_._2))
val statsWriter = new PrintWriter(new File(outputDir, "stats.json"))
val statsWriter = new PrintWriter(new File(outputDir, "bamstats.json"))
val totalStats = stats.toSummaryMap
val statsMap = Map(
"total" -> stats.toSummaryMap,
"total" -> totalStats,
"contigs" -> contigsFutures.map(x => x._1 -> x._2.value.get.get.toSummaryMap).toMap
)
statsWriter.println(ConfigUtils.mapToJson(statsMap).spaces2)
statsWriter.println(ConfigUtils.mapToJson(statsMap).nospaces)
statsWriter.close()
//stats.writeStatsToFiles(outputDir)
// FIXME: getting aggregated stats
val clippingHistogram = tsvToMap(new File(outputDir, "clipping.tsv"))
val mappingQualityHistogram = tsvToMap(new File(outputDir, "mapping_quality.tsv"))
val summary = Map(
"flagstats" -> ConfigUtils.fileToConfigMap(new File(outputDir, "flagstats.summary.json")),
"flagstats_per_contig" -> referenceDict.getSequences.map {
c =>
c.getSequenceName -> ConfigUtils.fileToConfigMap(
new File(outputDir, "contigs" + File.separator + c.getSequenceName + File.separator + "flagstats.summary.json"))
}.toMap,
"mapping_quality" -> Map("general" -> aggregateStats(mappingQualityHistogram), "histogram" -> mappingQualityHistogram),
"clipping" -> Map("general" -> aggregateStats(clippingHistogram), "histogram" -> clippingHistogram)
)
val summaryWriter = new PrintWriter(new File(outputDir, "bamstats.summary.json"))
summaryWriter.println(ConfigUtils.mapToJson(summary).spaces2)
summaryWriter.println(ConfigUtils.mapToJson(totalStats).nospaces)
summaryWriter.close()
}
def aggregateStats(table: Map[String, Array[Long]]): Map[String, Any] = {
val values = table("value")
val counts = table("count")
require(values.size == counts.size)
if (values.nonEmpty) {
val modal = values(counts.indexOf(counts.max))
val totalCounts = counts.sum
val mean: Double = values.zip(counts).map(x => x._1 * x._2).sum.toDouble / totalCounts
val median: Long = values(values.zip(counts).zipWithIndex.sortBy(_._1._1).foldLeft((0L, 0)) {
case (a, b) =>
val total = a._1 + b._1._2
if (total >= totalCounts / 2) (total, a._2)
else (total, b._2)
}._2)
Map("min" -> values.min, "max" -> values.max, "median" -> median, "mean" -> mean, "modal" -> modal)
} else Map()
}
/**
* This will start the subjobs for each contig and collect [[Stats]] on contig level
*
......
......@@ -64,5 +64,22 @@ class Counts[T](_counts: Map[T, Long] = Map[T, Long]())(implicit ord: Ordering[T
}
class Histogram[T](_counts: Map[T, Long] = Map[T, Long]())(implicit ord: Numeric[T]) extends Counts[T](_counts) {
def aggregateStats: Map[String, Any] = {
val values = this.counts.keys.toList
val counts = this.counts.values.toList
require(values.size == counts.size)
if (values.nonEmpty) {
val modal = values(counts.indexOf(counts.max))
val totalCounts = counts.sum
val mean: Double = values.zip(counts).map(x => ord.toDouble(x._1) * x._2).sum / totalCounts
val median = values(values.zip(counts).zipWithIndex.sortBy(_._1._1).foldLeft((0L, 0)) {
case (a, b) =>
val total = a._1 + b._1._2
if (total >= totalCounts / 2) (total, a._2)
else (total, b._2)
}._2)
Map("min" -> values.min, "max" -> values.max, "median" -> median, "mean" -> mean, "modal" -> modal)
} else Map()
}
}
......@@ -62,13 +62,13 @@ case class Stats(flagstat: FlagstatCollector = new FlagstatCollector(),
def toSummaryMap = {
Map(
"flagstats" -> flagstat.toSummaryMap,
"mapping_quality" -> mappingQualityHistogram.toSummaryMap,
"insert_size" -> insertSizeHistogram.toSummaryMap,
"clipping" -> clippingHistogram.toSummaryMap,
"left_clipping" -> leftClippingHistogram.toSummaryMap,
"right_clipping" -> rightClippingHistogram.toSummaryMap,
"5_prime_clipping" -> _5_ClippingHistogram.toSummaryMap,
"3_prime_clipping" -> _3_ClippingHistogram.toSummaryMap
"mapping_quality" -> Map("histrogram" -> mappingQualityHistogram.toSummaryMap, "general" -> mappingQualityHistogram.aggregateStats),
"insert_size" -> Map("histrogram" -> insertSizeHistogram.toSummaryMap, "general" -> insertSizeHistogram.aggregateStats),
"clipping" -> Map("histrogram" -> clippingHistogram.toSummaryMap, "general" -> clippingHistogram.aggregateStats),
"left_clipping" -> Map("histrogram" -> leftClippingHistogram.toSummaryMap, "general" -> leftClippingHistogram.aggregateStats),
"right_clipping" -> Map("histrogram" -> rightClippingHistogram.toSummaryMap, "general" -> rightClippingHistogram.aggregateStats),
"5_prime_clipping" -> Map("histrogram" -> _5_ClippingHistogram.toSummaryMap, "general" -> _5_ClippingHistogram.aggregateStats),
"3_prime_clipping" -> Map("histrogram" -> _3_ClippingHistogram.toSummaryMap, "general" -> _3_ClippingHistogram.aggregateStats)
)
}
}
......@@ -3,11 +3,11 @@ package nl.lumc.sasc.biopet.tools.vcfstats
import scala.collection.mutable
/**
* class to store all sample relative stats
*
* @param genotypeStats Stores all genotype relative stats
* @param sampleToSample Stores sample to sample compare stats
*/
* class to store all sample relative stats
*
* @param genotypeStats Stores all genotype relative stats
* @param sampleToSample Stores sample to sample compare stats
*/
case class SampleStats(genotypeStats: mutable.Map[String, mutable.Map[String, mutable.Map[Any, Int]]] = mutable.Map(),
sampleToSample: mutable.Map[String, SampleToSampleStats] = mutable.Map()) {
/** Add an other class */
......
package nl.lumc.sasc.biopet.tools.vcfstats
/**
* Class to store sample to sample compare stats
* @param genotypeOverlap Number of genotypes match with other sample
* @param alleleOverlap Number of alleles also found in other sample
*/
* Class to store sample to sample compare stats
* @param genotypeOverlap Number of genotypes match with other sample
* @param alleleOverlap Number of alleles also found in other sample
*/
case class SampleToSampleStats(var genotypeOverlap: Int = 0,
var alleleOverlap: Int = 0) {
/** Add an other class */
......
package nl.lumc.sasc.biopet.tools.vcfstats
import java.io.{File, PrintWriter}
import java.io.{ File, PrintWriter }
import scala.collection.mutable
import nl.lumc.sasc.biopet.utils.sortAnyAny
/**
* General stats class to store vcf stats
*
* @param generalStats Stores are general stats
* @param samplesStats Stores all sample/genotype specific stats
*/
* General stats class to store vcf stats
*
* @param generalStats Stores are general stats
* @param samplesStats Stores all sample/genotype specific stats
*/
case class Stats(generalStats: mutable.Map[String, mutable.Map[String, mutable.Map[Any, Int]]] = mutable.Map(),
samplesStats: mutable.Map[String, SampleStats] = mutable.Map()) {
/** Add an other class */
......@@ -33,9 +33,9 @@ case class Stats(generalStats: mutable.Map[String, mutable.Map[String, mutable.M
def writeField(field: String, outputDir: File, prefix: String = "", chr: String = "total"): File = {
val file = (prefix, chr) match {
case ("", "total") => new File(outputDir, field + ".tsv")
case (_, "total") => new File(outputDir, prefix + "-" + field + ".tsv")
case ("", _) => new File(outputDir, chr + "-" + field + ".tsv")
case _ => new File(outputDir, prefix + "-" + chr + "-" + field + ".tsv")
case (_, "total") => new File(outputDir, prefix + "-" + field + ".tsv")
case ("", _) => new File(outputDir, chr + "-" + field + ".tsv")
case _ => new File(outputDir, prefix + "-" + chr + "-" + field + ".tsv")
}
val data = this.generalStats.getOrElse(chr, mutable.Map[String, mutable.Map[Any, Int]]()).getOrElse(field, mutable.Map[Any, Int]())
......@@ -65,9 +65,9 @@ case class Stats(generalStats: mutable.Map[String, mutable.Map[String, mutable.M
prefix: String = "", chr: String = "total"): Unit = {
val file = (prefix, chr) match {
case ("", "total") => new File(outputDir, field + ".tsv")
case (_, "total") => new File(outputDir, prefix + "-" + field + ".tsv")
case ("", _) => new File(outputDir, chr + "-" + field + ".tsv")
case _ => new File(outputDir, prefix + "-" + chr + "-" + field + ".tsv")
case (_, "total") => new File(outputDir, prefix + "-" + field + ".tsv")
case ("", _) => new File(outputDir, chr + "-" + field + ".tsv")
case _ => new File(outputDir, prefix + "-" + chr + "-" + field + ".tsv")
}
file.getParentFile.mkdirs()
......@@ -95,7 +95,7 @@ case class Stats(generalStats: mutable.Map[String, mutable.Map[String, mutable.M
/** This will generate stats for one contig */
def getContigStats(contig: String,
samples: List[String],
genotypeFields: List[String] = Nil,
genotypeFields: List[String] = Nil,
infoFields: List[String] = Nil,
sampleDistributions: List[String] = Nil): Map[String, Any] = {
Map(
......@@ -107,7 +107,7 @@ case class Stats(generalStats: mutable.Map[String, mutable.Map[String, mutable.M
/** This will generate stats for total */
def getTotalStats(samples: List[String],
genotypeFields: List[String] = Nil,
genotypeFields: List[String] = Nil,
infoFields: List[String] = Nil,
sampleDistributions: List[String] = Nil) =
getContigStats("total", samples, genotypeFields, infoFields, sampleDistributions)
......@@ -115,7 +115,7 @@ case class Stats(generalStats: mutable.Map[String, mutable.Map[String, mutable.M
/** This will generate stats for total and contigs separated */
def getAllStats(contigs: List[String],
samples: List[String],
genotypeFields: List[String] = Nil,
genotypeFields: List[String] = Nil,
infoFields: List[String] = Nil,
sampleDistributions: List[String] = Nil): Map[String, Any] = {
Map(
......
......@@ -14,22 +14,22 @@
*/
package nl.lumc.sasc.biopet.tools.vcfstats
import java.io.{File, FileOutputStream, PrintWriter}
import java.io.{ File, FileOutputStream, PrintWriter }
import htsjdk.samtools.util.Interval
import htsjdk.variant.variantcontext.{Genotype, VariantContext}
import htsjdk.variant.variantcontext.{ Genotype, VariantContext }
import htsjdk.variant.vcf.VCFFileReader
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
import nl.lumc.sasc.biopet.utils.{ConfigUtils, FastaUtils, ToolCommand, VcfUtils}
import nl.lumc.sasc.biopet.utils.{ ConfigUtils, FastaUtils, ToolCommand, VcfUtils }
import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.io.Source
import scala.sys.process.{Process, ProcessLogger}
import scala.sys.process.{ Process, ProcessLogger }
import scala.util.Random
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.duration._
import scala.concurrent.{Await, Future}
import scala.concurrent.{ Await, Future }
/**
* This tool will generate statistics from a vcf file
......@@ -243,7 +243,7 @@ object VcfStats extends ToolCommand {
}
chunkStats.toList.fold(createStats)(_ += _)
}
val stats = statsFutures.foldLeft(createStats) { case (a,b) => a += Await.result(b, Duration.Inf) }
val stats = statsFutures.foldLeft(createStats) { case (a, b) => a += Await.result(b, Duration.Inf) }
logger.info("Done reading vcf records")
......
......@@ -15,11 +15,10 @@
package nl.lumc.sasc.biopet.tools
import java.io.File
import java.nio.file.{Files, Paths}
import java.nio.file.{ Files, Paths }
import htsjdk.variant.variantcontext.Allele
import htsjdk.variant.vcf.VCFFileReader
import nl.lumc.sasc.biopet.tools.vcfstats.{SampleStats, SampleToSampleStats, Stats, VcfStats}
import nl.lumc.sasc.biopet.tools.vcfstats.{ SampleStats, SampleToSampleStats, Stats, VcfStats }
import nl.lumc.sasc.biopet.tools.vcfstats.VcfStats._
import org.scalatest.Matchers
import org.scalatest.testng.TestNGSuite
......
......@@ -17,8 +17,8 @@ package nl.lumc.sasc.biopet.utils
import java.io.File
import java.util
import htsjdk.variant.variantcontext.{Allele, Genotype, VariantContext}
import htsjdk.variant.vcf.{VCFFileReader, VCFFilterHeaderLine, VCFHeader}
import htsjdk.variant.variantcontext.{ Allele, Genotype, VariantContext }
import htsjdk.variant.vcf.{ VCFFileReader, VCFFilterHeaderLine, VCFHeader }
import scala.collection.JavaConversions._
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment