From 11a6c2db87e7ae1626c7caf3d08cf2f1f5394f70 Mon Sep 17 00:00:00 2001
From: Peter van 't Hof <p.j.van_t_hof@lumc.nl>
Date: Wed, 4 Feb 2015 17:14:37 +0100
Subject: [PATCH] Added general stats

---
 .../nl/lumc/sasc/biopet/tools/VcfStats.scala  | 44 ++++++++++++-------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala
index c60bdc769..b0d09de61 100644
--- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala
+++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala
@@ -114,15 +114,17 @@ object VcfStats extends ToolCommand {
     val samples = header.getSampleNamesInOrder.toList
 
     val intervals: List[Interval] = (
-      for (seq <- header.getSequenceDictionary.getSequences;
+      for (
+        seq <- header.getSequenceDictionary.getSequences;
         chunks = seq.getSequenceLength / 10000000;
-        i <- 1 until chunks) yield {
+        i <- 1 until chunks
+      ) yield {
         val size = seq.getSequenceLength / chunks
-        val begin = size * (i-1) + 1
+        val begin = size * (i - 1) + 1
         val end = if (i >= chunks) seq.getSequenceLength else size * i
         new Interval(seq.getSequenceName, begin, end)
       }
-      ).toList
+    ).toList
 
     val totalBases = intervals.foldRight(0L)(_.length() + _)
 
@@ -144,7 +146,7 @@ object VcfStats extends ToolCommand {
     var variantCounter = 0L
     var baseCounter = 0L
 
-    def status(count: Int, interval:Interval): Unit = {
+    def status(count: Int, interval: Interval): Unit = {
       variantCounter += count
       baseCounter += interval.length()
       val fraction = baseCounter.toFloat / totalBases * 100
@@ -158,8 +160,9 @@ object VcfStats extends ToolCommand {
       val stats = createStats
       logger.info("Starting on: " + interval)
 
-      for (record <- reader.query(interval.getSequence, interval.getStart, interval.getEnd)
-           if record.getStart <= interval.getEnd) {
+      for (
+        record <- reader.query(interval.getSequence, interval.getStart, interval.getEnd) if record.getStart <= interval.getEnd
+      ) {
         mergeNestedStatsMap(stats.generalStats, checkGeneral(record))
         for (sample1 <- samples) yield {
           val genotype = record.getGenotype(sample1)
@@ -201,13 +204,25 @@ object VcfStats extends ToolCommand {
     buffer += "DP" -> Map((if (genotype.hasDP) genotype.getDP else "not set") -> 1)
     buffer += "GQ" -> Map((if (genotype.hasGQ) genotype.getGQ else "not set") -> 1)
 
-    def addToBuffer(key:String, value:Any): Unit = {
+    def addToBuffer(key: String, value: Any): Unit = {
       val map = buffer.getOrElse(key, Map())
       buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
     }
 
     val usedAlleles = (for (allele <- genotype.getAlleles) yield record.getAlleleIndex(allele)).toList
 
+    if (genotype.isHet) addToBuffer("general", "Het")
+    if (genotype.isHetNonRef) addToBuffer("general", "HetNonRef")
+    if (genotype.isHom) addToBuffer("general", "Hom")
+    if (genotype.isHomRef) addToBuffer("general", "HomRef")
+    if (genotype.isHomVar) addToBuffer("general", "HomVar")
+    if (genotype.isMixed) addToBuffer("general", "Mixed")
+    if (genotype.isNoCall) addToBuffer("general", "NoCall")
+    if (genotype.isNonInformative) addToBuffer("general", "NonInformative")
+    if (genotype.isAvailable) addToBuffer("general", "Available")
+    if (genotype.isCalled) addToBuffer("general", "Called")
+    if (genotype.isFiltered) addToBuffer("general", "Filtered")
+
     if (genotype.hasAD) {
       val ad = genotype.getAD
       for (i <- 0 until ad.size if ad(i) > 0) {
@@ -218,21 +233,20 @@ object VcfStats extends ToolCommand {
         else addToBuffer("AD-not_used", ad(i))
       }
     }
+
     buffer.toMap
   }
 
   def writeGenotypeFields(stats: Stats, prefix: String, samples: List[String]) {
-    val fields = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used")
+    val fields = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")
     for (field <- fields) {
       val file = new File(prefix + field + ".tsv")
       file.getParentFile.mkdirs()
       val writer = new PrintWriter(file)
       writer.println(samples.mkString("\t", "\t", ""))
-      val keySet = (for (sample <- samples) yield
-        stats.samplesStats(sample).genotypeStats.getOrElse(field, Map[Any,Int]()).keySet).fold(Set[Any]())(_ ++ _)
+      val keySet = (for (sample <- samples) yield stats.samplesStats(sample).genotypeStats.getOrElse(field, Map[Any, Int]()).keySet).fold(Set[Any]())(_ ++ _)
       for (key <- keySet.toList.sortWith(sortAnyAny(_, _))) {
-        val values = for (sample <- samples) yield
-          stats.samplesStats(sample).genotypeStats.getOrElse(field, Map[Any,Int]()).getOrElse(key, 0)
+        val values = for (sample <- samples) yield stats.samplesStats(sample).genotypeStats.getOrElse(field, Map[Any, Int]()).getOrElse(key, 0)
         writer.println(values.mkString(key + "\t", "\t", ""))
       }
       writer.close()
@@ -315,9 +329,9 @@ object VcfStats extends ToolCommand {
 
     logger.info("Starting: " + command)
     val process = Process(command).run(ProcessLogger(x => logger.debug(x), x => logger.debug(x)))
-    if (process.exitValue() == 0) logger.info("Done: " +  command)
+    if (process.exitValue() == 0) logger.info("Done: " + command)
     else {
-      logger.warn("Failed: " +  command)
+      logger.warn("Failed: " + command)
       if (!logger.isDebugEnabled) logger.warn("Use -l debug for more info")
     }
   }
-- 
GitLab