From 410646217232693f62c27ee35c53cf4ec97d4f74 Mon Sep 17 00:00:00 2001 From: Peter van 't Hof <p.j.van_t_hof@lumc.nl> Date: Wed, 4 Feb 2015 16:56:54 +0100 Subject: [PATCH] Added AD fields --- .../nl/lumc/sasc/biopet/tools/VcfStats.scala | 40 ++++++++++++++----- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala index 9efa75bbc..5d2f01c6c 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala @@ -30,6 +30,7 @@ object VcfStats extends ToolCommand { opt[String]('o', "outputDir") required () unbounded () valueName ("<file>") action { (x, c) => c.copy(outputDir = x) } + //TODO: add interval argument /* opt[File]('i', "intervals") unbounded () valueName ("<file>") action { (x, c) => c.copy(intervals = Some(x)) @@ -162,7 +163,7 @@ object VcfStats extends ToolCommand { mergeNestedStatsMap(stats.generalStats, checkGeneral(record)) for (sample1 <- samples) yield { val genotype = record.getGenotype(sample1) - mergeNestedStatsMap(stats.samplesStats(sample1).genotypeStats, checkGenotype(genotype)) + mergeNestedStatsMap(stats.samplesStats(sample1).genotypeStats, checkGenotype(record, genotype)) for (sample2 <- samples) { val genotype2 = record.getGenotype(sample2) if (genotype.getAlleles == genotype2.getAlleles) @@ -194,27 +195,44 @@ object VcfStats extends ToolCommand { Map("QUAL" -> Map(qual -> 1)) } - def checkGenotype(genotype: Genotype): Map[String, Map[Any, Int]] = { - val sample = genotype.getSampleName - val dp = if (genotype.hasDP) genotype.getDP else "not set" - val gq = if (genotype.hasGQ) genotype.getGQ else "not set" + def checkGenotype(record: VariantContext, genotype: Genotype): Map[String, Map[Any, Int]] = { + val buffer = mutable.Map[String, Map[Any, Int]]() - //TODO: add AD field + buffer += "DP" -> Map((if (genotype.hasDP) genotype.getDP else "not set") -> 1) + buffer += "GQ" -> Map((if (genotype.hasGQ) genotype.getGQ else "not set") -> 1) - Map("DP" -> Map(dp -> 1), - "GQ" -> Map(gq -> 1)) + def addToBuffer(key:String, value:Any): Unit = { + val map = buffer.getOrElse(key, Map()) + buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1))) + } + + val usedAlleles = (for (allele <- genotype.getAlleles) yield record.getAlleleIndex(allele)).toList + + if (genotype.hasAD) { + val ad = genotype.getAD + for (i <- 0 until ad.size if ad(i) > 0) { + addToBuffer("AD", ad(i)) + if (i == 0) addToBuffer("AD-ref", ad(i)) + if (i > 0) addToBuffer("AD-alt", ad(i)) + if (usedAlleles.exists(_ == i)) addToBuffer("AD-used", ad(i)) + else addToBuffer("AD-not_used", ad(i)) + } + } + buffer.toMap } def writeGenotypeFields(stats: Stats, prefix: String, samples: List[String]) { - val fields = List("DP", "GQ") + val fields = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used") for (field <- fields) { val file = new File(prefix + field + ".tsv") file.getParentFile.mkdirs() val writer = new PrintWriter(file) writer.println(samples.mkString("\t", "\t", "")) - val keySet = (for (sample <- samples) yield stats.samplesStats(sample).genotypeStats(field).keySet).fold(Set[Any]())(_ ++ _) + val keySet = (for (sample <- samples) yield + stats.samplesStats(sample).genotypeStats.getOrElse(field, Map[Any,Int]()).keySet).fold(Set[Any]())(_ ++ _) for (key <- keySet.toList.sortWith(sortAnyAny(_, _))) { - val values = for (sample <- samples) yield stats.samplesStats(sample).genotypeStats(field).getOrElse(key, 0) + val values = for (sample <- samples) yield + stats.samplesStats(sample).genotypeStats.getOrElse(field, Map[Any,Int]()).getOrElse(key, 0) writer.println(values.mkString(key + "\t", "\t", "")) } writer.close() -- GitLab