From 0636ae0a52d096e98ef1417eb5175fc72270189b Mon Sep 17 00:00:00 2001
From: Peter van 't Hof <p.j.van_t_hof@lumc.nl>
Date: Sat, 10 Jan 2015 21:52:29 +0100
Subject: [PATCH] First version of VcfStats tool for #93

---
 .../nl/lumc/sasc/biopet/tools/plotHeatmap.R   | 24 ++++++
 .../nl/lumc/sasc/biopet/tools/VcfStats.scala  | 86 +++++++++++++++++++
 2 files changed, 110 insertions(+)
 create mode 100644 public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotHeatmap.R
 create mode 100644 public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala

diff --git a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotHeatmap.R b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotHeatmap.R
new file mode 100644
index 000000000..4158db708
--- /dev/null
+++ b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotHeatmap.R
@@ -0,0 +1,24 @@
+library('gplots')
+
+args <- commandArgs(TRUE)
+inputArg <- args[1]
+outputArg <- args[2]
+outputArgClustering <- args[3]
+
+col <- heat.colors(250)
+col[250] <- "#00FF00"
+
+heat<-read.table(inputArg, header = 1, sep= '\t')
+rownames(heat) <- heat[,1]
+heat<- heat[,-1]
+
+heat<- as.matrix(heat)
+
+png(file = outputArg, width = 1000, height = 1000)
+heatmap.2(heat, trace = 'none', col = col, Colv=NA, Rowv=NA, dendrogram="none")
+dev.off()
+
+
+png(file = outputArgClustering, width = 1000, height = 1000)
+heatmap.2(heat, trace = 'none', col = col, Colv="Rowv", dendrogram="row")
+dev.off()
diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala
new file mode 100644
index 000000000..e45a5daf9
--- /dev/null
+++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala
@@ -0,0 +1,86 @@
+package nl.lumc.sasc.biopet.tools
+
+import java.io.{ PrintWriter, File }
+
+import htsjdk.variant.vcf.VCFFileReader
+import nl.lumc.sasc.biopet.core.ToolCommand
+import org.broadinstitute.gatk.utils.R.RScriptExecutor
+import org.broadinstitute.gatk.utils.io.Resource
+import scala.collection.JavaConversions._
+import scala.collection.mutable
+
+/**
+ * Created by pjvan_thof on 1/10/15.
+ */
+class VcfStats {
+  //TODO: add Queue wrapper
+}
+
+object VcfStats extends ToolCommand {
+  case class Args(inputFile: File = null, outputDir: String = null) extends AbstractArgs
+
+  class OptParser extends AbstractOptParser {
+    opt[File]('I', "inputFile") required () unbounded () valueName ("<file>") action { (x, c) =>
+      c.copy(inputFile = x)
+    }
+    opt[String]('o', "outputDir") required () unbounded () valueName ("<file>") action { (x, c) =>
+      c.copy(outputDir = x)
+    }
+  }
+
+  /**
+   * @param args the command line arguments
+   */
+  def main(args: Array[String]): Unit = {
+    logger.info("Started")
+    val argsParser = new OptParser
+    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
+
+    val reader = new VCFFileReader(commandArgs.inputFile, false)
+    val header = reader.getFileHeader
+    val samples = header.getSampleNamesInOrder.toList
+
+    val genotypeOverlap: mutable.Map[String, mutable.Map[String, Int]] = mutable.Map()
+    for (record <- reader) {
+      for (sample1 <- samples) {
+        for (sample2 <- samples) {
+          if (record.getGenotype(sample1).getAlleles == record.getGenotype(sample2).getAlleles) {
+            if (!genotypeOverlap.contains(sample1)) genotypeOverlap(sample1) = mutable.Map()
+            val current = genotypeOverlap(sample1).getOrElse(sample2, 0)
+            genotypeOverlap(sample1)(sample2) = current + 1
+          }
+        }
+      }
+    }
+
+    writeOverlap(genotypeOverlap, commandArgs.outputDir + "/sample_genotype_overlap", samples)
+
+    plot(new File(commandArgs.outputDir + "/sample_genotype_overlap.rel.tsv"))
+
+    logger.info("Done")
+  }
+
+  def writeOverlap(overlap: mutable.Map[String, mutable.Map[String, Int]], prefix: String, samples: List[String]): Unit = {
+    val absWriter = new PrintWriter(new File(prefix + ".abs.tsv"))
+    val relWriter = new PrintWriter(new File(prefix + ".rel.tsv"))
+
+    absWriter.println(samples.mkString("\t", "\t", ""))
+    relWriter.println(samples.mkString("\t", "\t", ""))
+    for (sample1 <- samples) {
+      val values = for (sample2 <- samples) yield overlap.getOrElse(sample1, mutable.Map()).getOrElse(sample2, 0)
+      absWriter.println(values.mkString(sample1 + "\t", "\t", ""))
+
+      val total = overlap.getOrElse(sample1, mutable.Map()).getOrElse(sample1, 0)
+      relWriter.println(values.map(_.toFloat / total).mkString(sample1 + "\t", "\t", ""))
+    }
+    absWriter.close()
+    relWriter.close()
+  }
+
+  def plot(file: File) {
+    val executor = new RScriptExecutor
+    executor.addScript(new Resource("plotHeatmap.R", getClass))
+    executor.addArgs(file, file.getAbsolutePath.stripSuffix(".tsv") + ".png", file.getAbsolutePath.stripSuffix(".tsv") + ".clustering.png")
+    executor.exec()
+  }
+}
-- 
GitLab