Merge branch 'feature-vcf_stats' into 'develop'

Feature vcf stats More functions can be added but this is a first version that is ready to use in the gatk pipeline. is also fix for #93 See merge request !84

Merge branch 'feature-vcf_stats' into 'develop'
1a870a62 · bow · 74077f92 · 74f11c8e · 1a870a62 · 74077f92
Commit 1a870a62 authored 10 years ago by bow
--- a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVariantcalling.scala
+++ b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVariantcalling.scala
@@ -7,7 +7,7 @@ package nl.lumc.sasc.biopet.pipelines.gatk

 import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
 import java.io.File
-import nl.lumc.sasc.biopet.tools.{ MpileupToVcf, VcfFilter, MergeAlleles }
+import nl.lumc.sasc.biopet.tools.{ VcfStats, MpileupToVcf, VcfFilter, MergeAlleles }
 import nl.lumc.sasc.biopet.core.config.Configurable
 import nl.lumc.sasc.biopet.extensions.gatk.{ AnalyzeCovariates, BaseRecalibrator, GenotypeGVCFs, HaplotypeCaller, IndelRealigner, PrintReads, RealignerTargetCreator, SelectVariants, CombineVariants, UnifiedGenotyper }
 import nl.lumc.sasc.biopet.extensions.picard.MarkDuplicates
@@ -197,6 +197,12 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr
      val cvFinal = CombineVariants(this, mergeList.toList, outputDir + outputName + ".final.vcf.gz")
      cvFinal.genotypemergeoption = org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils.GenotypeMergeType.UNSORTED
      add(cvFinal)
+
+      val vcfStats = new VcfStats(this)
+      vcfStats.input = cvFinal.out
+      vcfStats.setOutputDir(outputDir + File.separator + "vcfstats")
+      add(vcfStats)
+
      scriptOutput.finalVcfFile = cvFinal.out
    }
  }

--- a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVcfSampleCompare.scala
+++ b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVcfSampleCompare.scala
-/**
- * Due to the license issue with GATK, this part of Biopet can only be used inside the
- * LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions
- * on how to use this protected part of biopet or contact us at sasc@lumc.nl
- */
-package nl.lumc.sasc.biopet.pipelines.gatk
-
-import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
-import java.io.File
-import nl.lumc.sasc.biopet.core.config.Configurable
-import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants
-import nl.lumc.sasc.biopet.extensions.gatk.SelectVariants
-import nl.lumc.sasc.biopet.extensions.gatk.VariantEval
-import org.broadinstitute.gatk.queue.QScript
-import org.broadinstitute.gatk.utils.commandline.{ Input, Argument }
-
-class GatkVcfSampleCompare(val root: Configurable) extends QScript with BiopetQScript {
-  def this() = this(null)
-
-  @Input(doc = "Sample vcf file(s)", shortName = "V")
-  var vcfFiles: List[File] = _
-
-  @Argument(doc = "Reference", shortName = "R", required = false)
-  var reference: File = config("reference")
-
-  @Argument(doc = "Target bed", shortName = "targetBed", required = false)
-  var targetBed: List[File] = Nil
-
-  @Argument(doc = "Samples", shortName = "sample", required = false)
-  var samples: List[String] = Nil
-
-  var vcfFile: File = _
-  var sampleVcfs: Map[String, File] = Map()
-  def generalSampleDir = outputDir + "samples/"
-
-  def init() {
-    if (config.contains("target_bed"))
-      for (bed <- config("target_bed").asList)
-        targetBed :+= bed.toString
-    if (outputDir == null) throw new IllegalStateException("Missing Output directory on gatk module")
-    else if (!outputDir.endsWith("/")) outputDir += "/"
-  }
-
-  def biopetScript() {
-    vcfFile = if (vcfFiles.size > 1) {
-      val combineVariants = CombineVariants(this, vcfFiles, outputDir + "merge.vcf")
-      add(combineVariants)
-      combineVariants.out
-    } else vcfFiles.head
-
-    for (sample <- samples) {
-      sampleVcfs += (sample -> new File(generalSampleDir + sample + File.separator + sample + ".vcf"))
-      val selectVariants = SelectVariants(this, vcfFile, sampleVcfs(sample))
-      selectVariants.sample_name = Seq(sample)
-      selectVariants.excludeNonVariants = true
-      add(selectVariants)
-    }
-
-    val sampleCompareMetrics = new SampleCompareMetrics(this)
-    sampleCompareMetrics.samples = samples
-    sampleCompareMetrics.sampleDir = generalSampleDir
-    sampleCompareMetrics.snpRelFile = outputDir + "compare.snp.rel.tsv"
-    sampleCompareMetrics.snpAbsFile = outputDir + "compare.snp.abs.tsv"
-    sampleCompareMetrics.indelRelFile = outputDir + "compare.indel.rel.tsv"
-    sampleCompareMetrics.indelAbsFile = outputDir + "compare.indel.abs.tsv"
-    sampleCompareMetrics.totalFile = outputDir + "total.tsv"
-
-    for ((sample, sampleVcf) <- sampleVcfs) {
-      val sampleDir = generalSampleDir + sample + File.separator
-      for ((compareSample, compareSampleVcf) <- sampleVcfs) {
-        val variantEval = VariantEval(this,
-          sampleVcf,
-          compareSampleVcf,
-          new File(sampleDir + sample + "-" + compareSample + ".eval.txt"),
-          Seq("VariantType", "CompRod"),
-          Seq("CompOverlap")
-        )
-        if (targetBed != null) variantEval.L = targetBed
-        add(variantEval)
-        sampleCompareMetrics.deps ::= variantEval.out
-      }
-    }
-    add(sampleCompareMetrics)
-  }
-}
-
-object GatkVcfSampleCompare extends PipelineCommand
--- a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/SampleCompareMetrics.scala
+++ b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/SampleCompareMetrics.scala
-/**
- * Due to the license issue with GATK, this part of Biopet can only be used inside the
- * LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions
- * on how to use this protected part of biopet or contact us at sasc@lumc.nl
- */
-package nl.lumc.sasc.biopet.pipelines.gatk
-
-import java.io.File
-import java.io.PrintWriter
-import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
-import nl.lumc.sasc.biopet.core.config.Configurable
-import org.broadinstitute.gatk.utils.R.RScriptExecutor
-import org.broadinstitute.gatk.utils.commandline.{ Output, Argument }
-import scala.io.Source
-import org.broadinstitute.gatk.utils.R.{ RScriptLibrary, RScriptExecutor }
-import org.broadinstitute.gatk.utils.io.Resource
-import scala.collection.mutable.Map
-import scala.math._
-
-class SampleCompareMetrics(val root: Configurable) extends BiopetJavaCommandLineFunction {
-  javaMainClass = getClass.getName
-
-  @Argument(doc = "Sample Dir", shortName = "sampleDir", required = true)
-  var sampleDir: String = _
-
-  @Argument(doc = "Samples", shortName = "sample", required = true)
-  var samples: List[String] = Nil
-
-  @Argument(doc = "File sufix", shortName = "sufix", required = false)
-  var fileSufix: String = _
-
-  @Output(doc = "snpRelFile", shortName = "snpRelFile", required = true)
-  var snpRelFile: File = _
-
-  @Output(doc = "snpAbsFile", shortName = "snpAbsFile", required = true)
-  var snpAbsFile: File = _
-
-  @Output(doc = "indelRelFile", shortName = "indelRelFile", required = true)
-  var indelRelFile: File = _
-
-  @Output(doc = "indelAbsFile", shortName = "indelAbsFile", required = true)
-  var indelAbsFile: File = _
-
-  @Output(doc = "totalFile", shortName = "totalFile", required = true)
-  var totalFile: File = _
-
-  override val defaultVmem = "8G"
-  memoryLimit = Option(4.0)
-
-  override def commandLine = super.commandLine +
-    required("-sampleDir", sampleDir) +
-    repeat("-sample", samples) +
-    optional("-fileSufix", fileSufix) +
-    required("-snpRelFile", snpRelFile) +
-    required("-snpAbsFile", snpAbsFile) +
-    required("-indelRelFile", indelRelFile) +
-    required("-indelAbsFile", indelAbsFile) +
-    required("-totalFile", totalFile)
-}
-
-object SampleCompareMetrics {
-  var sampleDir: String = _
-  var samples: List[String] = Nil
-  var fileSufix: String = ".eval.txt"
-  var snpRelFile: File = _
-  var snpAbsFile: File = _
-  var indelRelFile: File = _
-  var indelAbsFile: File = _
-  var totalFile: File = _
-  /**
-   * @param args the command line arguments
-   */
-  def main(args: Array[String]): Unit = {
-
-    for (t <- 0 until args.size) {
-      args(t) match {
-        case "-sample"       => samples +:= args(t + 1)
-        case "-sampleDir"    => sampleDir = args(t + 1)
-        case "-fileSufix"    => fileSufix = args(t + 1)
-        case "-snpRelFile"   => snpRelFile = new File(args(t + 1))
-        case "-snpAbsFile"   => snpAbsFile = new File(args(t + 1))
-        case "-indelRelFile" => indelRelFile = new File(args(t + 1))
-        case "-indelAbsFile" => indelAbsFile = new File(args(t + 1))
-        case "-totalFile"    => totalFile = new File(args(t + 1))
-        case _               =>
-      }
-    }
-    if (sampleDir == null) throw new IllegalStateException("No sampleDir, use -sampleDir")
-    else if (!sampleDir.endsWith("/")) sampleDir += "/"
-
-    val regex = """\W+""".r
-    val snpsOverlap: Map[(String, String), Int] = Map()
-    val indelsOverlap: Map[(String, String), Int] = Map()
-    val snpsTotal: Map[String, Int] = Map()
-    val indelsTotal: Map[String, Int] = Map()
-    for (sample1 <- samples; sample2 <- samples) {
-      val reader = Source.fromFile(new File(sampleDir + sample1 + "/" + sample1 + "-" + sample2 + fileSufix))
-      for (line <- reader.getLines) {
-        regex.split(line) match {
-          case Array(_, _, _, varType, all, novel, overlap, rate, _*) => {
-            varType match {
-              case "SNP" => {
-                snpsOverlap += (sample1, sample2) -> overlap.toInt
-                snpsTotal += sample1 -> all.toInt
-              }
-              case "INDEL" => {
-                indelsOverlap += (sample1, sample2) -> overlap.toInt
-                indelsTotal += sample1 -> all.toInt
-              }
-              case _ =>
-            }
-          }
-          case _ =>
-        }
-      }
-      reader.close()
-    }
-
-    val snpRelWritter = new PrintWriter(snpRelFile)
-    val snpAbsWritter = new PrintWriter(snpAbsFile)
-    val indelRelWritter = new PrintWriter(indelRelFile)
-    val indelAbsWritter = new PrintWriter(indelAbsFile)
-
-    val allWritters = List(snpRelWritter, snpAbsWritter, indelRelWritter, indelAbsWritter)
-    for (writter <- allWritters) writter.println(samples.mkString("\t", "\t", ""))
-    for (sample1 <- samples) {
-      for (writter <- allWritters) writter.print(sample1)
-      for (sample2 <- samples) {
-        snpRelWritter.print("\t" + (round((snpsOverlap(sample1, sample2).toDouble / snpsTotal(sample1) * 10000.0)) / 10000.0))
-        snpAbsWritter.print("\t" + snpsOverlap(sample1, sample2))
-        indelRelWritter.print("\t" + (round((indelsOverlap(sample1, sample2).toDouble / indelsTotal(sample1) * 10000.0)) / 10000.0))
-        indelAbsWritter.print("\t" + indelsOverlap(sample1, sample2))
-      }
-      for (writter <- allWritters) writter.println()
-    }
-    for (writter <- allWritters) writter.close()
-
-    val totalWritter = new PrintWriter(totalFile)
-    totalWritter.println("Sample\tSNPs\tIndels")
-    for (sample <- samples)
-      totalWritter.println(sample + "\t" + snpsTotal(sample) + "\t" + indelsTotal(sample))
-    totalWritter.close()
-
-    def plot(file: File) {
-      val executor = new RScriptExecutor
-      executor.addScript(new Resource("plotHeatmap.R", getClass))
-      executor.addArgs(file, file.getAbsolutePath.stripSuffix(".tsv") + ".png", file.getAbsolutePath.stripSuffix(".tsv") + ".clustering.png")
-      executor.exec()
-    }
-    plot(snpRelFile)
-    plot(indelRelFile)
-  }
-}
\ No newline at end of file
--- a/protected/biopet-protected-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutableProtected.scala
+++ b/protected/biopet-protected-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutableProtected.scala
@@ -12,7 +12,6 @@ object BiopetExecutableProtected extends BiopetExecutable {
    nl.lumc.sasc.biopet.pipelines.gatk.GatkVariantcalling,
    nl.lumc.sasc.biopet.pipelines.gatk.GatkPipeline,
    nl.lumc.sasc.biopet.pipelines.gatk.GatkVariantRecalibration,
-    nl.lumc.sasc.biopet.pipelines.gatk.GatkVcfSampleCompare,
    nl.lumc.sasc.biopet.pipelines.basty.Basty)

  def tools = BiopetExecutablePublic.tools

--- a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/pipelines/gatk/plotHeatmap.R
+++ b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/pipelines/gatk/plotHeatmap.R
-library('gplots')
-
-args <- commandArgs(TRUE)
-inputArg <- args[1]
-outputArg <- args[2]
-outputArgClustering <- args[3]
-
-col <- heat.colors(250)
-col[250] <- "#00FF00"
-
-heat<-read.table(inputArg, header = 1, sep= '\t')
-rownames(heat) <- heat[,1]
-heat<- heat[,-1]
-
-heat<- as.matrix(heat)
-
-png(file = outputArg, width = 1000, height = 1000)
-heatmap.2(heat, trace = 'none', col = col, Colv=NA, Rowv=NA, dendrogram="none")
-dev.off()
-
-
-png(file = outputArgClustering, width = 1000, height = 1000)
-heatmap.2(heat, trace = 'none', col = col, Colv="Rowv", dendrogram="row")
-dev.off()
--- a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotHeatmap.R
+++ b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotHeatmap.R
+library('gplots')
+library('RColorBrewer')
+
+args <- commandArgs(TRUE)
+inputArg <- args[1]
+outputArg <- args[2]
+outputArgClustering <- args[3]
+outputArgDendrogram <- args[4]
+
+
+heat<-read.table(inputArg, header = 1, sep= '\t', stringsAsFactors = F)
+#heat[heat==1] <- NA
+rownames(heat) <- heat[,1]
+heat<- heat[,-1]
+heat<- as.matrix(heat)
+
+colNumber <- 50
+col <- rev(colorRampPalette(brewer.pal(11, "Spectral"))(colNumber))
+for (i in (colNumber+1):(colNumber+round((dist(range(heat)) - dist(range(heat[heat < 1]))) / dist(range(heat[heat < 1])) * colNumber))) {
+    col[i] <- col[colNumber]
+}
+col[length(col)] <- "#00FF00"
+
+png(file = outputArg, width = 1200, height = 1200)
+heatmap.2(heat, trace = 'none', col = col, Colv=NA, Rowv=NA, dendrogram="none", margins = c(12, 12), na.color="#00FF00")
+dev.off()
+
+hc <- hclust(d = dist(heat))
+png(file = outputArgDendrogram, width = 1200, height = 1200)
+plot(as.dendrogram(hc), horiz=TRUE, asp=0.02)
+dev.off()
+
+png(file = outputArgClustering, width = 1200, height = 1200)
+heatmap.2(heat, trace = 'none', col = col, Colv="Rowv", dendrogram="row",margins = c(12, 12), na.color="#00FF00")
+dev.off()
--- a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotXY.R
+++ b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/tools/plotXY.R
+library('ggplot2')
+library('reshape2')
+
+args <- commandArgs(TRUE)
+inputArg <- args[1]
+outputArg <- args[2]
+
+tsv<-read.table(inputArg, header = 1, sep= '\t', stringsAsFactors = F)
+
+data <- melt(tsv)
+
+data$X <- as.numeric(data$X)
+data <- na.omit(data)
+data <- data[data$value > 0,]
+
+print("Starting to plot")
+png(file = outputArg, width = 1500, height = 1500)
+ggplot(data, aes(x=X, y=value, color=variable, group=variable)) + geom_line()
+dev.off()
+print("plot done")
--- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/RunGubbins.scala
+++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/RunGubbins.scala
@@ -59,7 +59,7 @@ class RunGubbins(val root: Configurable) extends BiopetCommandLineFunction {
      ".filtered_polymorphic_sites.fasta",
      ".filtered_polymorphic_sites.phylip",
      ".final_tree.tre")
-    for (t <- out) outputFiles ::= new File(outputDirectory + File.separator + prefix + t)
+    for (t <- out) outputFiles ::= new File(outputDirectory + File.separator + prefix.getOrElse("gubbins") + t)
  }

  def cmdLine = required("cd", outputDirectory) + " && " + required(executable) +

--- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala
+++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala
+package nl.lumc.sasc.biopet.tools
+
+import java.io.{ FileOutputStream, PrintWriter, File }
+
+import htsjdk.variant.variantcontext.{ VariantContext, Genotype }
+import htsjdk.variant.vcf.VCFFileReader
+import nl.lumc.sasc.biopet.core.{ BiopetJavaCommandLineFunction, ToolCommand }
+import nl.lumc.sasc.biopet.core.config.Configurable
+import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
+import scala.collection.JavaConversions._
+import scala.collection.mutable
+import scala.sys.process.{ Process, ProcessLogger }
+import htsjdk.samtools.util.Interval
+
+/**
+ * Created by pjvan_thof on 1/10/15.
+ */
+class VcfStats(val root: Configurable) extends BiopetJavaCommandLineFunction {
+  javaMainClass = getClass.getName
+
+  @Input(doc = "Input fastq", shortName = "I", required = true)
+  var input: File = _
+
+  protected var outputDir: String = _
+
+  /**
+   * Set output dir and a output file
+   * @param dir
+   */
+  def setOutputDir(dir: String): Unit = {
+    outputDir = dir
+    this.jobOutputFile = new File(dir + File.separator + ".vcfstats.out")
+  }
+
+  /**
+   * Creates command to execute extension
+   * @return
+   */
+  override def commandLine = super.commandLine +
+    required("-I", input) +
+    required("-o", outputDir)
+}
+
+object VcfStats extends ToolCommand {
+  /** Commandline argument */
+  case class Args(inputFile: File = null, outputDir: String = null, intervals: Option[File] = None) extends AbstractArgs
+
+  /** Parsing commandline arguments */
+  class OptParser extends AbstractOptParser {
+    opt[File]('I', "inputFile") required () unbounded () valueName ("<file>") action { (x, c) =>
+      c.copy(inputFile = x)
+    }
+    opt[String]('o', "outputDir") required () unbounded () valueName ("<file>") action { (x, c) =>
+      c.copy(outputDir = x)
+    }
+    //TODO: add interval argument
+    /*
+    opt[File]('i', "intervals") unbounded () valueName ("<file>") action { (x, c) =>
+      c.copy(intervals = Some(x))
+    }
+    */
+  }
+
+  /**
+   * Class to store sample to sample compare stats
+   * @param genotypeOverlap Number of genotypes match with other sample
+   * @param alleleOverlap Number of alleles also found in other sample
+   */
+  case class SampleToSampleStats(var genotypeOverlap: Int = 0,
+                                 var alleleOverlap: Int = 0) {
+    /** Add an other class */
+    def +=(other: SampleToSampleStats) {
+      this.genotypeOverlap += other.genotypeOverlap
+      this.alleleOverlap += other.alleleOverlap
+    }
+  }
+
+  /**
+   * class to store all sample relative stats
+   * @param genotypeStats Stores all genotype relative stats
+   * @param sampleToSample Stores sample to sample compare stats
+   */
+  case class SampleStats(val genotypeStats: mutable.Map[String, mutable.Map[Any, Int]] = mutable.Map(),
+                         val sampleToSample: mutable.Map[String, SampleToSampleStats] = mutable.Map()) {
+    /** Add an other class */
+    def +=(other: SampleStats): Unit = {
+      for ((key, value) <- other.sampleToSample) {
+        if (this.sampleToSample.contains(key)) this.sampleToSample(key) += value
+        else this.sampleToSample(key) = value
+      }
+      for ((field, fieldMap) <- other.genotypeStats) {
+        val thisField = this.genotypeStats.get(field)
+        if (thisField.isDefined) mergeStatsMap(thisField.get, fieldMap)
+        else this.genotypeStats += field -> fieldMap
+      }
+    }
+  }
+
+  /**
+   * General stats class to store vcf stats
+   * @param generalStats Stores are general stats
+   * @param samplesStats Stores all sample/genotype specific stats
+   */
+  case class Stats(val generalStats: mutable.Map[String, mutable.Map[Any, Int]] = mutable.Map(),
+                   val samplesStats: mutable.Map[String, SampleStats] = mutable.Map()) {
+    /** Add an other class */
+    def +=(other: Stats): Stats = {
+      for ((key, value) <- other.samplesStats) {
+        if (this.samplesStats.contains(key)) this.samplesStats(key) += value
+        else this.samplesStats(key) = value
+      }
+      for ((field, fieldMap) <- other.generalStats) {
+        val thisField = this.generalStats.get(field)
+        if (thisField.isDefined) mergeStatsMap(thisField.get, fieldMap)
+        else this.generalStats += field -> fieldMap
+      }
+      this
+    }
+  }
+
+  /**
+   * Merge m2 into m1
+   * @param m1
+   * @param m2
+   */
+  def mergeStatsMap(m1: mutable.Map[Any, Int], m2: mutable.Map[Any, Int]): Unit = {
+    for (key <- m2.keySet)
+      m1(key) = m1.getOrElse(key, 0) + m2(key)
+  }
+
+  /**
+   * Merge m2 into m1
+   * @param m1
+   * @param m2
+   */
+  def mergeNestedStatsMap(m1: mutable.Map[String, mutable.Map[Any, Int]], m2: Map[String, Map[Any, Int]]): Unit = {
+    for ((field, fieldMap) <- m2) {
+      if (m1.contains(field)) {
+        for ((key, value) <- fieldMap) {
+          if (m1(field).contains(key)) m1(field)(key) += value
+          else m1(field)(key) = value
+        }
+      } else m1(field) = mutable.Map(fieldMap.toList: _*)
+    }
+  }
+
+  protected var commandArgs: Args = _
+
+  /**
+   * @param args the command line arguments
+   */
+  def main(args: Array[String]): Unit = {
+    logger.info("Started")
+    val argsParser = new OptParser
+    commandArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1)
+
+    val reader = new VCFFileReader(commandArgs.inputFile, true)
+    val header = reader.getFileHeader
+    val samples = header.getSampleNamesInOrder.toList
+
+    val intervals: List[Interval] = (
+      for (
+        seq <- header.getSequenceDictionary.getSequences;
+        chunks = seq.getSequenceLength / 10000000;
+        i <- 1 until chunks
+      ) yield {
+        val size = seq.getSequenceLength / chunks
+        val begin = size * (i - 1) + 1
+        val end = if (i >= chunks) seq.getSequenceLength else size * i
+        new Interval(seq.getSequenceName, begin, end)
+      }
+    ).toList
+
+    val totalBases = intervals.foldRight(0L)(_.length() + _)
+
+    // Reading vcf records
+    logger.info("Start reading vcf records")
+
+    def createStats: Stats = {
+      val stats = new Stats
+      //init stats
+      for (sample1 <- samples) {
+        stats.samplesStats += sample1 -> new SampleStats
+        for (sample2 <- samples) {
+          stats.samplesStats(sample1).sampleToSample += sample2 -> new SampleToSampleStats
+        }
+      }
+      stats
+    }
+
+    var variantCounter = 0L
+    var baseCounter = 0L
+
+    def status(count: Int, interval: Interval): Unit = {
+      variantCounter += count
+      baseCounter += interval.length()
+      val fraction = baseCounter.toFloat / totalBases * 100
+      logger.info(interval + " done, " + count + " rows processed")
+      logger.info("total: " + variantCounter + " rows processed, " + fraction + "% done")
+    }
+
+    val statsChunks = for (interval <- intervals.par) yield {
+      val reader = new VCFFileReader(commandArgs.inputFile, true)
+      var chunkCounter = 0
+      val stats = createStats
+      logger.info("Starting on: " + interval)
+
+      for (
+        record <- reader.query(interval.getSequence, interval.getStart, interval.getEnd) if record.getStart <= interval.getEnd
+      ) {
+        mergeNestedStatsMap(stats.generalStats, checkGeneral(record))
+        for (sample1 <- samples) yield {
+          val genotype = record.getGenotype(sample1)
+          mergeNestedStatsMap(stats.samplesStats(sample1).genotypeStats, checkGenotype(record, genotype))
+          for (sample2 <- samples) {
+            val genotype2 = record.getGenotype(sample2)
+            if (genotype.getAlleles == genotype2.getAlleles)
+              stats.samplesStats(sample1).sampleToSample(sample2).genotypeOverlap += 1
+            stats.samplesStats(sample1).sampleToSample(sample2).alleleOverlap += genotype.getAlleles.count(allele => genotype2.getAlleles.exists(_.basesMatch(allele)))
+          }
+        }
+        chunkCounter += 1
+      }
+      status(chunkCounter, interval)
+      stats
+    }
+
+    val stats = statsChunks.toList.fold(createStats)(_ += _)
+
+    logger.info("Done reading vcf records")
+
+    writeField("QUAL", stats.generalStats.getOrElse("QUAL", mutable.Map()))
+    writeField("general", stats.generalStats.getOrElse("general", mutable.Map()))
+    writeGenotypeFields(stats, commandArgs.outputDir + "/genotype_", samples)
+    writeOverlap(stats, _.genotypeOverlap, commandArgs.outputDir + "/sample_compare/genotype_overlap", samples)
+    writeOverlap(stats, _.alleleOverlap, commandArgs.outputDir + "/sample_compare/allele_overlap", samples)
+
+    logger.info("Done")
+  }
+
+  /**
+   * Function to check all general stats, all info expect sample/genotype specific stats
+   * @param record
+   * @return
+   */
+  protected def checkGeneral(record: VariantContext): Map[String, Map[Any, Int]] = {
+    val buffer = mutable.Map[String, Map[Any, Int]]()
+
+    def addToBuffer(key: String, value: Any): Unit = {
+      val map = buffer.getOrElse(key, Map())
+      buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
+    }
+
+    buffer += "QUAL" -> Map(record.getPhredScaledQual -> 1)
+
+    addToBuffer("general", "Total")
+    if (record.isBiallelic) addToBuffer("general", "Biallelic")
+    if (record.isComplexIndel) addToBuffer("general", "ComplexIndel")
+    if (record.isFiltered) addToBuffer("general", "Filtered")
+    if (record.isFullyDecoded) addToBuffer("general", "FullyDecoded")
+    if (record.isIndel) addToBuffer("general", "Indel")
+    if (record.isMixed) addToBuffer("general", "Mixed")
+    if (record.isMNP) addToBuffer("general", "MNP")
+    if (record.isMonomorphicInSamples) addToBuffer("general", "MonomorphicInSamples")
+    if (record.isNotFiltered) addToBuffer("general", "NotFiltered")
+    if (record.isPointEvent) addToBuffer("general", "PointEvent")
+    if (record.isPolymorphicInSamples) addToBuffer("general", "PolymorphicInSamples")
+    if (record.isSimpleDeletion) addToBuffer("general", "SimpleDeletion")
+    if (record.isSimpleInsertion) addToBuffer("general", "SimpleInsertion")
+    if (record.isSNP) addToBuffer("general", "SNP")
+    if (record.isStructuralIndel) addToBuffer("general", "StructuralIndel")
+    if (record.isSymbolic) addToBuffer("general", "Symbolic")
+    if (record.isSymbolicOrSV) addToBuffer("general", "SymbolicOrSV")
+    if (record.isVariant) addToBuffer("general", "Variant")
+
+    buffer.toMap
+  }
+
+  /**
+   * Function to check sample/genotype specific stats
+   * @param record
+   * @param genotype
+   * @return
+   */
+  protected def checkGenotype(record: VariantContext, genotype: Genotype): Map[String, Map[Any, Int]] = {
+    val buffer = mutable.Map[String, Map[Any, Int]]()
+
+    def addToBuffer(key: String, value: Any): Unit = {
+      val map = buffer.getOrElse(key, Map())
+      buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
+    }
+
+    buffer += "DP" -> Map((if (genotype.hasDP) genotype.getDP else "not set") -> 1)
+    buffer += "GQ" -> Map((if (genotype.hasGQ) genotype.getGQ else "not set") -> 1)
+
+    val usedAlleles = (for (allele <- genotype.getAlleles) yield record.getAlleleIndex(allele)).toList
+
+    addToBuffer("general", "Total")
+    if (genotype.isHet) addToBuffer("general", "Het")
+    if (genotype.isHetNonRef) addToBuffer("general", "HetNonRef")
+    if (genotype.isHom) addToBuffer("general", "Hom")
+    if (genotype.isHomRef) addToBuffer("general", "HomRef")
+    if (genotype.isHomVar) addToBuffer("general", "HomVar")
+    if (genotype.isMixed) addToBuffer("general", "Mixed")
+    if (genotype.isNoCall) addToBuffer("general", "NoCall")
+    if (genotype.isNonInformative) addToBuffer("general", "NonInformative")
+    if (genotype.isAvailable) addToBuffer("general", "Available")
+    if (genotype.isCalled) addToBuffer("general", "Called")
+    if (genotype.isFiltered) addToBuffer("general", "Filtered")
+
+    if (genotype.hasAD) {
+      val ad = genotype.getAD
+      for (i <- 0 until ad.size if ad(i) > 0) {
+        addToBuffer("AD", ad(i))
+        if (i == 0) addToBuffer("AD-ref", ad(i))
+        if (i > 0) addToBuffer("AD-alt", ad(i))
+        if (usedAlleles.exists(_ == i)) addToBuffer("AD-used", ad(i))
+        else addToBuffer("AD-not_used", ad(i))
+      }
+    }
+
+    buffer.toMap
+  }
+
+  /**
+   * Function to write stats to tsv files
+   * @param stats
+   * @param prefix
+   * @param samples
+   */
+  protected def writeGenotypeFields(stats: Stats, prefix: String, samples: List[String]) {
+    val fields = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")
+    for (field <- fields) {
+      writeGenotypeField(stats, prefix, samples, field)
+    }
+  }
+
+  /**
+   * Function to write 1 specific genotype field
+   * @param stats
+   * @param prefix
+   * @param samples
+   * @param field
+   */
+  protected def writeGenotypeField(stats: Stats, prefix: String, samples: List[String], field: String): Unit = {
+    val file = new File(prefix + field + ".tsv")
+    file.getParentFile.mkdirs()
+    val writer = new PrintWriter(file)
+    writer.println(samples.mkString(field + "\t", "\t", ""))
+    val keySet = (for (sample <- samples) yield stats.samplesStats(sample).genotypeStats.getOrElse(field, Map[Any, Int]()).keySet).fold(Set[Any]())(_ ++ _)
+    for (key <- keySet.toList.sortWith(sortAnyAny(_, _))) {
+      val values = for (sample <- samples) yield stats.samplesStats(sample).genotypeStats.getOrElse(field, Map[Any, Int]()).getOrElse(key, 0)
+      writer.println(values.mkString(key + "\t", "\t", ""))
+    }
+    writer.close()
+    plotLine(file)
+  }
+
+  /**
+   * Function to write 1 specific general field
+   * @param prefix
+   * @param data
+   * @return
+   */
+  protected def writeField(prefix: String, data: mutable.Map[Any, Int]): File = {
+    val file = new File(commandArgs.outputDir + "/" + prefix + ".tsv")
+    println(file)
+    file.getParentFile.mkdirs()
+    val writer = new PrintWriter(file)
+    writer.println("\t" + prefix)
+    for (key <- data.keySet.toList.sortWith(sortAnyAny(_, _))) {
+      writer.println(key + "\t" + data(key))
+    }
+    writer.close()
+    file
+  }
+
+  /**
+   * Function to sort Any values
+   * @param a
+   * @param b
+   * @return
+   */
+  def sortAnyAny(a: Any, b: Any): Boolean = {
+    a match {
+      case ai: Int => {
+        b match {
+          case bi: Int    => ai < bi
+          case bi: Double => ai < bi
+          case _          => a.toString < b.toString
+        }
+      }
+      case _ => a.toString < b.toString
+    }
+  }
+
+  /**
+   * Function to write sample to sample compare tsv's / heatmaps
+   * @param stats
+   * @param function function to extract targeted var in SampleToSampleStats
+   * @param prefix
+   * @param samples
+   */
+  def writeOverlap(stats: Stats, function: SampleToSampleStats => Int,
+                   prefix: String, samples: List[String]): Unit = {
+    val absFile = new File(prefix + ".abs.tsv")
+    val relFile = new File(prefix + ".rel.tsv")
+
+    absFile.getParentFile.mkdirs()
+
+    val absWriter = new PrintWriter(absFile)
+    val relWriter = new PrintWriter(relFile)
+
+    absWriter.println(samples.mkString("\t", "\t", ""))
+    relWriter.println(samples.mkString("\t", "\t", ""))
+    for (sample1 <- samples) {
+      val values = for (sample2 <- samples) yield function(stats.samplesStats(sample1).sampleToSample(sample2))
+
+      absWriter.println(values.mkString(sample1 + "\t", "\t", ""))
+
+      val total = function(stats.samplesStats(sample1).sampleToSample(sample1))
+      relWriter.println(values.map(_.toFloat / total).mkString(sample1 + "\t", "\t", ""))
+    }
+    absWriter.close()
+    relWriter.close()
+
+    plotHeatmap(relFile)
+  }
+
+  /**
+   * Plots heatmaps on target tsv file
+   * @param file
+   */
+  def plotHeatmap(file: File) {
+    executeRscript("plotHeatmap.R", Array(file.getAbsolutePath,
+      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.png",
+      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.clustering.png",
+      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.dendrogram.png"))
+  }
+
+  /**
+   * Plots line graph with target tsv file
+   * @param file
+   */
+  def plotLine(file: File) {
+    executeRscript("plotXY.R", Array(file.getAbsolutePath,
+      file.getAbsolutePath.stripSuffix(".tsv") + ".xy.png"))
+  }
+
+  /**
+   * Function to execute Rscript as subproces
+   * @param resource
+   * @param args
+   */
+  def executeRscript(resource: String, args: Array[String]): Unit = {
+    val is = getClass.getResourceAsStream(resource)
+    val file = File.createTempFile("script.", "." + resource)
+    val os = new FileOutputStream(file)
+    org.apache.commons.io.IOUtils.copy(is, os)
+    os.close()
+
+    val command: String = "Rscript " + file + " " + args.mkString(" ")
+
+    logger.info("Starting: " + command)
+    val process = Process(command).run(ProcessLogger(x => logger.debug(x), x => logger.debug(x)))
+    if (process.exitValue() == 0) logger.info("Done: " + command)
+    else {
+      logger.warn("Failed: " + command)
+      if (!logger.isDebugEnabled) logger.warn("Use -l debug for more info")
+    }
+  }
+}
--- a/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VcfStatsTest.scala
+++ b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VcfStatsTest.scala
+package nl.lumc.sasc.biopet.tools
+
+import org.scalatest.Matchers
+import org.scalatest.testng.TestNGSuite
+import org.testng.annotations.Test
+import scala.collection.mutable
+import VcfStats._
+
+/**
+ * Created by pjvan_thof on 2/5/15.
+ */
+class VcfStatsTest extends TestNGSuite with Matchers {
+
+  @Test
+  def testSampleToSampleStats: Unit = {
+    val s1 = SampleToSampleStats()
+    val s2 = SampleToSampleStats()
+    s1.alleleOverlap shouldBe 0
+    s1.genotypeOverlap shouldBe 0
+    s2.alleleOverlap shouldBe 0
+    s2.genotypeOverlap shouldBe 0
+
+    s1 += s2
+    s1.alleleOverlap shouldBe 0
+    s1.genotypeOverlap shouldBe 0
+    s2.alleleOverlap shouldBe 0
+    s2.genotypeOverlap shouldBe 0
+
+    s2.alleleOverlap = 2
+    s2.genotypeOverlap = 3
+
+    s1 += s2
+    s1.alleleOverlap shouldBe 2
+    s1.genotypeOverlap shouldBe 3
+    s2.alleleOverlap shouldBe 2
+    s2.genotypeOverlap shouldBe 3
+
+    s1 += s2
+    s1.alleleOverlap shouldBe 4
+    s1.genotypeOverlap shouldBe 6
+    s2.alleleOverlap shouldBe 2
+    s2.genotypeOverlap shouldBe 3
+  }
+
+  @Test
+  def testSampleStats: Unit = {
+    val s1 = SampleStats()
+    val s2 = SampleStats()
+
+    s1.sampleToSample += "s1" -> SampleToSampleStats()
+    s1.sampleToSample += "s2" -> SampleToSampleStats()
+    s2.sampleToSample += "s1" -> SampleToSampleStats()
+    s2.sampleToSample += "s2" -> SampleToSampleStats()
+
+    s1.sampleToSample("s1").alleleOverlap = 1
+    s2.sampleToSample("s2").alleleOverlap = 2
+
+    s1.genotypeStats += "1" -> mutable.Map(1 -> 1)
+    s2.genotypeStats += "2" -> mutable.Map(2 -> 2)
+
+    val ss1 = SampleToSampleStats()
+    val ss2 = SampleToSampleStats()
+
+    s1 += s2
+    s1.genotypeStats shouldBe mutable.Map("1" -> mutable.Map(1 -> 1), "2" -> mutable.Map(2 -> 2))
+    ss1.alleleOverlap = 1
+    ss2.alleleOverlap = 2
+    s1.sampleToSample shouldBe mutable.Map("s1" -> ss1, "s2" -> ss2)
+
+    s1 += s2
+    s1.genotypeStats shouldBe mutable.Map("1" -> mutable.Map(1 -> 1), "2" -> mutable.Map(2 -> 4))
+
+    s1 += s1
+    s1.genotypeStats shouldBe mutable.Map("1" -> mutable.Map(1 -> 2), "2" -> mutable.Map(2 -> 8))
+  }
+
+}
--- a/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala
+++ b/public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala
@@ -35,6 +35,7 @@ object BiopetExecutablePublic extends BiopetExecutable {
    nl.lumc.sasc.biopet.tools.CheckAllelesVcfInBam,
    nl.lumc.sasc.biopet.tools.VcfToTsv,
    nl.lumc.sasc.biopet.tools.VcfFilter,
+    nl.lumc.sasc.biopet.tools.VcfStats,
    nl.lumc.sasc.biopet.tools.FindRepeatsPacBio,
    nl.lumc.sasc.biopet.tools.BedToInterval,
    nl.lumc.sasc.biopet.tools.MpileupToVcf,