VcfStats.scala 29.1 KB
Newer Older
bow's avatar
bow committed
1
/**
2 3 4 5 6 7 8 9 10 11 12 13 14
  * Biopet is built on top of GATK Queue for building bioinformatic
  * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
  * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
  * should also be able to execute Biopet tools and pipelines.
  *
  * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
  *
  * Contact us at: sasc@lumc.nl
  *
  * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
  * license; For commercial users or users who do not want to follow the AGPL
  * license, please contact us to obtain a separate license.
  */
Peter van 't Hof's avatar
Peter van 't Hof committed
15
package nl.lumc.sasc.biopet.tools.vcfstats
16

17
import java.io.{File, FileOutputStream, IOException, PrintWriter}
18

Peter van 't Hof's avatar
Peter van 't Hof committed
19
import htsjdk.samtools.util.Interval
20
import htsjdk.variant.variantcontext.{Genotype, VariantContext}
21
import htsjdk.variant.vcf.VCFFileReader
22
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
23
import nl.lumc.sasc.biopet.utils.{ConfigUtils, FastaUtils, ToolCommand, VcfUtils}
Peter van 't Hof's avatar
Peter van 't Hof committed
24

25 26
import scala.collection.JavaConversions._
import scala.collection.mutable
Peter van 't Hof's avatar
Peter van 't Hof committed
27
import scala.io.Source
28
import scala.sys.process.{Process, ProcessLogger}
Peter van 't Hof's avatar
Peter van 't Hof committed
29
import scala.util.Random
Peter van 't Hof's avatar
Peter van 't Hof committed
30 31
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.duration._
32
import scala.concurrent.{Await, Future}
Peter van 't Hof's avatar
Peter van 't Hof committed
33

Peter van 't Hof's avatar
Peter van 't Hof committed
34 35 36
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf

37
/**
38 39 40 41
  * This tool will generate statistics from a vcf file
  *
  * Created by pjvan_thof on 1/10/15.
  */
42
object VcfStats extends ToolCommand {
43

Peter van 't Hof's avatar
Peter van 't Hof committed
44
  /** Commandline argument */
45 46
  case class Args(inputFile: File = null,
                  outputDir: File = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
47
                  referenceFile: File = null,
48 49 50 51
                  intervals: Option[File] = None,
                  infoTags: List[String] = Nil,
                  genotypeTags: List[String] = Nil,
                  allInfoTags: Boolean = false,
Peter van 't Hof's avatar
Peter van 't Hof committed
52 53 54 55
                  allGenotypeTags: Boolean = false,
                  binSize: Int = 10000000,
                  writeBinStats: Boolean = false,
                  generalWiggle: List[String] = Nil,
Peter van 't Hof's avatar
Peter van 't Hof committed
56 57 58
                  genotypeWiggle: List[String] = Nil,
                  localThreads: Int = 1,
                  sparkMaster: Option[String] = None)
59 60
      extends AbstractArgs

61
  val generalWiggleOptions = List(
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
    "Total",
    "Biallelic",
    "ComplexIndel",
    "Filtered",
    "FullyDecoded",
    "Indel",
    "Mixed",
    "MNP",
    "MonomorphicInSamples",
    "NotFiltered",
    "PointEvent",
    "PolymorphicInSamples",
    "SimpleDeletion",
    "SimpleInsertion",
    "SNP",
    "StructuralIndel",
    "Symbolic",
    "SymbolicOrSV",
    "Variant"
  )

83
  val genotypeWiggleOptions = List("Total",
84 85 86 87 88 89 90 91 92 93 94 95
                                   "Het",
                                   "HetNonRef",
                                   "Hom",
                                   "HomRef",
                                   "HomVar",
                                   "Mixed",
                                   "NoCall",
                                   "NonInformative",
                                   "Available",
                                   "Called",
                                   "Filtered",
                                   "Variant")
96

Peter van 't Hof's avatar
Peter van 't Hof committed
97
  /** Parsing commandline arguments */
98
  class OptParser extends AbstractOptParser {
99 100 101 102 103
    opt[File]('I', "inputFile") required () unbounded () maxOccurs 1 valueName "<file>" action {
      (x, c) =>
        c.copy(inputFile = x)
    } validate { x =>
      if (x.exists) success else failure("Input VCF required")
104
    } text "Input VCF file (required)"
105 106 107 108 109
    opt[File]('R', "referenceFile") required () unbounded () maxOccurs 1 valueName "<file>" action {
      (x, c) =>
        c.copy(referenceFile = x)
    } validate { x =>
      if (x.exists) success else failure("Reference file required")
110
    } text "Fasta reference which was used to call input VCF (required)"
111 112 113 114 115 116 117
    opt[File]('o', "outputDir") required () unbounded () maxOccurs 1 valueName "<file>" action {
      (x, c) =>
        c.copy(outputDir = x)
    } validate { x =>
      if (x == null) failure("Valid output directory required")
      else if (x.exists) success
      else failure(s"Output directory does not exist: $x")
118
    } text "Path to directory for output (required)"
Peter van 't Hof's avatar
Peter van 't Hof committed
119
    opt[File]('i', "intervals") unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
120
      c.copy(intervals = Some(x))
121
    } text "Path to interval (BED) file (optional)"
Peter van 't Hof's avatar
Peter van 't Hof committed
122
    opt[String]("infoTag") unbounded () valueName "<tag>" action { (x, c) =>
123
      c.copy(infoTags = x :: c.infoTags)
Peter van 't Hof's avatar
Peter van 't Hof committed
124
    } text s"Summarize these info tags. Default is (${defaultInfoFields.mkString(", ")})"
Peter van 't Hof's avatar
Peter van 't Hof committed
125
    opt[String]("genotypeTag") unbounded () valueName "<tag>" action { (x, c) =>
126
      c.copy(genotypeTags = x :: c.genotypeTags)
Peter van 't Hof's avatar
Peter van 't Hof committed
127
    } text s"Summarize these genotype tags. Default is (${defaultGenotypeFields.mkString(", ")})"
Peter van 't Hof's avatar
Peter van 't Hof committed
128
    opt[Unit]("allInfoTags") unbounded () action { (_, c) =>
129
      c.copy(allInfoTags = true)
130
    } text "Summarize all info tags. Default false"
Peter van 't Hof's avatar
Peter van 't Hof committed
131
    opt[Unit]("allGenotypeTags") unbounded () action { (_, c) =>
132
      c.copy(allGenotypeTags = true)
133
    } text "Summarize all genotype tags. Default false"
Peter van 't Hof's avatar
Peter van 't Hof committed
134 135
    opt[Int]("binSize") unbounded () action { (x, c) =>
      c.copy(binSize = x)
136
    } text "Binsize in estimated base pairs"
Peter van 't Hof's avatar
Peter van 't Hof committed
137
    opt[Unit]("writeBinStats") unbounded () action { (_, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
138
      c.copy(writeBinStats = true)
139
    } text "Write bin statistics. Default False"
Peter van 't Hof's avatar
Peter van 't Hof committed
140 141
    opt[String]("generalWiggle") unbounded () action { (x, c) =>
      c.copy(generalWiggle = x :: c.generalWiggle, writeBinStats = true)
142 143
    } validate { x =>
      if (generalWiggleOptions.contains(x)) success else failure(s"""Nonexistent field $x""")
Peter van 't Hof's avatar
Peter van 't Hof committed
144 145
    } text s"""Create a wiggle track with bin size <binSize> for any of the following statistics:
        |${generalWiggleOptions.mkString(", ")}""".stripMargin
Peter van 't Hof's avatar
Peter van 't Hof committed
146 147
    opt[String]("genotypeWiggle") unbounded () action { (x, c) =>
      c.copy(genotypeWiggle = x :: c.genotypeWiggle, writeBinStats = true)
148 149
    } validate { x =>
      if (genotypeWiggleOptions.contains(x)) success else failure(s"""Non-existent field $x""")
Peter van 't Hof's avatar
Peter van 't Hof committed
150 151
    } text s"""Create a wiggle track with bin size <binSize> for any of the following genotype fields:
        |${genotypeWiggleOptions.mkString(", ")}""".stripMargin
152
    opt[Int]('t', "localThreads") unbounded () action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
153 154
      c.copy(localThreads = x)
    } text s"Number of local threads to use"
155
    opt[String]("sparkMaster") unbounded () action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
156 157 158
      c.copy(sparkMaster = Some(x))
    } text s"Spark master to use"

159 160
  }

161
  //protected var cmdArgs: Args = _
Peter van 't Hof's avatar
Peter van 't Hof committed
162

163 164
  val defaultGenotypeFields =
    List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")
165

166
  val defaultInfoFields = List("QUAL", "general", "AC", "AF", "AN", "DP")
167

168 169 170 171 172 173 174 175 176 177 178 179
  val sampleDistributions = List("Het",
                                 "HetNonRef",
                                 "Hom",
                                 "HomRef",
                                 "HomVar",
                                 "Mixed",
                                 "NoCall",
                                 "NonInformative",
                                 "Available",
                                 "Called",
                                 "Filtered",
                                 "Variant")
Peter van 't Hof's avatar
Peter van 't Hof committed
180

181
  /**
182 183
    * @param args the command line arguments
    */
184 185 186
  def main(args: Array[String]): Unit = {
    logger.info("Started")
    val argsParser = new OptParser
187
    val cmdArgs = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
188

Peter van 't Hof's avatar
Peter van 't Hof committed
189 190 191 192 193 194 195 196 197
    logger.info("Init spark context")

    val conf = new SparkConf()
      .setAppName(this.getClass.getSimpleName)
      .setMaster(cmdArgs.sparkMaster.getOrElse(s"local[${cmdArgs.localThreads}]"))
    val sparkContext = new SparkContext(conf)

    logger.info("Spark context is up")

Peter van 't Hof's avatar
Peter van 't Hof committed
198
    val reader = new VCFFileReader(cmdArgs.inputFile, true)
199 200 201
    val header = reader.getFileHeader
    val samples = header.getSampleNamesInOrder.toList

Peter van 't Hof's avatar
Peter van 't Hof committed
202 203
    reader.close()

Peter van 't Hof's avatar
Peter van 't Hof committed
204
    val adInfoTags = {
205 206 207
      (for (infoTag <- cmdArgs.infoTags if !defaultInfoFields.contains(infoTag)) yield {
        require(header.getInfoHeaderLine(infoTag) != null,
                "Info tag '" + infoTag + "' not found in header of vcf file")
Peter van 't Hof's avatar
Peter van 't Hof committed
208
        infoTag
209 210 211
      }) ::: (for (line <- header.getInfoHeaderLines if cmdArgs.allInfoTags
                   if !defaultInfoFields.contains(line.getID)
                   if !cmdArgs.infoTags.contains(line.getID)) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
212 213 214
        line.getID
      }).toList ::: defaultInfoFields
    }
215

216 217 218 219
    val adGenotypeTags = (for (genotypeTag <- cmdArgs.genotypeTags
                               if !defaultGenotypeFields.contains(genotypeTag)) yield {
      require(header.getFormatHeaderLine(genotypeTag) != null,
              "Format tag '" + genotypeTag + "' not found in header of vcf file")
220
      genotypeTag
221 222 223 224
    }) ::: (for (line <- header.getFormatHeaderLines if cmdArgs.allGenotypeTags
                 if !defaultGenotypeFields.contains(line.getID)
                 if !cmdArgs.genotypeTags.contains(line.getID)
                 if line.getID != "PL") yield {
225
      line.getID
Peter van 't Hof's avatar
Peter van 't Hof committed
226
    }).toList ::: defaultGenotypeFields
227

Peter van 't Hof's avatar
Peter van 't Hof committed
228
    val bedRecords = (cmdArgs.intervals match {
Peter van 't Hof's avatar
Peter van 't Hof committed
229 230
      case Some(i) =>
        BedRecordList.fromFile(i).validateContigs(cmdArgs.referenceFile)
231
      case _ => BedRecordList.fromReference(cmdArgs.referenceFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
232
    }).combineOverlap.scatter(cmdArgs.binSize)
233

Peter van 't Hof's avatar
Peter van 't Hof committed
234 235
    val intervals: List[Interval] =
      BedRecordList.fromList(bedRecords.flatten).toSamIntervals.toList
Peter van 't Hof's avatar
Peter van 't Hof committed
236

237
    val totalBases = bedRecords.flatten.map(_.length.toLong).sum
Peter van 't Hof's avatar
Peter van 't Hof committed
238

Peter van 't Hof's avatar
Peter van 't Hof committed
239 240
    // Reading vcf records
    logger.info("Start reading vcf records")
Peter van 't Hof's avatar
Peter van 't Hof committed
241 242 243

    var variantCounter = 0L
    var baseCounter = 0L
Peter van 't Hof's avatar
Peter van 't Hof committed
244

Peter van 't Hof's avatar
Peter van 't Hof committed
245
    def status(count: Int, interval: Interval): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
246 247 248 249 250
      variantCounter += count
      baseCounter += interval.length()
      val fraction = baseCounter.toFloat / totalBases * 100
      logger.info(interval + " done, " + count + " rows processed")
      logger.info("total: " + variantCounter + " rows processed, " + fraction + "% done")
251
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
252

Peter van 't Hof's avatar
Peter van 't Hof committed
253
    // Triple for loop to not keep all bins in memory
254
    val statsFutures = for (intervals <- Random
Peter van 't Hof's avatar
Peter van 't Hof committed
255
                              .shuffle(bedRecords))
256 257 258 259 260 261
      yield
        Future {
          val chunkStats = for (intervals <- intervals.grouped(25)) yield {
            val binStats = for (interval <- intervals.par) yield {
              val reader = new VCFFileReader(cmdArgs.inputFile, true)
              var chunkCounter = 0
Peter van 't Hof's avatar
Peter van 't Hof committed
262
              val stats = Stats.emptyStats(samples)
263 264
              logger.info("Starting on: " + interval)

Peter van 't Hof's avatar
Peter van 't Hof committed
265 266
              val samInterval = interval.toSamInterval

Peter van 't Hof's avatar
Peter van 't Hof committed
267 268
              val query =
                reader.query(samInterval.getContig, samInterval.getStart, samInterval.getEnd)
269 270 271 272 273 274 275
              if (!query.hasNext) {
                Stats.mergeNestedStatsMap(stats.generalStats, fillGeneral(adInfoTags))
                for (sample <- samples) yield {
                  Stats.mergeNestedStatsMap(stats.samplesStats(sample).genotypeStats,
                                            fillGenotype(adGenotypeTags))
                }
                chunkCounter += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
276
              }
Peter van 't Hof's avatar
Peter van 't Hof committed
277

Peter van 't Hof's avatar
Peter van 't Hof committed
278
              for (record <- query if record.getStart <= samInterval.getEnd) {
279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
                Stats.mergeNestedStatsMap(stats.generalStats, checkGeneral(record, adInfoTags))
                for (sample1 <- samples) yield {
                  val genotype = record.getGenotype(sample1)
                  Stats.mergeNestedStatsMap(stats.samplesStats(sample1).genotypeStats,
                                            checkGenotype(record, genotype, adGenotypeTags))
                  for (sample2 <- samples) {
                    val genotype2 = record.getGenotype(sample2)
                    if (genotype.getAlleles == genotype2.getAlleles)
                      stats.samplesStats(sample1).sampleToSample(sample2).genotypeOverlap += 1
                    stats.samplesStats(sample1).sampleToSample(sample2).alleleOverlap += VcfUtils
                      .alleleOverlap(genotype.getAlleles.toList, genotype2.getAlleles.toList)
                  }
                }
                chunkCounter += 1
              }
              reader.close()

              if (cmdArgs.writeBinStats) {
                val binOutputDir =
Peter van 't Hof's avatar
Peter van 't Hof committed
298
                  new File(cmdArgs.outputDir, "bins" + File.separator + samInterval.getContig)
299 300 301 302 303

                stats.writeGenotypeField(
                  samples,
                  "general",
                  binOutputDir,
Peter van 't Hof's avatar
Peter van 't Hof committed
304
                  prefix = "genotype-" + samInterval.getStart + "-" + samInterval.getEnd)
305 306
                stats.writeField("general",
                                 binOutputDir,
Peter van 't Hof's avatar
Peter van 't Hof committed
307
                                 prefix = samInterval.getStart + "-" + samInterval.getEnd)
308
              }
Peter van 't Hof's avatar
Peter van 't Hof committed
309

Peter van 't Hof's avatar
Peter van 't Hof committed
310
              status(chunkCounter, samInterval)
311 312
              stats
            }
Peter van 't Hof's avatar
Peter van 't Hof committed
313
            binStats.toList.fold(Stats.emptyStats(samples))(_ += _)
Peter van 't Hof's avatar
Peter van 't Hof committed
314
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
315
          chunkStats.toList.fold(Stats.emptyStats(samples))(_ += _)
Peter van 't Hof's avatar
Peter van 't Hof committed
316
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
317
    val stats = statsFutures.foldLeft(Stats.emptyStats(samples)) {
318
      case (a, b) => a += Await.result(b, Duration.Inf)
319
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
320

Peter van 't Hof's avatar
Peter van 't Hof committed
321
    logger.info("Done reading vcf records")
322

Peter van 't Hof's avatar
Rename  
Peter van 't Hof committed
323
    val allWriter = new PrintWriter(new File(cmdArgs.outputDir, "stats.json"))
324 325 326 327 328 329 330
    val json = ConfigUtils.mapToJson(
      stats.getAllStats(
        FastaUtils.getCachedDict(cmdArgs.referenceFile).getSequences.map(_.getSequenceName).toList,
        samples,
        adGenotypeTags,
        adInfoTags,
        sampleDistributions))
Peter van 't Hof's avatar
Peter van 't Hof committed
331
    allWriter.println(json.nospaces)
332
    allWriter.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
333 334

    // Write general wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
335 336
    for (field <- cmdArgs.generalWiggle) {
      val file = new File(cmdArgs.outputDir, "wigs" + File.separator + "general-" + field + ".wig")
337
      writeWiggle(intervals, field, "count", file, genotype = false, cmdArgs.outputDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
338 339
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
340
    // Write sample wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
341
    for (field <- cmdArgs.genotypeWiggle; sample <- samples) {
342 343
      val file = new File(cmdArgs.outputDir,
                          "wigs" + File.separator + "genotype-" + sample + "-" + field + ".wig")
344
      writeWiggle(intervals, field, sample, file, genotype = true, cmdArgs.outputDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
345 346
    }

347 348 349 350 351 352 353 354
    writeOverlap(stats,
                 _.genotypeOverlap,
                 cmdArgs.outputDir + "/sample_compare/genotype_overlap",
                 samples)
    writeOverlap(stats,
                 _.alleleOverlap,
                 cmdArgs.outputDir + "/sample_compare/allele_overlap",
                 samples)
355

Peter van 't Hof's avatar
Peter van 't Hof committed
356
    sparkContext.stop()
357
    logger.info("Done")
Peter van 't Hof's avatar
Peter van 't Hof committed
358 359
  }

360
  //FIXME: does only work correct for reference and not with a bed file
361 362 363 364
  protected def writeWiggle(intervals: List[Interval],
                            row: String,
                            column: String,
                            outputFile: File,
365 366
                            genotype: Boolean,
                            outputDir: File): Unit = {
367 368
    val groupedIntervals =
      intervals.groupBy(_.getContig).map { case (k, v) => k -> v.sortBy(_.getStart) }
Peter van 't Hof's avatar
Peter van 't Hof committed
369
    outputFile.getParentFile.mkdirs()
Peter van 't Hof's avatar
Peter van 't Hof committed
370 371
    val writer = new PrintWriter(outputFile)
    writer.println("track type=wiggle_0")
372
    for ((chr, intervals) <- groupedIntervals) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
373 374 375
      val length = intervals.head.length()
      writer.println(s"fixedStep chrom=$chr start=1 step=$length span=$length")
      for (interval <- intervals) {
Peter van 't Hof's avatar
Peter van 't Hof committed
376
        val file = {
377 378
          if (genotype)
            new File(
379
              outputDir,
380 381 382
              "bins" + File.separator + chr + File.separator + "genotype-" + interval.getStart + "-" + interval.getEnd + "-general.tsv")
          else
            new File(
383
              outputDir,
384
              "bins" + File.separator + chr + File.separator + interval.getStart + "-" + interval.getEnd + "-general.tsv")
Peter van 't Hof's avatar
Peter van 't Hof committed
385 386
        }
        writer.println(valueFromTsv(file, row, column).getOrElse(0))
Peter van 't Hof's avatar
Peter van 't Hof committed
387 388 389 390 391
      }
    }
    writer.close()
  }

392
  /**
393 394 395 396 397 398
    * Gets single value from a tsv file
    * @param file Input tsv file
    * @param row Row id
    * @param column column id
    * @return value
    */
Peter van 't Hof's avatar
Peter van 't Hof committed
399 400 401 402 403 404
  def valueFromTsv(file: File, row: String, column: String): Option[String] = {
    val reader = Source.fromFile(file)
    val it = reader.getLines()
    val index = it.next().split("\t").indexOf(column)

    val value = it.find(_.startsWith(row))
405
    reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
406 407

    value.collect { case x => x.split("\t")(index) }
408 409
  }

410 411
  protected[tools] def fillGeneral(
      additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
412 413 414 415 416 417 418 419
    val buffer = mutable.Map[String, Map[Any, Int]]()

    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
      val map = buffer.getOrElse(key, Map())
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
420
    addToBuffer("QUAL", "not set", found = false)
421

Sander Bollen's avatar
Sander Bollen committed
422 423 424 425 426 427 428 429 430 431 432 433
    addToBuffer("SampleDistribution-Het", "not set", found = false)
    addToBuffer("SampleDistribution-HetNonRef", "not set", found = false)
    addToBuffer("SampleDistribution-Hom", "not set", found = false)
    addToBuffer("SampleDistribution-HomRef", "not set", found = false)
    addToBuffer("SampleDistribution-HomVar", "not set", found = false)
    addToBuffer("SampleDistribution-Mixed", "not set", found = false)
    addToBuffer("SampleDistribution-NoCall", "not set", found = false)
    addToBuffer("SampleDistribution-NonInformative", "not set", found = false)
    addToBuffer("SampleDistribution-Available", "not set", found = false)
    addToBuffer("SampleDistribution-Called", "not set", found = false)
    addToBuffer("SampleDistribution-Filtered", "not set", found = false)
    addToBuffer("SampleDistribution-Variant", "not set", found = false)
434

Peter van 't Hof's avatar
Peter van 't Hof committed
435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453
    addToBuffer("general", "Total", found = false)
    addToBuffer("general", "Biallelic", found = false)
    addToBuffer("general", "ComplexIndel", found = false)
    addToBuffer("general", "Filtered", found = false)
    addToBuffer("general", "FullyDecoded", found = false)
    addToBuffer("general", "Indel", found = false)
    addToBuffer("general", "Mixed", found = false)
    addToBuffer("general", "MNP", found = false)
    addToBuffer("general", "MonomorphicInSamples", found = false)
    addToBuffer("general", "NotFiltered", found = false)
    addToBuffer("general", "PointEvent", found = false)
    addToBuffer("general", "PolymorphicInSamples", found = false)
    addToBuffer("general", "SimpleDeletion", found = false)
    addToBuffer("general", "SimpleInsertion", found = false)
    addToBuffer("general", "SNP", found = false)
    addToBuffer("general", "StructuralIndel", found = false)
    addToBuffer("general", "Symbolic", found = false)
    addToBuffer("general", "SymbolicOrSV", found = false)
    addToBuffer("general", "Variant", found = false)
454 455 456 457

    val skipTags = List("QUAL", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
458
      addToBuffer(tag, "not set", found = false)
459 460 461 462 463
    }

    Map("total" -> buffer.toMap)
  }

464
  /** Function to check all general stats, all info expect sample/genotype specific stats */
465 466 467
  protected[tools] def checkGeneral(
      record: VariantContext,
      additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
468 469
    val buffer = mutable.Map[String, Map[Any, Int]]()

470
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
471
      val map = buffer.getOrElse(key, Map())
472
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
473
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
474 475
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
476
    addToBuffer("QUAL", Math.round(record.getPhredScaledQual), found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
477

478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514
    addToBuffer("SampleDistribution-Het",
                record.getGenotypes.count(genotype => genotype.isHet),
                found = true)
    addToBuffer("SampleDistribution-HetNonRef",
                record.getGenotypes.count(genotype => genotype.isHetNonRef),
                found = true)
    addToBuffer("SampleDistribution-Hom",
                record.getGenotypes.count(genotype => genotype.isHom),
                found = true)
    addToBuffer("SampleDistribution-HomRef",
                record.getGenotypes.count(genotype => genotype.isHomRef),
                found = true)
    addToBuffer("SampleDistribution-HomVar",
                record.getGenotypes.count(genotype => genotype.isHomVar),
                found = true)
    addToBuffer("SampleDistribution-Mixed",
                record.getGenotypes.count(genotype => genotype.isMixed),
                found = true)
    addToBuffer("SampleDistribution-NoCall",
                record.getGenotypes.count(genotype => genotype.isNoCall),
                found = true)
    addToBuffer("SampleDistribution-NonInformative",
                record.getGenotypes.count(genotype => genotype.isNonInformative),
                found = true)
    addToBuffer("SampleDistribution-Available",
                record.getGenotypes.count(genotype => genotype.isAvailable),
                found = true)
    addToBuffer("SampleDistribution-Called",
                record.getGenotypes.count(genotype => genotype.isCalled),
                found = true)
    addToBuffer("SampleDistribution-Filtered",
                record.getGenotypes.count(genotype => genotype.isFiltered),
                found = true)
    addToBuffer("SampleDistribution-Variant",
                record.getGenotypes.count(genotype =>
                  genotype.isHetNonRef || genotype.isHet || genotype.isHomVar),
                found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
515

Peter van 't Hof's avatar
Peter van 't Hof committed
516
    addToBuffer("general", "Total", found = true)
517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535
    addToBuffer("general", "Biallelic", record.isBiallelic)
    addToBuffer("general", "ComplexIndel", record.isComplexIndel)
    addToBuffer("general", "Filtered", record.isFiltered)
    addToBuffer("general", "FullyDecoded", record.isFullyDecoded)
    addToBuffer("general", "Indel", record.isIndel)
    addToBuffer("general", "Mixed", record.isMixed)
    addToBuffer("general", "MNP", record.isMNP)
    addToBuffer("general", "MonomorphicInSamples", record.isMonomorphicInSamples)
    addToBuffer("general", "NotFiltered", record.isNotFiltered)
    addToBuffer("general", "PointEvent", record.isPointEvent)
    addToBuffer("general", "PolymorphicInSamples", record.isPolymorphicInSamples)
    addToBuffer("general", "SimpleDeletion", record.isSimpleDeletion)
    addToBuffer("general", "SimpleInsertion", record.isSimpleInsertion)
    addToBuffer("general", "SNP", record.isSNP)
    addToBuffer("general", "StructuralIndel", record.isStructuralIndel)
    addToBuffer("general", "Symbolic", record.isSymbolic)
    addToBuffer("general", "SymbolicOrSV", record.isSymbolicOrSV)
    addToBuffer("general", "Variant", record.isVariant)

Peter van 't Hof's avatar
Peter van 't Hof committed
536 537 538
    val skipTags = List("QUAL", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
539
      val value = record.getAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
540 541
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
542 543
    }

544
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
Peter van 't Hof's avatar
Peter van 't Hof committed
545 546
  }

547 548
  protected[tools] def fillGenotype(
      additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
549 550 551 552 553 554 555 556
    val buffer = mutable.Map[String, Map[Any, Int]]()

    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
      val map = buffer.getOrElse(key, Map())
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572
    addToBuffer("DP", "not set", found = false)
    addToBuffer("GQ", "not set", found = false)

    addToBuffer("general", "Total", found = false)
    addToBuffer("general", "Het", found = false)
    addToBuffer("general", "HetNonRef", found = false)
    addToBuffer("general", "Hom", found = false)
    addToBuffer("general", "HomRef", found = false)
    addToBuffer("general", "HomVar", found = false)
    addToBuffer("general", "Mixed", found = false)
    addToBuffer("general", "NoCall", found = false)
    addToBuffer("general", "NonInformative", found = false)
    addToBuffer("general", "Available", found = false)
    addToBuffer("general", "Called", found = false)
    addToBuffer("general", "Filtered", found = false)
    addToBuffer("general", "Variant", found = false)
573 574 575 576 577 578 579 580 581 582 583

    val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
      addToBuffer(tag, 0, found = false)
    }

    Map("total" -> buffer.toMap)

  }

584
  /** Function to check sample/genotype specific stats */
585 586 587 588
  protected[tools] def checkGenotype(
      record: VariantContext,
      genotype: Genotype,
      additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
589
    val buffer = mutable.Map[String, Map[Any, Int]]()
Peter van 't Hof's avatar
Peter van 't Hof committed
590

591
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
592
      val map = buffer.getOrElse(key, Map())
593
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
594
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
595 596
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
597 598 599
    buffer += "DP" -> Map((if (genotype.hasDP) genotype.getDP else "not set") -> 1)
    buffer += "GQ" -> Map((if (genotype.hasGQ) genotype.getGQ else "not set") -> 1)

600 601
    val usedAlleles =
      (for (allele <- genotype.getAlleles) yield record.getAlleleIndex(allele)).toList
Peter van 't Hof's avatar
Peter van 't Hof committed
602

Peter van 't Hof's avatar
Peter van 't Hof committed
603
    addToBuffer("general", "Total", found = true)
604 605 606 607 608 609 610 611 612 613 614 615
    addToBuffer("general", "Het", genotype.isHet)
    addToBuffer("general", "HetNonRef", genotype.isHetNonRef)
    addToBuffer("general", "Hom", genotype.isHom)
    addToBuffer("general", "HomRef", genotype.isHomRef)
    addToBuffer("general", "HomVar", genotype.isHomVar)
    addToBuffer("general", "Mixed", genotype.isMixed)
    addToBuffer("general", "NoCall", genotype.isNoCall)
    addToBuffer("general", "NonInformative", genotype.isNonInformative)
    addToBuffer("general", "Available", genotype.isAvailable)
    addToBuffer("general", "Called", genotype.isCalled)
    addToBuffer("general", "Filtered", genotype.isFiltered)
    addToBuffer("general", "Variant", genotype.isHetNonRef || genotype.isHet || genotype.isHomVar)
Peter van 't Hof's avatar
Peter van 't Hof committed
616

Peter van 't Hof's avatar
Peter van 't Hof committed
617 618 619
    if (genotype.hasAD) {
      val ad = genotype.getAD
      for (i <- 0 until ad.size if ad(i) > 0) {
Peter van 't Hof's avatar
Peter van 't Hof committed
620 621 622 623 624
        addToBuffer("AD", ad(i), found = true)
        if (i == 0) addToBuffer("AD-ref", ad(i), found = true)
        if (i > 0) addToBuffer("AD-alt", ad(i), found = true)
        if (usedAlleles.contains(i)) addToBuffer("AD-used", ad(i), found = true)
        else addToBuffer("AD-not_used", ad(i), found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
625 626
      }
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
627

Peter van 't Hof's avatar
Peter van 't Hof committed
628 629 630
    val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
631
      val value = genotype.getAnyAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
632 633
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
634 635
    }

636
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
637 638
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
639
  /** Function to write sample to sample compare tsv's / heatmaps */
640 641 642
  def writeOverlap(stats: Stats,
                   function: SampleToSampleStats => Int,
                   prefix: String,
643 644
                   samples: List[String],
                   plots: Boolean = true): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
645 646 647 648 649 650 651
    val absFile = new File(prefix + ".abs.tsv")
    val relFile = new File(prefix + ".rel.tsv")

    absFile.getParentFile.mkdirs()

    val absWriter = new PrintWriter(absFile)
    val relWriter = new PrintWriter(relFile)
652 653 654 655

    absWriter.println(samples.mkString("\t", "\t", ""))
    relWriter.println(samples.mkString("\t", "\t", ""))
    for (sample1 <- samples) {
656 657
      val values = for (sample2 <- samples)
        yield function(stats.samplesStats(sample1).sampleToSample(sample2))
Peter van 't Hof's avatar
Peter van 't Hof committed
658

659 660
      absWriter.println(values.mkString(sample1 + "\t", "\t", ""))

Peter van 't Hof's avatar
Peter van 't Hof committed
661
      val total = function(stats.samplesStats(sample1).sampleToSample(sample1))
662 663 664 665
      relWriter.println(values.map(_.toFloat / total).mkString(sample1 + "\t", "\t", ""))
    }
    absWriter.close()
    relWriter.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
666

667
    if (plots) plotHeatmap(relFile)
668 669
  }

670
  /** Plots heatmaps on target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
671
  def plotHeatmap(file: File) {
672 673 674 675 676 677 678 679 680
    executeRscript(
      "plotHeatmap.R",
      Array(
        file.getAbsolutePath,
        file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.png",
        file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.clustering.png",
        file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.dendrogram.png"
      )
    )
Peter van 't Hof's avatar
Peter van 't Hof committed
681 682
  }

683
  /** Plots line graph with target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
684
  def plotLine(file: File) {
685 686 687
    executeRscript(
      "plotXY.R",
      Array(file.getAbsolutePath, file.getAbsolutePath.stripSuffix(".tsv") + ".xy.png"))
Peter van 't Hof's avatar
Peter van 't Hof committed
688 689
  }

690
  /** Function to execute Rscript as subproces */
Peter van 't Hof's avatar
Peter van 't Hof committed
691 692 693
  def executeRscript(resource: String, args: Array[String]): Unit = {
    val is = getClass.getResourceAsStream(resource)
    val file = File.createTempFile("script.", "." + resource)
694
    file.deleteOnExit()
Peter van 't Hof's avatar
Peter van 't Hof committed
695 696 697 698 699 700
    val os = new FileOutputStream(file)
    org.apache.commons.io.IOUtils.copy(is, os)
    os.close()

    val command: String = "Rscript " + file + " " + args.mkString(" ")

701
    logger.info("Starting: " + command)
Peter van 't Hof's avatar
Peter van 't Hof committed
702 703 704 705 706 707 708 709 710
    try {
      val process = Process(command).run(ProcessLogger(x => logger.debug(x), x => logger.debug(x)))
      if (process.exitValue() == 0) logger.info("Done: " + command)
      else {
        logger.warn("Failed: " + command)
        if (!logger.isDebugEnabled) logger.warn("Use -l debug for more info")
      }
    } catch {
      case e: IOException =>
Peter van 't Hof's avatar
Peter van 't Hof committed
711 712
        logger.warn("Failed: " + command)
        logger.debug(e)
713
    }
714 715
  }
}