VcfStats.scala 28.6 KB
Newer Older
bow's avatar
bow committed
1
/**
2 3 4 5 6 7 8 9 10 11 12 13 14
  * Biopet is built on top of GATK Queue for building bioinformatic
  * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
  * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
  * should also be able to execute Biopet tools and pipelines.
  *
  * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
  *
  * Contact us at: sasc@lumc.nl
  *
  * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
  * license; For commercial users or users who do not want to follow the AGPL
  * license, please contact us to obtain a separate license.
  */
Peter van 't Hof's avatar
Peter van 't Hof committed
15
package nl.lumc.sasc.biopet.tools.vcfstats
16

17
import java.io.{File, FileOutputStream, IOException, PrintWriter}
18

Peter van 't Hof's avatar
Peter van 't Hof committed
19
import htsjdk.samtools.util.Interval
20
import htsjdk.variant.variantcontext.{Genotype, VariantContext}
21
import htsjdk.variant.vcf.VCFFileReader
22
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
23
import nl.lumc.sasc.biopet.utils.{ConfigUtils, FastaUtils, ToolCommand, VcfUtils}
Peter van 't Hof's avatar
Peter van 't Hof committed
24

25 26
import scala.collection.JavaConversions._
import scala.collection.mutable
Peter van 't Hof's avatar
Peter van 't Hof committed
27
import scala.io.Source
28
import scala.sys.process.{Process, ProcessLogger}
Peter van 't Hof's avatar
Peter van 't Hof committed
29
import scala.util.Random
Peter van 't Hof's avatar
Peter van 't Hof committed
30 31
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.duration._
32
import scala.concurrent.{Await, Future}
Peter van 't Hof's avatar
Peter van 't Hof committed
33

34
/**
35 36 37 38
  * This tool will generate statistics from a vcf file
  *
  * Created by pjvan_thof on 1/10/15.
  */
39
object VcfStats extends ToolCommand {
40

Peter van 't Hof's avatar
Peter van 't Hof committed
41
  /** Commandline argument */
42 43
  case class Args(inputFile: File = null,
                  outputDir: File = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
44
                  referenceFile: File = null,
45 46 47 48
                  intervals: Option[File] = None,
                  infoTags: List[String] = Nil,
                  genotypeTags: List[String] = Nil,
                  allInfoTags: Boolean = false,
Peter van 't Hof's avatar
Peter van 't Hof committed
49 50 51 52
                  allGenotypeTags: Boolean = false,
                  binSize: Int = 10000000,
                  writeBinStats: Boolean = false,
                  generalWiggle: List[String] = Nil,
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
                  genotypeWiggle: List[String] = Nil)
      extends AbstractArgs

  private val generalWiggleOptions = List(
    "Total",
    "Biallelic",
    "ComplexIndel",
    "Filtered",
    "FullyDecoded",
    "Indel",
    "Mixed",
    "MNP",
    "MonomorphicInSamples",
    "NotFiltered",
    "PointEvent",
    "PolymorphicInSamples",
    "SimpleDeletion",
    "SimpleInsertion",
    "SNP",
    "StructuralIndel",
    "Symbolic",
    "SymbolicOrSV",
    "Variant"
  )

  private val genotypeWiggleOptions = List("Total",
                                           "Het",
                                           "HetNonRef",
                                           "Hom",
                                           "HomRef",
                                           "HomVar",
                                           "Mixed",
                                           "NoCall",
                                           "NonInformative",
                                           "Available",
                                           "Called",
                                           "Filtered",
                                           "Variant")
91

Peter van 't Hof's avatar
Peter van 't Hof committed
92
  /** Parsing commandline arguments */
93
  class OptParser extends AbstractOptParser {
94 95 96 97 98
    opt[File]('I', "inputFile") required () unbounded () maxOccurs 1 valueName "<file>" action {
      (x, c) =>
        c.copy(inputFile = x)
    } validate { x =>
      if (x.exists) success else failure("Input VCF required")
99
    } text "Input VCF file (required)"
100 101 102 103 104
    opt[File]('R', "referenceFile") required () unbounded () maxOccurs 1 valueName "<file>" action {
      (x, c) =>
        c.copy(referenceFile = x)
    } validate { x =>
      if (x.exists) success else failure("Reference file required")
105
    } text "Fasta reference which was used to call input VCF (required)"
106 107 108 109 110 111 112
    opt[File]('o', "outputDir") required () unbounded () maxOccurs 1 valueName "<file>" action {
      (x, c) =>
        c.copy(outputDir = x)
    } validate { x =>
      if (x == null) failure("Valid output directory required")
      else if (x.exists) success
      else failure(s"Output directory does not exist: $x")
113
    } text "Path to directory for output (required)"
Peter van 't Hof's avatar
Peter van 't Hof committed
114
    opt[File]('i', "intervals") unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
115
      c.copy(intervals = Some(x))
116
    } text "Path to interval (BED) file (optional)"
Peter van 't Hof's avatar
Peter van 't Hof committed
117
    opt[String]("infoTag") unbounded () valueName "<tag>" action { (x, c) =>
118
      c.copy(infoTags = x :: c.infoTags)
Peter van 't Hof's avatar
Peter van 't Hof committed
119
    } text s"Summarize these info tags. Default is (${defaultInfoFields.mkString(", ")})"
Peter van 't Hof's avatar
Peter van 't Hof committed
120
    opt[String]("genotypeTag") unbounded () valueName "<tag>" action { (x, c) =>
121
      c.copy(genotypeTags = x :: c.genotypeTags)
Peter van 't Hof's avatar
Peter van 't Hof committed
122
    } text s"Summarize these genotype tags. Default is (${defaultGenotypeFields.mkString(", ")})"
Peter van 't Hof's avatar
Peter van 't Hof committed
123
    opt[Unit]("allInfoTags") unbounded () action { (_, c) =>
124
      c.copy(allInfoTags = true)
125
    } text "Summarize all info tags. Default false"
Peter van 't Hof's avatar
Peter van 't Hof committed
126
    opt[Unit]("allGenotypeTags") unbounded () action { (_, c) =>
127
      c.copy(allGenotypeTags = true)
128
    } text "Summarize all genotype tags. Default false"
Peter van 't Hof's avatar
Peter van 't Hof committed
129 130
    opt[Int]("binSize") unbounded () action { (x, c) =>
      c.copy(binSize = x)
131
    } text "Binsize in estimated base pairs"
Peter van 't Hof's avatar
Peter van 't Hof committed
132
    opt[Unit]("writeBinStats") unbounded () action { (_, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
133
      c.copy(writeBinStats = true)
134
    } text "Write bin statistics. Default False"
Peter van 't Hof's avatar
Peter van 't Hof committed
135 136
    opt[String]("generalWiggle") unbounded () action { (x, c) =>
      c.copy(generalWiggle = x :: c.generalWiggle, writeBinStats = true)
137 138
    } validate { x =>
      if (generalWiggleOptions.contains(x)) success else failure(s"""Nonexistent field $x""")
Peter van 't Hof's avatar
Peter van 't Hof committed
139 140
    } text s"""Create a wiggle track with bin size <binSize> for any of the following statistics:
        |${generalWiggleOptions.mkString(", ")}""".stripMargin
Peter van 't Hof's avatar
Peter van 't Hof committed
141 142
    opt[String]("genotypeWiggle") unbounded () action { (x, c) =>
      c.copy(genotypeWiggle = x :: c.genotypeWiggle, writeBinStats = true)
143 144
    } validate { x =>
      if (genotypeWiggleOptions.contains(x)) success else failure(s"""Non-existent field $x""")
Peter van 't Hof's avatar
Peter van 't Hof committed
145 146
    } text s"""Create a wiggle track with bin size <binSize> for any of the following genotype fields:
        |${genotypeWiggleOptions.mkString(", ")}""".stripMargin
147 148
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
149
  protected var cmdArgs: Args = _
Peter van 't Hof's avatar
Peter van 't Hof committed
150

151 152
  val defaultGenotypeFields =
    List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")
153

154
  val defaultInfoFields = List("QUAL", "general", "AC", "AF", "AN", "DP")
155

156 157 158 159 160 161 162 163 164 165 166 167
  val sampleDistributions = List("Het",
                                 "HetNonRef",
                                 "Hom",
                                 "HomRef",
                                 "HomVar",
                                 "Mixed",
                                 "NoCall",
                                 "NonInformative",
                                 "Available",
                                 "Called",
                                 "Filtered",
                                 "Variant")
Peter van 't Hof's avatar
Peter van 't Hof committed
168

169
  /**
170 171
    * @param args the command line arguments
    */
172 173 174
  def main(args: Array[String]): Unit = {
    logger.info("Started")
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
175
    cmdArgs = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
176

Peter van 't Hof's avatar
Peter van 't Hof committed
177
    val reader = new VCFFileReader(cmdArgs.inputFile, true)
178 179 180
    val header = reader.getFileHeader
    val samples = header.getSampleNamesInOrder.toList

Peter van 't Hof's avatar
Peter van 't Hof committed
181 182
    reader.close()

Peter van 't Hof's avatar
Peter van 't Hof committed
183
    val adInfoTags = {
184 185 186
      (for (infoTag <- cmdArgs.infoTags if !defaultInfoFields.contains(infoTag)) yield {
        require(header.getInfoHeaderLine(infoTag) != null,
                "Info tag '" + infoTag + "' not found in header of vcf file")
Peter van 't Hof's avatar
Peter van 't Hof committed
187
        infoTag
188 189 190
      }) ::: (for (line <- header.getInfoHeaderLines if cmdArgs.allInfoTags
                   if !defaultInfoFields.contains(line.getID)
                   if !cmdArgs.infoTags.contains(line.getID)) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
191 192 193
        line.getID
      }).toList ::: defaultInfoFields
    }
194

195 196 197 198
    val adGenotypeTags = (for (genotypeTag <- cmdArgs.genotypeTags
                               if !defaultGenotypeFields.contains(genotypeTag)) yield {
      require(header.getFormatHeaderLine(genotypeTag) != null,
              "Format tag '" + genotypeTag + "' not found in header of vcf file")
199
      genotypeTag
200 201 202 203
    }) ::: (for (line <- header.getFormatHeaderLines if cmdArgs.allGenotypeTags
                 if !defaultGenotypeFields.contains(line.getID)
                 if !cmdArgs.genotypeTags.contains(line.getID)
                 if line.getID != "PL") yield {
204
      line.getID
Peter van 't Hof's avatar
Peter van 't Hof committed
205
    }).toList ::: defaultGenotypeFields
206

Peter van 't Hof's avatar
Peter van 't Hof committed
207
    val bedRecords = (cmdArgs.intervals match {
Peter van 't Hof's avatar
Peter van 't Hof committed
208 209
      case Some(i) =>
        BedRecordList.fromFile(i).validateContigs(cmdArgs.referenceFile)
210
      case _ => BedRecordList.fromReference(cmdArgs.referenceFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
211
    }).combineOverlap.scatter(cmdArgs.binSize)
212

Peter van 't Hof's avatar
Peter van 't Hof committed
213 214
    val intervals: List[Interval] =
      BedRecordList.fromList(bedRecords.flatten).toSamIntervals.toList
Peter van 't Hof's avatar
Peter van 't Hof committed
215

216
    val totalBases = bedRecords.length
Peter van 't Hof's avatar
Peter van 't Hof committed
217

Peter van 't Hof's avatar
Peter van 't Hof committed
218 219
    // Reading vcf records
    logger.info("Start reading vcf records")
Peter van 't Hof's avatar
Peter van 't Hof committed
220 221 222 223 224 225

    def createStats: Stats = {
      val stats = new Stats
      //init stats
      for (sample1 <- samples) {
        stats.samplesStats += sample1 -> new SampleStats
226
        for (sample2 <- samples) {
Peter van 't Hof's avatar
Peter van 't Hof committed
227
          stats.samplesStats(sample1).sampleToSample += sample2 -> new SampleToSampleStats
228 229
        }
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
230 231 232 233 234
      stats
    }

    var variantCounter = 0L
    var baseCounter = 0L
Peter van 't Hof's avatar
Peter van 't Hof committed
235

Peter van 't Hof's avatar
Peter van 't Hof committed
236
    def status(count: Int, interval: Interval): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
237 238 239 240 241
      variantCounter += count
      baseCounter += interval.length()
      val fraction = baseCounter.toFloat / totalBases * 100
      logger.info(interval + " done, " + count + " rows processed")
      logger.info("total: " + variantCounter + " rows processed, " + fraction + "% done")
242
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
243

Peter van 't Hof's avatar
Peter van 't Hof committed
244
    // Triple for loop to not keep all bins in memory
245
    val statsFutures = for (intervals <- Random
Peter van 't Hof's avatar
Peter van 't Hof committed
246
                              .shuffle(bedRecords))
247 248 249 250 251 252 253 254 255
      yield
        Future {
          val chunkStats = for (intervals <- intervals.grouped(25)) yield {
            val binStats = for (interval <- intervals.par) yield {
              val reader = new VCFFileReader(cmdArgs.inputFile, true)
              var chunkCounter = 0
              val stats = createStats
              logger.info("Starting on: " + interval)

Peter van 't Hof's avatar
Peter van 't Hof committed
256 257
              val samInterval = interval.toSamInterval

Peter van 't Hof's avatar
Peter van 't Hof committed
258 259
              val query =
                reader.query(samInterval.getContig, samInterval.getStart, samInterval.getEnd)
260 261 262 263 264 265 266
              if (!query.hasNext) {
                Stats.mergeNestedStatsMap(stats.generalStats, fillGeneral(adInfoTags))
                for (sample <- samples) yield {
                  Stats.mergeNestedStatsMap(stats.samplesStats(sample).genotypeStats,
                                            fillGenotype(adGenotypeTags))
                }
                chunkCounter += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
267
              }
Peter van 't Hof's avatar
Peter van 't Hof committed
268

Peter van 't Hof's avatar
Peter van 't Hof committed
269
              for (record <- query if record.getStart <= samInterval.getEnd) {
270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
                Stats.mergeNestedStatsMap(stats.generalStats, checkGeneral(record, adInfoTags))
                for (sample1 <- samples) yield {
                  val genotype = record.getGenotype(sample1)
                  Stats.mergeNestedStatsMap(stats.samplesStats(sample1).genotypeStats,
                                            checkGenotype(record, genotype, adGenotypeTags))
                  for (sample2 <- samples) {
                    val genotype2 = record.getGenotype(sample2)
                    if (genotype.getAlleles == genotype2.getAlleles)
                      stats.samplesStats(sample1).sampleToSample(sample2).genotypeOverlap += 1
                    stats.samplesStats(sample1).sampleToSample(sample2).alleleOverlap += VcfUtils
                      .alleleOverlap(genotype.getAlleles.toList, genotype2.getAlleles.toList)
                  }
                }
                chunkCounter += 1
              }
              reader.close()

              if (cmdArgs.writeBinStats) {
                val binOutputDir =
Peter van 't Hof's avatar
Peter van 't Hof committed
289
                  new File(cmdArgs.outputDir, "bins" + File.separator + samInterval.getContig)
290 291 292 293 294

                stats.writeGenotypeField(
                  samples,
                  "general",
                  binOutputDir,
Peter van 't Hof's avatar
Peter van 't Hof committed
295
                  prefix = "genotype-" + samInterval.getStart + "-" + samInterval.getEnd)
296 297
                stats.writeField("general",
                                 binOutputDir,
Peter van 't Hof's avatar
Peter van 't Hof committed
298
                                 prefix = samInterval.getStart + "-" + samInterval.getEnd)
299
              }
Peter van 't Hof's avatar
Peter van 't Hof committed
300

Peter van 't Hof's avatar
Peter van 't Hof committed
301
              status(chunkCounter, samInterval)
302 303 304
              stats
            }
            binStats.toList.fold(createStats)(_ += _)
Peter van 't Hof's avatar
Peter van 't Hof committed
305
          }
306
          chunkStats.toList.fold(createStats)(_ += _)
Peter van 't Hof's avatar
Peter van 't Hof committed
307
        }
308 309
    val stats = statsFutures.foldLeft(createStats) {
      case (a, b) => a += Await.result(b, Duration.Inf)
310
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
311

Peter van 't Hof's avatar
Peter van 't Hof committed
312
    logger.info("Done reading vcf records")
313

Peter van 't Hof's avatar
Rename  
Peter van 't Hof committed
314
    val allWriter = new PrintWriter(new File(cmdArgs.outputDir, "stats.json"))
315 316 317 318 319 320 321
    val json = ConfigUtils.mapToJson(
      stats.getAllStats(
        FastaUtils.getCachedDict(cmdArgs.referenceFile).getSequences.map(_.getSequenceName).toList,
        samples,
        adGenotypeTags,
        adInfoTags,
        sampleDistributions))
Peter van 't Hof's avatar
Peter van 't Hof committed
322
    allWriter.println(json.nospaces)
323
    allWriter.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
324 325

    // Write general wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
326 327
    for (field <- cmdArgs.generalWiggle) {
      val file = new File(cmdArgs.outputDir, "wigs" + File.separator + "general-" + field + ".wig")
Peter van 't Hof's avatar
Peter van 't Hof committed
328
      writeWiggle(intervals, field, "count", file, genotype = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
329 330
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
331
    // Write sample wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
332
    for (field <- cmdArgs.genotypeWiggle; sample <- samples) {
333 334
      val file = new File(cmdArgs.outputDir,
                          "wigs" + File.separator + "genotype-" + sample + "-" + field + ".wig")
Peter van 't Hof's avatar
Peter van 't Hof committed
335
      writeWiggle(intervals, field, sample, file, genotype = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
336 337
    }

338 339 340 341 342 343 344 345
    writeOverlap(stats,
                 _.genotypeOverlap,
                 cmdArgs.outputDir + "/sample_compare/genotype_overlap",
                 samples)
    writeOverlap(stats,
                 _.alleleOverlap,
                 cmdArgs.outputDir + "/sample_compare/allele_overlap",
                 samples)
346 347

    logger.info("Done")
Peter van 't Hof's avatar
Peter van 't Hof committed
348 349
  }

350
  //FIXME: does only work correct for reference and not with a bed file
351 352 353 354 355 356 357
  protected def writeWiggle(intervals: List[Interval],
                            row: String,
                            column: String,
                            outputFile: File,
                            genotype: Boolean): Unit = {
    val groupedIntervals =
      intervals.groupBy(_.getContig).map { case (k, v) => k -> v.sortBy(_.getStart) }
Peter van 't Hof's avatar
Peter van 't Hof committed
358
    outputFile.getParentFile.mkdirs()
Peter van 't Hof's avatar
Peter van 't Hof committed
359 360
    val writer = new PrintWriter(outputFile)
    writer.println("track type=wiggle_0")
361
    for ((chr, intervals) <- groupedIntervals) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
362 363 364
      val length = intervals.head.length()
      writer.println(s"fixedStep chrom=$chr start=1 step=$length span=$length")
      for (interval <- intervals) {
Peter van 't Hof's avatar
Peter van 't Hof committed
365
        val file = {
366 367 368 369 370 371 372 373
          if (genotype)
            new File(
              cmdArgs.outputDir,
              "bins" + File.separator + chr + File.separator + "genotype-" + interval.getStart + "-" + interval.getEnd + "-general.tsv")
          else
            new File(
              cmdArgs.outputDir,
              "bins" + File.separator + chr + File.separator + interval.getStart + "-" + interval.getEnd + "-general.tsv")
Peter van 't Hof's avatar
Peter van 't Hof committed
374 375
        }
        writer.println(valueFromTsv(file, row, column).getOrElse(0))
Peter van 't Hof's avatar
Peter van 't Hof committed
376 377 378 379 380
      }
    }
    writer.close()
  }

381
  /**
382 383 384 385 386 387
    * Gets single value from a tsv file
    * @param file Input tsv file
    * @param row Row id
    * @param column column id
    * @return value
    */
Peter van 't Hof's avatar
Peter van 't Hof committed
388 389 390 391 392 393
  def valueFromTsv(file: File, row: String, column: String): Option[String] = {
    val reader = Source.fromFile(file)
    val it = reader.getLines()
    val index = it.next().split("\t").indexOf(column)

    val value = it.find(_.startsWith(row))
394
    reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
395 396

    value.collect { case x => x.split("\t")(index) }
397 398
  }

399 400
  protected[tools] def fillGeneral(
      additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
401 402 403 404 405 406 407 408
    val buffer = mutable.Map[String, Map[Any, Int]]()

    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
      val map = buffer.getOrElse(key, Map())
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
409
    addToBuffer("QUAL", "not set", found = false)
410

Sander Bollen's avatar
Sander Bollen committed
411 412 413 414 415 416 417 418 419 420 421 422
    addToBuffer("SampleDistribution-Het", "not set", found = false)
    addToBuffer("SampleDistribution-HetNonRef", "not set", found = false)
    addToBuffer("SampleDistribution-Hom", "not set", found = false)
    addToBuffer("SampleDistribution-HomRef", "not set", found = false)
    addToBuffer("SampleDistribution-HomVar", "not set", found = false)
    addToBuffer("SampleDistribution-Mixed", "not set", found = false)
    addToBuffer("SampleDistribution-NoCall", "not set", found = false)
    addToBuffer("SampleDistribution-NonInformative", "not set", found = false)
    addToBuffer("SampleDistribution-Available", "not set", found = false)
    addToBuffer("SampleDistribution-Called", "not set", found = false)
    addToBuffer("SampleDistribution-Filtered", "not set", found = false)
    addToBuffer("SampleDistribution-Variant", "not set", found = false)
423

Peter van 't Hof's avatar
Peter van 't Hof committed
424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442
    addToBuffer("general", "Total", found = false)
    addToBuffer("general", "Biallelic", found = false)
    addToBuffer("general", "ComplexIndel", found = false)
    addToBuffer("general", "Filtered", found = false)
    addToBuffer("general", "FullyDecoded", found = false)
    addToBuffer("general", "Indel", found = false)
    addToBuffer("general", "Mixed", found = false)
    addToBuffer("general", "MNP", found = false)
    addToBuffer("general", "MonomorphicInSamples", found = false)
    addToBuffer("general", "NotFiltered", found = false)
    addToBuffer("general", "PointEvent", found = false)
    addToBuffer("general", "PolymorphicInSamples", found = false)
    addToBuffer("general", "SimpleDeletion", found = false)
    addToBuffer("general", "SimpleInsertion", found = false)
    addToBuffer("general", "SNP", found = false)
    addToBuffer("general", "StructuralIndel", found = false)
    addToBuffer("general", "Symbolic", found = false)
    addToBuffer("general", "SymbolicOrSV", found = false)
    addToBuffer("general", "Variant", found = false)
443 444 445 446

    val skipTags = List("QUAL", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
447
      addToBuffer(tag, "not set", found = false)
448 449 450 451 452
    }

    Map("total" -> buffer.toMap)
  }

453
  /** Function to check all general stats, all info expect sample/genotype specific stats */
454 455 456
  protected[tools] def checkGeneral(
      record: VariantContext,
      additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
457 458
    val buffer = mutable.Map[String, Map[Any, Int]]()

459
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
460
      val map = buffer.getOrElse(key, Map())
461
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
462
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
463 464
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
465
    addToBuffer("QUAL", Math.round(record.getPhredScaledQual), found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
466

467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
    addToBuffer("SampleDistribution-Het",
                record.getGenotypes.count(genotype => genotype.isHet),
                found = true)
    addToBuffer("SampleDistribution-HetNonRef",
                record.getGenotypes.count(genotype => genotype.isHetNonRef),
                found = true)
    addToBuffer("SampleDistribution-Hom",
                record.getGenotypes.count(genotype => genotype.isHom),
                found = true)
    addToBuffer("SampleDistribution-HomRef",
                record.getGenotypes.count(genotype => genotype.isHomRef),
                found = true)
    addToBuffer("SampleDistribution-HomVar",
                record.getGenotypes.count(genotype => genotype.isHomVar),
                found = true)
    addToBuffer("SampleDistribution-Mixed",
                record.getGenotypes.count(genotype => genotype.isMixed),
                found = true)
    addToBuffer("SampleDistribution-NoCall",
                record.getGenotypes.count(genotype => genotype.isNoCall),
                found = true)
    addToBuffer("SampleDistribution-NonInformative",
                record.getGenotypes.count(genotype => genotype.isNonInformative),
                found = true)
    addToBuffer("SampleDistribution-Available",
                record.getGenotypes.count(genotype => genotype.isAvailable),
                found = true)
    addToBuffer("SampleDistribution-Called",
                record.getGenotypes.count(genotype => genotype.isCalled),
                found = true)
    addToBuffer("SampleDistribution-Filtered",
                record.getGenotypes.count(genotype => genotype.isFiltered),
                found = true)
    addToBuffer("SampleDistribution-Variant",
                record.getGenotypes.count(genotype =>
                  genotype.isHetNonRef || genotype.isHet || genotype.isHomVar),
                found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
504

Peter van 't Hof's avatar
Peter van 't Hof committed
505
    addToBuffer("general", "Total", found = true)
506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524
    addToBuffer("general", "Biallelic", record.isBiallelic)
    addToBuffer("general", "ComplexIndel", record.isComplexIndel)
    addToBuffer("general", "Filtered", record.isFiltered)
    addToBuffer("general", "FullyDecoded", record.isFullyDecoded)
    addToBuffer("general", "Indel", record.isIndel)
    addToBuffer("general", "Mixed", record.isMixed)
    addToBuffer("general", "MNP", record.isMNP)
    addToBuffer("general", "MonomorphicInSamples", record.isMonomorphicInSamples)
    addToBuffer("general", "NotFiltered", record.isNotFiltered)
    addToBuffer("general", "PointEvent", record.isPointEvent)
    addToBuffer("general", "PolymorphicInSamples", record.isPolymorphicInSamples)
    addToBuffer("general", "SimpleDeletion", record.isSimpleDeletion)
    addToBuffer("general", "SimpleInsertion", record.isSimpleInsertion)
    addToBuffer("general", "SNP", record.isSNP)
    addToBuffer("general", "StructuralIndel", record.isStructuralIndel)
    addToBuffer("general", "Symbolic", record.isSymbolic)
    addToBuffer("general", "SymbolicOrSV", record.isSymbolicOrSV)
    addToBuffer("general", "Variant", record.isVariant)

Peter van 't Hof's avatar
Peter van 't Hof committed
525 526 527
    val skipTags = List("QUAL", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
528
      val value = record.getAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
529 530
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
531 532
    }

533
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
Peter van 't Hof's avatar
Peter van 't Hof committed
534 535
  }

536 537
  protected[tools] def fillGenotype(
      additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
538 539 540 541 542 543 544 545
    val buffer = mutable.Map[String, Map[Any, Int]]()

    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
      val map = buffer.getOrElse(key, Map())
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561
    addToBuffer("DP", "not set", found = false)
    addToBuffer("GQ", "not set", found = false)

    addToBuffer("general", "Total", found = false)
    addToBuffer("general", "Het", found = false)
    addToBuffer("general", "HetNonRef", found = false)
    addToBuffer("general", "Hom", found = false)
    addToBuffer("general", "HomRef", found = false)
    addToBuffer("general", "HomVar", found = false)
    addToBuffer("general", "Mixed", found = false)
    addToBuffer("general", "NoCall", found = false)
    addToBuffer("general", "NonInformative", found = false)
    addToBuffer("general", "Available", found = false)
    addToBuffer("general", "Called", found = false)
    addToBuffer("general", "Filtered", found = false)
    addToBuffer("general", "Variant", found = false)
562 563 564 565 566 567 568 569 570 571 572

    val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
      addToBuffer(tag, 0, found = false)
    }

    Map("total" -> buffer.toMap)

  }

573
  /** Function to check sample/genotype specific stats */
574 575 576 577
  protected[tools] def checkGenotype(
      record: VariantContext,
      genotype: Genotype,
      additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
578
    val buffer = mutable.Map[String, Map[Any, Int]]()
Peter van 't Hof's avatar
Peter van 't Hof committed
579

580
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
581
      val map = buffer.getOrElse(key, Map())
582
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
583
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
584 585
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
586 587 588
    buffer += "DP" -> Map((if (genotype.hasDP) genotype.getDP else "not set") -> 1)
    buffer += "GQ" -> Map((if (genotype.hasGQ) genotype.getGQ else "not set") -> 1)

589 590
    val usedAlleles =
      (for (allele <- genotype.getAlleles) yield record.getAlleleIndex(allele)).toList
Peter van 't Hof's avatar
Peter van 't Hof committed
591

Peter van 't Hof's avatar
Peter van 't Hof committed
592
    addToBuffer("general", "Total", found = true)
593 594 595 596 597 598 599 600 601 602 603 604
    addToBuffer("general", "Het", genotype.isHet)
    addToBuffer("general", "HetNonRef", genotype.isHetNonRef)
    addToBuffer("general", "Hom", genotype.isHom)
    addToBuffer("general", "HomRef", genotype.isHomRef)
    addToBuffer("general", "HomVar", genotype.isHomVar)
    addToBuffer("general", "Mixed", genotype.isMixed)
    addToBuffer("general", "NoCall", genotype.isNoCall)
    addToBuffer("general", "NonInformative", genotype.isNonInformative)
    addToBuffer("general", "Available", genotype.isAvailable)
    addToBuffer("general", "Called", genotype.isCalled)
    addToBuffer("general", "Filtered", genotype.isFiltered)
    addToBuffer("general", "Variant", genotype.isHetNonRef || genotype.isHet || genotype.isHomVar)
Peter van 't Hof's avatar
Peter van 't Hof committed
605

Peter van 't Hof's avatar
Peter van 't Hof committed
606 607 608
    if (genotype.hasAD) {
      val ad = genotype.getAD
      for (i <- 0 until ad.size if ad(i) > 0) {
Peter van 't Hof's avatar
Peter van 't Hof committed
609 610 611 612 613
        addToBuffer("AD", ad(i), found = true)
        if (i == 0) addToBuffer("AD-ref", ad(i), found = true)
        if (i > 0) addToBuffer("AD-alt", ad(i), found = true)
        if (usedAlleles.contains(i)) addToBuffer("AD-used", ad(i), found = true)
        else addToBuffer("AD-not_used", ad(i), found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
614 615
      }
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
616

Peter van 't Hof's avatar
Peter van 't Hof committed
617 618 619
    val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
620
      val value = genotype.getAnyAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
621 622
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
623 624
    }

625
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
626 627
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
628
  /** Function to write sample to sample compare tsv's / heatmaps */
629 630 631 632
  def writeOverlap(stats: Stats,
                   function: SampleToSampleStats => Int,
                   prefix: String,
                   samples: List[String]): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
633 634 635 636 637 638 639
    val absFile = new File(prefix + ".abs.tsv")
    val relFile = new File(prefix + ".rel.tsv")

    absFile.getParentFile.mkdirs()

    val absWriter = new PrintWriter(absFile)
    val relWriter = new PrintWriter(relFile)
640 641 642 643

    absWriter.println(samples.mkString("\t", "\t", ""))
    relWriter.println(samples.mkString("\t", "\t", ""))
    for (sample1 <- samples) {
644 645
      val values = for (sample2 <- samples)
        yield function(stats.samplesStats(sample1).sampleToSample(sample2))
Peter van 't Hof's avatar
Peter van 't Hof committed
646

647 648
      absWriter.println(values.mkString(sample1 + "\t", "\t", ""))

Peter van 't Hof's avatar
Peter van 't Hof committed
649
      val total = function(stats.samplesStats(sample1).sampleToSample(sample1))
650 651 652 653
      relWriter.println(values.map(_.toFloat / total).mkString(sample1 + "\t", "\t", ""))
    }
    absWriter.close()
    relWriter.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
654 655

    plotHeatmap(relFile)
656 657
  }

658
  /** Plots heatmaps on target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
659
  def plotHeatmap(file: File) {
660 661 662 663 664 665 666 667 668
    executeRscript(
      "plotHeatmap.R",
      Array(
        file.getAbsolutePath,
        file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.png",
        file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.clustering.png",
        file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.dendrogram.png"
      )
    )
Peter van 't Hof's avatar
Peter van 't Hof committed
669 670
  }

671
  /** Plots line graph with target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
672
  def plotLine(file: File) {
673 674 675
    executeRscript(
      "plotXY.R",
      Array(file.getAbsolutePath, file.getAbsolutePath.stripSuffix(".tsv") + ".xy.png"))
Peter van 't Hof's avatar
Peter van 't Hof committed
676 677
  }

678
  /** Function to execute Rscript as subproces */
Peter van 't Hof's avatar
Peter van 't Hof committed
679 680 681
  def executeRscript(resource: String, args: Array[String]): Unit = {
    val is = getClass.getResourceAsStream(resource)
    val file = File.createTempFile("script.", "." + resource)
682
    file.deleteOnExit()
Peter van 't Hof's avatar
Peter van 't Hof committed
683 684 685 686 687 688
    val os = new FileOutputStream(file)
    org.apache.commons.io.IOUtils.copy(is, os)
    os.close()

    val command: String = "Rscript " + file + " " + args.mkString(" ")

689
    logger.info("Starting: " + command)
Peter van 't Hof's avatar
Peter van 't Hof committed
690 691 692 693 694 695 696 697 698
    try {
      val process = Process(command).run(ProcessLogger(x => logger.debug(x), x => logger.debug(x)))
      if (process.exitValue() == 0) logger.info("Done: " + command)
      else {
        logger.warn("Failed: " + command)
        if (!logger.isDebugEnabled) logger.warn("Use -l debug for more info")
      }
    } catch {
      case e: IOException =>
Peter van 't Hof's avatar
Peter van 't Hof committed
699 700
        logger.warn("Failed: " + command)
        logger.debug(e)
701
    }
702 703
  }
}