VcfStats.scala 28.4 KB
Newer Older
bow's avatar
bow committed
1
/**
2 3 4 5 6 7 8 9 10 11 12 13 14
  * Biopet is built on top of GATK Queue for building bioinformatic
  * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
  * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
  * should also be able to execute Biopet tools and pipelines.
  *
  * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
  *
  * Contact us at: sasc@lumc.nl
  *
  * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
  * license; For commercial users or users who do not want to follow the AGPL
  * license, please contact us to obtain a separate license.
  */
Peter van 't Hof's avatar
Peter van 't Hof committed
15
package nl.lumc.sasc.biopet.tools.vcfstats
16

17
import java.io.{File, FileOutputStream, IOException, PrintWriter}
18

Peter van 't Hof's avatar
Peter van 't Hof committed
19
import htsjdk.samtools.util.Interval
20
import htsjdk.variant.variantcontext.{Genotype, VariantContext}
21
import htsjdk.variant.vcf.VCFFileReader
22
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
23
import nl.lumc.sasc.biopet.utils.{ConfigUtils, FastaUtils, ToolCommand, VcfUtils}
Peter van 't Hof's avatar
Peter van 't Hof committed
24

25 26
import scala.collection.JavaConversions._
import scala.collection.mutable
Peter van 't Hof's avatar
Peter van 't Hof committed
27
import scala.io.Source
28
import scala.sys.process.{Process, ProcessLogger}
Peter van 't Hof's avatar
Peter van 't Hof committed
29
import scala.util.Random
Peter van 't Hof's avatar
Peter van 't Hof committed
30 31
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.duration._
32
import scala.concurrent.{Await, Future}
Peter van 't Hof's avatar
Peter van 't Hof committed
33

34
/**
35 36 37 38
  * This tool will generate statistics from a vcf file
  *
  * Created by pjvan_thof on 1/10/15.
  */
39
object VcfStats extends ToolCommand {
40

Peter van 't Hof's avatar
Peter van 't Hof committed
41
  /** Commandline argument */
42 43
  case class Args(inputFile: File = null,
                  outputDir: File = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
44
                  referenceFile: File = null,
45 46 47 48
                  intervals: Option[File] = None,
                  infoTags: List[String] = Nil,
                  genotypeTags: List[String] = Nil,
                  allInfoTags: Boolean = false,
Peter van 't Hof's avatar
Peter van 't Hof committed
49 50 51 52
                  allGenotypeTags: Boolean = false,
                  binSize: Int = 10000000,
                  writeBinStats: Boolean = false,
                  generalWiggle: List[String] = Nil,
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
                  genotypeWiggle: List[String] = Nil)
      extends AbstractArgs

  private val generalWiggleOptions = List(
    "Total",
    "Biallelic",
    "ComplexIndel",
    "Filtered",
    "FullyDecoded",
    "Indel",
    "Mixed",
    "MNP",
    "MonomorphicInSamples",
    "NotFiltered",
    "PointEvent",
    "PolymorphicInSamples",
    "SimpleDeletion",
    "SimpleInsertion",
    "SNP",
    "StructuralIndel",
    "Symbolic",
    "SymbolicOrSV",
    "Variant"
  )

  private val genotypeWiggleOptions = List("Total",
                                           "Het",
                                           "HetNonRef",
                                           "Hom",
                                           "HomRef",
                                           "HomVar",
                                           "Mixed",
                                           "NoCall",
                                           "NonInformative",
                                           "Available",
                                           "Called",
                                           "Filtered",
                                           "Variant")
91

Peter van 't Hof's avatar
Peter van 't Hof committed
92
  /** Parsing commandline arguments */
93
  class OptParser extends AbstractOptParser {
94 95 96 97 98
    opt[File]('I', "inputFile") required () unbounded () maxOccurs 1 valueName "<file>" action {
      (x, c) =>
        c.copy(inputFile = x)
    } validate { x =>
      if (x.exists) success else failure("Input VCF required")
99
    } text "Input VCF file (required)"
100 101 102 103 104
    opt[File]('R', "referenceFile") required () unbounded () maxOccurs 1 valueName "<file>" action {
      (x, c) =>
        c.copy(referenceFile = x)
    } validate { x =>
      if (x.exists) success else failure("Reference file required")
105
    } text "Fasta reference which was used to call input VCF (required)"
106 107 108 109 110 111 112
    opt[File]('o', "outputDir") required () unbounded () maxOccurs 1 valueName "<file>" action {
      (x, c) =>
        c.copy(outputDir = x)
    } validate { x =>
      if (x == null) failure("Valid output directory required")
      else if (x.exists) success
      else failure(s"Output directory does not exist: $x")
113
    } text "Path to directory for output (required)"
Peter van 't Hof's avatar
Peter van 't Hof committed
114
    opt[File]('i', "intervals") unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
115
      c.copy(intervals = Some(x))
116
    } text "Path to interval (BED) file (optional)"
Peter van 't Hof's avatar
Peter van 't Hof committed
117
    opt[String]("infoTag") unbounded () valueName "<tag>" action { (x, c) =>
118
      c.copy(infoTags = x :: c.infoTags)
Peter van 't Hof's avatar
Peter van 't Hof committed
119
    } text s"Summarize these info tags. Default is (${defaultInfoFields.mkString(", ")})"
Peter van 't Hof's avatar
Peter van 't Hof committed
120
    opt[String]("genotypeTag") unbounded () valueName "<tag>" action { (x, c) =>
121
      c.copy(genotypeTags = x :: c.genotypeTags)
Peter van 't Hof's avatar
Peter van 't Hof committed
122
    } text s"Summarize these genotype tags. Default is (${defaultGenotypeFields.mkString(", ")})"
123 124
    opt[Unit]("allInfoTags") unbounded () action { (x, c) =>
      c.copy(allInfoTags = true)
125
    } text "Summarize all info tags. Default false"
126 127
    opt[Unit]("allGenotypeTags") unbounded () action { (x, c) =>
      c.copy(allGenotypeTags = true)
128
    } text "Summarize all genotype tags. Default false"
Peter van 't Hof's avatar
Peter van 't Hof committed
129 130
    opt[Int]("binSize") unbounded () action { (x, c) =>
      c.copy(binSize = x)
131
    } text "Binsize in estimated base pairs"
Peter van 't Hof's avatar
Peter van 't Hof committed
132 133
    opt[Unit]("writeBinStats") unbounded () action { (x, c) =>
      c.copy(writeBinStats = true)
134
    } text "Write bin statistics. Default False"
Peter van 't Hof's avatar
Peter van 't Hof committed
135 136
    opt[String]("generalWiggle") unbounded () action { (x, c) =>
      c.copy(generalWiggle = x :: c.generalWiggle, writeBinStats = true)
137 138
    } validate { x =>
      if (generalWiggleOptions.contains(x)) success else failure(s"""Nonexistent field $x""")
Peter van 't Hof's avatar
Peter van 't Hof committed
139 140
    } text s"""Create a wiggle track with bin size <binSize> for any of the following statistics:
        |${generalWiggleOptions.mkString(", ")}""".stripMargin
Peter van 't Hof's avatar
Peter van 't Hof committed
141 142
    opt[String]("genotypeWiggle") unbounded () action { (x, c) =>
      c.copy(genotypeWiggle = x :: c.genotypeWiggle, writeBinStats = true)
143 144
    } validate { x =>
      if (genotypeWiggleOptions.contains(x)) success else failure(s"""Non-existent field $x""")
Peter van 't Hof's avatar
Peter van 't Hof committed
145 146
    } text s"""Create a wiggle track with bin size <binSize> for any of the following genotype fields:
        |${genotypeWiggleOptions.mkString(", ")}""".stripMargin
147 148
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
149
  protected var cmdArgs: Args = _
Peter van 't Hof's avatar
Peter van 't Hof committed
150

151 152
  val defaultGenotypeFields =
    List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")
153

154
  val defaultInfoFields = List("QUAL", "general", "AC", "AF", "AN", "DP")
155

156 157 158 159 160 161 162 163 164 165 166 167
  val sampleDistributions = List("Het",
                                 "HetNonRef",
                                 "Hom",
                                 "HomRef",
                                 "HomVar",
                                 "Mixed",
                                 "NoCall",
                                 "NonInformative",
                                 "Available",
                                 "Called",
                                 "Filtered",
                                 "Variant")
Peter van 't Hof's avatar
Peter van 't Hof committed
168

169
  /**
170 171
    * @param args the command line arguments
    */
172 173 174
  def main(args: Array[String]): Unit = {
    logger.info("Started")
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
175
    cmdArgs = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
176

Peter van 't Hof's avatar
Peter van 't Hof committed
177
    val reader = new VCFFileReader(cmdArgs.inputFile, true)
178 179 180
    val header = reader.getFileHeader
    val samples = header.getSampleNamesInOrder.toList

Peter van 't Hof's avatar
Peter van 't Hof committed
181 182
    reader.close()

Peter van 't Hof's avatar
Peter van 't Hof committed
183
    val adInfoTags = {
184 185 186
      (for (infoTag <- cmdArgs.infoTags if !defaultInfoFields.contains(infoTag)) yield {
        require(header.getInfoHeaderLine(infoTag) != null,
                "Info tag '" + infoTag + "' not found in header of vcf file")
Peter van 't Hof's avatar
Peter van 't Hof committed
187
        infoTag
188 189 190
      }) ::: (for (line <- header.getInfoHeaderLines if cmdArgs.allInfoTags
                   if !defaultInfoFields.contains(line.getID)
                   if !cmdArgs.infoTags.contains(line.getID)) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
191 192 193
        line.getID
      }).toList ::: defaultInfoFields
    }
194

195 196 197 198
    val adGenotypeTags = (for (genotypeTag <- cmdArgs.genotypeTags
                               if !defaultGenotypeFields.contains(genotypeTag)) yield {
      require(header.getFormatHeaderLine(genotypeTag) != null,
              "Format tag '" + genotypeTag + "' not found in header of vcf file")
199
      genotypeTag
200 201 202 203
    }) ::: (for (line <- header.getFormatHeaderLines if cmdArgs.allGenotypeTags
                 if !defaultGenotypeFields.contains(line.getID)
                 if !cmdArgs.genotypeTags.contains(line.getID)
                 if line.getID != "PL") yield {
204
      line.getID
Peter van 't Hof's avatar
Peter van 't Hof committed
205
    }).toList ::: defaultGenotypeFields
206

Peter van 't Hof's avatar
Peter van 't Hof committed
207
    val bedRecords = (cmdArgs.intervals match {
208 209 210
      case Some(intervals) =>
        BedRecordList.fromFile(intervals).validateContigs(cmdArgs.referenceFile)
      case _ => BedRecordList.fromReference(cmdArgs.referenceFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
211
    }).combineOverlap.scatter(cmdArgs.binSize)
212 213

    val intervals: List[Interval] = bedRecords.toSamIntervals.toList
Peter van 't Hof's avatar
Peter van 't Hof committed
214

215
    val totalBases = bedRecords.length
Peter van 't Hof's avatar
Peter van 't Hof committed
216

Peter van 't Hof's avatar
Peter van 't Hof committed
217 218
    // Reading vcf records
    logger.info("Start reading vcf records")
Peter van 't Hof's avatar
Peter van 't Hof committed
219 220 221 222 223 224

    def createStats: Stats = {
      val stats = new Stats
      //init stats
      for (sample1 <- samples) {
        stats.samplesStats += sample1 -> new SampleStats
225
        for (sample2 <- samples) {
Peter van 't Hof's avatar
Peter van 't Hof committed
226
          stats.samplesStats(sample1).sampleToSample += sample2 -> new SampleToSampleStats
227 228
        }
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
229 230 231 232 233
      stats
    }

    var variantCounter = 0L
    var baseCounter = 0L
Peter van 't Hof's avatar
Peter van 't Hof committed
234

Peter van 't Hof's avatar
Peter van 't Hof committed
235
    def status(count: Int, interval: Interval): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
236 237 238 239 240
      variantCounter += count
      baseCounter += interval.length()
      val fraction = baseCounter.toFloat / totalBases * 100
      logger.info(interval + " done, " + count + " rows processed")
      logger.info("total: " + variantCounter + " rows processed, " + fraction + "% done")
241
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
242

Peter van 't Hof's avatar
Peter van 't Hof committed
243
    // Triple for loop to not keep all bins in memory
244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
    val statsFutures = for (intervals <- Random
                              .shuffle(intervals)
                              .grouped(intervals.size / (if (intervals.size > 10) 4 else 1))
                              .toList)
      yield
        Future {
          val chunkStats = for (intervals <- intervals.grouped(25)) yield {
            val binStats = for (interval <- intervals.par) yield {
              val reader = new VCFFileReader(cmdArgs.inputFile, true)
              var chunkCounter = 0
              val stats = createStats
              logger.info("Starting on: " + interval)

              val query = reader.query(interval.getContig, interval.getStart, interval.getEnd)
              if (!query.hasNext) {
                Stats.mergeNestedStatsMap(stats.generalStats, fillGeneral(adInfoTags))
                for (sample <- samples) yield {
                  Stats.mergeNestedStatsMap(stats.samplesStats(sample).genotypeStats,
                                            fillGenotype(adGenotypeTags))
                }
                chunkCounter += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
265
              }
Peter van 't Hof's avatar
Peter van 't Hof committed
266

267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
              for (record <- query if record.getStart <= interval.getEnd) {
                Stats.mergeNestedStatsMap(stats.generalStats, checkGeneral(record, adInfoTags))
                for (sample1 <- samples) yield {
                  val genotype = record.getGenotype(sample1)
                  Stats.mergeNestedStatsMap(stats.samplesStats(sample1).genotypeStats,
                                            checkGenotype(record, genotype, adGenotypeTags))
                  for (sample2 <- samples) {
                    val genotype2 = record.getGenotype(sample2)
                    if (genotype.getAlleles == genotype2.getAlleles)
                      stats.samplesStats(sample1).sampleToSample(sample2).genotypeOverlap += 1
                    stats.samplesStats(sample1).sampleToSample(sample2).alleleOverlap += VcfUtils
                      .alleleOverlap(genotype.getAlleles.toList, genotype2.getAlleles.toList)
                  }
                }
                chunkCounter += 1
              }
              reader.close()

              if (cmdArgs.writeBinStats) {
                val binOutputDir =
                  new File(cmdArgs.outputDir, "bins" + File.separator + interval.getContig)

                stats.writeGenotypeField(
                  samples,
                  "general",
                  binOutputDir,
                  prefix = "genotype-" + interval.getStart + "-" + interval.getEnd)
                stats.writeField("general",
                                 binOutputDir,
                                 prefix = interval.getStart + "-" + interval.getEnd)
              }
Peter van 't Hof's avatar
Peter van 't Hof committed
298

299 300 301 302
              status(chunkCounter, interval)
              stats
            }
            binStats.toList.fold(createStats)(_ += _)
Peter van 't Hof's avatar
Peter van 't Hof committed
303
          }
304
          chunkStats.toList.fold(createStats)(_ += _)
Peter van 't Hof's avatar
Peter van 't Hof committed
305
        }
306 307
    val stats = statsFutures.foldLeft(createStats) {
      case (a, b) => a += Await.result(b, Duration.Inf)
308
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
309

Peter van 't Hof's avatar
Peter van 't Hof committed
310
    logger.info("Done reading vcf records")
311

Peter van 't Hof's avatar
Rename  
Peter van 't Hof committed
312
    val allWriter = new PrintWriter(new File(cmdArgs.outputDir, "stats.json"))
313 314 315 316 317 318 319
    val json = ConfigUtils.mapToJson(
      stats.getAllStats(
        FastaUtils.getCachedDict(cmdArgs.referenceFile).getSequences.map(_.getSequenceName).toList,
        samples,
        adGenotypeTags,
        adInfoTags,
        sampleDistributions))
Peter van 't Hof's avatar
Peter van 't Hof committed
320
    allWriter.println(json.nospaces)
321
    allWriter.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
322 323

    // Write general wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
324 325
    for (field <- cmdArgs.generalWiggle) {
      val file = new File(cmdArgs.outputDir, "wigs" + File.separator + "general-" + field + ".wig")
Peter van 't Hof's avatar
Peter van 't Hof committed
326
      writeWiggle(intervals, field, "count", file, genotype = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
327 328
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
329
    // Write sample wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
330
    for (field <- cmdArgs.genotypeWiggle; sample <- samples) {
331 332
      val file = new File(cmdArgs.outputDir,
                          "wigs" + File.separator + "genotype-" + sample + "-" + field + ".wig")
Peter van 't Hof's avatar
Peter van 't Hof committed
333
      writeWiggle(intervals, field, sample, file, genotype = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
334 335
    }

336 337 338 339 340 341 342 343
    writeOverlap(stats,
                 _.genotypeOverlap,
                 cmdArgs.outputDir + "/sample_compare/genotype_overlap",
                 samples)
    writeOverlap(stats,
                 _.alleleOverlap,
                 cmdArgs.outputDir + "/sample_compare/allele_overlap",
                 samples)
344 345

    logger.info("Done")
Peter van 't Hof's avatar
Peter van 't Hof committed
346 347
  }

348
  //FIXME: does only work correct for reference and not with a bed file
349 350 351 352 353 354 355
  protected def writeWiggle(intervals: List[Interval],
                            row: String,
                            column: String,
                            outputFile: File,
                            genotype: Boolean): Unit = {
    val groupedIntervals =
      intervals.groupBy(_.getContig).map { case (k, v) => k -> v.sortBy(_.getStart) }
Peter van 't Hof's avatar
Peter van 't Hof committed
356
    outputFile.getParentFile.mkdirs()
Peter van 't Hof's avatar
Peter van 't Hof committed
357 358
    val writer = new PrintWriter(outputFile)
    writer.println("track type=wiggle_0")
359
    for ((chr, intervals) <- groupedIntervals) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
360 361 362
      val length = intervals.head.length()
      writer.println(s"fixedStep chrom=$chr start=1 step=$length span=$length")
      for (interval <- intervals) {
Peter van 't Hof's avatar
Peter van 't Hof committed
363
        val file = {
364 365 366 367 368 369 370 371
          if (genotype)
            new File(
              cmdArgs.outputDir,
              "bins" + File.separator + chr + File.separator + "genotype-" + interval.getStart + "-" + interval.getEnd + "-general.tsv")
          else
            new File(
              cmdArgs.outputDir,
              "bins" + File.separator + chr + File.separator + interval.getStart + "-" + interval.getEnd + "-general.tsv")
Peter van 't Hof's avatar
Peter van 't Hof committed
372 373
        }
        writer.println(valueFromTsv(file, row, column).getOrElse(0))
Peter van 't Hof's avatar
Peter van 't Hof committed
374 375 376 377 378
      }
    }
    writer.close()
  }

379
  /**
380 381 382 383 384 385
    * Gets single value from a tsv file
    * @param file Input tsv file
    * @param row Row id
    * @param column column id
    * @return value
    */
Peter van 't Hof's avatar
Peter van 't Hof committed
386 387 388 389 390 391
  def valueFromTsv(file: File, row: String, column: String): Option[String] = {
    val reader = Source.fromFile(file)
    val it = reader.getLines()
    val index = it.next().split("\t").indexOf(column)

    val value = it.find(_.startsWith(row))
392
    reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
393 394

    value.collect { case x => x.split("\t")(index) }
395 396
  }

397 398
  protected[tools] def fillGeneral(
      additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
399 400 401 402 403 404 405 406
    val buffer = mutable.Map[String, Map[Any, Int]]()

    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
      val map = buffer.getOrElse(key, Map())
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
407
    addToBuffer("QUAL", "not set", false)
408

Sander Bollen's avatar
Sander Bollen committed
409 410 411 412 413 414 415 416 417 418 419 420
    addToBuffer("SampleDistribution-Het", "not set", found = false)
    addToBuffer("SampleDistribution-HetNonRef", "not set", found = false)
    addToBuffer("SampleDistribution-Hom", "not set", found = false)
    addToBuffer("SampleDistribution-HomRef", "not set", found = false)
    addToBuffer("SampleDistribution-HomVar", "not set", found = false)
    addToBuffer("SampleDistribution-Mixed", "not set", found = false)
    addToBuffer("SampleDistribution-NoCall", "not set", found = false)
    addToBuffer("SampleDistribution-NonInformative", "not set", found = false)
    addToBuffer("SampleDistribution-Available", "not set", found = false)
    addToBuffer("SampleDistribution-Called", "not set", found = false)
    addToBuffer("SampleDistribution-Filtered", "not set", found = false)
    addToBuffer("SampleDistribution-Variant", "not set", found = false)
421

Peter van 't Hof's avatar
Peter van 't Hof committed
422
    addToBuffer("general", "Total", false)
423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444
    addToBuffer("general", "Biallelic", false)
    addToBuffer("general", "ComplexIndel", false)
    addToBuffer("general", "Filtered", false)
    addToBuffer("general", "FullyDecoded", false)
    addToBuffer("general", "Indel", false)
    addToBuffer("general", "Mixed", false)
    addToBuffer("general", "MNP", false)
    addToBuffer("general", "MonomorphicInSamples", false)
    addToBuffer("general", "NotFiltered", false)
    addToBuffer("general", "PointEvent", false)
    addToBuffer("general", "PolymorphicInSamples", false)
    addToBuffer("general", "SimpleDeletion", false)
    addToBuffer("general", "SimpleInsertion", false)
    addToBuffer("general", "SNP", false)
    addToBuffer("general", "StructuralIndel", false)
    addToBuffer("general", "Symbolic", false)
    addToBuffer("general", "SymbolicOrSV", false)
    addToBuffer("general", "Variant", false)

    val skipTags = List("QUAL", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
445
      addToBuffer(tag, "not set", found = false)
446 447 448 449 450
    }

    Map("total" -> buffer.toMap)
  }

451
  /** Function to check all general stats, all info expect sample/genotype specific stats */
452 453 454
  protected[tools] def checkGeneral(
      record: VariantContext,
      additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
455 456
    val buffer = mutable.Map[String, Map[Any, Int]]()

457
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
458
      val map = buffer.getOrElse(key, Map())
459
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
460
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
461 462
    }

463
    addToBuffer("QUAL", Math.round(record.getPhredScaledQual), true)
Peter van 't Hof's avatar
Peter van 't Hof committed
464

465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501
    addToBuffer("SampleDistribution-Het",
                record.getGenotypes.count(genotype => genotype.isHet),
                found = true)
    addToBuffer("SampleDistribution-HetNonRef",
                record.getGenotypes.count(genotype => genotype.isHetNonRef),
                found = true)
    addToBuffer("SampleDistribution-Hom",
                record.getGenotypes.count(genotype => genotype.isHom),
                found = true)
    addToBuffer("SampleDistribution-HomRef",
                record.getGenotypes.count(genotype => genotype.isHomRef),
                found = true)
    addToBuffer("SampleDistribution-HomVar",
                record.getGenotypes.count(genotype => genotype.isHomVar),
                found = true)
    addToBuffer("SampleDistribution-Mixed",
                record.getGenotypes.count(genotype => genotype.isMixed),
                found = true)
    addToBuffer("SampleDistribution-NoCall",
                record.getGenotypes.count(genotype => genotype.isNoCall),
                found = true)
    addToBuffer("SampleDistribution-NonInformative",
                record.getGenotypes.count(genotype => genotype.isNonInformative),
                found = true)
    addToBuffer("SampleDistribution-Available",
                record.getGenotypes.count(genotype => genotype.isAvailable),
                found = true)
    addToBuffer("SampleDistribution-Called",
                record.getGenotypes.count(genotype => genotype.isCalled),
                found = true)
    addToBuffer("SampleDistribution-Filtered",
                record.getGenotypes.count(genotype => genotype.isFiltered),
                found = true)
    addToBuffer("SampleDistribution-Variant",
                record.getGenotypes.count(genotype =>
                  genotype.isHetNonRef || genotype.isHet || genotype.isHomVar),
                found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
502

503
    addToBuffer("general", "Total", true)
504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
    addToBuffer("general", "Biallelic", record.isBiallelic)
    addToBuffer("general", "ComplexIndel", record.isComplexIndel)
    addToBuffer("general", "Filtered", record.isFiltered)
    addToBuffer("general", "FullyDecoded", record.isFullyDecoded)
    addToBuffer("general", "Indel", record.isIndel)
    addToBuffer("general", "Mixed", record.isMixed)
    addToBuffer("general", "MNP", record.isMNP)
    addToBuffer("general", "MonomorphicInSamples", record.isMonomorphicInSamples)
    addToBuffer("general", "NotFiltered", record.isNotFiltered)
    addToBuffer("general", "PointEvent", record.isPointEvent)
    addToBuffer("general", "PolymorphicInSamples", record.isPolymorphicInSamples)
    addToBuffer("general", "SimpleDeletion", record.isSimpleDeletion)
    addToBuffer("general", "SimpleInsertion", record.isSimpleInsertion)
    addToBuffer("general", "SNP", record.isSNP)
    addToBuffer("general", "StructuralIndel", record.isStructuralIndel)
    addToBuffer("general", "Symbolic", record.isSymbolic)
    addToBuffer("general", "SymbolicOrSV", record.isSymbolicOrSV)
    addToBuffer("general", "Variant", record.isVariant)

Peter van 't Hof's avatar
Peter van 't Hof committed
523 524 525
    val skipTags = List("QUAL", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
526
      val value = record.getAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
527 528
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
529 530
    }

531
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
Peter van 't Hof's avatar
Peter van 't Hof committed
532 533
  }

534 535
  protected[tools] def fillGenotype(
      additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
536 537 538 539 540 541 542 543
    val buffer = mutable.Map[String, Map[Any, Int]]()

    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
      val map = buffer.getOrElse(key, Map())
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
544 545
    addToBuffer("DP", "not set", false)
    addToBuffer("GQ", "not set", false)
546

Peter van 't Hof's avatar
Peter van 't Hof committed
547
    addToBuffer("general", "Total", false)
548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570
    addToBuffer("general", "Het", false)
    addToBuffer("general", "HetNonRef", false)
    addToBuffer("general", "Hom", false)
    addToBuffer("general", "HomRef", false)
    addToBuffer("general", "HomVar", false)
    addToBuffer("general", "Mixed", false)
    addToBuffer("general", "NoCall", false)
    addToBuffer("general", "NonInformative", false)
    addToBuffer("general", "Available", false)
    addToBuffer("general", "Called", false)
    addToBuffer("general", "Filtered", false)
    addToBuffer("general", "Variant", false)

    val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
      addToBuffer(tag, 0, found = false)
    }

    Map("total" -> buffer.toMap)

  }

571
  /** Function to check sample/genotype specific stats */
572 573 574 575
  protected[tools] def checkGenotype(
      record: VariantContext,
      genotype: Genotype,
      additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
576
    val buffer = mutable.Map[String, Map[Any, Int]]()
Peter van 't Hof's avatar
Peter van 't Hof committed
577

578
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
579
      val map = buffer.getOrElse(key, Map())
580
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
581
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
582 583
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
584 585 586
    buffer += "DP" -> Map((if (genotype.hasDP) genotype.getDP else "not set") -> 1)
    buffer += "GQ" -> Map((if (genotype.hasGQ) genotype.getGQ else "not set") -> 1)

587 588
    val usedAlleles =
      (for (allele <- genotype.getAlleles) yield record.getAlleleIndex(allele)).toList
Peter van 't Hof's avatar
Peter van 't Hof committed
589

Peter van 't Hof's avatar
Peter van 't Hof committed
590
    addToBuffer("general", "Total", found = true)
591 592 593 594 595 596 597 598 599 600 601 602
    addToBuffer("general", "Het", genotype.isHet)
    addToBuffer("general", "HetNonRef", genotype.isHetNonRef)
    addToBuffer("general", "Hom", genotype.isHom)
    addToBuffer("general", "HomRef", genotype.isHomRef)
    addToBuffer("general", "HomVar", genotype.isHomVar)
    addToBuffer("general", "Mixed", genotype.isMixed)
    addToBuffer("general", "NoCall", genotype.isNoCall)
    addToBuffer("general", "NonInformative", genotype.isNonInformative)
    addToBuffer("general", "Available", genotype.isAvailable)
    addToBuffer("general", "Called", genotype.isCalled)
    addToBuffer("general", "Filtered", genotype.isFiltered)
    addToBuffer("general", "Variant", genotype.isHetNonRef || genotype.isHet || genotype.isHomVar)
Peter van 't Hof's avatar
Peter van 't Hof committed
603

Peter van 't Hof's avatar
Peter van 't Hof committed
604 605 606
    if (genotype.hasAD) {
      val ad = genotype.getAD
      for (i <- 0 until ad.size if ad(i) > 0) {
Peter van 't Hof's avatar
Peter van 't Hof committed
607 608 609 610 611
        addToBuffer("AD", ad(i), found = true)
        if (i == 0) addToBuffer("AD-ref", ad(i), found = true)
        if (i > 0) addToBuffer("AD-alt", ad(i), found = true)
        if (usedAlleles.contains(i)) addToBuffer("AD-used", ad(i), found = true)
        else addToBuffer("AD-not_used", ad(i), found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
612 613
      }
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
614

Peter van 't Hof's avatar
Peter van 't Hof committed
615 616 617
    val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
618
      val value = genotype.getAnyAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
619 620
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
621 622
    }

623
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
624 625
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
626
  /** Function to write sample to sample compare tsv's / heatmaps */
627 628 629 630
  def writeOverlap(stats: Stats,
                   function: SampleToSampleStats => Int,
                   prefix: String,
                   samples: List[String]): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
631 632 633 634 635 636 637
    val absFile = new File(prefix + ".abs.tsv")
    val relFile = new File(prefix + ".rel.tsv")

    absFile.getParentFile.mkdirs()

    val absWriter = new PrintWriter(absFile)
    val relWriter = new PrintWriter(relFile)
638 639 640 641

    absWriter.println(samples.mkString("\t", "\t", ""))
    relWriter.println(samples.mkString("\t", "\t", ""))
    for (sample1 <- samples) {
642 643
      val values = for (sample2 <- samples)
        yield function(stats.samplesStats(sample1).sampleToSample(sample2))
Peter van 't Hof's avatar
Peter van 't Hof committed
644

645 646
      absWriter.println(values.mkString(sample1 + "\t", "\t", ""))

Peter van 't Hof's avatar
Peter van 't Hof committed
647
      val total = function(stats.samplesStats(sample1).sampleToSample(sample1))
648 649 650 651
      relWriter.println(values.map(_.toFloat / total).mkString(sample1 + "\t", "\t", ""))
    }
    absWriter.close()
    relWriter.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
652 653

    plotHeatmap(relFile)
654 655
  }

656
  /** Plots heatmaps on target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
657
  def plotHeatmap(file: File) {
658 659 660 661 662 663 664 665 666
    executeRscript(
      "plotHeatmap.R",
      Array(
        file.getAbsolutePath,
        file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.png",
        file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.clustering.png",
        file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.dendrogram.png"
      )
    )
Peter van 't Hof's avatar
Peter van 't Hof committed
667 668
  }

669
  /** Plots line graph with target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
670
  def plotLine(file: File) {
671 672 673
    executeRscript(
      "plotXY.R",
      Array(file.getAbsolutePath, file.getAbsolutePath.stripSuffix(".tsv") + ".xy.png"))
Peter van 't Hof's avatar
Peter van 't Hof committed
674 675
  }

676
  /** Function to execute Rscript as subproces */
Peter van 't Hof's avatar
Peter van 't Hof committed
677 678 679
  def executeRscript(resource: String, args: Array[String]): Unit = {
    val is = getClass.getResourceAsStream(resource)
    val file = File.createTempFile("script.", "." + resource)
680
    file.deleteOnExit()
Peter van 't Hof's avatar
Peter van 't Hof committed
681 682 683 684 685 686
    val os = new FileOutputStream(file)
    org.apache.commons.io.IOUtils.copy(is, os)
    os.close()

    val command: String = "Rscript " + file + " " + args.mkString(" ")

687
    logger.info("Starting: " + command)
Peter van 't Hof's avatar
Peter van 't Hof committed
688 689 690 691 692 693 694 695 696
    try {
      val process = Process(command).run(ProcessLogger(x => logger.debug(x), x => logger.debug(x)))
      if (process.exitValue() == 0) logger.info("Done: " + command)
      else {
        logger.warn("Failed: " + command)
        if (!logger.isDebugEnabled) logger.warn("Use -l debug for more info")
      }
    } catch {
      case e: IOException =>
Peter van 't Hof's avatar
Peter van 't Hof committed
697 698
        logger.warn("Failed: " + command)
        logger.debug(e)
699
    }
700 701
  }
}