VcfStats.scala 25.1 KB
Newer Older
bow's avatar
bow committed
1
/**
2 3 4 5 6 7 8 9 10 11 12 13 14
  * Biopet is built on top of GATK Queue for building bioinformatic
  * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
  * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
  * should also be able to execute Biopet tools and pipelines.
  *
  * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
  *
  * Contact us at: sasc@lumc.nl
  *
  * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
  * license; For commercial users or users who do not want to follow the AGPL
  * license, please contact us to obtain a separate license.
  */
Peter van 't Hof's avatar
Peter van 't Hof committed
15
package nl.lumc.sasc.biopet.tools.vcfstats
16

17
import java.io.{File, FileOutputStream, IOException, PrintWriter}
18

Peter van 't Hof's avatar
Peter van 't Hof committed
19
import htsjdk.samtools.util.Interval
20
import htsjdk.variant.variantcontext.{Genotype, VariantContext}
21
import htsjdk.variant.vcf.VCFFileReader
22
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
23
import nl.lumc.sasc.biopet.utils.{ConfigUtils, FastaUtils, ToolCommand, VcfUtils}
Peter van 't Hof's avatar
Peter van 't Hof committed
24

25 26
import scala.collection.JavaConversions._
import scala.collection.mutable
Peter van 't Hof's avatar
Peter van 't Hof committed
27
import scala.io.Source
28
import scala.sys.process.{Process, ProcessLogger}
Peter van 't Hof's avatar
Peter van 't Hof committed
29
import scala.util.Random
Peter van 't Hof's avatar
Peter van 't Hof committed
30
import scala.concurrent.duration._
31
import scala.concurrent.{Await, Future}
32
import scala.concurrent.ExecutionContext.Implicits.global
Peter van 't Hof's avatar
Peter van 't Hof committed
33 34 35
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf

36
/**
37 38 39 40
  * This tool will generate statistics from a vcf file
  *
  * Created by pjvan_thof on 1/10/15.
  */
41
object VcfStats extends ToolCommand {
42

43
  val generalWiggleOptions = List(
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
    "Total",
    "Biallelic",
    "ComplexIndel",
    "Filtered",
    "FullyDecoded",
    "Indel",
    "Mixed",
    "MNP",
    "MonomorphicInSamples",
    "NotFiltered",
    "PointEvent",
    "PolymorphicInSamples",
    "SimpleDeletion",
    "SimpleInsertion",
    "SNP",
    "StructuralIndel",
    "Symbolic",
    "SymbolicOrSV",
    "Variant"
  )

65
  val genotypeWiggleOptions = List("Total",
66 67 68 69 70 71 72 73 74 75 76 77
                                   "Het",
                                   "HetNonRef",
                                   "Hom",
                                   "HomRef",
                                   "HomVar",
                                   "Mixed",
                                   "NoCall",
                                   "NonInformative",
                                   "Available",
                                   "Called",
                                   "Filtered",
                                   "Variant")
78

79 80
  val defaultGenotypeFields =
    List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")
81

82
  val defaultInfoFields = List("QUAL", "general", "AC", "AF", "AN", "DP")
83

84 85 86 87 88 89 90 91 92 93 94 95
  val sampleDistributions = List("Het",
                                 "HetNonRef",
                                 "Hom",
                                 "HomRef",
                                 "HomVar",
                                 "Mixed",
                                 "NoCall",
                                 "NonInformative",
                                 "Available",
                                 "Called",
                                 "Filtered",
                                 "Variant")
Peter van 't Hof's avatar
Peter van 't Hof committed
96

97
  /**
98 99
    * @param args the command line arguments
    */
100 101
  def main(args: Array[String]): Unit = {
    logger.info("Started")
102 103
    val argsParser = new VcfStatsOptParser(commandName)
    val cmdArgs = argsParser.parse(args, VcfStatsArgs()) getOrElse (throw new IllegalArgumentException)
104

Peter van 't Hof's avatar
Peter van 't Hof committed
105 106 107
    logger.info("Init spark context")

    val conf = new SparkConf()
108
      .setAppName(commandName)
Peter van 't Hof's avatar
Peter van 't Hof committed
109 110 111 112 113
      .setMaster(cmdArgs.sparkMaster.getOrElse(s"local[${cmdArgs.localThreads}]"))
    val sparkContext = new SparkContext(conf)

    logger.info("Spark context is up")

Peter van 't Hof's avatar
Peter van 't Hof committed
114
    val reader = new VCFFileReader(cmdArgs.inputFile, true)
115 116 117
    val header = reader.getFileHeader
    val samples = header.getSampleNamesInOrder.toList

Peter van 't Hof's avatar
Peter van 't Hof committed
118 119
    reader.close()

Peter van 't Hof's avatar
Peter van 't Hof committed
120
    val adInfoTags = {
121 122 123
      (for (infoTag <- cmdArgs.infoTags if !defaultInfoFields.contains(infoTag)) yield {
        require(header.getInfoHeaderLine(infoTag) != null,
                "Info tag '" + infoTag + "' not found in header of vcf file")
Peter van 't Hof's avatar
Peter van 't Hof committed
124
        infoTag
125 126 127
      }) ::: (for (line <- header.getInfoHeaderLines if cmdArgs.allInfoTags
                   if !defaultInfoFields.contains(line.getID)
                   if !cmdArgs.infoTags.contains(line.getID)) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
128 129 130
        line.getID
      }).toList ::: defaultInfoFields
    }
131

132 133 134 135
    val adGenotypeTags = (for (genotypeTag <- cmdArgs.genotypeTags
                               if !defaultGenotypeFields.contains(genotypeTag)) yield {
      require(header.getFormatHeaderLine(genotypeTag) != null,
              "Format tag '" + genotypeTag + "' not found in header of vcf file")
136
      genotypeTag
137 138 139 140
    }) ::: (for (line <- header.getFormatHeaderLines if cmdArgs.allGenotypeTags
                 if !defaultGenotypeFields.contains(line.getID)
                 if !cmdArgs.genotypeTags.contains(line.getID)
                 if line.getID != "PL") yield {
141
      line.getID
Peter van 't Hof's avatar
Peter van 't Hof committed
142
    }).toList ::: defaultGenotypeFields
143

Peter van 't Hof's avatar
Peter van 't Hof committed
144
    val bedRecords = (cmdArgs.intervals match {
Peter van 't Hof's avatar
Peter van 't Hof committed
145 146
      case Some(i) =>
        BedRecordList.fromFile(i).validateContigs(cmdArgs.referenceFile)
147
      case _ => BedRecordList.fromReference(cmdArgs.referenceFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
148
    }).combineOverlap.scatter(cmdArgs.binSize)
149

Peter van 't Hof's avatar
Peter van 't Hof committed
150 151
    val intervals: List[Interval] =
      BedRecordList.fromList(bedRecords.flatten).toSamIntervals.toList
Peter van 't Hof's avatar
Peter van 't Hof committed
152

153
    val totalBases = bedRecords.flatten.map(_.length.toLong).sum
Peter van 't Hof's avatar
Peter van 't Hof committed
154

Peter van 't Hof's avatar
Peter van 't Hof committed
155 156
    // Reading vcf records
    logger.info("Start reading vcf records")
Peter van 't Hof's avatar
Peter van 't Hof committed
157 158 159

    var variantCounter = 0L
    var baseCounter = 0L
Peter van 't Hof's avatar
Peter van 't Hof committed
160

Peter van 't Hof's avatar
Peter van 't Hof committed
161
    def status(count: Int, interval: Interval): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
162 163 164 165 166
      variantCounter += count
      baseCounter += interval.length()
      val fraction = baseCounter.toFloat / totalBases * 100
      logger.info(interval + " done, " + count + " rows processed")
      logger.info("total: " + variantCounter + " rows processed, " + fraction + "% done")
167
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
168

Peter van 't Hof's avatar
Peter van 't Hof committed
169
    // Triple for loop to not keep all bins in memory
170
    val statsFutures = for (intervals <- Random
Peter van 't Hof's avatar
Peter van 't Hof committed
171
                              .shuffle(bedRecords))
172 173 174 175 176 177
      yield
        Future {
          val chunkStats = for (intervals <- intervals.grouped(25)) yield {
            val binStats = for (interval <- intervals.par) yield {
              val reader = new VCFFileReader(cmdArgs.inputFile, true)
              var chunkCounter = 0
Peter van 't Hof's avatar
Peter van 't Hof committed
178
              val stats = Stats.emptyStats(samples)
179 180
              logger.info("Starting on: " + interval)

Peter van 't Hof's avatar
Peter van 't Hof committed
181 182
              val samInterval = interval.toSamInterval

Peter van 't Hof's avatar
Peter van 't Hof committed
183 184
              val query =
                reader.query(samInterval.getContig, samInterval.getStart, samInterval.getEnd)
185 186 187 188 189 190 191
              if (!query.hasNext) {
                Stats.mergeNestedStatsMap(stats.generalStats, fillGeneral(adInfoTags))
                for (sample <- samples) yield {
                  Stats.mergeNestedStatsMap(stats.samplesStats(sample).genotypeStats,
                                            fillGenotype(adGenotypeTags))
                }
                chunkCounter += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
192
              }
Peter van 't Hof's avatar
Peter van 't Hof committed
193

Peter van 't Hof's avatar
Peter van 't Hof committed
194
              for (record <- query if record.getStart <= samInterval.getEnd) {
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
                Stats.mergeNestedStatsMap(stats.generalStats, checkGeneral(record, adInfoTags))
                for (sample1 <- samples) yield {
                  val genotype = record.getGenotype(sample1)
                  Stats.mergeNestedStatsMap(stats.samplesStats(sample1).genotypeStats,
                                            checkGenotype(record, genotype, adGenotypeTags))
                  for (sample2 <- samples) {
                    val genotype2 = record.getGenotype(sample2)
                    if (genotype.getAlleles == genotype2.getAlleles)
                      stats.samplesStats(sample1).sampleToSample(sample2).genotypeOverlap += 1
                    stats.samplesStats(sample1).sampleToSample(sample2).alleleOverlap += VcfUtils
                      .alleleOverlap(genotype.getAlleles.toList, genotype2.getAlleles.toList)
                  }
                }
                chunkCounter += 1
              }
              reader.close()

              if (cmdArgs.writeBinStats) {
                val binOutputDir =
Peter van 't Hof's avatar
Peter van 't Hof committed
214
                  new File(cmdArgs.outputDir, "bins" + File.separator + samInterval.getContig)
215 216 217 218 219

                stats.writeGenotypeField(
                  samples,
                  "general",
                  binOutputDir,
Peter van 't Hof's avatar
Peter van 't Hof committed
220
                  prefix = "genotype-" + samInterval.getStart + "-" + samInterval.getEnd)
221 222
                stats.writeField("general",
                                 binOutputDir,
Peter van 't Hof's avatar
Peter van 't Hof committed
223
                                 prefix = samInterval.getStart + "-" + samInterval.getEnd)
224
              }
Peter van 't Hof's avatar
Peter van 't Hof committed
225

Peter van 't Hof's avatar
Peter van 't Hof committed
226
              status(chunkCounter, samInterval)
227 228
              stats
            }
Peter van 't Hof's avatar
Peter van 't Hof committed
229
            binStats.toList.fold(Stats.emptyStats(samples))(_ += _)
Peter van 't Hof's avatar
Peter van 't Hof committed
230
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
231
          chunkStats.toList.fold(Stats.emptyStats(samples))(_ += _)
Peter van 't Hof's avatar
Peter van 't Hof committed
232
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
233
    val stats = statsFutures.foldLeft(Stats.emptyStats(samples)) {
234
      case (a, b) => a += Await.result(b, Duration.Inf)
235
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
236

Peter van 't Hof's avatar
Peter van 't Hof committed
237
    logger.info("Done reading vcf records")
238

Peter van 't Hof's avatar
Rename  
Peter van 't Hof committed
239
    val allWriter = new PrintWriter(new File(cmdArgs.outputDir, "stats.json"))
240 241 242 243 244 245 246
    val json = ConfigUtils.mapToJson(
      stats.getAllStats(
        FastaUtils.getCachedDict(cmdArgs.referenceFile).getSequences.map(_.getSequenceName).toList,
        samples,
        adGenotypeTags,
        adInfoTags,
        sampleDistributions))
Peter van 't Hof's avatar
Peter van 't Hof committed
247
    allWriter.println(json.nospaces)
248
    allWriter.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
249 250

    // Write general wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
251 252
    for (field <- cmdArgs.generalWiggle) {
      val file = new File(cmdArgs.outputDir, "wigs" + File.separator + "general-" + field + ".wig")
253
      writeWiggle(intervals, field, "count", file, genotype = false, cmdArgs.outputDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
254 255
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
256
    // Write sample wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
257
    for (field <- cmdArgs.genotypeWiggle; sample <- samples) {
258 259
      val file = new File(cmdArgs.outputDir,
                          "wigs" + File.separator + "genotype-" + sample + "-" + field + ".wig")
260
      writeWiggle(intervals, field, sample, file, genotype = true, cmdArgs.outputDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
261 262
    }

263 264 265 266 267 268 269 270
    writeOverlap(stats,
                 _.genotypeOverlap,
                 cmdArgs.outputDir + "/sample_compare/genotype_overlap",
                 samples)
    writeOverlap(stats,
                 _.alleleOverlap,
                 cmdArgs.outputDir + "/sample_compare/allele_overlap",
                 samples)
271

Peter van 't Hof's avatar
Peter van 't Hof committed
272
    sparkContext.stop()
273
    logger.info("Done")
Peter van 't Hof's avatar
Peter van 't Hof committed
274 275
  }

276
  //FIXME: does only work correct for reference and not with a bed file
277 278 279 280
  protected def writeWiggle(intervals: List[Interval],
                            row: String,
                            column: String,
                            outputFile: File,
281 282
                            genotype: Boolean,
                            outputDir: File): Unit = {
283 284
    val groupedIntervals =
      intervals.groupBy(_.getContig).map { case (k, v) => k -> v.sortBy(_.getStart) }
Peter van 't Hof's avatar
Peter van 't Hof committed
285
    outputFile.getParentFile.mkdirs()
Peter van 't Hof's avatar
Peter van 't Hof committed
286 287
    val writer = new PrintWriter(outputFile)
    writer.println("track type=wiggle_0")
288
    for ((chr, intervals) <- groupedIntervals) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
289 290 291
      val length = intervals.head.length()
      writer.println(s"fixedStep chrom=$chr start=1 step=$length span=$length")
      for (interval <- intervals) {
Peter van 't Hof's avatar
Peter van 't Hof committed
292
        val file = {
293 294
          if (genotype)
            new File(
295
              outputDir,
296 297 298
              "bins" + File.separator + chr + File.separator + "genotype-" + interval.getStart + "-" + interval.getEnd + "-general.tsv")
          else
            new File(
299
              outputDir,
300
              "bins" + File.separator + chr + File.separator + interval.getStart + "-" + interval.getEnd + "-general.tsv")
Peter van 't Hof's avatar
Peter van 't Hof committed
301 302
        }
        writer.println(valueFromTsv(file, row, column).getOrElse(0))
Peter van 't Hof's avatar
Peter van 't Hof committed
303 304 305 306 307
      }
    }
    writer.close()
  }

308
  /**
309 310 311 312 313 314
    * Gets single value from a tsv file
    * @param file Input tsv file
    * @param row Row id
    * @param column column id
    * @return value
    */
Peter van 't Hof's avatar
Peter van 't Hof committed
315 316 317 318 319 320
  def valueFromTsv(file: File, row: String, column: String): Option[String] = {
    val reader = Source.fromFile(file)
    val it = reader.getLines()
    val index = it.next().split("\t").indexOf(column)

    val value = it.find(_.startsWith(row))
321
    reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
322 323

    value.collect { case x => x.split("\t")(index) }
324 325
  }

326 327
  protected[tools] def fillGeneral(
      additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
328 329 330 331 332 333 334 335
    val buffer = mutable.Map[String, Map[Any, Int]]()

    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
      val map = buffer.getOrElse(key, Map())
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
336
    addToBuffer("QUAL", "not set", found = false)
337

Sander Bollen's avatar
Sander Bollen committed
338 339 340 341 342 343 344 345 346 347 348 349
    addToBuffer("SampleDistribution-Het", "not set", found = false)
    addToBuffer("SampleDistribution-HetNonRef", "not set", found = false)
    addToBuffer("SampleDistribution-Hom", "not set", found = false)
    addToBuffer("SampleDistribution-HomRef", "not set", found = false)
    addToBuffer("SampleDistribution-HomVar", "not set", found = false)
    addToBuffer("SampleDistribution-Mixed", "not set", found = false)
    addToBuffer("SampleDistribution-NoCall", "not set", found = false)
    addToBuffer("SampleDistribution-NonInformative", "not set", found = false)
    addToBuffer("SampleDistribution-Available", "not set", found = false)
    addToBuffer("SampleDistribution-Called", "not set", found = false)
    addToBuffer("SampleDistribution-Filtered", "not set", found = false)
    addToBuffer("SampleDistribution-Variant", "not set", found = false)
350

Peter van 't Hof's avatar
Peter van 't Hof committed
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369
    addToBuffer("general", "Total", found = false)
    addToBuffer("general", "Biallelic", found = false)
    addToBuffer("general", "ComplexIndel", found = false)
    addToBuffer("general", "Filtered", found = false)
    addToBuffer("general", "FullyDecoded", found = false)
    addToBuffer("general", "Indel", found = false)
    addToBuffer("general", "Mixed", found = false)
    addToBuffer("general", "MNP", found = false)
    addToBuffer("general", "MonomorphicInSamples", found = false)
    addToBuffer("general", "NotFiltered", found = false)
    addToBuffer("general", "PointEvent", found = false)
    addToBuffer("general", "PolymorphicInSamples", found = false)
    addToBuffer("general", "SimpleDeletion", found = false)
    addToBuffer("general", "SimpleInsertion", found = false)
    addToBuffer("general", "SNP", found = false)
    addToBuffer("general", "StructuralIndel", found = false)
    addToBuffer("general", "Symbolic", found = false)
    addToBuffer("general", "SymbolicOrSV", found = false)
    addToBuffer("general", "Variant", found = false)
370 371 372 373

    val skipTags = List("QUAL", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
374
      addToBuffer(tag, "not set", found = false)
375 376 377 378 379
    }

    Map("total" -> buffer.toMap)
  }

380
  /** Function to check all general stats, all info expect sample/genotype specific stats */
381 382 383
  protected[tools] def checkGeneral(
      record: VariantContext,
      additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
384 385
    val buffer = mutable.Map[String, Map[Any, Int]]()

386
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
387
      val map = buffer.getOrElse(key, Map())
388
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
389
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
390 391
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
392
    addToBuffer("QUAL", Math.round(record.getPhredScaledQual), found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
393

394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430
    addToBuffer("SampleDistribution-Het",
                record.getGenotypes.count(genotype => genotype.isHet),
                found = true)
    addToBuffer("SampleDistribution-HetNonRef",
                record.getGenotypes.count(genotype => genotype.isHetNonRef),
                found = true)
    addToBuffer("SampleDistribution-Hom",
                record.getGenotypes.count(genotype => genotype.isHom),
                found = true)
    addToBuffer("SampleDistribution-HomRef",
                record.getGenotypes.count(genotype => genotype.isHomRef),
                found = true)
    addToBuffer("SampleDistribution-HomVar",
                record.getGenotypes.count(genotype => genotype.isHomVar),
                found = true)
    addToBuffer("SampleDistribution-Mixed",
                record.getGenotypes.count(genotype => genotype.isMixed),
                found = true)
    addToBuffer("SampleDistribution-NoCall",
                record.getGenotypes.count(genotype => genotype.isNoCall),
                found = true)
    addToBuffer("SampleDistribution-NonInformative",
                record.getGenotypes.count(genotype => genotype.isNonInformative),
                found = true)
    addToBuffer("SampleDistribution-Available",
                record.getGenotypes.count(genotype => genotype.isAvailable),
                found = true)
    addToBuffer("SampleDistribution-Called",
                record.getGenotypes.count(genotype => genotype.isCalled),
                found = true)
    addToBuffer("SampleDistribution-Filtered",
                record.getGenotypes.count(genotype => genotype.isFiltered),
                found = true)
    addToBuffer("SampleDistribution-Variant",
                record.getGenotypes.count(genotype =>
                  genotype.isHetNonRef || genotype.isHet || genotype.isHomVar),
                found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
431

Peter van 't Hof's avatar
Peter van 't Hof committed
432
    addToBuffer("general", "Total", found = true)
433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451
    addToBuffer("general", "Biallelic", record.isBiallelic)
    addToBuffer("general", "ComplexIndel", record.isComplexIndel)
    addToBuffer("general", "Filtered", record.isFiltered)
    addToBuffer("general", "FullyDecoded", record.isFullyDecoded)
    addToBuffer("general", "Indel", record.isIndel)
    addToBuffer("general", "Mixed", record.isMixed)
    addToBuffer("general", "MNP", record.isMNP)
    addToBuffer("general", "MonomorphicInSamples", record.isMonomorphicInSamples)
    addToBuffer("general", "NotFiltered", record.isNotFiltered)
    addToBuffer("general", "PointEvent", record.isPointEvent)
    addToBuffer("general", "PolymorphicInSamples", record.isPolymorphicInSamples)
    addToBuffer("general", "SimpleDeletion", record.isSimpleDeletion)
    addToBuffer("general", "SimpleInsertion", record.isSimpleInsertion)
    addToBuffer("general", "SNP", record.isSNP)
    addToBuffer("general", "StructuralIndel", record.isStructuralIndel)
    addToBuffer("general", "Symbolic", record.isSymbolic)
    addToBuffer("general", "SymbolicOrSV", record.isSymbolicOrSV)
    addToBuffer("general", "Variant", record.isVariant)

Peter van 't Hof's avatar
Peter van 't Hof committed
452 453 454
    val skipTags = List("QUAL", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
455
      val value = record.getAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
456 457
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
458 459
    }

460
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
Peter van 't Hof's avatar
Peter van 't Hof committed
461 462
  }

463 464
  protected[tools] def fillGenotype(
      additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
465 466 467 468 469 470 471 472
    val buffer = mutable.Map[String, Map[Any, Int]]()

    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
      val map = buffer.getOrElse(key, Map())
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488
    addToBuffer("DP", "not set", found = false)
    addToBuffer("GQ", "not set", found = false)

    addToBuffer("general", "Total", found = false)
    addToBuffer("general", "Het", found = false)
    addToBuffer("general", "HetNonRef", found = false)
    addToBuffer("general", "Hom", found = false)
    addToBuffer("general", "HomRef", found = false)
    addToBuffer("general", "HomVar", found = false)
    addToBuffer("general", "Mixed", found = false)
    addToBuffer("general", "NoCall", found = false)
    addToBuffer("general", "NonInformative", found = false)
    addToBuffer("general", "Available", found = false)
    addToBuffer("general", "Called", found = false)
    addToBuffer("general", "Filtered", found = false)
    addToBuffer("general", "Variant", found = false)
489 490 491 492 493 494 495 496 497 498 499

    val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
      addToBuffer(tag, 0, found = false)
    }

    Map("total" -> buffer.toMap)

  }

500
  /** Function to check sample/genotype specific stats */
501 502 503 504
  protected[tools] def checkGenotype(
      record: VariantContext,
      genotype: Genotype,
      additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
505
    val buffer = mutable.Map[String, Map[Any, Int]]()
Peter van 't Hof's avatar
Peter van 't Hof committed
506

507
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
508
      val map = buffer.getOrElse(key, Map())
509
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
510
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
511 512
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
513 514 515
    buffer += "DP" -> Map((if (genotype.hasDP) genotype.getDP else "not set") -> 1)
    buffer += "GQ" -> Map((if (genotype.hasGQ) genotype.getGQ else "not set") -> 1)

516 517
    val usedAlleles =
      (for (allele <- genotype.getAlleles) yield record.getAlleleIndex(allele)).toList
Peter van 't Hof's avatar
Peter van 't Hof committed
518

Peter van 't Hof's avatar
Peter van 't Hof committed
519
    addToBuffer("general", "Total", found = true)
520 521 522 523 524 525 526 527 528 529 530 531
    addToBuffer("general", "Het", genotype.isHet)
    addToBuffer("general", "HetNonRef", genotype.isHetNonRef)
    addToBuffer("general", "Hom", genotype.isHom)
    addToBuffer("general", "HomRef", genotype.isHomRef)
    addToBuffer("general", "HomVar", genotype.isHomVar)
    addToBuffer("general", "Mixed", genotype.isMixed)
    addToBuffer("general", "NoCall", genotype.isNoCall)
    addToBuffer("general", "NonInformative", genotype.isNonInformative)
    addToBuffer("general", "Available", genotype.isAvailable)
    addToBuffer("general", "Called", genotype.isCalled)
    addToBuffer("general", "Filtered", genotype.isFiltered)
    addToBuffer("general", "Variant", genotype.isHetNonRef || genotype.isHet || genotype.isHomVar)
Peter van 't Hof's avatar
Peter van 't Hof committed
532

Peter van 't Hof's avatar
Peter van 't Hof committed
533 534 535
    if (genotype.hasAD) {
      val ad = genotype.getAD
      for (i <- 0 until ad.size if ad(i) > 0) {
Peter van 't Hof's avatar
Peter van 't Hof committed
536 537 538 539 540
        addToBuffer("AD", ad(i), found = true)
        if (i == 0) addToBuffer("AD-ref", ad(i), found = true)
        if (i > 0) addToBuffer("AD-alt", ad(i), found = true)
        if (usedAlleles.contains(i)) addToBuffer("AD-used", ad(i), found = true)
        else addToBuffer("AD-not_used", ad(i), found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
541 542
      }
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
543

Peter van 't Hof's avatar
Peter van 't Hof committed
544 545 546
    val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
547
      val value = genotype.getAnyAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
548 549
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
550 551
    }

552
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
553 554
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
555
  /** Function to write sample to sample compare tsv's / heatmaps */
556 557 558
  def writeOverlap(stats: Stats,
                   function: SampleToSampleStats => Int,
                   prefix: String,
559 560
                   samples: List[String],
                   plots: Boolean = true): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
561 562 563 564 565 566 567
    val absFile = new File(prefix + ".abs.tsv")
    val relFile = new File(prefix + ".rel.tsv")

    absFile.getParentFile.mkdirs()

    val absWriter = new PrintWriter(absFile)
    val relWriter = new PrintWriter(relFile)
568 569 570 571

    absWriter.println(samples.mkString("\t", "\t", ""))
    relWriter.println(samples.mkString("\t", "\t", ""))
    for (sample1 <- samples) {
572 573
      val values = for (sample2 <- samples)
        yield function(stats.samplesStats(sample1).sampleToSample(sample2))
Peter van 't Hof's avatar
Peter van 't Hof committed
574

575 576
      absWriter.println(values.mkString(sample1 + "\t", "\t", ""))

Peter van 't Hof's avatar
Peter van 't Hof committed
577
      val total = function(stats.samplesStats(sample1).sampleToSample(sample1))
578 579 580 581
      relWriter.println(values.map(_.toFloat / total).mkString(sample1 + "\t", "\t", ""))
    }
    absWriter.close()
    relWriter.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
582

583
    if (plots) plotHeatmap(relFile)
584 585
  }

586
  /** Plots heatmaps on target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
587
  def plotHeatmap(file: File) {
588 589 590 591 592 593 594 595 596
    executeRscript(
      "plotHeatmap.R",
      Array(
        file.getAbsolutePath,
        file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.png",
        file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.clustering.png",
        file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.dendrogram.png"
      )
    )
Peter van 't Hof's avatar
Peter van 't Hof committed
597 598
  }

599
  /** Plots line graph with target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
600
  def plotLine(file: File) {
601 602 603
    executeRscript(
      "plotXY.R",
      Array(file.getAbsolutePath, file.getAbsolutePath.stripSuffix(".tsv") + ".xy.png"))
Peter van 't Hof's avatar
Peter van 't Hof committed
604 605
  }

606
  /** Function to execute Rscript as subproces */
Peter van 't Hof's avatar
Peter van 't Hof committed
607 608 609
  def executeRscript(resource: String, args: Array[String]): Unit = {
    val is = getClass.getResourceAsStream(resource)
    val file = File.createTempFile("script.", "." + resource)
610
    file.deleteOnExit()
Peter van 't Hof's avatar
Peter van 't Hof committed
611 612 613 614 615 616
    val os = new FileOutputStream(file)
    org.apache.commons.io.IOUtils.copy(is, os)
    os.close()

    val command: String = "Rscript " + file + " " + args.mkString(" ")

617
    logger.info("Starting: " + command)
Peter van 't Hof's avatar
Peter van 't Hof committed
618 619 620 621 622 623 624 625 626
    try {
      val process = Process(command).run(ProcessLogger(x => logger.debug(x), x => logger.debug(x)))
      if (process.exitValue() == 0) logger.info("Done: " + command)
      else {
        logger.warn("Failed: " + command)
        if (!logger.isDebugEnabled) logger.warn("Use -l debug for more info")
      }
    } catch {
      case e: IOException =>
Peter van 't Hof's avatar
Peter van 't Hof committed
627 628
        logger.warn("Failed: " + command)
        logger.debug(e)
629
    }
630 631
  }
}