VcfStats.scala 25.9 KB
Newer Older
bow's avatar
bow committed
1
2
3
4
5
6
7
8
9
10
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
11
 * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
bow's avatar
bow committed
12
13
14
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
15
package nl.lumc.sasc.biopet.tools.vcfstats
16

Peter van 't Hof's avatar
Peter van 't Hof committed
17
import java.io.{ File, FileOutputStream, IOException, PrintWriter }
18

Peter van 't Hof's avatar
Peter van 't Hof committed
19
import htsjdk.samtools.util.Interval
Peter van 't Hof's avatar
Peter van 't Hof committed
20
import htsjdk.variant.variantcontext.{ Genotype, VariantContext }
21
import htsjdk.variant.vcf.VCFFileReader
22
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
Peter van 't Hof's avatar
Peter van 't Hof committed
23
import nl.lumc.sasc.biopet.utils.{ ConfigUtils, FastaUtils, ToolCommand, VcfUtils }
Peter van 't Hof's avatar
Peter van 't Hof committed
24

25
26
import scala.collection.JavaConversions._
import scala.collection.mutable
Peter van 't Hof's avatar
Peter van 't Hof committed
27
import scala.io.Source
Peter van 't Hof's avatar
Peter van 't Hof committed
28
import scala.sys.process.{ Process, ProcessLogger }
Peter van 't Hof's avatar
Peter van 't Hof committed
29
import scala.util.Random
Peter van 't Hof's avatar
Peter van 't Hof committed
30
31
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.duration._
Peter van 't Hof's avatar
Peter van 't Hof committed
32
import scala.concurrent.{ Await, Future }
Peter van 't Hof's avatar
Peter van 't Hof committed
33

34
/**
Peter van 't Hof's avatar
Peter van 't Hof committed
35
36
 * This tool will generate statistics from a vcf file
 *
37
38
39
 * Created by pjvan_thof on 1/10/15.
 */
object VcfStats extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
40
  /** Commandline argument */
41
42
  case class Args(inputFile: File = null,
                  outputDir: File = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
43
                  referenceFile: File = null,
44
45
46
47
                  intervals: Option[File] = None,
                  infoTags: List[String] = Nil,
                  genotypeTags: List[String] = Nil,
                  allInfoTags: Boolean = false,
Peter van 't Hof's avatar
Peter van 't Hof committed
48
49
50
51
52
                  allGenotypeTags: Boolean = false,
                  binSize: Int = 10000000,
                  writeBinStats: Boolean = false,
                  generalWiggle: List[String] = Nil,
                  genotypeWiggle: List[String] = Nil) extends AbstractArgs
53

Peter van 't Hof's avatar
Peter van 't Hof committed
54
  private val generalWiggleOptions = List("Total", "Biallelic", "ComplexIndel", "Filtered", "FullyDecoded", "Indel", "Mixed",
55
56
57
58
    "MNP", "MonomorphicInSamples", "NotFiltered", "PointEvent", "PolymorphicInSamples",
    "SimpleDeletion", "SimpleInsertion", "SNP", "StructuralIndel", "Symbolic",
    "SymbolicOrSV", "Variant")

Peter van 't Hof's avatar
Peter van 't Hof committed
59
  private val genotypeWiggleOptions = List("Total", "Het", "HetNonRef", "Hom", "HomRef", "HomVar", "Mixed", "NoCall", "NonInformative",
60
61
    "Available", "Called", "Filtered", "Variant")

Peter van 't Hof's avatar
Peter van 't Hof committed
62
  /** Parsing commandline arguments */
63
  class OptParser extends AbstractOptParser {
64
    opt[File]('I', "inputFile") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) =>
65
      c.copy(inputFile = x)
66
67
68
69
    } validate {
      x => if (x.exists) success else failure("Input VCF required")
    } text "Input VCF file (required)"
    opt[File]('R', "referenceFile") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
70
      c.copy(referenceFile = x)
71
72
73
74
    } validate {
      x => if (x.exists) success else failure("Reference file required")
    } text "Fasta reference which was used to call input VCF (required)"
    opt[File]('o', "outputDir") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) =>
75
      c.copy(outputDir = x)
76
    } validate {
Peter van 't Hof's avatar
Peter van 't Hof committed
77
      x => if (x == null) failure("Valid output directory required") else if (x.exists) success else failure(s"Output directory does not exist: $x")
78
    } text "Path to directory for output (required)"
Peter van 't Hof's avatar
Peter van 't Hof committed
79
    opt[File]('i', "intervals") unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
80
      c.copy(intervals = Some(x))
81
    } text "Path to interval (BED) file (optional)"
Peter van 't Hof's avatar
Peter van 't Hof committed
82
    opt[String]("infoTag") unbounded () valueName "<tag>" action { (x, c) =>
83
      c.copy(infoTags = x :: c.infoTags)
Peter van 't Hof's avatar
Peter van 't Hof committed
84
    } text s"Summarize these info tags. Default is (${defaultInfoFields.mkString(", ")})"
Peter van 't Hof's avatar
Peter van 't Hof committed
85
    opt[String]("genotypeTag") unbounded () valueName "<tag>" action { (x, c) =>
86
      c.copy(genotypeTags = x :: c.genotypeTags)
Peter van 't Hof's avatar
Peter van 't Hof committed
87
    } text s"Summarize these genotype tags. Default is (${defaultGenotypeFields.mkString(", ")})"
88
89
    opt[Unit]("allInfoTags") unbounded () action { (x, c) =>
      c.copy(allInfoTags = true)
90
    } text "Summarize all info tags. Default false"
91
92
    opt[Unit]("allGenotypeTags") unbounded () action { (x, c) =>
      c.copy(allGenotypeTags = true)
93
    } text "Summarize all genotype tags. Default false"
Peter van 't Hof's avatar
Peter van 't Hof committed
94
95
    opt[Int]("binSize") unbounded () action { (x, c) =>
      c.copy(binSize = x)
96
    } text "Binsize in estimated base pairs"
Peter van 't Hof's avatar
Peter van 't Hof committed
97
98
    opt[Unit]("writeBinStats") unbounded () action { (x, c) =>
      c.copy(writeBinStats = true)
99
    } text "Write bin statistics. Default False"
Peter van 't Hof's avatar
Peter van 't Hof committed
100
101
    opt[String]("generalWiggle") unbounded () action { (x, c) =>
      c.copy(generalWiggle = x :: c.generalWiggle, writeBinStats = true)
102
    } validate {
Peter van 't Hof's avatar
Peter van 't Hof committed
103
      x => if (generalWiggleOptions.contains(x)) success else failure(s"""Nonexistent field $x""")
Peter van 't Hof's avatar
Peter van 't Hof committed
104
105
    } text s"""Create a wiggle track with bin size <binSize> for any of the following statistics:
        |${generalWiggleOptions.mkString(", ")}""".stripMargin
Peter van 't Hof's avatar
Peter van 't Hof committed
106
107
    opt[String]("genotypeWiggle") unbounded () action { (x, c) =>
      c.copy(genotypeWiggle = x :: c.genotypeWiggle, writeBinStats = true)
108
109
    } validate {
      x => if (genotypeWiggleOptions.contains(x)) success else failure(s"""Non-existent field $x""")
Peter van 't Hof's avatar
Peter van 't Hof committed
110
111
    } text s"""Create a wiggle track with bin size <binSize> for any of the following genotype fields:
        |${genotypeWiggleOptions.mkString(", ")}""".stripMargin
112
113
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
114
  protected var cmdArgs: Args = _
Peter van 't Hof's avatar
Peter van 't Hof committed
115

116
117
  val defaultGenotypeFields = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

118
  val defaultInfoFields = List("QUAL", "general", "AC", "AF", "AN", "DP")
119

Peter van 't Hof's avatar
Peter van 't Hof committed
120
121
122
  val sampleDistributions = List("Het", "HetNonRef", "Hom", "HomRef", "HomVar", "Mixed", "NoCall",
    "NonInformative", "Available", "Called", "Filtered", "Variant")

123
124
125
126
127
128
  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
    logger.info("Started")
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
129
    cmdArgs = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
130

Peter van 't Hof's avatar
Peter van 't Hof committed
131
    val reader = new VCFFileReader(cmdArgs.inputFile, true)
132
133
134
    val header = reader.getFileHeader
    val samples = header.getSampleNamesInOrder.toList

Peter van 't Hof's avatar
Peter van 't Hof committed
135
136
    reader.close()

Peter van 't Hof's avatar
Peter van 't Hof committed
137
138
    val adInfoTags = {
      (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
139
        infoTag <- cmdArgs.infoTags if !defaultInfoFields.contains(infoTag)
Peter van 't Hof's avatar
Peter van 't Hof committed
140
141
142
143
      ) yield {
        require(header.getInfoHeaderLine(infoTag) != null, "Info tag '" + infoTag + "' not found in header of vcf file")
        infoTag
      }) ::: (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
144
        line <- header.getInfoHeaderLines if cmdArgs.allInfoTags if !defaultInfoFields.contains(line.getID) if !cmdArgs.infoTags.contains(line.getID)
Peter van 't Hof's avatar
Peter van 't Hof committed
145
146
147
148
      ) yield {
        line.getID
      }).toList ::: defaultInfoFields
    }
149

150
    val adGenotypeTags = (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
151
      genotypeTag <- cmdArgs.genotypeTags if !defaultGenotypeFields.contains(genotypeTag)
152
    ) yield {
153
154
      require(header.getFormatHeaderLine(genotypeTag) != null, "Format tag '" + genotypeTag + "' not found in header of vcf file")
      genotypeTag
155
    }) ::: (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
156
      line <- header.getFormatHeaderLines if cmdArgs.allGenotypeTags if !defaultGenotypeFields.contains(line.getID) if !cmdArgs.genotypeTags.contains(line.getID) if line.getID != "PL"
157
    ) yield {
158
      line.getID
Peter van 't Hof's avatar
Peter van 't Hof committed
159
    }).toList ::: defaultGenotypeFields
160

Peter van 't Hof's avatar
Peter van 't Hof committed
161
162
    val bedRecords = (cmdArgs.intervals match {
      case Some(intervals) => BedRecordList.fromFile(intervals).validateContigs(cmdArgs.referenceFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
163
      case _               => BedRecordList.fromReference(cmdArgs.referenceFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
164
    }).combineOverlap.scatter(cmdArgs.binSize)
165
166

    val intervals: List[Interval] = bedRecords.toSamIntervals.toList
Peter van 't Hof's avatar
Peter van 't Hof committed
167

168
    val totalBases = bedRecords.length
Peter van 't Hof's avatar
Peter van 't Hof committed
169

Peter van 't Hof's avatar
Peter van 't Hof committed
170
171
    // Reading vcf records
    logger.info("Start reading vcf records")
Peter van 't Hof's avatar
Peter van 't Hof committed
172
173
174
175
176
177

    def createStats: Stats = {
      val stats = new Stats
      //init stats
      for (sample1 <- samples) {
        stats.samplesStats += sample1 -> new SampleStats
178
        for (sample2 <- samples) {
Peter van 't Hof's avatar
Peter van 't Hof committed
179
          stats.samplesStats(sample1).sampleToSample += sample2 -> new SampleToSampleStats
180
181
        }
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
182
183
184
185
186
      stats
    }

    var variantCounter = 0L
    var baseCounter = 0L
Peter van 't Hof's avatar
Peter van 't Hof committed
187

Peter van 't Hof's avatar
Peter van 't Hof committed
188
    def status(count: Int, interval: Interval): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
189
190
191
192
193
      variantCounter += count
      baseCounter += interval.length()
      val fraction = baseCounter.toFloat / totalBases * 100
      logger.info(interval + " done, " + count + " rows processed")
      logger.info("total: " + variantCounter + " rows processed, " + fraction + "% done")
194
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
195

Peter van 't Hof's avatar
Peter van 't Hof committed
196
    // Triple for loop to not keep all bins in memory
197
    val statsFutures = for (intervals <- Random.shuffle(intervals).grouped(intervals.size / (if (intervals.size > 10) 4 else 1)).toList) yield Future {
Peter van 't Hof's avatar
Peter van 't Hof committed
198
      val chunkStats = for (intervals <- intervals.grouped(25)) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
199
        val binStats = for (interval <- intervals.par) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
200
          val reader = new VCFFileReader(cmdArgs.inputFile, true)
Peter van 't Hof's avatar
Peter van 't Hof committed
201
202
203
204
          var chunkCounter = 0
          val stats = createStats
          logger.info("Starting on: " + interval)

205
206
          val query = reader.query(interval.getContig, interval.getStart, interval.getEnd)
          if (!query.hasNext) {
Peter van 't Hof's avatar
Peter van 't Hof committed
207
            Stats.mergeNestedStatsMap(stats.generalStats, fillGeneral(adInfoTags))
208
            for (sample <- samples) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
209
              Stats.mergeNestedStatsMap(stats.samplesStats(sample).genotypeStats, fillGenotype(adGenotypeTags))
210
211
212
213
            }
            chunkCounter += 1
          }

Peter van 't Hof's avatar
Peter van 't Hof committed
214
          for (
215
            record <- query if record.getStart <= interval.getEnd
Peter van 't Hof's avatar
Peter van 't Hof committed
216
          ) {
Peter van 't Hof's avatar
Peter van 't Hof committed
217
            Stats.mergeNestedStatsMap(stats.generalStats, checkGeneral(record, adInfoTags))
Peter van 't Hof's avatar
Peter van 't Hof committed
218
219
            for (sample1 <- samples) yield {
              val genotype = record.getGenotype(sample1)
Peter van 't Hof's avatar
Peter van 't Hof committed
220
              Stats.mergeNestedStatsMap(stats.samplesStats(sample1).genotypeStats, checkGenotype(record, genotype, adGenotypeTags))
Peter van 't Hof's avatar
Peter van 't Hof committed
221
222
223
224
              for (sample2 <- samples) {
                val genotype2 = record.getGenotype(sample2)
                if (genotype.getAlleles == genotype2.getAlleles)
                  stats.samplesStats(sample1).sampleToSample(sample2).genotypeOverlap += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
225
                stats.samplesStats(sample1).sampleToSample(sample2).alleleOverlap += VcfUtils.alleleOverlap(genotype.getAlleles.toList, genotype2.getAlleles.toList)
Peter van 't Hof's avatar
Peter van 't Hof committed
226
              }
Peter van 't Hof's avatar
Peter van 't Hof committed
227
            }
Peter van 't Hof's avatar
Peter van 't Hof committed
228
            chunkCounter += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
229
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
230
          reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
231

Peter van 't Hof's avatar
Peter van 't Hof committed
232
233
          if (cmdArgs.writeBinStats) {
            val binOutputDir = new File(cmdArgs.outputDir, "bins" + File.separator + interval.getContig)
Peter van 't Hof's avatar
Peter van 't Hof committed
234

Peter van 't Hof's avatar
Peter van 't Hof committed
235
236
            stats.writeGenotypeField(samples, "general", binOutputDir, prefix = "genotype-" + interval.getStart + "-" + interval.getEnd)
            stats.writeField("general", binOutputDir, prefix = interval.getStart + "-" + interval.getEnd)
Peter van 't Hof's avatar
Peter van 't Hof committed
237
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
238

Peter van 't Hof's avatar
Peter van 't Hof committed
239
240
          status(chunkCounter, interval)
          stats
Peter van 't Hof's avatar
Peter van 't Hof committed
241
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
242
        binStats.toList.fold(createStats)(_ += _)
Peter van 't Hof's avatar
Peter van 't Hof committed
243
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
244
      chunkStats.toList.fold(createStats)(_ += _)
245
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
246
    val stats = statsFutures.foldLeft(createStats) { case (a, b) => a += Await.result(b, Duration.Inf) }
Peter van 't Hof's avatar
Peter van 't Hof committed
247

Peter van 't Hof's avatar
Peter van 't Hof committed
248
    logger.info("Done reading vcf records")
249

Peter van 't Hof's avatar
Rename    
Peter van 't Hof committed
250
    val allWriter = new PrintWriter(new File(cmdArgs.outputDir, "stats.json"))
251
    val json = ConfigUtils.mapToJson(stats.getAllStats(FastaUtils.getCachedDict(cmdArgs.referenceFile).getSequences.map(_.getSequenceName).toList, samples, adGenotypeTags, adInfoTags, sampleDistributions))
Peter van 't Hof's avatar
Peter van 't Hof committed
252
    allWriter.println(json.nospaces)
253
    allWriter.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
254
255

    // Write general wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
256
257
    for (field <- cmdArgs.generalWiggle) {
      val file = new File(cmdArgs.outputDir, "wigs" + File.separator + "general-" + field + ".wig")
Peter van 't Hof's avatar
Peter van 't Hof committed
258
      writeWiggle(intervals, field, "count", file, genotype = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
259
260
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
261
    // Write sample wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
262
263
    for (field <- cmdArgs.genotypeWiggle; sample <- samples) {
      val file = new File(cmdArgs.outputDir, "wigs" + File.separator + "genotype-" + sample + "-" + field + ".wig")
Peter van 't Hof's avatar
Peter van 't Hof committed
264
      writeWiggle(intervals, field, sample, file, genotype = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
265
266
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
267
268
    writeOverlap(stats, _.genotypeOverlap, cmdArgs.outputDir + "/sample_compare/genotype_overlap", samples)
    writeOverlap(stats, _.alleleOverlap, cmdArgs.outputDir + "/sample_compare/allele_overlap", samples)
269
270

    logger.info("Done")
Peter van 't Hof's avatar
Peter van 't Hof committed
271
272
  }

273
  //FIXME: does only work correct for reference and not with a bed file
Peter van 't Hof's avatar
Peter van 't Hof committed
274
  protected def writeWiggle(intervals: List[Interval], row: String, column: String, outputFile: File, genotype: Boolean): Unit = {
275
    val groupedIntervals = intervals.groupBy(_.getContig).map { case (k, v) => k -> v.sortBy(_.getStart) }
Peter van 't Hof's avatar
Peter van 't Hof committed
276
    outputFile.getParentFile.mkdirs()
Peter van 't Hof's avatar
Peter van 't Hof committed
277
278
    val writer = new PrintWriter(outputFile)
    writer.println("track type=wiggle_0")
279
    for ((chr, intervals) <- groupedIntervals) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
280
281
282
      val length = intervals.head.length()
      writer.println(s"fixedStep chrom=$chr start=1 step=$length span=$length")
      for (interval <- intervals) {
Peter van 't Hof's avatar
Peter van 't Hof committed
283
        val file = {
Peter van 't Hof's avatar
Peter van 't Hof committed
284
285
          if (genotype) new File(cmdArgs.outputDir, "bins" + File.separator + chr + File.separator + "genotype-" + interval.getStart + "-" + interval.getEnd + "-general.tsv")
          else new File(cmdArgs.outputDir, "bins" + File.separator + chr + File.separator + interval.getStart + "-" + interval.getEnd + "-general.tsv")
Peter van 't Hof's avatar
Peter van 't Hof committed
286
287
        }
        writer.println(valueFromTsv(file, row, column).getOrElse(0))
Peter van 't Hof's avatar
Peter van 't Hof committed
288
289
290
291
292
      }
    }
    writer.close()
  }

293
294
295
296
297
298
299
  /**
   * Gets single value from a tsv file
   * @param file Input tsv file
   * @param row Row id
   * @param column column id
   * @return value
   */
Peter van 't Hof's avatar
Peter van 't Hof committed
300
301
302
303
304
305
  def valueFromTsv(file: File, row: String, column: String): Option[String] = {
    val reader = Source.fromFile(file)
    val it = reader.getLines()
    val index = it.next().split("\t").indexOf(column)

    val value = it.find(_.startsWith(row))
306
    reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
307
308

    value.collect { case x => x.split("\t")(index) }
309
310
  }

311
312
313
314
315
316
317
318
319
  protected[tools] def fillGeneral(additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
    val buffer = mutable.Map[String, Map[Any, Int]]()

    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
      val map = buffer.getOrElse(key, Map())
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
320
    addToBuffer("QUAL", "not set", false)
321

Sander Bollen's avatar
Sander Bollen committed
322
323
324
325
326
327
328
329
330
331
332
333
    addToBuffer("SampleDistribution-Het", "not set", found = false)
    addToBuffer("SampleDistribution-HetNonRef", "not set", found = false)
    addToBuffer("SampleDistribution-Hom", "not set", found = false)
    addToBuffer("SampleDistribution-HomRef", "not set", found = false)
    addToBuffer("SampleDistribution-HomVar", "not set", found = false)
    addToBuffer("SampleDistribution-Mixed", "not set", found = false)
    addToBuffer("SampleDistribution-NoCall", "not set", found = false)
    addToBuffer("SampleDistribution-NonInformative", "not set", found = false)
    addToBuffer("SampleDistribution-Available", "not set", found = false)
    addToBuffer("SampleDistribution-Called", "not set", found = false)
    addToBuffer("SampleDistribution-Filtered", "not set", found = false)
    addToBuffer("SampleDistribution-Variant", "not set", found = false)
334

Peter van 't Hof's avatar
Peter van 't Hof committed
335
    addToBuffer("general", "Total", false)
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
    addToBuffer("general", "Biallelic", false)
    addToBuffer("general", "ComplexIndel", false)
    addToBuffer("general", "Filtered", false)
    addToBuffer("general", "FullyDecoded", false)
    addToBuffer("general", "Indel", false)
    addToBuffer("general", "Mixed", false)
    addToBuffer("general", "MNP", false)
    addToBuffer("general", "MonomorphicInSamples", false)
    addToBuffer("general", "NotFiltered", false)
    addToBuffer("general", "PointEvent", false)
    addToBuffer("general", "PolymorphicInSamples", false)
    addToBuffer("general", "SimpleDeletion", false)
    addToBuffer("general", "SimpleInsertion", false)
    addToBuffer("general", "SNP", false)
    addToBuffer("general", "StructuralIndel", false)
    addToBuffer("general", "Symbolic", false)
    addToBuffer("general", "SymbolicOrSV", false)
    addToBuffer("general", "Variant", false)

    val skipTags = List("QUAL", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
358
      addToBuffer(tag, "not set", found = false)
359
360
361
362
363
    }

    Map("total" -> buffer.toMap)
  }

364
  /** Function to check all general stats, all info expect sample/genotype specific stats */
Sander Bollen's avatar
Sander Bollen committed
365
  protected[tools] def checkGeneral(record: VariantContext, additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
366
367
    val buffer = mutable.Map[String, Map[Any, Int]]()

368
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
369
      val map = buffer.getOrElse(key, Map())
370
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
371
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
372
373
    }

374
    addToBuffer("QUAL", Math.round(record.getPhredScaledQual), true)
Peter van 't Hof's avatar
Peter van 't Hof committed
375

Peter van 't Hof's avatar
Peter van 't Hof committed
376
377
378
379
380
381
382
383
384
385
386
387
388
    addToBuffer("SampleDistribution-Het", record.getGenotypes.count(genotype => genotype.isHet), found = true)
    addToBuffer("SampleDistribution-HetNonRef", record.getGenotypes.count(genotype => genotype.isHetNonRef), found = true)
    addToBuffer("SampleDistribution-Hom", record.getGenotypes.count(genotype => genotype.isHom), found = true)
    addToBuffer("SampleDistribution-HomRef", record.getGenotypes.count(genotype => genotype.isHomRef), found = true)
    addToBuffer("SampleDistribution-HomVar", record.getGenotypes.count(genotype => genotype.isHomVar), found = true)
    addToBuffer("SampleDistribution-Mixed", record.getGenotypes.count(genotype => genotype.isMixed), found = true)
    addToBuffer("SampleDistribution-NoCall", record.getGenotypes.count(genotype => genotype.isNoCall), found = true)
    addToBuffer("SampleDistribution-NonInformative", record.getGenotypes.count(genotype => genotype.isNonInformative), found = true)
    addToBuffer("SampleDistribution-Available", record.getGenotypes.count(genotype => genotype.isAvailable), found = true)
    addToBuffer("SampleDistribution-Called", record.getGenotypes.count(genotype => genotype.isCalled), found = true)
    addToBuffer("SampleDistribution-Filtered", record.getGenotypes.count(genotype => genotype.isFiltered), found = true)
    addToBuffer("SampleDistribution-Variant", record.getGenotypes.count(genotype => genotype.isHetNonRef || genotype.isHet || genotype.isHomVar), found = true)

389
    addToBuffer("general", "Total", true)
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
    addToBuffer("general", "Biallelic", record.isBiallelic)
    addToBuffer("general", "ComplexIndel", record.isComplexIndel)
    addToBuffer("general", "Filtered", record.isFiltered)
    addToBuffer("general", "FullyDecoded", record.isFullyDecoded)
    addToBuffer("general", "Indel", record.isIndel)
    addToBuffer("general", "Mixed", record.isMixed)
    addToBuffer("general", "MNP", record.isMNP)
    addToBuffer("general", "MonomorphicInSamples", record.isMonomorphicInSamples)
    addToBuffer("general", "NotFiltered", record.isNotFiltered)
    addToBuffer("general", "PointEvent", record.isPointEvent)
    addToBuffer("general", "PolymorphicInSamples", record.isPolymorphicInSamples)
    addToBuffer("general", "SimpleDeletion", record.isSimpleDeletion)
    addToBuffer("general", "SimpleInsertion", record.isSimpleInsertion)
    addToBuffer("general", "SNP", record.isSNP)
    addToBuffer("general", "StructuralIndel", record.isStructuralIndel)
    addToBuffer("general", "Symbolic", record.isSymbolic)
    addToBuffer("general", "SymbolicOrSV", record.isSymbolicOrSV)
    addToBuffer("general", "Variant", record.isVariant)

Peter van 't Hof's avatar
Peter van 't Hof committed
409
410
411
    val skipTags = List("QUAL", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
412
      val value = record.getAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
413
414
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
415
416
    }

417
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
Peter van 't Hof's avatar
Peter van 't Hof committed
418
419
  }

420
421
422
423
424
425
426
427
428
  protected[tools] def fillGenotype(additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
    val buffer = mutable.Map[String, Map[Any, Int]]()

    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
      val map = buffer.getOrElse(key, Map())
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
429
430
    addToBuffer("DP", "not set", false)
    addToBuffer("GQ", "not set", false)
431

Peter van 't Hof's avatar
Peter van 't Hof committed
432
    addToBuffer("general", "Total", false)
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
    addToBuffer("general", "Het", false)
    addToBuffer("general", "HetNonRef", false)
    addToBuffer("general", "Hom", false)
    addToBuffer("general", "HomRef", false)
    addToBuffer("general", "HomVar", false)
    addToBuffer("general", "Mixed", false)
    addToBuffer("general", "NoCall", false)
    addToBuffer("general", "NonInformative", false)
    addToBuffer("general", "Available", false)
    addToBuffer("general", "Called", false)
    addToBuffer("general", "Filtered", false)
    addToBuffer("general", "Variant", false)

    val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
      addToBuffer(tag, 0, found = false)
    }

    Map("total" -> buffer.toMap)

  }

456
  /** Function to check sample/genotype specific stats */
Sander Bollen's avatar
Sander Bollen committed
457
  protected[tools] def checkGenotype(record: VariantContext, genotype: Genotype, additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
458
    val buffer = mutable.Map[String, Map[Any, Int]]()
Peter van 't Hof's avatar
Peter van 't Hof committed
459

460
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
461
      val map = buffer.getOrElse(key, Map())
462
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
463
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
464
465
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
466
467
468
    buffer += "DP" -> Map((if (genotype.hasDP) genotype.getDP else "not set") -> 1)
    buffer += "GQ" -> Map((if (genotype.hasGQ) genotype.getGQ else "not set") -> 1)

Peter van 't Hof's avatar
Peter van 't Hof committed
469
470
    val usedAlleles = (for (allele <- genotype.getAlleles) yield record.getAlleleIndex(allele)).toList

Peter van 't Hof's avatar
Peter van 't Hof committed
471
    addToBuffer("general", "Total", found = true)
472
473
474
475
476
477
478
479
480
481
482
483
    addToBuffer("general", "Het", genotype.isHet)
    addToBuffer("general", "HetNonRef", genotype.isHetNonRef)
    addToBuffer("general", "Hom", genotype.isHom)
    addToBuffer("general", "HomRef", genotype.isHomRef)
    addToBuffer("general", "HomVar", genotype.isHomVar)
    addToBuffer("general", "Mixed", genotype.isMixed)
    addToBuffer("general", "NoCall", genotype.isNoCall)
    addToBuffer("general", "NonInformative", genotype.isNonInformative)
    addToBuffer("general", "Available", genotype.isAvailable)
    addToBuffer("general", "Called", genotype.isCalled)
    addToBuffer("general", "Filtered", genotype.isFiltered)
    addToBuffer("general", "Variant", genotype.isHetNonRef || genotype.isHet || genotype.isHomVar)
Peter van 't Hof's avatar
Peter van 't Hof committed
484

Peter van 't Hof's avatar
Peter van 't Hof committed
485
486
487
    if (genotype.hasAD) {
      val ad = genotype.getAD
      for (i <- 0 until ad.size if ad(i) > 0) {
Peter van 't Hof's avatar
Peter van 't Hof committed
488
489
490
491
492
        addToBuffer("AD", ad(i), found = true)
        if (i == 0) addToBuffer("AD-ref", ad(i), found = true)
        if (i > 0) addToBuffer("AD-alt", ad(i), found = true)
        if (usedAlleles.contains(i)) addToBuffer("AD-used", ad(i), found = true)
        else addToBuffer("AD-not_used", ad(i), found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
493
494
      }
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
495

Peter van 't Hof's avatar
Peter van 't Hof committed
496
497
498
    val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
499
      val value = genotype.getAnyAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
500
501
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
502
503
    }

504
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
505
506
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
507
  /** Function to write sample to sample compare tsv's / heatmaps */
Peter van 't Hof's avatar
Peter van 't Hof committed
508
509
  def writeOverlap(stats: Stats, function: SampleToSampleStats => Int,
                   prefix: String, samples: List[String]): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
510
511
512
513
514
515
516
    val absFile = new File(prefix + ".abs.tsv")
    val relFile = new File(prefix + ".rel.tsv")

    absFile.getParentFile.mkdirs()

    val absWriter = new PrintWriter(absFile)
    val relWriter = new PrintWriter(relFile)
517
518
519
520

    absWriter.println(samples.mkString("\t", "\t", ""))
    relWriter.println(samples.mkString("\t", "\t", ""))
    for (sample1 <- samples) {
Peter van 't Hof's avatar
Peter van 't Hof committed
521
522
      val values = for (sample2 <- samples) yield function(stats.samplesStats(sample1).sampleToSample(sample2))

523
524
      absWriter.println(values.mkString(sample1 + "\t", "\t", ""))

Peter van 't Hof's avatar
Peter van 't Hof committed
525
      val total = function(stats.samplesStats(sample1).sampleToSample(sample1))
526
527
528
529
      relWriter.println(values.map(_.toFloat / total).mkString(sample1 + "\t", "\t", ""))
    }
    absWriter.close()
    relWriter.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
530
531

    plotHeatmap(relFile)
532
533
  }

534
  /** Plots heatmaps on target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
535
  def plotHeatmap(file: File) {
Peter van 't Hof's avatar
Peter van 't Hof committed
536
537
    executeRscript("plotHeatmap.R", Array(file.getAbsolutePath,
      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.png",
Peter van 't Hof's avatar
Peter van 't Hof committed
538
539
      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.clustering.png",
      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.dendrogram.png"))
Peter van 't Hof's avatar
Peter van 't Hof committed
540
541
  }

542
  /** Plots line graph with target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
543
  def plotLine(file: File) {
Peter van 't Hof's avatar
Peter van 't Hof committed
544
545
546
547
    executeRscript("plotXY.R", Array(file.getAbsolutePath,
      file.getAbsolutePath.stripSuffix(".tsv") + ".xy.png"))
  }

548
  /** Function to execute Rscript as subproces */
Peter van 't Hof's avatar
Peter van 't Hof committed
549
550
551
  def executeRscript(resource: String, args: Array[String]): Unit = {
    val is = getClass.getResourceAsStream(resource)
    val file = File.createTempFile("script.", "." + resource)
552
    file.deleteOnExit()
Peter van 't Hof's avatar
Peter van 't Hof committed
553
554
555
556
557
558
    val os = new FileOutputStream(file)
    org.apache.commons.io.IOUtils.copy(is, os)
    os.close()

    val command: String = "Rscript " + file + " " + args.mkString(" ")

559
    logger.info("Starting: " + command)
Peter van 't Hof's avatar
Peter van 't Hof committed
560
561
562
563
564
565
566
567
568
    try {
      val process = Process(command).run(ProcessLogger(x => logger.debug(x), x => logger.debug(x)))
      if (process.exitValue() == 0) logger.info("Done: " + command)
      else {
        logger.warn("Failed: " + command)
        if (!logger.isDebugEnabled) logger.warn("Use -l debug for more info")
      }
    } catch {
      case e: IOException =>
Peter van 't Hof's avatar
Peter van 't Hof committed
569
570
        logger.warn("Failed: " + command)
        logger.debug(e)
571
    }
572
573
  }
}