VcfStats.scala 29.3 KB
Newer Older
bow's avatar
bow committed
1 2 3 4 5 6 7 8 9 10
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
11
 * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
bow's avatar
bow committed
12 13 14
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
15
package nl.lumc.sasc.biopet.tools.vcfstats
16

Peter van 't Hof's avatar
Peter van 't Hof committed
17
import java.io.{ File, FileOutputStream, PrintWriter }
18

Peter van 't Hof's avatar
Peter van 't Hof committed
19 20
import htsjdk.samtools.util.Interval
import htsjdk.variant.variantcontext.{ Allele, Genotype, VariantContext }
21
import htsjdk.variant.vcf.VCFFileReader
22
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
Peter van 't Hof's avatar
Peter van 't Hof committed
23
import nl.lumc.sasc.biopet.utils.{ FastaUtils, ToolCommand }
Peter van 't Hof's avatar
Peter van 't Hof committed
24

25 26
import scala.collection.JavaConversions._
import scala.collection.mutable
Peter van 't Hof's avatar
Peter van 't Hof committed
27
import scala.io.Source
Peter van 't Hof's avatar
Peter van 't Hof committed
28
import scala.sys.process.{ Process, ProcessLogger }
Peter van 't Hof's avatar
Peter van 't Hof committed
29 30
import scala.util.Random

31
/**
Peter van 't Hof's avatar
Peter van 't Hof committed
32 33
 * This tool will generate statistics from a vcf file
 *
34 35 36
 * Created by pjvan_thof on 1/10/15.
 */
object VcfStats extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
37
  /** Commandline argument */
38 39
  case class Args(inputFile: File = null,
                  outputDir: File = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
40
                  referenceFile: File = null,
41 42 43 44
                  intervals: Option[File] = None,
                  infoTags: List[String] = Nil,
                  genotypeTags: List[String] = Nil,
                  allInfoTags: Boolean = false,
Peter van 't Hof's avatar
Peter van 't Hof committed
45 46 47 48 49
                  allGenotypeTags: Boolean = false,
                  binSize: Int = 10000000,
                  writeBinStats: Boolean = false,
                  generalWiggle: List[String] = Nil,
                  genotypeWiggle: List[String] = Nil) extends AbstractArgs
50

Peter van 't Hof's avatar
Peter van 't Hof committed
51
  private val generalWiggleOptions = List("Total", "Biallelic", "ComplexIndel", "Filtered", "FullyDecoded", "Indel", "Mixed",
52 53 54 55
    "MNP", "MonomorphicInSamples", "NotFiltered", "PointEvent", "PolymorphicInSamples",
    "SimpleDeletion", "SimpleInsertion", "SNP", "StructuralIndel", "Symbolic",
    "SymbolicOrSV", "Variant")

Peter van 't Hof's avatar
Peter van 't Hof committed
56
  private val genotypeWiggleOptions = List("Total", "Het", "HetNonRef", "Hom", "HomRef", "HomVar", "Mixed", "NoCall", "NonInformative",
57 58
    "Available", "Called", "Filtered", "Variant")

Peter van 't Hof's avatar
Peter van 't Hof committed
59
  /** Parsing commandline arguments */
60
  class OptParser extends AbstractOptParser {
61
    opt[File]('I', "inputFile") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) =>
62
      c.copy(inputFile = x)
63 64 65 66
    } validate {
      x => if (x.exists) success else failure("Input VCF required")
    } text "Input VCF file (required)"
    opt[File]('R', "referenceFile") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
67
      c.copy(referenceFile = x)
68 69 70 71
    } validate {
      x => if (x.exists) success else failure("Reference file required")
    } text "Fasta reference which was used to call input VCF (required)"
    opt[File]('o', "outputDir") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) =>
72
      c.copy(outputDir = x)
73 74 75
    } validate {
      x => if (x == null) failure("Output directory required") else success
    } text "Path to directory for output (required)"
Peter van 't Hof's avatar
Peter van 't Hof committed
76 77
    opt[File]('i', "intervals") unbounded () valueName ("<file>") action { (x, c) =>
      c.copy(intervals = Some(x))
78
    } text "Path to interval (BED) file (optional)"
Peter van 't Hof's avatar
Peter van 't Hof committed
79
    opt[String]("infoTag") unbounded () valueName "<tag>" action { (x, c) =>
80
      c.copy(infoTags = x :: c.infoTags)
81
    } text "Summarize these info tags. Default is all tags"
Peter van 't Hof's avatar
Peter van 't Hof committed
82
    opt[String]("genotypeTag") unbounded () valueName "<tag>" action { (x, c) =>
83
      c.copy(genotypeTags = x :: c.genotypeTags)
84
    } text "Summarize these genotype tags. Default is all tags"
85 86
    opt[Unit]("allInfoTags") unbounded () action { (x, c) =>
      c.copy(allInfoTags = true)
87
    } text "Summarize all info tags. Default false"
88 89
    opt[Unit]("allGenotypeTags") unbounded () action { (x, c) =>
      c.copy(allGenotypeTags = true)
90
    } text "Summarize all genotype tags. Default false"
Peter van 't Hof's avatar
Peter van 't Hof committed
91 92
    opt[Int]("binSize") unbounded () action { (x, c) =>
      c.copy(binSize = x)
93
    } text "Binsize in estimated base pairs"
Peter van 't Hof's avatar
Peter van 't Hof committed
94 95
    opt[Unit]("writeBinStats") unbounded () action { (x, c) =>
      c.copy(writeBinStats = true)
96
    } text "Write bin statistics. Default False"
Peter van 't Hof's avatar
Peter van 't Hof committed
97 98
    opt[String]("generalWiggle") unbounded () action { (x, c) =>
      c.copy(generalWiggle = x :: c.generalWiggle, writeBinStats = true)
99
    } validate {
Peter van 't Hof's avatar
Peter van 't Hof committed
100
      x => if (generalWiggleOptions.contains(x)) success else failure(s"""Nonexistent field $x""")
Peter van 't Hof's avatar
Peter van 't Hof committed
101 102
    } text s"""Create a wiggle track with bin size <binSize> for any of the following statistics:
        |${generalWiggleOptions.mkString(", ")}""".stripMargin
Peter van 't Hof's avatar
Peter van 't Hof committed
103 104
    opt[String]("genotypeWiggle") unbounded () action { (x, c) =>
      c.copy(genotypeWiggle = x :: c.genotypeWiggle, writeBinStats = true)
105 106
    } validate {
      x => if (genotypeWiggleOptions.contains(x)) success else failure(s"""Non-existent field $x""")
Peter van 't Hof's avatar
Peter van 't Hof committed
107 108
    } text s"""Create a wiggle track with bin size <binSize> for any of the following genotype fields:
        |${genotypeWiggleOptions.mkString(", ")}""".stripMargin
109 110
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
111
  protected var cmdArgs: Args = _
Peter van 't Hof's avatar
Peter van 't Hof committed
112

113 114
  val defaultGenotypeFields = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

115
  val defaultInfoFields = List("QUAL", "general", "AC", "AF", "AN", "DP")
116

Peter van 't Hof's avatar
Peter van 't Hof committed
117 118 119
  val sampleDistributions = List("Het", "HetNonRef", "Hom", "HomRef", "HomVar", "Mixed", "NoCall",
    "NonInformative", "Available", "Called", "Filtered", "Variant")

120 121 122 123 124 125
  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
    logger.info("Started")
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
126
    cmdArgs = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
127

Peter van 't Hof's avatar
Peter van 't Hof committed
128
    val reader = new VCFFileReader(cmdArgs.inputFile, true)
129 130 131
    val header = reader.getFileHeader
    val samples = header.getSampleNamesInOrder.toList

Peter van 't Hof's avatar
Peter van 't Hof committed
132 133
    reader.close()

Peter van 't Hof's avatar
Peter van 't Hof committed
134 135
    val adInfoTags = {
      (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
136
        infoTag <- cmdArgs.infoTags if !defaultInfoFields.contains(infoTag)
Peter van 't Hof's avatar
Peter van 't Hof committed
137 138 139 140
      ) yield {
        require(header.getInfoHeaderLine(infoTag) != null, "Info tag '" + infoTag + "' not found in header of vcf file")
        infoTag
      }) ::: (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
141
        line <- header.getInfoHeaderLines if cmdArgs.allInfoTags if !defaultInfoFields.contains(line.getID) if !cmdArgs.infoTags.contains(line.getID)
Peter van 't Hof's avatar
Peter van 't Hof committed
142 143 144 145
      ) yield {
        line.getID
      }).toList ::: defaultInfoFields
    }
146

147
    val adGenotypeTags = (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
148
      genotypeTag <- cmdArgs.genotypeTags if !defaultGenotypeFields.contains(genotypeTag)
149
    ) yield {
150 151
      require(header.getFormatHeaderLine(genotypeTag) != null, "Format tag '" + genotypeTag + "' not found in header of vcf file")
      genotypeTag
152
    }) ::: (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
153
      line <- header.getFormatHeaderLines if cmdArgs.allGenotypeTags if !defaultGenotypeFields.contains(line.getID) if !cmdArgs.genotypeTags.contains(line.getID) if line.getID != "PL"
154
    ) yield {
155
      line.getID
Peter van 't Hof's avatar
Peter van 't Hof committed
156
    }).toList ::: defaultGenotypeFields
157

Peter van 't Hof's avatar
Peter van 't Hof committed
158 159
    val bedRecords = (cmdArgs.intervals match {
      case Some(intervals) => BedRecordList.fromFile(intervals).validateContigs(cmdArgs.referenceFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
160
      case _               => BedRecordList.fromReference(cmdArgs.referenceFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
161
    }).combineOverlap.scatter(cmdArgs.binSize)
162 163

    val intervals: List[Interval] = bedRecords.toSamIntervals.toList
Peter van 't Hof's avatar
Peter van 't Hof committed
164

165
    val totalBases = bedRecords.length
Peter van 't Hof's avatar
Peter van 't Hof committed
166

Peter van 't Hof's avatar
Peter van 't Hof committed
167 168
    // Reading vcf records
    logger.info("Start reading vcf records")
Peter van 't Hof's avatar
Peter van 't Hof committed
169 170 171 172 173 174

    def createStats: Stats = {
      val stats = new Stats
      //init stats
      for (sample1 <- samples) {
        stats.samplesStats += sample1 -> new SampleStats
175
        for (sample2 <- samples) {
Peter van 't Hof's avatar
Peter van 't Hof committed
176
          stats.samplesStats(sample1).sampleToSample += sample2 -> new SampleToSampleStats
177 178
        }
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
179 180 181 182 183
      stats
    }

    var variantCounter = 0L
    var baseCounter = 0L
Peter van 't Hof's avatar
Peter van 't Hof committed
184

Peter van 't Hof's avatar
Peter van 't Hof committed
185
    def status(count: Int, interval: Interval): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
186 187 188 189 190
      variantCounter += count
      baseCounter += interval.length()
      val fraction = baseCounter.toFloat / totalBases * 100
      logger.info(interval + " done, " + count + " rows processed")
      logger.info("total: " + variantCounter + " rows processed, " + fraction + "% done")
191
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
192

Peter van 't Hof's avatar
Peter van 't Hof committed
193
    // Triple for loop to not keep all bins in memory
Peter van 't Hof's avatar
Peter van 't Hof committed
194
    val stats = (for (intervals <- Random.shuffle(intervals).grouped(intervals.size / (if (intervals.size > 10) 4 else 1)).toList.par) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
195
      val chunkStats = for (intervals <- intervals.grouped(25)) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
196
        val binStats = for (interval <- intervals.par) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
197
          val reader = new VCFFileReader(cmdArgs.inputFile, true)
Peter van 't Hof's avatar
Peter van 't Hof committed
198 199 200 201
          var chunkCounter = 0
          val stats = createStats
          logger.info("Starting on: " + interval)

202 203
          val query = reader.query(interval.getContig, interval.getStart, interval.getEnd)
          if (!query.hasNext) {
Peter van 't Hof's avatar
Peter van 't Hof committed
204
            Stats.mergeNestedStatsMap(stats.generalStats, fillGeneral(adInfoTags))
205
            for (sample <- samples) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
206
              Stats.mergeNestedStatsMap(stats.samplesStats(sample).genotypeStats, fillGenotype(adGenotypeTags))
207 208 209 210
            }
            chunkCounter += 1
          }

Peter van 't Hof's avatar
Peter van 't Hof committed
211
          for (
212
            record <- query if record.getStart <= interval.getEnd
Peter van 't Hof's avatar
Peter van 't Hof committed
213
          ) {
Peter van 't Hof's avatar
Peter van 't Hof committed
214
            Stats.mergeNestedStatsMap(stats.generalStats, checkGeneral(record, adInfoTags))
Peter van 't Hof's avatar
Peter van 't Hof committed
215 216
            for (sample1 <- samples) yield {
              val genotype = record.getGenotype(sample1)
Peter van 't Hof's avatar
Peter van 't Hof committed
217
              Stats.mergeNestedStatsMap(stats.samplesStats(sample1).genotypeStats, checkGenotype(record, genotype, adGenotypeTags))
Peter van 't Hof's avatar
Peter van 't Hof committed
218 219 220 221 222 223
              for (sample2 <- samples) {
                val genotype2 = record.getGenotype(sample2)
                if (genotype.getAlleles == genotype2.getAlleles)
                  stats.samplesStats(sample1).sampleToSample(sample2).genotypeOverlap += 1
                stats.samplesStats(sample1).sampleToSample(sample2).alleleOverlap += alleleOverlap(genotype.getAlleles.toList, genotype2.getAlleles.toList)
              }
Peter van 't Hof's avatar
Peter van 't Hof committed
224
            }
Peter van 't Hof's avatar
Peter van 't Hof committed
225
            chunkCounter += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
226
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
227
          reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
228

Peter van 't Hof's avatar
Peter van 't Hof committed
229 230
          if (cmdArgs.writeBinStats) {
            val binOutputDir = new File(cmdArgs.outputDir, "bins" + File.separator + interval.getContig)
Peter van 't Hof's avatar
Peter van 't Hof committed
231

Peter van 't Hof's avatar
Peter van 't Hof committed
232 233
            writeGenotypeField(stats, samples, "general", binOutputDir, prefix = "genotype-" + interval.getStart + "-" + interval.getEnd)
            writeField(stats, "general", binOutputDir, prefix = interval.getStart + "-" + interval.getEnd)
Peter van 't Hof's avatar
Peter van 't Hof committed
234
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
235

Peter van 't Hof's avatar
Peter van 't Hof committed
236 237
          status(chunkCounter, interval)
          stats
Peter van 't Hof's avatar
Peter van 't Hof committed
238
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
239
        binStats.toList.fold(createStats)(_ += _)
Peter van 't Hof's avatar
Peter van 't Hof committed
240
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
241
      chunkStats.toList.fold(createStats)(_ += _)
Peter van 't Hof's avatar
Peter van 't Hof committed
242
    }).toList.fold(createStats)(_ += _)
Peter van 't Hof's avatar
Peter van 't Hof committed
243

Peter van 't Hof's avatar
Peter van 't Hof committed
244
    logger.info("Done reading vcf records")
245

Peter van 't Hof's avatar
Peter van 't Hof committed
246
    // Writing info fields to tsv files
Peter van 't Hof's avatar
Peter van 't Hof committed
247 248
    val infoOutputDir = new File(cmdArgs.outputDir, "infotags")
    writeField(stats, "general", cmdArgs.outputDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
249
    for (field <- adInfoTags.distinct.par) {
250
      writeField(stats, field, infoOutputDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
251
      for (line <- FastaUtils.getCachedDict(cmdArgs.referenceFile).getSequences) {
Peter van 't Hof's avatar
Peter van 't Hof committed
252
        val chr = line.getSequenceName
253 254 255 256
        writeField(stats, field, new File(infoOutputDir, "chrs" + File.separator + chr), chr = chr)
      }
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
257
    // Write genotype field to tsv files
Peter van 't Hof's avatar
Peter van 't Hof committed
258 259
    val genotypeOutputDir = new File(cmdArgs.outputDir, "genotypetags")
    writeGenotypeField(stats, samples, "general", cmdArgs.outputDir, prefix = "genotype")
Peter van 't Hof's avatar
Peter van 't Hof committed
260
    for (field <- adGenotypeTags.distinct.par) {
261
      writeGenotypeField(stats, samples, field, genotypeOutputDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
262
      for (line <- FastaUtils.getCachedDict(cmdArgs.referenceFile).getSequences) {
Peter van 't Hof's avatar
Peter van 't Hof committed
263
        val chr = line.getSequenceName
264 265 266 267
        writeGenotypeField(stats, samples, field, new File(genotypeOutputDir, "chrs" + File.separator + chr), chr = chr)
      }
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
268
    // Write sample distributions to tsv files
Peter van 't Hof's avatar
Peter van 't Hof committed
269
    val sampleDistributionsOutputDir = new File(cmdArgs.outputDir, "sample_distributions")
Peter van 't Hof's avatar
Peter van 't Hof committed
270 271 272 273 274
    for (field <- sampleDistributions) {
      writeField(stats, "SampleDistribution-" + field, sampleDistributionsOutputDir)
    }

    // Write general wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
275 276
    for (field <- cmdArgs.generalWiggle) {
      val file = new File(cmdArgs.outputDir, "wigs" + File.separator + "general-" + field + ".wig")
Peter van 't Hof's avatar
Peter van 't Hof committed
277
      writeWiggle(intervals, field, "count", file, genotype = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
278 279
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
280
    // Write sample wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
281 282
    for (field <- cmdArgs.genotypeWiggle; sample <- samples) {
      val file = new File(cmdArgs.outputDir, "wigs" + File.separator + "genotype-" + sample + "-" + field + ".wig")
Peter van 't Hof's avatar
Peter van 't Hof committed
283
      writeWiggle(intervals, field, sample, file, genotype = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
284 285
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
286 287
    writeOverlap(stats, _.genotypeOverlap, cmdArgs.outputDir + "/sample_compare/genotype_overlap", samples)
    writeOverlap(stats, _.alleleOverlap, cmdArgs.outputDir + "/sample_compare/allele_overlap", samples)
288 289

    logger.info("Done")
Peter van 't Hof's avatar
Peter van 't Hof committed
290 291
  }

292
  //FIXME: does only work correct for reference and not with a bed file
Peter van 't Hof's avatar
Peter van 't Hof committed
293
  protected def writeWiggle(intervals: List[Interval], row: String, column: String, outputFile: File, genotype: Boolean): Unit = {
294
    val groupedIntervals = intervals.groupBy(_.getContig).map { case (k, v) => k -> v.sortBy(_.getStart) }
Peter van 't Hof's avatar
Peter van 't Hof committed
295
    outputFile.getParentFile.mkdirs()
Peter van 't Hof's avatar
Peter van 't Hof committed
296 297
    val writer = new PrintWriter(outputFile)
    writer.println("track type=wiggle_0")
298
    for ((chr, intervals) <- groupedIntervals) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
299 300 301
      val length = intervals.head.length()
      writer.println(s"fixedStep chrom=$chr start=1 step=$length span=$length")
      for (interval <- intervals) {
Peter van 't Hof's avatar
Peter van 't Hof committed
302
        val file = {
Peter van 't Hof's avatar
Peter van 't Hof committed
303 304
          if (genotype) new File(cmdArgs.outputDir, "bins" + File.separator + chr + File.separator + "genotype-" + interval.getStart + "-" + interval.getEnd + "-general.tsv")
          else new File(cmdArgs.outputDir, "bins" + File.separator + chr + File.separator + interval.getStart + "-" + interval.getEnd + "-general.tsv")
Peter van 't Hof's avatar
Peter van 't Hof committed
305 306
        }
        writer.println(valueFromTsv(file, row, column).getOrElse(0))
Peter van 't Hof's avatar
Peter van 't Hof committed
307 308 309 310 311
      }
    }
    writer.close()
  }

312 313 314 315 316 317 318
  /**
   * Gets single value from a tsv file
   * @param file Input tsv file
   * @param row Row id
   * @param column column id
   * @return value
   */
Peter van 't Hof's avatar
Peter van 't Hof committed
319 320 321 322 323 324
  def valueFromTsv(file: File, row: String, column: String): Option[String] = {
    val reader = Source.fromFile(file)
    val it = reader.getLines()
    val index = it.next().split("\t").indexOf(column)

    val value = it.find(_.startsWith(row))
325
    reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
326 327

    value.collect { case x => x.split("\t")(index) }
328 329
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
330
  /** Give back the number of alleles that overlap */
331 332 333 334 335 336 337 338 339 340 341
  def alleleOverlap(g1: List[Allele], g2: List[Allele], start: Int = 0): Int = {
    if (g1.isEmpty) start
    else {
      val found = g2.contains(g1.head)
      val g2tail = if (found) {
        val index = g2.indexOf(g1.head)
        g2.drop(index + 1) ++ g2.take(index)
      } else g2

      alleleOverlap(g1.tail, g2tail, if (found) start + 1 else start)
    }
342 343
  }

344 345 346 347 348 349 350 351 352
  protected[tools] def fillGeneral(additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
    val buffer = mutable.Map[String, Map[Any, Int]]()

    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
      val map = buffer.getOrElse(key, Map())
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
    }

Sander Bollen's avatar
Sander Bollen committed
353
    buffer += "QUAL" -> Map("not set" -> 1)
354

Sander Bollen's avatar
Sander Bollen committed
355 356 357 358 359 360 361 362 363 364 365 366
    addToBuffer("SampleDistribution-Het", "not set", found = false)
    addToBuffer("SampleDistribution-HetNonRef", "not set", found = false)
    addToBuffer("SampleDistribution-Hom", "not set", found = false)
    addToBuffer("SampleDistribution-HomRef", "not set", found = false)
    addToBuffer("SampleDistribution-HomVar", "not set", found = false)
    addToBuffer("SampleDistribution-Mixed", "not set", found = false)
    addToBuffer("SampleDistribution-NoCall", "not set", found = false)
    addToBuffer("SampleDistribution-NonInformative", "not set", found = false)
    addToBuffer("SampleDistribution-Available", "not set", found = false)
    addToBuffer("SampleDistribution-Called", "not set", found = false)
    addToBuffer("SampleDistribution-Filtered", "not set", found = false)
    addToBuffer("SampleDistribution-Variant", "not set", found = false)
367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396

    addToBuffer("general", "Total", found = true)
    addToBuffer("general", "Biallelic", false)
    addToBuffer("general", "ComplexIndel", false)
    addToBuffer("general", "Filtered", false)
    addToBuffer("general", "FullyDecoded", false)
    addToBuffer("general", "Indel", false)
    addToBuffer("general", "Mixed", false)
    addToBuffer("general", "MNP", false)
    addToBuffer("general", "MonomorphicInSamples", false)
    addToBuffer("general", "NotFiltered", false)
    addToBuffer("general", "PointEvent", false)
    addToBuffer("general", "PolymorphicInSamples", false)
    addToBuffer("general", "SimpleDeletion", false)
    addToBuffer("general", "SimpleInsertion", false)
    addToBuffer("general", "SNP", false)
    addToBuffer("general", "StructuralIndel", false)
    addToBuffer("general", "Symbolic", false)
    addToBuffer("general", "SymbolicOrSV", false)
    addToBuffer("general", "Variant", false)

    val skipTags = List("QUAL", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
      addToBuffer(tag, 0, found = false)
    }

    Map("total" -> buffer.toMap)
  }

397
  /** Function to check all general stats, all info expect sample/genotype specific stats */
Sander Bollen's avatar
Sander Bollen committed
398
  protected[tools] def checkGeneral(record: VariantContext, additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
399 400
    val buffer = mutable.Map[String, Map[Any, Int]]()

401
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
402
      val map = buffer.getOrElse(key, Map())
403
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
404
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
405 406
    }

Sander Bollen's avatar
Sander Bollen committed
407
    buffer += "QUAL" -> Map(Math.round(record.getPhredScaledQual) -> 1)
Peter van 't Hof's avatar
Peter van 't Hof committed
408

Peter van 't Hof's avatar
Peter van 't Hof committed
409 410 411 412 413 414 415 416 417 418 419 420 421 422
    addToBuffer("SampleDistribution-Het", record.getGenotypes.count(genotype => genotype.isHet), found = true)
    addToBuffer("SampleDistribution-HetNonRef", record.getGenotypes.count(genotype => genotype.isHetNonRef), found = true)
    addToBuffer("SampleDistribution-Hom", record.getGenotypes.count(genotype => genotype.isHom), found = true)
    addToBuffer("SampleDistribution-HomRef", record.getGenotypes.count(genotype => genotype.isHomRef), found = true)
    addToBuffer("SampleDistribution-HomVar", record.getGenotypes.count(genotype => genotype.isHomVar), found = true)
    addToBuffer("SampleDistribution-Mixed", record.getGenotypes.count(genotype => genotype.isMixed), found = true)
    addToBuffer("SampleDistribution-NoCall", record.getGenotypes.count(genotype => genotype.isNoCall), found = true)
    addToBuffer("SampleDistribution-NonInformative", record.getGenotypes.count(genotype => genotype.isNonInformative), found = true)
    addToBuffer("SampleDistribution-Available", record.getGenotypes.count(genotype => genotype.isAvailable), found = true)
    addToBuffer("SampleDistribution-Called", record.getGenotypes.count(genotype => genotype.isCalled), found = true)
    addToBuffer("SampleDistribution-Filtered", record.getGenotypes.count(genotype => genotype.isFiltered), found = true)
    addToBuffer("SampleDistribution-Variant", record.getGenotypes.count(genotype => genotype.isHetNonRef || genotype.isHet || genotype.isHomVar), found = true)

    addToBuffer("general", "Total", found = true)
423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441
    addToBuffer("general", "Biallelic", record.isBiallelic)
    addToBuffer("general", "ComplexIndel", record.isComplexIndel)
    addToBuffer("general", "Filtered", record.isFiltered)
    addToBuffer("general", "FullyDecoded", record.isFullyDecoded)
    addToBuffer("general", "Indel", record.isIndel)
    addToBuffer("general", "Mixed", record.isMixed)
    addToBuffer("general", "MNP", record.isMNP)
    addToBuffer("general", "MonomorphicInSamples", record.isMonomorphicInSamples)
    addToBuffer("general", "NotFiltered", record.isNotFiltered)
    addToBuffer("general", "PointEvent", record.isPointEvent)
    addToBuffer("general", "PolymorphicInSamples", record.isPolymorphicInSamples)
    addToBuffer("general", "SimpleDeletion", record.isSimpleDeletion)
    addToBuffer("general", "SimpleInsertion", record.isSimpleInsertion)
    addToBuffer("general", "SNP", record.isSNP)
    addToBuffer("general", "StructuralIndel", record.isStructuralIndel)
    addToBuffer("general", "Symbolic", record.isSymbolic)
    addToBuffer("general", "SymbolicOrSV", record.isSymbolicOrSV)
    addToBuffer("general", "Variant", record.isVariant)

Peter van 't Hof's avatar
Peter van 't Hof committed
442 443 444
    val skipTags = List("QUAL", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
445
      val value = record.getAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
446 447
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
448 449
    }

450
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
Peter van 't Hof's avatar
Peter van 't Hof committed
451 452
  }

453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488
  protected[tools] def fillGenotype(additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
    val buffer = mutable.Map[String, Map[Any, Int]]()

    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
      val map = buffer.getOrElse(key, Map())
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
    }

    buffer += "DP" -> Map("not set" -> 1)
    buffer += "GQ" -> Map("not set" -> 1)

    addToBuffer("general", "Total", found = true)
    addToBuffer("general", "Het", false)
    addToBuffer("general", "HetNonRef", false)
    addToBuffer("general", "Hom", false)
    addToBuffer("general", "HomRef", false)
    addToBuffer("general", "HomVar", false)
    addToBuffer("general", "Mixed", false)
    addToBuffer("general", "NoCall", false)
    addToBuffer("general", "NonInformative", false)
    addToBuffer("general", "Available", false)
    addToBuffer("general", "Called", false)
    addToBuffer("general", "Filtered", false)
    addToBuffer("general", "Variant", false)

    val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
      addToBuffer(tag, 0, found = false)
    }

    Map("total" -> buffer.toMap)

  }

489
  /** Function to check sample/genotype specific stats */
Sander Bollen's avatar
Sander Bollen committed
490
  protected[tools] def checkGenotype(record: VariantContext, genotype: Genotype, additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
491
    val buffer = mutable.Map[String, Map[Any, Int]]()
Peter van 't Hof's avatar
Peter van 't Hof committed
492

493
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
494
      val map = buffer.getOrElse(key, Map())
495
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
496
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
497 498
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
499 500 501
    buffer += "DP" -> Map((if (genotype.hasDP) genotype.getDP else "not set") -> 1)
    buffer += "GQ" -> Map((if (genotype.hasGQ) genotype.getGQ else "not set") -> 1)

Peter van 't Hof's avatar
Peter van 't Hof committed
502 503
    val usedAlleles = (for (allele <- genotype.getAlleles) yield record.getAlleleIndex(allele)).toList

Peter van 't Hof's avatar
Peter van 't Hof committed
504
    addToBuffer("general", "Total", found = true)
505 506 507 508 509 510 511 512 513 514 515 516
    addToBuffer("general", "Het", genotype.isHet)
    addToBuffer("general", "HetNonRef", genotype.isHetNonRef)
    addToBuffer("general", "Hom", genotype.isHom)
    addToBuffer("general", "HomRef", genotype.isHomRef)
    addToBuffer("general", "HomVar", genotype.isHomVar)
    addToBuffer("general", "Mixed", genotype.isMixed)
    addToBuffer("general", "NoCall", genotype.isNoCall)
    addToBuffer("general", "NonInformative", genotype.isNonInformative)
    addToBuffer("general", "Available", genotype.isAvailable)
    addToBuffer("general", "Called", genotype.isCalled)
    addToBuffer("general", "Filtered", genotype.isFiltered)
    addToBuffer("general", "Variant", genotype.isHetNonRef || genotype.isHet || genotype.isHomVar)
Peter van 't Hof's avatar
Peter van 't Hof committed
517

Peter van 't Hof's avatar
Peter van 't Hof committed
518 519 520
    if (genotype.hasAD) {
      val ad = genotype.getAD
      for (i <- 0 until ad.size if ad(i) > 0) {
Peter van 't Hof's avatar
Peter van 't Hof committed
521 522 523 524 525
        addToBuffer("AD", ad(i), found = true)
        if (i == 0) addToBuffer("AD-ref", ad(i), found = true)
        if (i > 0) addToBuffer("AD-alt", ad(i), found = true)
        if (usedAlleles.contains(i)) addToBuffer("AD-used", ad(i), found = true)
        else addToBuffer("AD-not_used", ad(i), found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
526 527
      }
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
528

Peter van 't Hof's avatar
Peter van 't Hof committed
529 530 531
    val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
532
      val value = genotype.getAnyAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
533 534
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
535 536
    }

537
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
538 539 540
  }

  /** Function to write 1 specific genotype field */
541 542 543 544
  protected def writeGenotypeField(stats: Stats, samples: List[String], field: String, outputDir: File,
                                   prefix: String = "", chr: String = "total"): Unit = {
    val file = (prefix, chr) match {
      case ("", "total") => new File(outputDir, field + ".tsv")
545 546 547
      case (_, "total")  => new File(outputDir, prefix + "-" + field + ".tsv")
      case ("", _)       => new File(outputDir, chr + "-" + field + ".tsv")
      case _             => new File(outputDir, prefix + "-" + chr + "-" + field + ".tsv")
548 549
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
550 551 552
    file.getParentFile.mkdirs()
    val writer = new PrintWriter(file)
    writer.println(samples.mkString(field + "\t", "\t", ""))
553
    val keySet = (for (sample <- samples) yield stats.samplesStats(sample).genotypeStats.getOrElse(chr, Map[String, Map[Any, Int]]()).getOrElse(field, Map[Any, Int]()).keySet).fold(Set[Any]())(_ ++ _)
Peter van 't Hof's avatar
Peter van 't Hof committed
554
    for (key <- keySet.toList.sortWith(sortAnyAny)) {
555
      val values = for (sample <- samples) yield stats.samplesStats(sample).genotypeStats.getOrElse(chr, Map[String, Map[Any, Int]]()).getOrElse(field, Map[Any, Int]()).getOrElse(key, 0)
Peter van 't Hof's avatar
Peter van 't Hof committed
556
      writer.println(values.mkString(key + "\t", "\t", ""))
Peter van 't Hof's avatar
Peter van 't Hof committed
557
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
558
    writer.close()
559 560 561

    //FIXME: plotting of thise value is broken
    //plotLine(file)
Peter van 't Hof's avatar
Peter van 't Hof committed
562 563
  }

564
  /** Function to write 1 specific general field */
565 566 567
  protected def writeField(stats: Stats, field: String, outputDir: File, prefix: String = "", chr: String = "total"): File = {
    val file = (prefix, chr) match {
      case ("", "total") => new File(outputDir, field + ".tsv")
568 569 570
      case (_, "total")  => new File(outputDir, prefix + "-" + field + ".tsv")
      case ("", _)       => new File(outputDir, chr + "-" + field + ".tsv")
      case _             => new File(outputDir, prefix + "-" + chr + "-" + field + ".tsv")
571 572 573 574
    }

    val data = stats.generalStats.getOrElse(chr, mutable.Map[String, mutable.Map[Any, Int]]()).getOrElse(field, mutable.Map[Any, Int]())

Peter van 't Hof's avatar
Peter van 't Hof committed
575 576
    file.getParentFile.mkdirs()
    val writer = new PrintWriter(file)
577
    writer.println("value\tcount")
Peter van 't Hof's avatar
Peter van 't Hof committed
578
    for (key <- data.keySet.toList.sortWith(sortAnyAny)) {
Peter van 't Hof's avatar
Peter van 't Hof committed
579 580 581 582 583 584
      writer.println(key + "\t" + data(key))
    }
    writer.close()
    file
  }

585
  /** Function to sort Any values */
Peter van 't Hof's avatar
Peter van 't Hof committed
586 587
  def sortAnyAny(a: Any, b: Any): Boolean = {
    a match {
Peter van 't Hof's avatar
Peter van 't Hof committed
588
      case ai: Int =>
Peter van 't Hof's avatar
Peter van 't Hof committed
589
        b match {
Peter van 't Hof's avatar
Peter van 't Hof committed
590 591 592
          case bi: Int    => ai < bi
          case bi: Double => ai < bi
          case _          => a.toString < b.toString
Peter van 't Hof's avatar
Peter van 't Hof committed
593 594 595 596 597
        }
      case _ => a.toString < b.toString
    }
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
598
  /** Function to write sample to sample compare tsv's / heatmaps */
Peter van 't Hof's avatar
Peter van 't Hof committed
599 600
  def writeOverlap(stats: Stats, function: SampleToSampleStats => Int,
                   prefix: String, samples: List[String]): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
601 602 603 604 605 606 607
    val absFile = new File(prefix + ".abs.tsv")
    val relFile = new File(prefix + ".rel.tsv")

    absFile.getParentFile.mkdirs()

    val absWriter = new PrintWriter(absFile)
    val relWriter = new PrintWriter(relFile)
608 609 610 611

    absWriter.println(samples.mkString("\t", "\t", ""))
    relWriter.println(samples.mkString("\t", "\t", ""))
    for (sample1 <- samples) {
Peter van 't Hof's avatar
Peter van 't Hof committed
612 613
      val values = for (sample2 <- samples) yield function(stats.samplesStats(sample1).sampleToSample(sample2))

614 615
      absWriter.println(values.mkString(sample1 + "\t", "\t", ""))

Peter van 't Hof's avatar
Peter van 't Hof committed
616
      val total = function(stats.samplesStats(sample1).sampleToSample(sample1))
617 618 619 620
      relWriter.println(values.map(_.toFloat / total).mkString(sample1 + "\t", "\t", ""))
    }
    absWriter.close()
    relWriter.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
621 622

    plotHeatmap(relFile)
623 624
  }

625
  /** Plots heatmaps on target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
626
  def plotHeatmap(file: File) {
Peter van 't Hof's avatar
Peter van 't Hof committed
627 628
    executeRscript("plotHeatmap.R", Array(file.getAbsolutePath,
      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.png",
Peter van 't Hof's avatar
Peter van 't Hof committed
629 630
      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.clustering.png",
      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.dendrogram.png"))
Peter van 't Hof's avatar
Peter van 't Hof committed
631 632
  }

633
  /** Plots line graph with target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
634
  def plotLine(file: File) {
Peter van 't Hof's avatar
Peter van 't Hof committed
635 636 637 638
    executeRscript("plotXY.R", Array(file.getAbsolutePath,
      file.getAbsolutePath.stripSuffix(".tsv") + ".xy.png"))
  }

639
  /** Function to execute Rscript as subproces */
Peter van 't Hof's avatar
Peter van 't Hof committed
640 641 642
  def executeRscript(resource: String, args: Array[String]): Unit = {
    val is = getClass.getResourceAsStream(resource)
    val file = File.createTempFile("script.", "." + resource)
643
    file.deleteOnExit()
Peter van 't Hof's avatar
Peter van 't Hof committed
644 645 646 647 648 649
    val os = new FileOutputStream(file)
    org.apache.commons.io.IOUtils.copy(is, os)
    os.close()

    val command: String = "Rscript " + file + " " + args.mkString(" ")

650 651
    logger.info("Starting: " + command)
    val process = Process(command).run(ProcessLogger(x => logger.debug(x), x => logger.debug(x)))
Peter van 't Hof's avatar
Peter van 't Hof committed
652
    if (process.exitValue() == 0) logger.info("Done: " + command)
653
    else {
Peter van 't Hof's avatar
Peter van 't Hof committed
654
      logger.warn("Failed: " + command)
655 656
      if (!logger.isDebugEnabled) logger.warn("Use -l debug for more info")
    }
657 658
  }
}