VcfStats.scala 27 KB
Newer Older
bow's avatar
bow committed
1 2 3 4 5 6 7 8 9 10
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
11
 * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
bow's avatar
bow committed
12 13 14
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
15
package nl.lumc.sasc.biopet.tools.vcfstats
16

Peter van 't Hof's avatar
Peter van 't Hof committed
17
import java.io.{File, FileOutputStream, PrintWriter}
18

Peter van 't Hof's avatar
Peter van 't Hof committed
19
import htsjdk.samtools.util.Interval
Peter van 't Hof's avatar
Peter van 't Hof committed
20
import htsjdk.variant.variantcontext.{Allele, Genotype, VariantContext}
21
import htsjdk.variant.vcf.VCFFileReader
22
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
Peter van 't Hof's avatar
Peter van 't Hof committed
23
import nl.lumc.sasc.biopet.utils.{FastaUtils, ToolCommand}
Peter van 't Hof's avatar
Peter van 't Hof committed
24

25 26
import scala.collection.JavaConversions._
import scala.collection.mutable
Peter van 't Hof's avatar
Peter van 't Hof committed
27
import scala.io.Source
Peter van 't Hof's avatar
Peter van 't Hof committed
28
import scala.sys.process.{Process, ProcessLogger}
Peter van 't Hof's avatar
Peter van 't Hof committed
29 30
import scala.util.Random

Peter van 't Hof's avatar
Peter van 't Hof committed
31 32 33 34
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.duration._
import scala.concurrent.{ Await, Future }

35
/**
Peter van 't Hof's avatar
Peter van 't Hof committed
36 37
 * This tool will generate statistics from a vcf file
 *
38 39 40
 * Created by pjvan_thof on 1/10/15.
 */
object VcfStats extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
41
  /** Commandline argument */
42 43
  case class Args(inputFile: File = null,
                  outputDir: File = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
44
                  referenceFile: File = null,
45 46 47 48
                  intervals: Option[File] = None,
                  infoTags: List[String] = Nil,
                  genotypeTags: List[String] = Nil,
                  allInfoTags: Boolean = false,
Peter van 't Hof's avatar
Peter van 't Hof committed
49 50 51 52 53
                  allGenotypeTags: Boolean = false,
                  binSize: Int = 10000000,
                  writeBinStats: Boolean = false,
                  generalWiggle: List[String] = Nil,
                  genotypeWiggle: List[String] = Nil) extends AbstractArgs
54

Peter van 't Hof's avatar
Peter van 't Hof committed
55
  private val generalWiggleOptions = List("Total", "Biallelic", "ComplexIndel", "Filtered", "FullyDecoded", "Indel", "Mixed",
56 57 58 59
    "MNP", "MonomorphicInSamples", "NotFiltered", "PointEvent", "PolymorphicInSamples",
    "SimpleDeletion", "SimpleInsertion", "SNP", "StructuralIndel", "Symbolic",
    "SymbolicOrSV", "Variant")

Peter van 't Hof's avatar
Peter van 't Hof committed
60
  private val genotypeWiggleOptions = List("Total", "Het", "HetNonRef", "Hom", "HomRef", "HomVar", "Mixed", "NoCall", "NonInformative",
61 62
    "Available", "Called", "Filtered", "Variant")

Peter van 't Hof's avatar
Peter van 't Hof committed
63
  /** Parsing commandline arguments */
64
  class OptParser extends AbstractOptParser {
65
    opt[File]('I', "inputFile") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) =>
66
      c.copy(inputFile = x)
67 68 69 70
    } validate {
      x => if (x.exists) success else failure("Input VCF required")
    } text "Input VCF file (required)"
    opt[File]('R', "referenceFile") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
71
      c.copy(referenceFile = x)
72 73 74 75
    } validate {
      x => if (x.exists) success else failure("Reference file required")
    } text "Fasta reference which was used to call input VCF (required)"
    opt[File]('o', "outputDir") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) =>
76
      c.copy(outputDir = x)
77 78 79
    } validate {
      x => if (x == null) failure("Output directory required") else success
    } text "Path to directory for output (required)"
Peter van 't Hof's avatar
Peter van 't Hof committed
80
    opt[File]('i', "intervals") unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
81
      c.copy(intervals = Some(x))
82
    } text "Path to interval (BED) file (optional)"
Peter van 't Hof's avatar
Peter van 't Hof committed
83
    opt[String]("infoTag") unbounded () valueName "<tag>" action { (x, c) =>
84
      c.copy(infoTags = x :: c.infoTags)
Peter van 't Hof's avatar
Peter van 't Hof committed
85
    } text s"Summarize these info tags. Default is (${defaultInfoFields.mkString(", ")})"
Peter van 't Hof's avatar
Peter van 't Hof committed
86
    opt[String]("genotypeTag") unbounded () valueName "<tag>" action { (x, c) =>
87
      c.copy(genotypeTags = x :: c.genotypeTags)
Peter van 't Hof's avatar
Peter van 't Hof committed
88
    } text s"Summarize these genotype tags. Default is (${defaultGenotypeFields.mkString(", ")})"
89 90
    opt[Unit]("allInfoTags") unbounded () action { (x, c) =>
      c.copy(allInfoTags = true)
91
    } text "Summarize all info tags. Default false"
92 93
    opt[Unit]("allGenotypeTags") unbounded () action { (x, c) =>
      c.copy(allGenotypeTags = true)
94
    } text "Summarize all genotype tags. Default false"
Peter van 't Hof's avatar
Peter van 't Hof committed
95 96
    opt[Int]("binSize") unbounded () action { (x, c) =>
      c.copy(binSize = x)
97
    } text "Binsize in estimated base pairs"
Peter van 't Hof's avatar
Peter van 't Hof committed
98 99
    opt[Unit]("writeBinStats") unbounded () action { (x, c) =>
      c.copy(writeBinStats = true)
100
    } text "Write bin statistics. Default False"
Peter van 't Hof's avatar
Peter van 't Hof committed
101 102
    opt[String]("generalWiggle") unbounded () action { (x, c) =>
      c.copy(generalWiggle = x :: c.generalWiggle, writeBinStats = true)
103
    } validate {
Peter van 't Hof's avatar
Peter van 't Hof committed
104
      x => if (generalWiggleOptions.contains(x)) success else failure(s"""Nonexistent field $x""")
Peter van 't Hof's avatar
Peter van 't Hof committed
105 106
    } text s"""Create a wiggle track with bin size <binSize> for any of the following statistics:
        |${generalWiggleOptions.mkString(", ")}""".stripMargin
Peter van 't Hof's avatar
Peter van 't Hof committed
107 108
    opt[String]("genotypeWiggle") unbounded () action { (x, c) =>
      c.copy(genotypeWiggle = x :: c.genotypeWiggle, writeBinStats = true)
109 110
    } validate {
      x => if (genotypeWiggleOptions.contains(x)) success else failure(s"""Non-existent field $x""")
Peter van 't Hof's avatar
Peter van 't Hof committed
111 112
    } text s"""Create a wiggle track with bin size <binSize> for any of the following genotype fields:
        |${genotypeWiggleOptions.mkString(", ")}""".stripMargin
113 114
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
115
  protected var cmdArgs: Args = _
Peter van 't Hof's avatar
Peter van 't Hof committed
116

117 118
  val defaultGenotypeFields = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

119
  val defaultInfoFields = List("QUAL", "general", "AC", "AF", "AN", "DP")
120

Peter van 't Hof's avatar
Peter van 't Hof committed
121 122 123
  val sampleDistributions = List("Het", "HetNonRef", "Hom", "HomRef", "HomVar", "Mixed", "NoCall",
    "NonInformative", "Available", "Called", "Filtered", "Variant")

124 125 126 127 128 129
  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
    logger.info("Started")
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
130
    cmdArgs = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
131

Peter van 't Hof's avatar
Peter van 't Hof committed
132
    val reader = new VCFFileReader(cmdArgs.inputFile, true)
133 134 135
    val header = reader.getFileHeader
    val samples = header.getSampleNamesInOrder.toList

Peter van 't Hof's avatar
Peter van 't Hof committed
136 137
    reader.close()

Peter van 't Hof's avatar
Peter van 't Hof committed
138 139
    val adInfoTags = {
      (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
140
        infoTag <- cmdArgs.infoTags if !defaultInfoFields.contains(infoTag)
Peter van 't Hof's avatar
Peter van 't Hof committed
141 142 143 144
      ) yield {
        require(header.getInfoHeaderLine(infoTag) != null, "Info tag '" + infoTag + "' not found in header of vcf file")
        infoTag
      }) ::: (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
145
        line <- header.getInfoHeaderLines if cmdArgs.allInfoTags if !defaultInfoFields.contains(line.getID) if !cmdArgs.infoTags.contains(line.getID)
Peter van 't Hof's avatar
Peter van 't Hof committed
146 147 148 149
      ) yield {
        line.getID
      }).toList ::: defaultInfoFields
    }
150

151
    val adGenotypeTags = (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
152
      genotypeTag <- cmdArgs.genotypeTags if !defaultGenotypeFields.contains(genotypeTag)
153
    ) yield {
154 155
      require(header.getFormatHeaderLine(genotypeTag) != null, "Format tag '" + genotypeTag + "' not found in header of vcf file")
      genotypeTag
156
    }) ::: (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
157
      line <- header.getFormatHeaderLines if cmdArgs.allGenotypeTags if !defaultGenotypeFields.contains(line.getID) if !cmdArgs.genotypeTags.contains(line.getID) if line.getID != "PL"
158
    ) yield {
159
      line.getID
Peter van 't Hof's avatar
Peter van 't Hof committed
160
    }).toList ::: defaultGenotypeFields
161

Peter van 't Hof's avatar
Peter van 't Hof committed
162 163
    val bedRecords = (cmdArgs.intervals match {
      case Some(intervals) => BedRecordList.fromFile(intervals).validateContigs(cmdArgs.referenceFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
164
      case _               => BedRecordList.fromReference(cmdArgs.referenceFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
165
    }).combineOverlap.scatter(cmdArgs.binSize)
166 167

    val intervals: List[Interval] = bedRecords.toSamIntervals.toList
Peter van 't Hof's avatar
Peter van 't Hof committed
168

169
    val totalBases = bedRecords.length
Peter van 't Hof's avatar
Peter van 't Hof committed
170

Peter van 't Hof's avatar
Peter van 't Hof committed
171 172
    // Reading vcf records
    logger.info("Start reading vcf records")
Peter van 't Hof's avatar
Peter van 't Hof committed
173 174 175 176 177 178

    def createStats: Stats = {
      val stats = new Stats
      //init stats
      for (sample1 <- samples) {
        stats.samplesStats += sample1 -> new SampleStats
179
        for (sample2 <- samples) {
Peter van 't Hof's avatar
Peter van 't Hof committed
180
          stats.samplesStats(sample1).sampleToSample += sample2 -> new SampleToSampleStats
181 182
        }
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
183 184 185 186 187
      stats
    }

    var variantCounter = 0L
    var baseCounter = 0L
Peter van 't Hof's avatar
Peter van 't Hof committed
188

Peter van 't Hof's avatar
Peter van 't Hof committed
189
    def status(count: Int, interval: Interval): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
190 191 192 193 194
      variantCounter += count
      baseCounter += interval.length()
      val fraction = baseCounter.toFloat / totalBases * 100
      logger.info(interval + " done, " + count + " rows processed")
      logger.info("total: " + variantCounter + " rows processed, " + fraction + "% done")
195
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
196

Peter van 't Hof's avatar
Peter van 't Hof committed
197
    // Triple for loop to not keep all bins in memory
Peter van 't Hof's avatar
Peter van 't Hof committed
198
    val statsFutures = (for (intervals <- Random.shuffle(intervals).grouped(intervals.size / (if (intervals.size > 10) 4 else 1)).toList) yield Future {
Peter van 't Hof's avatar
Peter van 't Hof committed
199
      val chunkStats = for (intervals <- intervals.grouped(25)) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
200
        val binStats = for (interval <- intervals.par) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
201
          val reader = new VCFFileReader(cmdArgs.inputFile, true)
Peter van 't Hof's avatar
Peter van 't Hof committed
202 203 204 205
          var chunkCounter = 0
          val stats = createStats
          logger.info("Starting on: " + interval)

206 207
          val query = reader.query(interval.getContig, interval.getStart, interval.getEnd)
          if (!query.hasNext) {
Peter van 't Hof's avatar
Peter van 't Hof committed
208
            Stats.mergeNestedStatsMap(stats.generalStats, fillGeneral(adInfoTags))
209
            for (sample <- samples) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
210
              Stats.mergeNestedStatsMap(stats.samplesStats(sample).genotypeStats, fillGenotype(adGenotypeTags))
211 212 213 214
            }
            chunkCounter += 1
          }

Peter van 't Hof's avatar
Peter van 't Hof committed
215
          for (
216
            record <- query if record.getStart <= interval.getEnd
Peter van 't Hof's avatar
Peter van 't Hof committed
217
          ) {
Peter van 't Hof's avatar
Peter van 't Hof committed
218
            Stats.mergeNestedStatsMap(stats.generalStats, checkGeneral(record, adInfoTags))
Peter van 't Hof's avatar
Peter van 't Hof committed
219 220
            for (sample1 <- samples) yield {
              val genotype = record.getGenotype(sample1)
Peter van 't Hof's avatar
Peter van 't Hof committed
221
              Stats.mergeNestedStatsMap(stats.samplesStats(sample1).genotypeStats, checkGenotype(record, genotype, adGenotypeTags))
Peter van 't Hof's avatar
Peter van 't Hof committed
222 223 224 225 226 227
              for (sample2 <- samples) {
                val genotype2 = record.getGenotype(sample2)
                if (genotype.getAlleles == genotype2.getAlleles)
                  stats.samplesStats(sample1).sampleToSample(sample2).genotypeOverlap += 1
                stats.samplesStats(sample1).sampleToSample(sample2).alleleOverlap += alleleOverlap(genotype.getAlleles.toList, genotype2.getAlleles.toList)
              }
Peter van 't Hof's avatar
Peter van 't Hof committed
228
            }
Peter van 't Hof's avatar
Peter van 't Hof committed
229
            chunkCounter += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
230
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
231
          reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
232

Peter van 't Hof's avatar
Peter van 't Hof committed
233 234
          if (cmdArgs.writeBinStats) {
            val binOutputDir = new File(cmdArgs.outputDir, "bins" + File.separator + interval.getContig)
Peter van 't Hof's avatar
Peter van 't Hof committed
235

Peter van 't Hof's avatar
Peter van 't Hof committed
236 237
            stats.writeGenotypeField(samples, "general", binOutputDir, prefix = "genotype-" + interval.getStart + "-" + interval.getEnd)
            stats.writeField("general", binOutputDir, prefix = interval.getStart + "-" + interval.getEnd)
Peter van 't Hof's avatar
Peter van 't Hof committed
238
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
239

Peter van 't Hof's avatar
Peter van 't Hof committed
240 241
          status(chunkCounter, interval)
          stats
Peter van 't Hof's avatar
Peter van 't Hof committed
242
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
243
        binStats.toList.fold(createStats)(_ += _)
Peter van 't Hof's avatar
Peter van 't Hof committed
244
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
245
      chunkStats.toList.fold(createStats)(_ += _)
Peter van 't Hof's avatar
Peter van 't Hof committed
246 247
    })
    val stats = statsFutures.foldLeft(createStats) { case (a,b) => a += Await.result(b, Duration.Inf) }
Peter van 't Hof's avatar
Peter van 't Hof committed
248

Peter van 't Hof's avatar
Peter van 't Hof committed
249
    logger.info("Done reading vcf records")
250

Peter van 't Hof's avatar
Peter van 't Hof committed
251
    // Writing info fields to tsv files
Peter van 't Hof's avatar
Peter van 't Hof committed
252
    val infoOutputDir = new File(cmdArgs.outputDir, "infotags")
Peter van 't Hof's avatar
Peter van 't Hof committed
253
    stats.writeField("general", cmdArgs.outputDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
254
    for (field <- adInfoTags.distinct.par) {
Peter van 't Hof's avatar
Peter van 't Hof committed
255
      stats.writeField(field, infoOutputDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
256
      for (line <- FastaUtils.getCachedDict(cmdArgs.referenceFile).getSequences) {
Peter van 't Hof's avatar
Peter van 't Hof committed
257
        val chr = line.getSequenceName
Peter van 't Hof's avatar
Peter van 't Hof committed
258
        stats.writeField(field, new File(infoOutputDir, "chrs" + File.separator + chr), chr = chr)
259 260 261
      }
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
262
    // Write genotype field to tsv files
Peter van 't Hof's avatar
Peter van 't Hof committed
263
    val genotypeOutputDir = new File(cmdArgs.outputDir, "genotypetags")
Peter van 't Hof's avatar
Peter van 't Hof committed
264
    stats.writeGenotypeField(samples, "general", cmdArgs.outputDir, prefix = "genotype")
Peter van 't Hof's avatar
Peter van 't Hof committed
265
    for (field <- adGenotypeTags.distinct.par) {
Peter van 't Hof's avatar
Peter van 't Hof committed
266
      stats.writeGenotypeField(samples, field, genotypeOutputDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
267
      for (line <- FastaUtils.getCachedDict(cmdArgs.referenceFile).getSequences) {
Peter van 't Hof's avatar
Peter van 't Hof committed
268
        val chr = line.getSequenceName
Peter van 't Hof's avatar
Peter van 't Hof committed
269
        stats.writeGenotypeField(samples, field, new File(genotypeOutputDir, "chrs" + File.separator + chr), chr = chr)
270 271 272
      }
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
273
    // Write sample distributions to tsv files
Peter van 't Hof's avatar
Peter van 't Hof committed
274
    val sampleDistributionsOutputDir = new File(cmdArgs.outputDir, "sample_distributions")
Peter van 't Hof's avatar
Peter van 't Hof committed
275
    for (field <- sampleDistributions) {
Peter van 't Hof's avatar
Peter van 't Hof committed
276
      stats.writeField("SampleDistribution-" + field, sampleDistributionsOutputDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
277 278 279
    }

    // Write general wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
280 281
    for (field <- cmdArgs.generalWiggle) {
      val file = new File(cmdArgs.outputDir, "wigs" + File.separator + "general-" + field + ".wig")
Peter van 't Hof's avatar
Peter van 't Hof committed
282
      writeWiggle(intervals, field, "count", file, genotype = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
283 284
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
285
    // Write sample wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
286 287
    for (field <- cmdArgs.genotypeWiggle; sample <- samples) {
      val file = new File(cmdArgs.outputDir, "wigs" + File.separator + "genotype-" + sample + "-" + field + ".wig")
Peter van 't Hof's avatar
Peter van 't Hof committed
288
      writeWiggle(intervals, field, sample, file, genotype = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
289 290
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
291 292
    writeOverlap(stats, _.genotypeOverlap, cmdArgs.outputDir + "/sample_compare/genotype_overlap", samples)
    writeOverlap(stats, _.alleleOverlap, cmdArgs.outputDir + "/sample_compare/allele_overlap", samples)
293 294

    logger.info("Done")
Peter van 't Hof's avatar
Peter van 't Hof committed
295 296
  }

297
  //FIXME: does only work correct for reference and not with a bed file
Peter van 't Hof's avatar
Peter van 't Hof committed
298
  protected def writeWiggle(intervals: List[Interval], row: String, column: String, outputFile: File, genotype: Boolean): Unit = {
299
    val groupedIntervals = intervals.groupBy(_.getContig).map { case (k, v) => k -> v.sortBy(_.getStart) }
Peter van 't Hof's avatar
Peter van 't Hof committed
300
    outputFile.getParentFile.mkdirs()
Peter van 't Hof's avatar
Peter van 't Hof committed
301 302
    val writer = new PrintWriter(outputFile)
    writer.println("track type=wiggle_0")
303
    for ((chr, intervals) <- groupedIntervals) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
304 305 306
      val length = intervals.head.length()
      writer.println(s"fixedStep chrom=$chr start=1 step=$length span=$length")
      for (interval <- intervals) {
Peter van 't Hof's avatar
Peter van 't Hof committed
307
        val file = {
Peter van 't Hof's avatar
Peter van 't Hof committed
308 309
          if (genotype) new File(cmdArgs.outputDir, "bins" + File.separator + chr + File.separator + "genotype-" + interval.getStart + "-" + interval.getEnd + "-general.tsv")
          else new File(cmdArgs.outputDir, "bins" + File.separator + chr + File.separator + interval.getStart + "-" + interval.getEnd + "-general.tsv")
Peter van 't Hof's avatar
Peter van 't Hof committed
310 311
        }
        writer.println(valueFromTsv(file, row, column).getOrElse(0))
Peter van 't Hof's avatar
Peter van 't Hof committed
312 313 314 315 316
      }
    }
    writer.close()
  }

317 318 319 320 321 322 323
  /**
   * Gets single value from a tsv file
   * @param file Input tsv file
   * @param row Row id
   * @param column column id
   * @return value
   */
Peter van 't Hof's avatar
Peter van 't Hof committed
324 325 326 327 328 329
  def valueFromTsv(file: File, row: String, column: String): Option[String] = {
    val reader = Source.fromFile(file)
    val it = reader.getLines()
    val index = it.next().split("\t").indexOf(column)

    val value = it.find(_.startsWith(row))
330
    reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
331 332

    value.collect { case x => x.split("\t")(index) }
333 334
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
335
  /** Give back the number of alleles that overlap */
336 337 338 339 340 341 342 343 344 345 346
  def alleleOverlap(g1: List[Allele], g2: List[Allele], start: Int = 0): Int = {
    if (g1.isEmpty) start
    else {
      val found = g2.contains(g1.head)
      val g2tail = if (found) {
        val index = g2.indexOf(g1.head)
        g2.drop(index + 1) ++ g2.take(index)
      } else g2

      alleleOverlap(g1.tail, g2tail, if (found) start + 1 else start)
    }
347 348
  }

349 350 351 352 353 354 355 356 357
  protected[tools] def fillGeneral(additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
    val buffer = mutable.Map[String, Map[Any, Int]]()

    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
      val map = buffer.getOrElse(key, Map())
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
    }

Sander Bollen's avatar
Sander Bollen committed
358
    buffer += "QUAL" -> Map("not set" -> 1)
359

Sander Bollen's avatar
Sander Bollen committed
360 361 362 363 364 365 366 367 368 369 370 371
    addToBuffer("SampleDistribution-Het", "not set", found = false)
    addToBuffer("SampleDistribution-HetNonRef", "not set", found = false)
    addToBuffer("SampleDistribution-Hom", "not set", found = false)
    addToBuffer("SampleDistribution-HomRef", "not set", found = false)
    addToBuffer("SampleDistribution-HomVar", "not set", found = false)
    addToBuffer("SampleDistribution-Mixed", "not set", found = false)
    addToBuffer("SampleDistribution-NoCall", "not set", found = false)
    addToBuffer("SampleDistribution-NonInformative", "not set", found = false)
    addToBuffer("SampleDistribution-Available", "not set", found = false)
    addToBuffer("SampleDistribution-Called", "not set", found = false)
    addToBuffer("SampleDistribution-Filtered", "not set", found = false)
    addToBuffer("SampleDistribution-Variant", "not set", found = false)
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401

    addToBuffer("general", "Total", found = true)
    addToBuffer("general", "Biallelic", false)
    addToBuffer("general", "ComplexIndel", false)
    addToBuffer("general", "Filtered", false)
    addToBuffer("general", "FullyDecoded", false)
    addToBuffer("general", "Indel", false)
    addToBuffer("general", "Mixed", false)
    addToBuffer("general", "MNP", false)
    addToBuffer("general", "MonomorphicInSamples", false)
    addToBuffer("general", "NotFiltered", false)
    addToBuffer("general", "PointEvent", false)
    addToBuffer("general", "PolymorphicInSamples", false)
    addToBuffer("general", "SimpleDeletion", false)
    addToBuffer("general", "SimpleInsertion", false)
    addToBuffer("general", "SNP", false)
    addToBuffer("general", "StructuralIndel", false)
    addToBuffer("general", "Symbolic", false)
    addToBuffer("general", "SymbolicOrSV", false)
    addToBuffer("general", "Variant", false)

    val skipTags = List("QUAL", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
      addToBuffer(tag, 0, found = false)
    }

    Map("total" -> buffer.toMap)
  }

402
  /** Function to check all general stats, all info expect sample/genotype specific stats */
Sander Bollen's avatar
Sander Bollen committed
403
  protected[tools] def checkGeneral(record: VariantContext, additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
404 405
    val buffer = mutable.Map[String, Map[Any, Int]]()

406
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
407
      val map = buffer.getOrElse(key, Map())
408
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
409
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
410 411
    }

Sander Bollen's avatar
Sander Bollen committed
412
    buffer += "QUAL" -> Map(Math.round(record.getPhredScaledQual) -> 1)
Peter van 't Hof's avatar
Peter van 't Hof committed
413

Peter van 't Hof's avatar
Peter van 't Hof committed
414 415 416 417 418 419 420 421 422 423 424 425 426 427
    addToBuffer("SampleDistribution-Het", record.getGenotypes.count(genotype => genotype.isHet), found = true)
    addToBuffer("SampleDistribution-HetNonRef", record.getGenotypes.count(genotype => genotype.isHetNonRef), found = true)
    addToBuffer("SampleDistribution-Hom", record.getGenotypes.count(genotype => genotype.isHom), found = true)
    addToBuffer("SampleDistribution-HomRef", record.getGenotypes.count(genotype => genotype.isHomRef), found = true)
    addToBuffer("SampleDistribution-HomVar", record.getGenotypes.count(genotype => genotype.isHomVar), found = true)
    addToBuffer("SampleDistribution-Mixed", record.getGenotypes.count(genotype => genotype.isMixed), found = true)
    addToBuffer("SampleDistribution-NoCall", record.getGenotypes.count(genotype => genotype.isNoCall), found = true)
    addToBuffer("SampleDistribution-NonInformative", record.getGenotypes.count(genotype => genotype.isNonInformative), found = true)
    addToBuffer("SampleDistribution-Available", record.getGenotypes.count(genotype => genotype.isAvailable), found = true)
    addToBuffer("SampleDistribution-Called", record.getGenotypes.count(genotype => genotype.isCalled), found = true)
    addToBuffer("SampleDistribution-Filtered", record.getGenotypes.count(genotype => genotype.isFiltered), found = true)
    addToBuffer("SampleDistribution-Variant", record.getGenotypes.count(genotype => genotype.isHetNonRef || genotype.isHet || genotype.isHomVar), found = true)

    addToBuffer("general", "Total", found = true)
428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446
    addToBuffer("general", "Biallelic", record.isBiallelic)
    addToBuffer("general", "ComplexIndel", record.isComplexIndel)
    addToBuffer("general", "Filtered", record.isFiltered)
    addToBuffer("general", "FullyDecoded", record.isFullyDecoded)
    addToBuffer("general", "Indel", record.isIndel)
    addToBuffer("general", "Mixed", record.isMixed)
    addToBuffer("general", "MNP", record.isMNP)
    addToBuffer("general", "MonomorphicInSamples", record.isMonomorphicInSamples)
    addToBuffer("general", "NotFiltered", record.isNotFiltered)
    addToBuffer("general", "PointEvent", record.isPointEvent)
    addToBuffer("general", "PolymorphicInSamples", record.isPolymorphicInSamples)
    addToBuffer("general", "SimpleDeletion", record.isSimpleDeletion)
    addToBuffer("general", "SimpleInsertion", record.isSimpleInsertion)
    addToBuffer("general", "SNP", record.isSNP)
    addToBuffer("general", "StructuralIndel", record.isStructuralIndel)
    addToBuffer("general", "Symbolic", record.isSymbolic)
    addToBuffer("general", "SymbolicOrSV", record.isSymbolicOrSV)
    addToBuffer("general", "Variant", record.isVariant)

Peter van 't Hof's avatar
Peter van 't Hof committed
447 448 449
    val skipTags = List("QUAL", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
450
      val value = record.getAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
451 452
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
453 454
    }

455
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
Peter van 't Hof's avatar
Peter van 't Hof committed
456 457
  }

458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493
  protected[tools] def fillGenotype(additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
    val buffer = mutable.Map[String, Map[Any, Int]]()

    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
      val map = buffer.getOrElse(key, Map())
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
    }

    buffer += "DP" -> Map("not set" -> 1)
    buffer += "GQ" -> Map("not set" -> 1)

    addToBuffer("general", "Total", found = true)
    addToBuffer("general", "Het", false)
    addToBuffer("general", "HetNonRef", false)
    addToBuffer("general", "Hom", false)
    addToBuffer("general", "HomRef", false)
    addToBuffer("general", "HomVar", false)
    addToBuffer("general", "Mixed", false)
    addToBuffer("general", "NoCall", false)
    addToBuffer("general", "NonInformative", false)
    addToBuffer("general", "Available", false)
    addToBuffer("general", "Called", false)
    addToBuffer("general", "Filtered", false)
    addToBuffer("general", "Variant", false)

    val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
      addToBuffer(tag, 0, found = false)
    }

    Map("total" -> buffer.toMap)

  }

494
  /** Function to check sample/genotype specific stats */
Sander Bollen's avatar
Sander Bollen committed
495
  protected[tools] def checkGenotype(record: VariantContext, genotype: Genotype, additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
496
    val buffer = mutable.Map[String, Map[Any, Int]]()
Peter van 't Hof's avatar
Peter van 't Hof committed
497

498
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
499
      val map = buffer.getOrElse(key, Map())
500
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
501
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
502 503
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
504 505 506
    buffer += "DP" -> Map((if (genotype.hasDP) genotype.getDP else "not set") -> 1)
    buffer += "GQ" -> Map((if (genotype.hasGQ) genotype.getGQ else "not set") -> 1)

Peter van 't Hof's avatar
Peter van 't Hof committed
507 508
    val usedAlleles = (for (allele <- genotype.getAlleles) yield record.getAlleleIndex(allele)).toList

Peter van 't Hof's avatar
Peter van 't Hof committed
509
    addToBuffer("general", "Total", found = true)
510 511 512 513 514 515 516 517 518 519 520 521
    addToBuffer("general", "Het", genotype.isHet)
    addToBuffer("general", "HetNonRef", genotype.isHetNonRef)
    addToBuffer("general", "Hom", genotype.isHom)
    addToBuffer("general", "HomRef", genotype.isHomRef)
    addToBuffer("general", "HomVar", genotype.isHomVar)
    addToBuffer("general", "Mixed", genotype.isMixed)
    addToBuffer("general", "NoCall", genotype.isNoCall)
    addToBuffer("general", "NonInformative", genotype.isNonInformative)
    addToBuffer("general", "Available", genotype.isAvailable)
    addToBuffer("general", "Called", genotype.isCalled)
    addToBuffer("general", "Filtered", genotype.isFiltered)
    addToBuffer("general", "Variant", genotype.isHetNonRef || genotype.isHet || genotype.isHomVar)
Peter van 't Hof's avatar
Peter van 't Hof committed
522

Peter van 't Hof's avatar
Peter van 't Hof committed
523 524 525
    if (genotype.hasAD) {
      val ad = genotype.getAD
      for (i <- 0 until ad.size if ad(i) > 0) {
Peter van 't Hof's avatar
Peter van 't Hof committed
526 527 528 529 530
        addToBuffer("AD", ad(i), found = true)
        if (i == 0) addToBuffer("AD-ref", ad(i), found = true)
        if (i > 0) addToBuffer("AD-alt", ad(i), found = true)
        if (usedAlleles.contains(i)) addToBuffer("AD-used", ad(i), found = true)
        else addToBuffer("AD-not_used", ad(i), found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
531 532
      }
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
533

Peter van 't Hof's avatar
Peter van 't Hof committed
534 535 536
    val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
537
      val value = genotype.getAnyAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
538 539
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
540 541
    }

542
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
543 544
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
545
  /** Function to write sample to sample compare tsv's / heatmaps */
Peter van 't Hof's avatar
Peter van 't Hof committed
546 547
  def writeOverlap(stats: Stats, function: SampleToSampleStats => Int,
                   prefix: String, samples: List[String]): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
548 549 550 551 552 553 554
    val absFile = new File(prefix + ".abs.tsv")
    val relFile = new File(prefix + ".rel.tsv")

    absFile.getParentFile.mkdirs()

    val absWriter = new PrintWriter(absFile)
    val relWriter = new PrintWriter(relFile)
555 556 557 558

    absWriter.println(samples.mkString("\t", "\t", ""))
    relWriter.println(samples.mkString("\t", "\t", ""))
    for (sample1 <- samples) {
Peter van 't Hof's avatar
Peter van 't Hof committed
559 560
      val values = for (sample2 <- samples) yield function(stats.samplesStats(sample1).sampleToSample(sample2))

561 562
      absWriter.println(values.mkString(sample1 + "\t", "\t", ""))

Peter van 't Hof's avatar
Peter van 't Hof committed
563
      val total = function(stats.samplesStats(sample1).sampleToSample(sample1))
564 565 566 567
      relWriter.println(values.map(_.toFloat / total).mkString(sample1 + "\t", "\t", ""))
    }
    absWriter.close()
    relWriter.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
568 569

    plotHeatmap(relFile)
570 571
  }

572
  /** Plots heatmaps on target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
573
  def plotHeatmap(file: File) {
Peter van 't Hof's avatar
Peter van 't Hof committed
574 575
    executeRscript("plotHeatmap.R", Array(file.getAbsolutePath,
      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.png",
Peter van 't Hof's avatar
Peter van 't Hof committed
576 577
      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.clustering.png",
      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.dendrogram.png"))
Peter van 't Hof's avatar
Peter van 't Hof committed
578 579
  }

580
  /** Plots line graph with target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
581
  def plotLine(file: File) {
Peter van 't Hof's avatar
Peter van 't Hof committed
582 583 584 585
    executeRscript("plotXY.R", Array(file.getAbsolutePath,
      file.getAbsolutePath.stripSuffix(".tsv") + ".xy.png"))
  }

586
  /** Function to execute Rscript as subproces */
Peter van 't Hof's avatar
Peter van 't Hof committed
587 588 589
  def executeRscript(resource: String, args: Array[String]): Unit = {
    val is = getClass.getResourceAsStream(resource)
    val file = File.createTempFile("script.", "." + resource)
590
    file.deleteOnExit()
Peter van 't Hof's avatar
Peter van 't Hof committed
591 592 593 594 595 596
    val os = new FileOutputStream(file)
    org.apache.commons.io.IOUtils.copy(is, os)
    os.close()

    val command: String = "Rscript " + file + " " + args.mkString(" ")

597 598
    logger.info("Starting: " + command)
    val process = Process(command).run(ProcessLogger(x => logger.debug(x), x => logger.debug(x)))
Peter van 't Hof's avatar
Peter van 't Hof committed
599
    if (process.exitValue() == 0) logger.info("Done: " + command)
600
    else {
Peter van 't Hof's avatar
Peter van 't Hof committed
601
      logger.warn("Failed: " + command)
602 603
      if (!logger.isDebugEnabled) logger.warn("Use -l debug for more info")
    }
604 605
  }
}