VcfStats.scala 29.4 KB
Newer Older
bow's avatar
bow committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
16 17
package nl.lumc.sasc.biopet.tools

Peter van 't Hof's avatar
Peter van 't Hof committed
18
import java.io.{ File, FileOutputStream, PrintWriter }
19

Peter van 't Hof's avatar
Peter van 't Hof committed
20
import htsjdk.samtools.reference.FastaSequenceFile
Peter van 't Hof's avatar
Peter van 't Hof committed
21 22
import htsjdk.samtools.util.Interval
import htsjdk.variant.variantcontext.{ Allele, Genotype, VariantContext }
23
import htsjdk.variant.vcf.VCFFileReader
Peter van 't Hof's avatar
Peter van 't Hof committed
24
import nl.lumc.sasc.biopet.core.config.Configurable
Peter van 't Hof's avatar
Peter van 't Hof committed
25 26
import nl.lumc.sasc.biopet.core.summary.{ Summarizable, SummaryQScript }
import nl.lumc.sasc.biopet.core.{ Reference, ToolCommand, ToolCommandFuntion }
27
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
Peter van 't Hof's avatar
Peter van 't Hof committed
28 29
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }

30 31
import scala.collection.JavaConversions._
import scala.collection.mutable
Peter van 't Hof's avatar
Peter van 't Hof committed
32
import scala.io.Source
Peter van 't Hof's avatar
Peter van 't Hof committed
33
import scala.sys.process.{ Process, ProcessLogger }
Peter van 't Hof's avatar
Peter van 't Hof committed
34 35
import scala.util.Random

36
/**
Peter van 't Hof's avatar
Peter van 't Hof committed
37 38
 * This tool will generate statistics from a vcf file
 *
39 40
 * Created by pjvan_thof on 1/10/15.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
41
class VcfStats(val root: Configurable) extends ToolCommandFuntion with Summarizable with Reference {
Peter van 't Hof's avatar
Peter van 't Hof committed
42 43 44 45 46
  javaMainClass = getClass.getName

  @Input(doc = "Input fastq", shortName = "I", required = true)
  var input: File = _

Peter van 't Hof's avatar
Peter van 't Hof committed
47 48 49
  @Input
  protected var index: File = null

Peter van 't Hof's avatar
Peter van 't Hof committed
50 51 52 53 54 55
  @Output
  protected var generalStats: File = null

  @Output
  protected var genotypeStats: File = null

Peter van 't Hof's avatar
Peter van 't Hof committed
56 57
  override def defaultCoreMemory = 3.0
  override def defaultThreads = 3
Peter van 't Hof's avatar
Peter van 't Hof committed
58

Peter van 't Hof's avatar
Peter van 't Hof committed
59
  protected var outputDir: File = _
60

61 62 63 64
  var infoTags: List[String] = Nil
  var genotypeTags: List[String] = Nil
  var allInfoTags = false
  var allGenotypeTags = false
Peter van 't Hof's avatar
Peter van 't Hof committed
65
  var reference: File = _
66

Peter van 't Hof's avatar
Peter van 't Hof committed
67
  override def beforeGraph(): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
68
    reference = referenceFasta()
Peter van 't Hof's avatar
Peter van 't Hof committed
69 70 71
    index = new File(input.getAbsolutePath + ".tbi")
  }

72
  /** Set output dir and a output file */
Peter van 't Hof's avatar
Peter van 't Hof committed
73
  def setOutputDir(dir: File): Unit = {
74
    outputDir = dir
Peter van 't Hof's avatar
Peter van 't Hof committed
75
    generalStats = new File(dir, "general.tsv")
Peter van 't Hof's avatar
Peter van 't Hof committed
76
    genotypeStats = new File(dir, "genotype-general.tsv")
Peter van 't Hof's avatar
Peter van 't Hof committed
77
    jobOutputFile = new File(dir, ".vcfstats.out")
78
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
79

80
  /** Creates command to execute extension */
Peter van 't Hof's avatar
Peter van 't Hof committed
81 82
  override def commandLine = super.commandLine +
    required("-I", input) +
83 84 85 86
    required("-o", outputDir) +
    repeat("--infoTag", infoTags) +
    repeat("--genotypeTag", genotypeTags) +
    conditional(allInfoTags, "--allInfoTags") +
Peter van 't Hof's avatar
Peter van 't Hof committed
87 88
    conditional(allGenotypeTags, "--allGenotypeTags") +
    required("-R", reference)
Peter van 't Hof's avatar
Peter van 't Hof committed
89

90
  /** Returns general stats to the summary */
Peter van 't Hof's avatar
Peter van 't Hof committed
91
  def summaryStats: Map[String, Any] = {
92
    Map("info" -> (for (
93
      line <- Source.fromFile(generalStats).getLines().toList.tail;
Peter van 't Hof's avatar
Peter van 't Hof committed
94 95
      values = line.split("\t") if values.size >= 2 && !values(0).isEmpty
    ) yield values(0) -> values(1).toInt
96
    ).toMap)
Peter van 't Hof's avatar
Peter van 't Hof committed
97 98
  }

99
  /** return only general files to summary */
Peter van 't Hof's avatar
Peter van 't Hof committed
100 101 102 103
  def summaryFiles: Map[String, File] = Map(
    "general_stats" -> generalStats,
    "genotype_stats" -> genotypeStats
  )
104 105 106 107 108 109

  override def addToQscriptSummary(qscript: SummaryQScript, name: String): Unit = {
    val data = Source.fromFile(genotypeStats).getLines().map(_.split("\t")).toArray

    for (s <- 1 until data(0).size) {
      val sample = data(0)(s)
Peter van 't Hof's avatar
Peter van 't Hof committed
110
      val stats = Map("genotype" -> (for (f <- 1 until data.length) yield {
111 112 113 114 115 116 117 118 119 120 121
        data(f)(0) -> data(f)(s)
      }).toMap)

      val sum = new Summarizable {
        override def summaryFiles: Map[String, File] = Map()
        override def summaryStats: Map[String, Any] = stats
      }

      qscript.addSummarizable(sum, name, Some(sample))
    }
  }
122 123 124
}

object VcfStats extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
125
  /** Commandline argument */
126 127
  case class Args(inputFile: File = null,
                  outputDir: File = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
128
                  referenceFile: File = null,
129 130 131 132
                  intervals: Option[File] = None,
                  infoTags: List[String] = Nil,
                  genotypeTags: List[String] = Nil,
                  allInfoTags: Boolean = false,
Peter van 't Hof's avatar
Peter van 't Hof committed
133 134 135 136 137
                  allGenotypeTags: Boolean = false,
                  binSize: Int = 10000000,
                  writeBinStats: Boolean = false,
                  generalWiggle: List[String] = Nil,
                  genotypeWiggle: List[String] = Nil) extends AbstractArgs
138

Peter van 't Hof's avatar
Peter van 't Hof committed
139
  /** Parsing commandline arguments */
140
  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
141
    opt[File]('I', "inputFile") required () unbounded () valueName "<file>" action { (x, c) =>
142 143
      c.copy(inputFile = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
144
    opt[File]('R', "referenceFile") required () unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
145 146
      c.copy(referenceFile = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
147
    opt[File]('o', "outputDir") required () unbounded () valueName "<file>" action { (x, c) =>
148 149
      c.copy(outputDir = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
150 151 152
    opt[File]('i', "intervals") unbounded () valueName ("<file>") action { (x, c) =>
      c.copy(intervals = Some(x))
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
153
    opt[String]("infoTag") unbounded () valueName "<tag>" action { (x, c) =>
154 155
      c.copy(infoTags = x :: c.infoTags)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
156
    opt[String]("genotypeTag") unbounded () valueName "<tag>" action { (x, c) =>
157 158 159 160 161 162 163 164
      c.copy(genotypeTags = x :: c.genotypeTags)
    }
    opt[Unit]("allInfoTags") unbounded () action { (x, c) =>
      c.copy(allInfoTags = true)
    }
    opt[Unit]("allGenotypeTags") unbounded () action { (x, c) =>
      c.copy(allGenotypeTags = true)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
165 166 167 168 169 170 171 172 173 174 175 176
    opt[Int]("binSize") unbounded () action { (x, c) =>
      c.copy(binSize = x)
    }
    opt[Unit]("writeBinStats") unbounded () action { (x, c) =>
      c.copy(writeBinStats = true)
    }
    opt[String]("generalWiggle") unbounded () action { (x, c) =>
      c.copy(generalWiggle = x :: c.generalWiggle, writeBinStats = true)
    }
    opt[String]("genotypeWiggle") unbounded () action { (x, c) =>
      c.copy(genotypeWiggle = x :: c.genotypeWiggle, writeBinStats = true)
    }
177 178
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
179 180 181 182 183 184 185
  /**
   * Class to store sample to sample compare stats
   * @param genotypeOverlap Number of genotypes match with other sample
   * @param alleleOverlap Number of alleles also found in other sample
   */
  case class SampleToSampleStats(var genotypeOverlap: Int = 0,
                                 var alleleOverlap: Int = 0) {
Peter van 't Hof's avatar
Peter van 't Hof committed
186
    /** Add an other class */
Peter van 't Hof's avatar
Peter van 't Hof committed
187 188 189 190 191 192
    def +=(other: SampleToSampleStats) {
      this.genotypeOverlap += other.genotypeOverlap
      this.alleleOverlap += other.alleleOverlap
    }
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
193 194 195 196 197
  /**
   * class to store all sample relative stats
   * @param genotypeStats Stores all genotype relative stats
   * @param sampleToSample Stores sample to sample compare stats
   */
Peter van 't Hof's avatar
Peter van 't Hof committed
198 199
  case class SampleStats(genotypeStats: mutable.Map[String, mutable.Map[String, mutable.Map[Any, Int]]] = mutable.Map(),
                         sampleToSample: mutable.Map[String, SampleToSampleStats] = mutable.Map()) {
Peter van 't Hof's avatar
Peter van 't Hof committed
200
    /** Add an other class */
Peter van 't Hof's avatar
Peter van 't Hof committed
201 202 203 204 205
    def +=(other: SampleStats): Unit = {
      for ((key, value) <- other.sampleToSample) {
        if (this.sampleToSample.contains(key)) this.sampleToSample(key) += value
        else this.sampleToSample(key) = value
      }
206
      for ((chr, chrMap) <- other.genotypeStats; (field, fieldMap) <- chrMap) {
207
        if (!this.genotypeStats.contains(chr)) genotypeStats += (chr -> mutable.Map[String, mutable.Map[Any, Int]]())
208
        val thisField = this.genotypeStats(chr).get(field)
Peter van 't Hof's avatar
Peter van 't Hof committed
209
        if (thisField.isDefined) mergeStatsMap(thisField.get, fieldMap)
210
        else this.genotypeStats(chr) += field -> fieldMap
Peter van 't Hof's avatar
Peter van 't Hof committed
211 212 213 214
      }
    }
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
215 216 217 218 219
  /**
   * General stats class to store vcf stats
   * @param generalStats Stores are general stats
   * @param samplesStats Stores all sample/genotype specific stats
   */
Peter van 't Hof's avatar
Peter van 't Hof committed
220 221
  case class Stats(generalStats: mutable.Map[String, mutable.Map[String, mutable.Map[Any, Int]]] = mutable.Map(),
                   samplesStats: mutable.Map[String, SampleStats] = mutable.Map()) {
Peter van 't Hof's avatar
Peter van 't Hof committed
222
    /** Add an other class */
Peter van 't Hof's avatar
Peter van 't Hof committed
223
    def +=(other: Stats): Stats = {
Peter van 't Hof's avatar
Peter van 't Hof committed
224 225 226 227
      for ((key, value) <- other.samplesStats) {
        if (this.samplesStats.contains(key)) this.samplesStats(key) += value
        else this.samplesStats(key) = value
      }
228
      for ((chr, chrMap) <- other.generalStats; (field, fieldMap) <- chrMap) {
229
        if (!this.generalStats.contains(chr)) generalStats += (chr -> mutable.Map[String, mutable.Map[Any, Int]]())
230
        val thisField = this.generalStats(chr).get(field)
Peter van 't Hof's avatar
Peter van 't Hof committed
231
        if (thisField.isDefined) mergeStatsMap(thisField.get, fieldMap)
232
        else this.generalStats(chr) += field -> fieldMap
Peter van 't Hof's avatar
Peter van 't Hof committed
233
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
234
      this
Peter van 't Hof's avatar
Peter van 't Hof committed
235 236 237
    }
  }

238
  /** Merge m2 into m1 */
Peter van 't Hof's avatar
Peter van 't Hof committed
239 240 241 242 243
  def mergeStatsMap(m1: mutable.Map[Any, Int], m2: mutable.Map[Any, Int]): Unit = {
    for (key <- m2.keySet)
      m1(key) = m1.getOrElse(key, 0) + m2(key)
  }

244 245 246 247 248 249 250 251 252 253 254 255
  /** Merge m2 into m1 */
  def mergeNestedStatsMap(m1: mutable.Map[String, mutable.Map[String, mutable.Map[Any, Int]]],
                          m2: Map[String, Map[String, Map[Any, Int]]]): Unit = {
    for ((chr, chrMap) <- m2; (field, fieldMap) <- chrMap) {
      if (m1.contains(chr)) {
        if (m1(chr).contains(field)) {
          for ((key, value) <- fieldMap) {
            if (m1(chr)(field).contains(key)) m1(chr)(field)(key) += value
            else m1(chr)(field)(key) = value
          }
        } else m1(chr)(field) = mutable.Map(fieldMap.toList: _*)
      } else m1(chr) = mutable.Map(field -> mutable.Map(fieldMap.toList: _*))
Peter van 't Hof's avatar
Peter van 't Hof committed
256 257
    }
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
258

Peter van 't Hof's avatar
Peter van 't Hof committed
259
  protected var cmdArgs: Args = _
Peter van 't Hof's avatar
Peter van 't Hof committed
260

261 262
  val defaultGenotypeFields = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

263
  val defaultInfoFields = List("QUAL", "general", "AC", "AF", "AN", "DP")
264

Peter van 't Hof's avatar
Peter van 't Hof committed
265 266 267
  val sampleDistributions = List("Het", "HetNonRef", "Hom", "HomRef", "HomVar", "Mixed", "NoCall",
    "NonInformative", "Available", "Called", "Filtered", "Variant")

268 269 270 271 272 273
  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
    logger.info("Started")
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
274
    cmdArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1)
275

Peter van 't Hof's avatar
Peter van 't Hof committed
276
    val reader = new VCFFileReader(cmdArgs.inputFile, true)
277 278 279
    val header = reader.getFileHeader
    val samples = header.getSampleNamesInOrder.toList

Peter van 't Hof's avatar
Peter van 't Hof committed
280 281
    reader.close()

Peter van 't Hof's avatar
Peter van 't Hof committed
282 283
    val adInfoTags = {
      (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
284
        infoTag <- cmdArgs.infoTags if !defaultInfoFields.contains(infoTag)
Peter van 't Hof's avatar
Peter van 't Hof committed
285 286 287 288
      ) yield {
        require(header.getInfoHeaderLine(infoTag) != null, "Info tag '" + infoTag + "' not found in header of vcf file")
        infoTag
      }) ::: (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
289
        line <- header.getInfoHeaderLines if cmdArgs.allInfoTags if !defaultInfoFields.contains(line.getID) if !cmdArgs.infoTags.contains(line.getID)
Peter van 't Hof's avatar
Peter van 't Hof committed
290 291 292 293
      ) yield {
        line.getID
      }).toList ::: defaultInfoFields
    }
294

295
    val adGenotypeTags = (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
296
      genotypeTag <- cmdArgs.genotypeTags if !defaultGenotypeFields.contains(genotypeTag)
297
    ) yield {
298 299
      require(header.getFormatHeaderLine(genotypeTag) != null, "Format tag '" + genotypeTag + "' not found in header of vcf file")
      genotypeTag
300
    }) ::: (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
301
      line <- header.getFormatHeaderLines if cmdArgs.allGenotypeTags if !defaultGenotypeFields.contains(line.getID) if !cmdArgs.genotypeTags.contains(line.getID) if line.getID != "PL"
302
    ) yield {
303
      line.getID
Peter van 't Hof's avatar
Peter van 't Hof committed
304
    }).toList ::: defaultGenotypeFields
305

Peter van 't Hof's avatar
Peter van 't Hof committed
306 307
    val bedRecords = (cmdArgs.intervals match {
      case Some(intervals) => BedRecordList.fromFile(intervals).validateContigs(cmdArgs.referenceFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
308
      case _               => BedRecordList.fromReference(cmdArgs.referenceFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
309
    }).combineOverlap.scatter(cmdArgs.binSize)
310 311

    val intervals: List[Interval] = bedRecords.toSamIntervals.toList
Peter van 't Hof's avatar
Peter van 't Hof committed
312

313
    val totalBases = bedRecords.length
Peter van 't Hof's avatar
Peter van 't Hof committed
314

Peter van 't Hof's avatar
Peter van 't Hof committed
315 316
    // Reading vcf records
    logger.info("Start reading vcf records")
Peter van 't Hof's avatar
Peter van 't Hof committed
317 318 319 320 321 322

    def createStats: Stats = {
      val stats = new Stats
      //init stats
      for (sample1 <- samples) {
        stats.samplesStats += sample1 -> new SampleStats
323
        for (sample2 <- samples) {
Peter van 't Hof's avatar
Peter van 't Hof committed
324
          stats.samplesStats(sample1).sampleToSample += sample2 -> new SampleToSampleStats
325 326
        }
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
327 328 329 330 331
      stats
    }

    var variantCounter = 0L
    var baseCounter = 0L
Peter van 't Hof's avatar
Peter van 't Hof committed
332

Peter van 't Hof's avatar
Peter van 't Hof committed
333
    def status(count: Int, interval: Interval): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
334 335 336 337 338
      variantCounter += count
      baseCounter += interval.length()
      val fraction = baseCounter.toFloat / totalBases * 100
      logger.info(interval + " done, " + count + " rows processed")
      logger.info("total: " + variantCounter + " rows processed, " + fraction + "% done")
339
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
340

Peter van 't Hof's avatar
Peter van 't Hof committed
341
    // Triple for loop to not keep all bins in memory
Peter van 't Hof's avatar
Peter van 't Hof committed
342
    val stats = (for (intervals <- Random.shuffle(intervals).grouped(intervals.size / (if (intervals.size > 10) 4 else 1)).toList.par) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
343
      val chunkStats = for (intervals <- intervals.grouped(25)) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
344
        val binStats = for (interval <- intervals.par) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
345
          val reader = new VCFFileReader(cmdArgs.inputFile, true)
Peter van 't Hof's avatar
Peter van 't Hof committed
346 347 348 349 350
          var chunkCounter = 0
          val stats = createStats
          logger.info("Starting on: " + interval)

          for (
351
            record <- reader.query(interval.getContig, interval.getStart, interval.getEnd) if record.getStart <= interval.getEnd
Peter van 't Hof's avatar
Peter van 't Hof committed
352 353 354 355 356 357 358 359 360 361 362
          ) {
            mergeNestedStatsMap(stats.generalStats, checkGeneral(record, adInfoTags))
            for (sample1 <- samples) yield {
              val genotype = record.getGenotype(sample1)
              mergeNestedStatsMap(stats.samplesStats(sample1).genotypeStats, checkGenotype(record, genotype, adGenotypeTags))
              for (sample2 <- samples) {
                val genotype2 = record.getGenotype(sample2)
                if (genotype.getAlleles == genotype2.getAlleles)
                  stats.samplesStats(sample1).sampleToSample(sample2).genotypeOverlap += 1
                stats.samplesStats(sample1).sampleToSample(sample2).alleleOverlap += alleleOverlap(genotype.getAlleles.toList, genotype2.getAlleles.toList)
              }
Peter van 't Hof's avatar
Peter van 't Hof committed
363
            }
Peter van 't Hof's avatar
Peter van 't Hof committed
364
            chunkCounter += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
365
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
366
          reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
367

Peter van 't Hof's avatar
Peter van 't Hof committed
368 369
          if (cmdArgs.writeBinStats) {
            val binOutputDir = new File(cmdArgs.outputDir, "bins" + File.separator + interval.getContig)
Peter van 't Hof's avatar
Peter van 't Hof committed
370

Peter van 't Hof's avatar
Peter van 't Hof committed
371 372
            writeGenotypeField(stats, samples, "general", binOutputDir, prefix = "genotype-" + interval.getStart + "-" + interval.getEnd)
            writeField(stats, "general", binOutputDir, prefix = interval.getStart + "-" + interval.getEnd)
Peter van 't Hof's avatar
Peter van 't Hof committed
373
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
374

Peter van 't Hof's avatar
Peter van 't Hof committed
375 376
          status(chunkCounter, interval)
          stats
Peter van 't Hof's avatar
Peter van 't Hof committed
377
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
378
        binStats.toList.fold(createStats)(_ += _)
Peter van 't Hof's avatar
Peter van 't Hof committed
379
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
380
      chunkStats.toList.fold(createStats)(_ += _)
Peter van 't Hof's avatar
Peter van 't Hof committed
381
    }).toList.fold(createStats)(_ += _)
Peter van 't Hof's avatar
Peter van 't Hof committed
382

Peter van 't Hof's avatar
Peter van 't Hof committed
383
    logger.info("Done reading vcf records")
384

Peter van 't Hof's avatar
Peter van 't Hof committed
385
    // Writing info fields to tsv files
Peter van 't Hof's avatar
Peter van 't Hof committed
386 387
    val infoOutputDir = new File(cmdArgs.outputDir, "infotags")
    writeField(stats, "general", cmdArgs.outputDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
388
    for (field <- adInfoTags.distinct.par) {
389
      writeField(stats, field, infoOutputDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
390
      for (line <- new FastaSequenceFile(cmdArgs.referenceFile, true).getSequenceDictionary.getSequences) {
Peter van 't Hof's avatar
Peter van 't Hof committed
391
        val chr = line.getSequenceName
392 393 394 395
        writeField(stats, field, new File(infoOutputDir, "chrs" + File.separator + chr), chr = chr)
      }
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
396
    // Write genotype field to tsv files
Peter van 't Hof's avatar
Peter van 't Hof committed
397 398
    val genotypeOutputDir = new File(cmdArgs.outputDir, "genotypetags")
    writeGenotypeField(stats, samples, "general", cmdArgs.outputDir, prefix = "genotype")
Peter van 't Hof's avatar
Peter van 't Hof committed
399
    for (field <- adGenotypeTags.distinct.par) {
400
      writeGenotypeField(stats, samples, field, genotypeOutputDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
401
      for (line <- new FastaSequenceFile(cmdArgs.referenceFile, true).getSequenceDictionary.getSequences) {
Peter van 't Hof's avatar
Peter van 't Hof committed
402
        val chr = line.getSequenceName
403 404 405 406
        writeGenotypeField(stats, samples, field, new File(genotypeOutputDir, "chrs" + File.separator + chr), chr = chr)
      }
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
407
    // Write sample distributions to tsv files
Peter van 't Hof's avatar
Peter van 't Hof committed
408
    val sampleDistributionsOutputDir = new File(cmdArgs.outputDir, "sample_distributions")
Peter van 't Hof's avatar
Peter van 't Hof committed
409 410 411 412 413
    for (field <- sampleDistributions) {
      writeField(stats, "SampleDistribution-" + field, sampleDistributionsOutputDir)
    }

    // Write general wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
414 415
    for (field <- cmdArgs.generalWiggle) {
      val file = new File(cmdArgs.outputDir, "wigs" + File.separator + "general-" + field + ".wig")
Peter van 't Hof's avatar
Peter van 't Hof committed
416
      writeWiggle(intervals, field, "count", file, genotype = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
417 418
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
419
    // Write sample wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
420 421
    for (field <- cmdArgs.genotypeWiggle; sample <- samples) {
      val file = new File(cmdArgs.outputDir, "wigs" + File.separator + "genotype-" + sample + "-" + field + ".wig")
Peter van 't Hof's avatar
Peter van 't Hof committed
422
      writeWiggle(intervals, field, sample, file, genotype = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
423 424
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
425 426
    writeOverlap(stats, _.genotypeOverlap, cmdArgs.outputDir + "/sample_compare/genotype_overlap", samples)
    writeOverlap(stats, _.alleleOverlap, cmdArgs.outputDir + "/sample_compare/allele_overlap", samples)
427 428

    logger.info("Done")
Peter van 't Hof's avatar
Peter van 't Hof committed
429 430
  }

431
  //FIXME: does only work correct for reference and not with a bed file
Peter van 't Hof's avatar
Peter van 't Hof committed
432
  protected def writeWiggle(intervals: List[Interval], row: String, column: String, outputFile: File, genotype: Boolean): Unit = {
433
    val groupedIntervals = intervals.groupBy(_.getContig).map { case (k, v) => k -> v.sortBy(_.getStart) }
Peter van 't Hof's avatar
Peter van 't Hof committed
434
    outputFile.getParentFile.mkdirs()
Peter van 't Hof's avatar
Peter van 't Hof committed
435 436
    val writer = new PrintWriter(outputFile)
    writer.println("track type=wiggle_0")
437
    for ((chr, intervals) <- groupedIntervals) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
438 439 440
      val length = intervals.head.length()
      writer.println(s"fixedStep chrom=$chr start=1 step=$length span=$length")
      for (interval <- intervals) {
Peter van 't Hof's avatar
Peter van 't Hof committed
441
        val file = {
Peter van 't Hof's avatar
Peter van 't Hof committed
442 443
          if (genotype) new File(cmdArgs.outputDir, "bins" + File.separator + chr + File.separator + "genotype-" + interval.getStart + "-" + interval.getEnd + "-general.tsv")
          else new File(cmdArgs.outputDir, "bins" + File.separator + chr + File.separator + interval.getStart + "-" + interval.getEnd + "-general.tsv")
Peter van 't Hof's avatar
Peter van 't Hof committed
444 445
        }
        writer.println(valueFromTsv(file, row, column).getOrElse(0))
Peter van 't Hof's avatar
Peter van 't Hof committed
446 447 448 449 450
      }
    }
    writer.close()
  }

451 452 453 454 455 456 457
  /**
   * Gets single value from a tsv file
   * @param file Input tsv file
   * @param row Row id
   * @param column column id
   * @return value
   */
Peter van 't Hof's avatar
Peter van 't Hof committed
458 459 460 461 462 463
  def valueFromTsv(file: File, row: String, column: String): Option[String] = {
    val reader = Source.fromFile(file)
    val it = reader.getLines()
    val index = it.next().split("\t").indexOf(column)

    val value = it.find(_.startsWith(row))
464
    reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
465 466

    value.collect { case x => x.split("\t")(index) }
467 468
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
469
  /** Give back the number of alleles that overlap */
470 471 472 473 474 475 476 477 478 479 480
  def alleleOverlap(g1: List[Allele], g2: List[Allele], start: Int = 0): Int = {
    if (g1.isEmpty) start
    else {
      val found = g2.contains(g1.head)
      val g2tail = if (found) {
        val index = g2.indexOf(g1.head)
        g2.drop(index + 1) ++ g2.take(index)
      } else g2

      alleleOverlap(g1.tail, g2tail, if (found) start + 1 else start)
    }
481 482
  }

483
  /** Function to check all general stats, all info expect sample/genotype specific stats */
484
  protected def checkGeneral(record: VariantContext, additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
485 486
    val buffer = mutable.Map[String, Map[Any, Int]]()

487
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
488
      val map = buffer.getOrElse(key, Map())
489
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
490
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
491 492
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
493
    buffer += "QUAL" -> Map(record.getPhredScaledQual -> 1)
Peter van 't Hof's avatar
Peter van 't Hof committed
494

Peter van 't Hof's avatar
Peter van 't Hof committed
495 496 497 498 499 500 501 502 503 504 505 506 507 508
    addToBuffer("SampleDistribution-Het", record.getGenotypes.count(genotype => genotype.isHet), found = true)
    addToBuffer("SampleDistribution-HetNonRef", record.getGenotypes.count(genotype => genotype.isHetNonRef), found = true)
    addToBuffer("SampleDistribution-Hom", record.getGenotypes.count(genotype => genotype.isHom), found = true)
    addToBuffer("SampleDistribution-HomRef", record.getGenotypes.count(genotype => genotype.isHomRef), found = true)
    addToBuffer("SampleDistribution-HomVar", record.getGenotypes.count(genotype => genotype.isHomVar), found = true)
    addToBuffer("SampleDistribution-Mixed", record.getGenotypes.count(genotype => genotype.isMixed), found = true)
    addToBuffer("SampleDistribution-NoCall", record.getGenotypes.count(genotype => genotype.isNoCall), found = true)
    addToBuffer("SampleDistribution-NonInformative", record.getGenotypes.count(genotype => genotype.isNonInformative), found = true)
    addToBuffer("SampleDistribution-Available", record.getGenotypes.count(genotype => genotype.isAvailable), found = true)
    addToBuffer("SampleDistribution-Called", record.getGenotypes.count(genotype => genotype.isCalled), found = true)
    addToBuffer("SampleDistribution-Filtered", record.getGenotypes.count(genotype => genotype.isFiltered), found = true)
    addToBuffer("SampleDistribution-Variant", record.getGenotypes.count(genotype => genotype.isHetNonRef || genotype.isHet || genotype.isHomVar), found = true)

    addToBuffer("general", "Total", found = true)
509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527
    addToBuffer("general", "Biallelic", record.isBiallelic)
    addToBuffer("general", "ComplexIndel", record.isComplexIndel)
    addToBuffer("general", "Filtered", record.isFiltered)
    addToBuffer("general", "FullyDecoded", record.isFullyDecoded)
    addToBuffer("general", "Indel", record.isIndel)
    addToBuffer("general", "Mixed", record.isMixed)
    addToBuffer("general", "MNP", record.isMNP)
    addToBuffer("general", "MonomorphicInSamples", record.isMonomorphicInSamples)
    addToBuffer("general", "NotFiltered", record.isNotFiltered)
    addToBuffer("general", "PointEvent", record.isPointEvent)
    addToBuffer("general", "PolymorphicInSamples", record.isPolymorphicInSamples)
    addToBuffer("general", "SimpleDeletion", record.isSimpleDeletion)
    addToBuffer("general", "SimpleInsertion", record.isSimpleInsertion)
    addToBuffer("general", "SNP", record.isSNP)
    addToBuffer("general", "StructuralIndel", record.isStructuralIndel)
    addToBuffer("general", "Symbolic", record.isSymbolic)
    addToBuffer("general", "SymbolicOrSV", record.isSymbolicOrSV)
    addToBuffer("general", "Variant", record.isVariant)

Peter van 't Hof's avatar
Peter van 't Hof committed
528 529 530
    val skipTags = List("QUAL", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
531
      val value = record.getAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
532 533
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
534 535
    }

536
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
Peter van 't Hof's avatar
Peter van 't Hof committed
537 538
  }

539
  /** Function to check sample/genotype specific stats */
540
  protected def checkGenotype(record: VariantContext, genotype: Genotype, additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
541
    val buffer = mutable.Map[String, Map[Any, Int]]()
Peter van 't Hof's avatar
Peter van 't Hof committed
542

543
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
544
      val map = buffer.getOrElse(key, Map())
545
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
546
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
547 548
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
549 550 551
    buffer += "DP" -> Map((if (genotype.hasDP) genotype.getDP else "not set") -> 1)
    buffer += "GQ" -> Map((if (genotype.hasGQ) genotype.getGQ else "not set") -> 1)

Peter van 't Hof's avatar
Peter van 't Hof committed
552 553
    val usedAlleles = (for (allele <- genotype.getAlleles) yield record.getAlleleIndex(allele)).toList

Peter van 't Hof's avatar
Peter van 't Hof committed
554
    addToBuffer("general", "Total", found = true)
555 556 557 558 559 560 561 562 563 564 565 566
    addToBuffer("general", "Het", genotype.isHet)
    addToBuffer("general", "HetNonRef", genotype.isHetNonRef)
    addToBuffer("general", "Hom", genotype.isHom)
    addToBuffer("general", "HomRef", genotype.isHomRef)
    addToBuffer("general", "HomVar", genotype.isHomVar)
    addToBuffer("general", "Mixed", genotype.isMixed)
    addToBuffer("general", "NoCall", genotype.isNoCall)
    addToBuffer("general", "NonInformative", genotype.isNonInformative)
    addToBuffer("general", "Available", genotype.isAvailable)
    addToBuffer("general", "Called", genotype.isCalled)
    addToBuffer("general", "Filtered", genotype.isFiltered)
    addToBuffer("general", "Variant", genotype.isHetNonRef || genotype.isHet || genotype.isHomVar)
Peter van 't Hof's avatar
Peter van 't Hof committed
567

Peter van 't Hof's avatar
Peter van 't Hof committed
568 569 570
    if (genotype.hasAD) {
      val ad = genotype.getAD
      for (i <- 0 until ad.size if ad(i) > 0) {
Peter van 't Hof's avatar
Peter van 't Hof committed
571 572 573 574 575
        addToBuffer("AD", ad(i), found = true)
        if (i == 0) addToBuffer("AD-ref", ad(i), found = true)
        if (i > 0) addToBuffer("AD-alt", ad(i), found = true)
        if (usedAlleles.contains(i)) addToBuffer("AD-used", ad(i), found = true)
        else addToBuffer("AD-not_used", ad(i), found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
576 577
      }
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
578

Peter van 't Hof's avatar
Peter van 't Hof committed
579 580 581
    val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
582
      val value = genotype.getAnyAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
583 584
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
585 586
    }

587
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
588 589 590
  }

  /** Function to write 1 specific genotype field */
591 592 593 594
  protected def writeGenotypeField(stats: Stats, samples: List[String], field: String, outputDir: File,
                                   prefix: String = "", chr: String = "total"): Unit = {
    val file = (prefix, chr) match {
      case ("", "total") => new File(outputDir, field + ".tsv")
595 596 597
      case (_, "total")  => new File(outputDir, prefix + "-" + field + ".tsv")
      case ("", _)       => new File(outputDir, chr + "-" + field + ".tsv")
      case _             => new File(outputDir, prefix + "-" + chr + "-" + field + ".tsv")
598 599
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
600 601 602
    file.getParentFile.mkdirs()
    val writer = new PrintWriter(file)
    writer.println(samples.mkString(field + "\t", "\t", ""))
603
    val keySet = (for (sample <- samples) yield stats.samplesStats(sample).genotypeStats.getOrElse(chr, Map[String, Map[Any, Int]]()).getOrElse(field, Map[Any, Int]()).keySet).fold(Set[Any]())(_ ++ _)
Peter van 't Hof's avatar
Peter van 't Hof committed
604
    for (key <- keySet.toList.sortWith(sortAnyAny)) {
605
      val values = for (sample <- samples) yield stats.samplesStats(sample).genotypeStats.getOrElse(chr, Map[String, Map[Any, Int]]()).getOrElse(field, Map[Any, Int]()).getOrElse(key, 0)
Peter van 't Hof's avatar
Peter van 't Hof committed
606
      writer.println(values.mkString(key + "\t", "\t", ""))
Peter van 't Hof's avatar
Peter van 't Hof committed
607
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
608
    writer.close()
609 610 611

    //FIXME: plotting of thise value is broken
    //plotLine(file)
Peter van 't Hof's avatar
Peter van 't Hof committed
612 613
  }

614
  /** Function to write 1 specific general field */
615 616 617
  protected def writeField(stats: Stats, field: String, outputDir: File, prefix: String = "", chr: String = "total"): File = {
    val file = (prefix, chr) match {
      case ("", "total") => new File(outputDir, field + ".tsv")
618 619 620
      case (_, "total")  => new File(outputDir, prefix + "-" + field + ".tsv")
      case ("", _)       => new File(outputDir, chr + "-" + field + ".tsv")
      case _             => new File(outputDir, prefix + "-" + chr + "-" + field + ".tsv")
621 622 623 624
    }

    val data = stats.generalStats.getOrElse(chr, mutable.Map[String, mutable.Map[Any, Int]]()).getOrElse(field, mutable.Map[Any, Int]())

Peter van 't Hof's avatar
Peter van 't Hof committed
625 626
    file.getParentFile.mkdirs()
    val writer = new PrintWriter(file)
627
    writer.println("value\tcount")
Peter van 't Hof's avatar
Peter van 't Hof committed
628
    for (key <- data.keySet.toList.sortWith(sortAnyAny)) {
Peter van 't Hof's avatar
Peter van 't Hof committed
629 630 631 632 633 634
      writer.println(key + "\t" + data(key))
    }
    writer.close()
    file
  }

635
  /** Function to sort Any values */
Peter van 't Hof's avatar
Peter van 't Hof committed
636 637
  def sortAnyAny(a: Any, b: Any): Boolean = {
    a match {
Peter van 't Hof's avatar
Peter van 't Hof committed
638
      case ai: Int =>
Peter van 't Hof's avatar
Peter van 't Hof committed
639
        b match {
Peter van 't Hof's avatar
Peter van 't Hof committed
640 641 642
          case bi: Int    => ai < bi
          case bi: Double => ai < bi
          case _          => a.toString < b.toString
Peter van 't Hof's avatar
Peter van 't Hof committed
643 644 645 646 647
        }
      case _ => a.toString < b.toString
    }
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
648
  /** Function to write sample to sample compare tsv's / heatmaps */
Peter van 't Hof's avatar
Peter van 't Hof committed
649 650
  def writeOverlap(stats: Stats, function: SampleToSampleStats => Int,
                   prefix: String, samples: List[String]): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
651 652 653 654 655 656 657
    val absFile = new File(prefix + ".abs.tsv")
    val relFile = new File(prefix + ".rel.tsv")

    absFile.getParentFile.mkdirs()

    val absWriter = new PrintWriter(absFile)
    val relWriter = new PrintWriter(relFile)
658 659 660 661

    absWriter.println(samples.mkString("\t", "\t", ""))
    relWriter.println(samples.mkString("\t", "\t", ""))
    for (sample1 <- samples) {
Peter van 't Hof's avatar
Peter van 't Hof committed
662 663
      val values = for (sample2 <- samples) yield function(stats.samplesStats(sample1).sampleToSample(sample2))

664 665
      absWriter.println(values.mkString(sample1 + "\t", "\t", ""))

Peter van 't Hof's avatar
Peter van 't Hof committed
666
      val total = function(stats.samplesStats(sample1).sampleToSample(sample1))
667 668 669 670
      relWriter.println(values.map(_.toFloat / total).mkString(sample1 + "\t", "\t", ""))
    }
    absWriter.close()
    relWriter.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
671 672

    plotHeatmap(relFile)
673 674
  }

675
  /** Plots heatmaps on target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
676
  def plotHeatmap(file: File) {
Peter van 't Hof's avatar
Peter van 't Hof committed
677 678
    executeRscript("plotHeatmap.R", Array(file.getAbsolutePath,
      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.png",
Peter van 't Hof's avatar
Peter van 't Hof committed
679 680
      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.clustering.png",
      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.dendrogram.png"))
Peter van 't Hof's avatar
Peter van 't Hof committed
681 682
  }

683
  /** Plots line graph with target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
684
  def plotLine(file: File) {
Peter van 't Hof's avatar
Peter van 't Hof committed
685 686 687 688
    executeRscript("plotXY.R", Array(file.getAbsolutePath,
      file.getAbsolutePath.stripSuffix(".tsv") + ".xy.png"))
  }

689
  /** Function to execute Rscript as subproces */
Peter van 't Hof's avatar
Peter van 't Hof committed
690 691 692 693 694 695 696 697 698
  def executeRscript(resource: String, args: Array[String]): Unit = {
    val is = getClass.getResourceAsStream(resource)
    val file = File.createTempFile("script.", "." + resource)
    val os = new FileOutputStream(file)
    org.apache.commons.io.IOUtils.copy(is, os)
    os.close()

    val command: String = "Rscript " + file + " " + args.mkString(" ")

699 700
    logger.info("Starting: " + command)
    val process = Process(command).run(ProcessLogger(x => logger.debug(x), x => logger.debug(x)))
Peter van 't Hof's avatar
Peter van 't Hof committed
701
    if (process.exitValue() == 0) logger.info("Done: " + command)
702
    else {
Peter van 't Hof's avatar
Peter van 't Hof committed
703
      logger.warn("Failed: " + command)
704 705
      if (!logger.isDebugEnabled) logger.warn("Use -l debug for more info")
    }
706 707
  }
}