VcfStats.scala 26.6 KB
Newer Older
bow's avatar
bow committed
1 2 3 4 5 6 7 8 9 10
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
11
 * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
bow's avatar
bow committed
12 13 14
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
15
package nl.lumc.sasc.biopet.tools.vcfstats
16

Peter van 't Hof's avatar
Peter van 't Hof committed
17
import java.io.{File, FileOutputStream, PrintWriter}
18

Peter van 't Hof's avatar
Peter van 't Hof committed
19
import htsjdk.samtools.util.Interval
Peter van 't Hof's avatar
Peter van 't Hof committed
20
import htsjdk.variant.variantcontext.{Genotype, VariantContext}
21
import htsjdk.variant.vcf.VCFFileReader
22
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
Peter van 't Hof's avatar
Peter van 't Hof committed
23
import nl.lumc.sasc.biopet.utils.{FastaUtils, ToolCommand, VcfUtils}
Peter van 't Hof's avatar
Peter van 't Hof committed
24

25 26
import scala.collection.JavaConversions._
import scala.collection.mutable
Peter van 't Hof's avatar
Peter van 't Hof committed
27
import scala.io.Source
Peter van 't Hof's avatar
Peter van 't Hof committed
28
import scala.sys.process.{Process, ProcessLogger}
Peter van 't Hof's avatar
Peter van 't Hof committed
29 30
import scala.util.Random

Peter van 't Hof's avatar
Peter van 't Hof committed
31 32
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.duration._
Peter van 't Hof's avatar
Peter van 't Hof committed
33
import scala.concurrent.{Await, Future}
Peter van 't Hof's avatar
Peter van 't Hof committed
34

35
/**
Peter van 't Hof's avatar
Peter van 't Hof committed
36 37
 * This tool will generate statistics from a vcf file
 *
38 39 40
 * Created by pjvan_thof on 1/10/15.
 */
object VcfStats extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
41
  /** Commandline argument */
42 43
  case class Args(inputFile: File = null,
                  outputDir: File = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
44
                  referenceFile: File = null,
45 46 47 48
                  intervals: Option[File] = None,
                  infoTags: List[String] = Nil,
                  genotypeTags: List[String] = Nil,
                  allInfoTags: Boolean = false,
Peter van 't Hof's avatar
Peter van 't Hof committed
49 50 51 52 53
                  allGenotypeTags: Boolean = false,
                  binSize: Int = 10000000,
                  writeBinStats: Boolean = false,
                  generalWiggle: List[String] = Nil,
                  genotypeWiggle: List[String] = Nil) extends AbstractArgs
54

Peter van 't Hof's avatar
Peter van 't Hof committed
55
  private val generalWiggleOptions = List("Total", "Biallelic", "ComplexIndel", "Filtered", "FullyDecoded", "Indel", "Mixed",
56 57 58 59
    "MNP", "MonomorphicInSamples", "NotFiltered", "PointEvent", "PolymorphicInSamples",
    "SimpleDeletion", "SimpleInsertion", "SNP", "StructuralIndel", "Symbolic",
    "SymbolicOrSV", "Variant")

Peter van 't Hof's avatar
Peter van 't Hof committed
60
  private val genotypeWiggleOptions = List("Total", "Het", "HetNonRef", "Hom", "HomRef", "HomVar", "Mixed", "NoCall", "NonInformative",
61 62
    "Available", "Called", "Filtered", "Variant")

Peter van 't Hof's avatar
Peter van 't Hof committed
63
  /** Parsing commandline arguments */
64
  class OptParser extends AbstractOptParser {
65
    opt[File]('I', "inputFile") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) =>
66
      c.copy(inputFile = x)
67 68 69 70
    } validate {
      x => if (x.exists) success else failure("Input VCF required")
    } text "Input VCF file (required)"
    opt[File]('R', "referenceFile") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
71
      c.copy(referenceFile = x)
72 73 74 75
    } validate {
      x => if (x.exists) success else failure("Reference file required")
    } text "Fasta reference which was used to call input VCF (required)"
    opt[File]('o', "outputDir") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) =>
76
      c.copy(outputDir = x)
77 78 79
    } validate {
      x => if (x == null) failure("Output directory required") else success
    } text "Path to directory for output (required)"
Peter van 't Hof's avatar
Peter van 't Hof committed
80
    opt[File]('i', "intervals") unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
81
      c.copy(intervals = Some(x))
82
    } text "Path to interval (BED) file (optional)"
Peter van 't Hof's avatar
Peter van 't Hof committed
83
    opt[String]("infoTag") unbounded () valueName "<tag>" action { (x, c) =>
84
      c.copy(infoTags = x :: c.infoTags)
Peter van 't Hof's avatar
Peter van 't Hof committed
85
    } text s"Summarize these info tags. Default is (${defaultInfoFields.mkString(", ")})"
Peter van 't Hof's avatar
Peter van 't Hof committed
86
    opt[String]("genotypeTag") unbounded () valueName "<tag>" action { (x, c) =>
87
      c.copy(genotypeTags = x :: c.genotypeTags)
Peter van 't Hof's avatar
Peter van 't Hof committed
88
    } text s"Summarize these genotype tags. Default is (${defaultGenotypeFields.mkString(", ")})"
89 90
    opt[Unit]("allInfoTags") unbounded () action { (x, c) =>
      c.copy(allInfoTags = true)
91
    } text "Summarize all info tags. Default false"
92 93
    opt[Unit]("allGenotypeTags") unbounded () action { (x, c) =>
      c.copy(allGenotypeTags = true)
94
    } text "Summarize all genotype tags. Default false"
Peter van 't Hof's avatar
Peter van 't Hof committed
95 96
    opt[Int]("binSize") unbounded () action { (x, c) =>
      c.copy(binSize = x)
97
    } text "Binsize in estimated base pairs"
Peter van 't Hof's avatar
Peter van 't Hof committed
98 99
    opt[Unit]("writeBinStats") unbounded () action { (x, c) =>
      c.copy(writeBinStats = true)
100
    } text "Write bin statistics. Default False"
Peter van 't Hof's avatar
Peter van 't Hof committed
101 102
    opt[String]("generalWiggle") unbounded () action { (x, c) =>
      c.copy(generalWiggle = x :: c.generalWiggle, writeBinStats = true)
103
    } validate {
Peter van 't Hof's avatar
Peter van 't Hof committed
104
      x => if (generalWiggleOptions.contains(x)) success else failure(s"""Nonexistent field $x""")
Peter van 't Hof's avatar
Peter van 't Hof committed
105 106
    } text s"""Create a wiggle track with bin size <binSize> for any of the following statistics:
        |${generalWiggleOptions.mkString(", ")}""".stripMargin
Peter van 't Hof's avatar
Peter van 't Hof committed
107 108
    opt[String]("genotypeWiggle") unbounded () action { (x, c) =>
      c.copy(genotypeWiggle = x :: c.genotypeWiggle, writeBinStats = true)
109 110
    } validate {
      x => if (genotypeWiggleOptions.contains(x)) success else failure(s"""Non-existent field $x""")
Peter van 't Hof's avatar
Peter van 't Hof committed
111 112
    } text s"""Create a wiggle track with bin size <binSize> for any of the following genotype fields:
        |${genotypeWiggleOptions.mkString(", ")}""".stripMargin
113 114
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
115
  protected var cmdArgs: Args = _
Peter van 't Hof's avatar
Peter van 't Hof committed
116

117 118
  val defaultGenotypeFields = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

119
  val defaultInfoFields = List("QUAL", "general", "AC", "AF", "AN", "DP")
120

Peter van 't Hof's avatar
Peter van 't Hof committed
121 122 123
  val sampleDistributions = List("Het", "HetNonRef", "Hom", "HomRef", "HomVar", "Mixed", "NoCall",
    "NonInformative", "Available", "Called", "Filtered", "Variant")

124 125 126 127 128 129
  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
    logger.info("Started")
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
130
    cmdArgs = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
131

Peter van 't Hof's avatar
Peter van 't Hof committed
132
    val reader = new VCFFileReader(cmdArgs.inputFile, true)
133 134 135
    val header = reader.getFileHeader
    val samples = header.getSampleNamesInOrder.toList

Peter van 't Hof's avatar
Peter van 't Hof committed
136 137
    reader.close()

Peter van 't Hof's avatar
Peter van 't Hof committed
138 139
    val adInfoTags = {
      (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
140
        infoTag <- cmdArgs.infoTags if !defaultInfoFields.contains(infoTag)
Peter van 't Hof's avatar
Peter van 't Hof committed
141 142 143 144
      ) yield {
        require(header.getInfoHeaderLine(infoTag) != null, "Info tag '" + infoTag + "' not found in header of vcf file")
        infoTag
      }) ::: (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
145
        line <- header.getInfoHeaderLines if cmdArgs.allInfoTags if !defaultInfoFields.contains(line.getID) if !cmdArgs.infoTags.contains(line.getID)
Peter van 't Hof's avatar
Peter van 't Hof committed
146 147 148 149
      ) yield {
        line.getID
      }).toList ::: defaultInfoFields
    }
150

151
    val adGenotypeTags = (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
152
      genotypeTag <- cmdArgs.genotypeTags if !defaultGenotypeFields.contains(genotypeTag)
153
    ) yield {
154 155
      require(header.getFormatHeaderLine(genotypeTag) != null, "Format tag '" + genotypeTag + "' not found in header of vcf file")
      genotypeTag
156
    }) ::: (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
157
      line <- header.getFormatHeaderLines if cmdArgs.allGenotypeTags if !defaultGenotypeFields.contains(line.getID) if !cmdArgs.genotypeTags.contains(line.getID) if line.getID != "PL"
158
    ) yield {
159
      line.getID
Peter van 't Hof's avatar
Peter van 't Hof committed
160
    }).toList ::: defaultGenotypeFields
161

Peter van 't Hof's avatar
Peter van 't Hof committed
162 163
    val bedRecords = (cmdArgs.intervals match {
      case Some(intervals) => BedRecordList.fromFile(intervals).validateContigs(cmdArgs.referenceFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
164
      case _               => BedRecordList.fromReference(cmdArgs.referenceFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
165
    }).combineOverlap.scatter(cmdArgs.binSize)
166 167

    val intervals: List[Interval] = bedRecords.toSamIntervals.toList
Peter van 't Hof's avatar
Peter van 't Hof committed
168

169
    val totalBases = bedRecords.length
Peter van 't Hof's avatar
Peter van 't Hof committed
170

Peter van 't Hof's avatar
Peter van 't Hof committed
171 172
    // Reading vcf records
    logger.info("Start reading vcf records")
Peter van 't Hof's avatar
Peter van 't Hof committed
173 174 175 176 177 178

    def createStats: Stats = {
      val stats = new Stats
      //init stats
      for (sample1 <- samples) {
        stats.samplesStats += sample1 -> new SampleStats
179
        for (sample2 <- samples) {
Peter van 't Hof's avatar
Peter van 't Hof committed
180
          stats.samplesStats(sample1).sampleToSample += sample2 -> new SampleToSampleStats
181 182
        }
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
183 184 185 186 187
      stats
    }

    var variantCounter = 0L
    var baseCounter = 0L
Peter van 't Hof's avatar
Peter van 't Hof committed
188

Peter van 't Hof's avatar
Peter van 't Hof committed
189
    def status(count: Int, interval: Interval): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
190 191 192 193 194
      variantCounter += count
      baseCounter += interval.length()
      val fraction = baseCounter.toFloat / totalBases * 100
      logger.info(interval + " done, " + count + " rows processed")
      logger.info("total: " + variantCounter + " rows processed, " + fraction + "% done")
195
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
196

Peter van 't Hof's avatar
Peter van 't Hof committed
197
    // Triple for loop to not keep all bins in memory
198
    val statsFutures = for (intervals <- Random.shuffle(intervals).grouped(intervals.size / (if (intervals.size > 10) 4 else 1)).toList) yield Future {
Peter van 't Hof's avatar
Peter van 't Hof committed
199
      val chunkStats = for (intervals <- intervals.grouped(25)) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
200
        val binStats = for (interval <- intervals.par) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
201
          val reader = new VCFFileReader(cmdArgs.inputFile, true)
Peter van 't Hof's avatar
Peter van 't Hof committed
202 203 204 205
          var chunkCounter = 0
          val stats = createStats
          logger.info("Starting on: " + interval)

206 207
          val query = reader.query(interval.getContig, interval.getStart, interval.getEnd)
          if (!query.hasNext) {
Peter van 't Hof's avatar
Peter van 't Hof committed
208
            Stats.mergeNestedStatsMap(stats.generalStats, fillGeneral(adInfoTags))
209
            for (sample <- samples) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
210
              Stats.mergeNestedStatsMap(stats.samplesStats(sample).genotypeStats, fillGenotype(adGenotypeTags))
211 212 213 214
            }
            chunkCounter += 1
          }

Peter van 't Hof's avatar
Peter van 't Hof committed
215
          for (
216
            record <- query if record.getStart <= interval.getEnd
Peter van 't Hof's avatar
Peter van 't Hof committed
217
          ) {
Peter van 't Hof's avatar
Peter van 't Hof committed
218
            Stats.mergeNestedStatsMap(stats.generalStats, checkGeneral(record, adInfoTags))
Peter van 't Hof's avatar
Peter van 't Hof committed
219 220
            for (sample1 <- samples) yield {
              val genotype = record.getGenotype(sample1)
Peter van 't Hof's avatar
Peter van 't Hof committed
221
              Stats.mergeNestedStatsMap(stats.samplesStats(sample1).genotypeStats, checkGenotype(record, genotype, adGenotypeTags))
Peter van 't Hof's avatar
Peter van 't Hof committed
222 223 224 225
              for (sample2 <- samples) {
                val genotype2 = record.getGenotype(sample2)
                if (genotype.getAlleles == genotype2.getAlleles)
                  stats.samplesStats(sample1).sampleToSample(sample2).genotypeOverlap += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
226
                stats.samplesStats(sample1).sampleToSample(sample2).alleleOverlap += VcfUtils.alleleOverlap(genotype.getAlleles.toList, genotype2.getAlleles.toList)
Peter van 't Hof's avatar
Peter van 't Hof committed
227
              }
Peter van 't Hof's avatar
Peter van 't Hof committed
228
            }
Peter van 't Hof's avatar
Peter van 't Hof committed
229
            chunkCounter += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
230
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
231
          reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
232

Peter van 't Hof's avatar
Peter van 't Hof committed
233 234
          if (cmdArgs.writeBinStats) {
            val binOutputDir = new File(cmdArgs.outputDir, "bins" + File.separator + interval.getContig)
Peter van 't Hof's avatar
Peter van 't Hof committed
235

Peter van 't Hof's avatar
Peter van 't Hof committed
236 237
            stats.writeGenotypeField(samples, "general", binOutputDir, prefix = "genotype-" + interval.getStart + "-" + interval.getEnd)
            stats.writeField("general", binOutputDir, prefix = interval.getStart + "-" + interval.getEnd)
Peter van 't Hof's avatar
Peter van 't Hof committed
238
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
239

Peter van 't Hof's avatar
Peter van 't Hof committed
240 241
          status(chunkCounter, interval)
          stats
Peter van 't Hof's avatar
Peter van 't Hof committed
242
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
243
        binStats.toList.fold(createStats)(_ += _)
Peter van 't Hof's avatar
Peter van 't Hof committed
244
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
245
      chunkStats.toList.fold(createStats)(_ += _)
246
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
247
    val stats = statsFutures.foldLeft(createStats) { case (a,b) => a += Await.result(b, Duration.Inf) }
Peter van 't Hof's avatar
Peter van 't Hof committed
248

Peter van 't Hof's avatar
Peter van 't Hof committed
249
    logger.info("Done reading vcf records")
250

Peter van 't Hof's avatar
Peter van 't Hof committed
251
    // Writing info fields to tsv files
Peter van 't Hof's avatar
Peter van 't Hof committed
252
    val infoOutputDir = new File(cmdArgs.outputDir, "infotags")
Peter van 't Hof's avatar
Peter van 't Hof committed
253
    stats.writeField("general", cmdArgs.outputDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
254
    for (field <- adInfoTags.distinct.par) {
Peter van 't Hof's avatar
Peter van 't Hof committed
255
      stats.writeField(field, infoOutputDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
256
      for (line <- FastaUtils.getCachedDict(cmdArgs.referenceFile).getSequences) {
Peter van 't Hof's avatar
Peter van 't Hof committed
257
        val chr = line.getSequenceName
Peter van 't Hof's avatar
Peter van 't Hof committed
258
        stats.writeField(field, new File(infoOutputDir, "chrs" + File.separator + chr), chr = chr)
259 260 261
      }
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
262
    // Write genotype field to tsv files
Peter van 't Hof's avatar
Peter van 't Hof committed
263
    val genotypeOutputDir = new File(cmdArgs.outputDir, "genotypetags")
Peter van 't Hof's avatar
Peter van 't Hof committed
264
    stats.writeGenotypeField(samples, "general", cmdArgs.outputDir, prefix = "genotype")
Peter van 't Hof's avatar
Peter van 't Hof committed
265
    for (field <- adGenotypeTags.distinct.par) {
Peter van 't Hof's avatar
Peter van 't Hof committed
266
      stats.writeGenotypeField(samples, field, genotypeOutputDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
267
      for (line <- FastaUtils.getCachedDict(cmdArgs.referenceFile).getSequences) {
Peter van 't Hof's avatar
Peter van 't Hof committed
268
        val chr = line.getSequenceName
Peter van 't Hof's avatar
Peter van 't Hof committed
269
        stats.writeGenotypeField(samples, field, new File(genotypeOutputDir, "chrs" + File.separator + chr), chr = chr)
270 271 272
      }
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
273
    // Write sample distributions to tsv files
Peter van 't Hof's avatar
Peter van 't Hof committed
274
    val sampleDistributionsOutputDir = new File(cmdArgs.outputDir, "sample_distributions")
Peter van 't Hof's avatar
Peter van 't Hof committed
275
    for (field <- sampleDistributions) {
Peter van 't Hof's avatar
Peter van 't Hof committed
276
      stats.writeField("SampleDistribution-" + field, sampleDistributionsOutputDir)
Peter van 't Hof's avatar
Peter van 't Hof committed
277 278 279
    }

    // Write general wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
280 281
    for (field <- cmdArgs.generalWiggle) {
      val file = new File(cmdArgs.outputDir, "wigs" + File.separator + "general-" + field + ".wig")
Peter van 't Hof's avatar
Peter van 't Hof committed
282
      writeWiggle(intervals, field, "count", file, genotype = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
283 284
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
285
    // Write sample wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
286 287
    for (field <- cmdArgs.genotypeWiggle; sample <- samples) {
      val file = new File(cmdArgs.outputDir, "wigs" + File.separator + "genotype-" + sample + "-" + field + ".wig")
Peter van 't Hof's avatar
Peter van 't Hof committed
288
      writeWiggle(intervals, field, sample, file, genotype = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
289 290
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
291 292
    writeOverlap(stats, _.genotypeOverlap, cmdArgs.outputDir + "/sample_compare/genotype_overlap", samples)
    writeOverlap(stats, _.alleleOverlap, cmdArgs.outputDir + "/sample_compare/allele_overlap", samples)
293 294

    logger.info("Done")
Peter van 't Hof's avatar
Peter van 't Hof committed
295 296
  }

297
  //FIXME: does only work correct for reference and not with a bed file
Peter van 't Hof's avatar
Peter van 't Hof committed
298
  protected def writeWiggle(intervals: List[Interval], row: String, column: String, outputFile: File, genotype: Boolean): Unit = {
299
    val groupedIntervals = intervals.groupBy(_.getContig).map { case (k, v) => k -> v.sortBy(_.getStart) }
Peter van 't Hof's avatar
Peter van 't Hof committed
300
    outputFile.getParentFile.mkdirs()
Peter van 't Hof's avatar
Peter van 't Hof committed
301 302
    val writer = new PrintWriter(outputFile)
    writer.println("track type=wiggle_0")
303
    for ((chr, intervals) <- groupedIntervals) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
304 305 306
      val length = intervals.head.length()
      writer.println(s"fixedStep chrom=$chr start=1 step=$length span=$length")
      for (interval <- intervals) {
Peter van 't Hof's avatar
Peter van 't Hof committed
307
        val file = {
Peter van 't Hof's avatar
Peter van 't Hof committed
308 309
          if (genotype) new File(cmdArgs.outputDir, "bins" + File.separator + chr + File.separator + "genotype-" + interval.getStart + "-" + interval.getEnd + "-general.tsv")
          else new File(cmdArgs.outputDir, "bins" + File.separator + chr + File.separator + interval.getStart + "-" + interval.getEnd + "-general.tsv")
Peter van 't Hof's avatar
Peter van 't Hof committed
310 311
        }
        writer.println(valueFromTsv(file, row, column).getOrElse(0))
Peter van 't Hof's avatar
Peter van 't Hof committed
312 313 314 315 316
      }
    }
    writer.close()
  }

317 318 319 320 321 322 323
  /**
   * Gets single value from a tsv file
   * @param file Input tsv file
   * @param row Row id
   * @param column column id
   * @return value
   */
Peter van 't Hof's avatar
Peter van 't Hof committed
324 325 326 327 328 329
  def valueFromTsv(file: File, row: String, column: String): Option[String] = {
    val reader = Source.fromFile(file)
    val it = reader.getLines()
    val index = it.next().split("\t").indexOf(column)

    val value = it.find(_.startsWith(row))
330
    reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
331 332

    value.collect { case x => x.split("\t")(index) }
333 334
  }

335 336 337 338 339 340 341 342 343
  protected[tools] def fillGeneral(additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
    val buffer = mutable.Map[String, Map[Any, Int]]()

    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
      val map = buffer.getOrElse(key, Map())
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
344
    addToBuffer("QUAL", "not set", false)
345

Sander Bollen's avatar
Sander Bollen committed
346 347 348 349 350 351 352 353 354 355 356 357
    addToBuffer("SampleDistribution-Het", "not set", found = false)
    addToBuffer("SampleDistribution-HetNonRef", "not set", found = false)
    addToBuffer("SampleDistribution-Hom", "not set", found = false)
    addToBuffer("SampleDistribution-HomRef", "not set", found = false)
    addToBuffer("SampleDistribution-HomVar", "not set", found = false)
    addToBuffer("SampleDistribution-Mixed", "not set", found = false)
    addToBuffer("SampleDistribution-NoCall", "not set", found = false)
    addToBuffer("SampleDistribution-NonInformative", "not set", found = false)
    addToBuffer("SampleDistribution-Available", "not set", found = false)
    addToBuffer("SampleDistribution-Called", "not set", found = false)
    addToBuffer("SampleDistribution-Filtered", "not set", found = false)
    addToBuffer("SampleDistribution-Variant", "not set", found = false)
358

Peter van 't Hof's avatar
Peter van 't Hof committed
359
    addToBuffer("general", "Total", false)
360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387
    addToBuffer("general", "Biallelic", false)
    addToBuffer("general", "ComplexIndel", false)
    addToBuffer("general", "Filtered", false)
    addToBuffer("general", "FullyDecoded", false)
    addToBuffer("general", "Indel", false)
    addToBuffer("general", "Mixed", false)
    addToBuffer("general", "MNP", false)
    addToBuffer("general", "MonomorphicInSamples", false)
    addToBuffer("general", "NotFiltered", false)
    addToBuffer("general", "PointEvent", false)
    addToBuffer("general", "PolymorphicInSamples", false)
    addToBuffer("general", "SimpleDeletion", false)
    addToBuffer("general", "SimpleInsertion", false)
    addToBuffer("general", "SNP", false)
    addToBuffer("general", "StructuralIndel", false)
    addToBuffer("general", "Symbolic", false)
    addToBuffer("general", "SymbolicOrSV", false)
    addToBuffer("general", "Variant", false)

    val skipTags = List("QUAL", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
      addToBuffer(tag, 0, found = false)
    }

    Map("total" -> buffer.toMap)
  }

388
  /** Function to check all general stats, all info expect sample/genotype specific stats */
Sander Bollen's avatar
Sander Bollen committed
389
  protected[tools] def checkGeneral(record: VariantContext, additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
390 391
    val buffer = mutable.Map[String, Map[Any, Int]]()

392
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
393
      val map = buffer.getOrElse(key, Map())
394
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
395
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
396 397
    }

Sander Bollen's avatar
Sander Bollen committed
398
    buffer += "QUAL" -> Map(Math.round(record.getPhredScaledQual) -> 1)
Peter van 't Hof's avatar
Peter van 't Hof committed
399

Peter van 't Hof's avatar
Peter van 't Hof committed
400 401 402 403 404 405 406 407 408 409 410 411 412 413
    addToBuffer("SampleDistribution-Het", record.getGenotypes.count(genotype => genotype.isHet), found = true)
    addToBuffer("SampleDistribution-HetNonRef", record.getGenotypes.count(genotype => genotype.isHetNonRef), found = true)
    addToBuffer("SampleDistribution-Hom", record.getGenotypes.count(genotype => genotype.isHom), found = true)
    addToBuffer("SampleDistribution-HomRef", record.getGenotypes.count(genotype => genotype.isHomRef), found = true)
    addToBuffer("SampleDistribution-HomVar", record.getGenotypes.count(genotype => genotype.isHomVar), found = true)
    addToBuffer("SampleDistribution-Mixed", record.getGenotypes.count(genotype => genotype.isMixed), found = true)
    addToBuffer("SampleDistribution-NoCall", record.getGenotypes.count(genotype => genotype.isNoCall), found = true)
    addToBuffer("SampleDistribution-NonInformative", record.getGenotypes.count(genotype => genotype.isNonInformative), found = true)
    addToBuffer("SampleDistribution-Available", record.getGenotypes.count(genotype => genotype.isAvailable), found = true)
    addToBuffer("SampleDistribution-Called", record.getGenotypes.count(genotype => genotype.isCalled), found = true)
    addToBuffer("SampleDistribution-Filtered", record.getGenotypes.count(genotype => genotype.isFiltered), found = true)
    addToBuffer("SampleDistribution-Variant", record.getGenotypes.count(genotype => genotype.isHetNonRef || genotype.isHet || genotype.isHomVar), found = true)

    addToBuffer("general", "Total", found = true)
414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
    addToBuffer("general", "Biallelic", record.isBiallelic)
    addToBuffer("general", "ComplexIndel", record.isComplexIndel)
    addToBuffer("general", "Filtered", record.isFiltered)
    addToBuffer("general", "FullyDecoded", record.isFullyDecoded)
    addToBuffer("general", "Indel", record.isIndel)
    addToBuffer("general", "Mixed", record.isMixed)
    addToBuffer("general", "MNP", record.isMNP)
    addToBuffer("general", "MonomorphicInSamples", record.isMonomorphicInSamples)
    addToBuffer("general", "NotFiltered", record.isNotFiltered)
    addToBuffer("general", "PointEvent", record.isPointEvent)
    addToBuffer("general", "PolymorphicInSamples", record.isPolymorphicInSamples)
    addToBuffer("general", "SimpleDeletion", record.isSimpleDeletion)
    addToBuffer("general", "SimpleInsertion", record.isSimpleInsertion)
    addToBuffer("general", "SNP", record.isSNP)
    addToBuffer("general", "StructuralIndel", record.isStructuralIndel)
    addToBuffer("general", "Symbolic", record.isSymbolic)
    addToBuffer("general", "SymbolicOrSV", record.isSymbolicOrSV)
    addToBuffer("general", "Variant", record.isVariant)

Peter van 't Hof's avatar
Peter van 't Hof committed
433 434 435
    val skipTags = List("QUAL", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
436
      val value = record.getAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
437 438
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
439 440
    }

441
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
Peter van 't Hof's avatar
Peter van 't Hof committed
442 443
  }

444 445 446 447 448 449 450 451 452
  protected[tools] def fillGenotype(additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
    val buffer = mutable.Map[String, Map[Any, Int]]()

    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
      val map = buffer.getOrElse(key, Map())
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
453 454
    addToBuffer("DP", "not set", false)
    addToBuffer("GQ", "not set", false)
455

Peter van 't Hof's avatar
Peter van 't Hof committed
456
    addToBuffer("general", "Total", false)
457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
    addToBuffer("general", "Het", false)
    addToBuffer("general", "HetNonRef", false)
    addToBuffer("general", "Hom", false)
    addToBuffer("general", "HomRef", false)
    addToBuffer("general", "HomVar", false)
    addToBuffer("general", "Mixed", false)
    addToBuffer("general", "NoCall", false)
    addToBuffer("general", "NonInformative", false)
    addToBuffer("general", "Available", false)
    addToBuffer("general", "Called", false)
    addToBuffer("general", "Filtered", false)
    addToBuffer("general", "Variant", false)

    val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
      addToBuffer(tag, 0, found = false)
    }

    Map("total" -> buffer.toMap)

  }

480
  /** Function to check sample/genotype specific stats */
Sander Bollen's avatar
Sander Bollen committed
481
  protected[tools] def checkGenotype(record: VariantContext, genotype: Genotype, additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
482
    val buffer = mutable.Map[String, Map[Any, Int]]()
Peter van 't Hof's avatar
Peter van 't Hof committed
483

484
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
485
      val map = buffer.getOrElse(key, Map())
486
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
487
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
488 489
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
490 491 492
    buffer += "DP" -> Map((if (genotype.hasDP) genotype.getDP else "not set") -> 1)
    buffer += "GQ" -> Map((if (genotype.hasGQ) genotype.getGQ else "not set") -> 1)

Peter van 't Hof's avatar
Peter van 't Hof committed
493 494
    val usedAlleles = (for (allele <- genotype.getAlleles) yield record.getAlleleIndex(allele)).toList

Peter van 't Hof's avatar
Peter van 't Hof committed
495
    addToBuffer("general", "Total", found = true)
496 497 498 499 500 501 502 503 504 505 506 507
    addToBuffer("general", "Het", genotype.isHet)
    addToBuffer("general", "HetNonRef", genotype.isHetNonRef)
    addToBuffer("general", "Hom", genotype.isHom)
    addToBuffer("general", "HomRef", genotype.isHomRef)
    addToBuffer("general", "HomVar", genotype.isHomVar)
    addToBuffer("general", "Mixed", genotype.isMixed)
    addToBuffer("general", "NoCall", genotype.isNoCall)
    addToBuffer("general", "NonInformative", genotype.isNonInformative)
    addToBuffer("general", "Available", genotype.isAvailable)
    addToBuffer("general", "Called", genotype.isCalled)
    addToBuffer("general", "Filtered", genotype.isFiltered)
    addToBuffer("general", "Variant", genotype.isHetNonRef || genotype.isHet || genotype.isHomVar)
Peter van 't Hof's avatar
Peter van 't Hof committed
508

Peter van 't Hof's avatar
Peter van 't Hof committed
509 510 511
    if (genotype.hasAD) {
      val ad = genotype.getAD
      for (i <- 0 until ad.size if ad(i) > 0) {
Peter van 't Hof's avatar
Peter van 't Hof committed
512 513 514 515 516
        addToBuffer("AD", ad(i), found = true)
        if (i == 0) addToBuffer("AD-ref", ad(i), found = true)
        if (i > 0) addToBuffer("AD-alt", ad(i), found = true)
        if (usedAlleles.contains(i)) addToBuffer("AD-used", ad(i), found = true)
        else addToBuffer("AD-not_used", ad(i), found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
517 518
      }
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
519

Peter van 't Hof's avatar
Peter van 't Hof committed
520 521 522
    val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
523
      val value = genotype.getAnyAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
524 525
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
526 527
    }

528
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
529 530
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
531
  /** Function to write sample to sample compare tsv's / heatmaps */
Peter van 't Hof's avatar
Peter van 't Hof committed
532 533
  def writeOverlap(stats: Stats, function: SampleToSampleStats => Int,
                   prefix: String, samples: List[String]): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
534 535 536 537 538 539 540
    val absFile = new File(prefix + ".abs.tsv")
    val relFile = new File(prefix + ".rel.tsv")

    absFile.getParentFile.mkdirs()

    val absWriter = new PrintWriter(absFile)
    val relWriter = new PrintWriter(relFile)
541 542 543 544

    absWriter.println(samples.mkString("\t", "\t", ""))
    relWriter.println(samples.mkString("\t", "\t", ""))
    for (sample1 <- samples) {
Peter van 't Hof's avatar
Peter van 't Hof committed
545 546
      val values = for (sample2 <- samples) yield function(stats.samplesStats(sample1).sampleToSample(sample2))

547 548
      absWriter.println(values.mkString(sample1 + "\t", "\t", ""))

Peter van 't Hof's avatar
Peter van 't Hof committed
549
      val total = function(stats.samplesStats(sample1).sampleToSample(sample1))
550 551 552 553
      relWriter.println(values.map(_.toFloat / total).mkString(sample1 + "\t", "\t", ""))
    }
    absWriter.close()
    relWriter.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
554 555

    plotHeatmap(relFile)
556 557
  }

558
  /** Plots heatmaps on target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
559
  def plotHeatmap(file: File) {
Peter van 't Hof's avatar
Peter van 't Hof committed
560 561
    executeRscript("plotHeatmap.R", Array(file.getAbsolutePath,
      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.png",
Peter van 't Hof's avatar
Peter van 't Hof committed
562 563
      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.clustering.png",
      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.dendrogram.png"))
Peter van 't Hof's avatar
Peter van 't Hof committed
564 565
  }

566
  /** Plots line graph with target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
567
  def plotLine(file: File) {
Peter van 't Hof's avatar
Peter van 't Hof committed
568 569 570 571
    executeRscript("plotXY.R", Array(file.getAbsolutePath,
      file.getAbsolutePath.stripSuffix(".tsv") + ".xy.png"))
  }

572
  /** Function to execute Rscript as subproces */
Peter van 't Hof's avatar
Peter van 't Hof committed
573 574 575
  def executeRscript(resource: String, args: Array[String]): Unit = {
    val is = getClass.getResourceAsStream(resource)
    val file = File.createTempFile("script.", "." + resource)
576
    file.deleteOnExit()
Peter van 't Hof's avatar
Peter van 't Hof committed
577 578 579 580 581 582
    val os = new FileOutputStream(file)
    org.apache.commons.io.IOUtils.copy(is, os)
    os.close()

    val command: String = "Rscript " + file + " " + args.mkString(" ")

583 584
    logger.info("Starting: " + command)
    val process = Process(command).run(ProcessLogger(x => logger.debug(x), x => logger.debug(x)))
Peter van 't Hof's avatar
Peter van 't Hof committed
585
    if (process.exitValue() == 0) logger.info("Done: " + command)
586
    else {
Peter van 't Hof's avatar
Peter van 't Hof committed
587
      logger.warn("Failed: " + command)
588 589
      if (!logger.isDebugEnabled) logger.warn("Use -l debug for more info")
    }
590 591
  }
}