VcfStats.scala 25.7 KB
Newer Older
bow's avatar
bow committed
1 2 3 4 5 6 7 8 9 10
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
11
 * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
bow's avatar
bow committed
12 13 14
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
15
package nl.lumc.sasc.biopet.tools.vcfstats
16

Peter van 't Hof's avatar
Peter van 't Hof committed
17
import java.io.{File, FileOutputStream, IOException, PrintWriter}
18

Peter van 't Hof's avatar
Peter van 't Hof committed
19
import htsjdk.samtools.util.Interval
Peter van 't Hof's avatar
Peter van 't Hof committed
20
import htsjdk.variant.variantcontext.{Genotype, VariantContext}
21
import htsjdk.variant.vcf.VCFFileReader
22
import nl.lumc.sasc.biopet.utils.intervals.BedRecordList
Peter van 't Hof's avatar
Peter van 't Hof committed
23
import nl.lumc.sasc.biopet.utils.{ConfigUtils, FastaUtils, ToolCommand, VcfUtils}
Peter van 't Hof's avatar
Peter van 't Hof committed
24

25 26
import scala.collection.JavaConversions._
import scala.collection.mutable
Peter van 't Hof's avatar
Peter van 't Hof committed
27
import scala.io.Source
Peter van 't Hof's avatar
Peter van 't Hof committed
28
import scala.sys.process.{Process, ProcessLogger}
Peter van 't Hof's avatar
Peter van 't Hof committed
29
import scala.util.Random
Peter van 't Hof's avatar
Peter van 't Hof committed
30 31
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.duration._
Peter van 't Hof's avatar
Peter van 't Hof committed
32
import scala.concurrent.{Await, Future}
Peter van 't Hof's avatar
Peter van 't Hof committed
33

34
/**
Peter van 't Hof's avatar
Peter van 't Hof committed
35 36
 * This tool will generate statistics from a vcf file
 *
37 38 39
 * Created by pjvan_thof on 1/10/15.
 */
object VcfStats extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
40
  /** Commandline argument */
41 42
  case class Args(inputFile: File = null,
                  outputDir: File = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
43
                  referenceFile: File = null,
44 45 46 47
                  intervals: Option[File] = None,
                  infoTags: List[String] = Nil,
                  genotypeTags: List[String] = Nil,
                  allInfoTags: Boolean = false,
Peter van 't Hof's avatar
Peter van 't Hof committed
48 49 50 51 52
                  allGenotypeTags: Boolean = false,
                  binSize: Int = 10000000,
                  writeBinStats: Boolean = false,
                  generalWiggle: List[String] = Nil,
                  genotypeWiggle: List[String] = Nil) extends AbstractArgs
53

Peter van 't Hof's avatar
Peter van 't Hof committed
54
  private val generalWiggleOptions = List("Total", "Biallelic", "ComplexIndel", "Filtered", "FullyDecoded", "Indel", "Mixed",
55 56 57 58
    "MNP", "MonomorphicInSamples", "NotFiltered", "PointEvent", "PolymorphicInSamples",
    "SimpleDeletion", "SimpleInsertion", "SNP", "StructuralIndel", "Symbolic",
    "SymbolicOrSV", "Variant")

Peter van 't Hof's avatar
Peter van 't Hof committed
59
  private val genotypeWiggleOptions = List("Total", "Het", "HetNonRef", "Hom", "HomRef", "HomVar", "Mixed", "NoCall", "NonInformative",
60 61
    "Available", "Called", "Filtered", "Variant")

Peter van 't Hof's avatar
Peter van 't Hof committed
62
  /** Parsing commandline arguments */
63
  class OptParser extends AbstractOptParser {
64
    opt[File]('I', "inputFile") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) =>
65
      c.copy(inputFile = x)
66 67 68 69
    } validate {
      x => if (x.exists) success else failure("Input VCF required")
    } text "Input VCF file (required)"
    opt[File]('R', "referenceFile") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
70
      c.copy(referenceFile = x)
71 72 73 74
    } validate {
      x => if (x.exists) success else failure("Reference file required")
    } text "Fasta reference which was used to call input VCF (required)"
    opt[File]('o', "outputDir") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) =>
75
      c.copy(outputDir = x)
76 77 78
    } validate {
      x => if (x == null) failure("Output directory required") else success
    } text "Path to directory for output (required)"
Peter van 't Hof's avatar
Peter van 't Hof committed
79
    opt[File]('i', "intervals") unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
80
      c.copy(intervals = Some(x))
81
    } text "Path to interval (BED) file (optional)"
Peter van 't Hof's avatar
Peter van 't Hof committed
82
    opt[String]("infoTag") unbounded () valueName "<tag>" action { (x, c) =>
83
      c.copy(infoTags = x :: c.infoTags)
Peter van 't Hof's avatar
Peter van 't Hof committed
84
    } text s"Summarize these info tags. Default is (${defaultInfoFields.mkString(", ")})"
Peter van 't Hof's avatar
Peter van 't Hof committed
85
    opt[String]("genotypeTag") unbounded () valueName "<tag>" action { (x, c) =>
86
      c.copy(genotypeTags = x :: c.genotypeTags)
Peter van 't Hof's avatar
Peter van 't Hof committed
87
    } text s"Summarize these genotype tags. Default is (${defaultGenotypeFields.mkString(", ")})"
88 89
    opt[Unit]("allInfoTags") unbounded () action { (x, c) =>
      c.copy(allInfoTags = true)
90
    } text "Summarize all info tags. Default false"
91 92
    opt[Unit]("allGenotypeTags") unbounded () action { (x, c) =>
      c.copy(allGenotypeTags = true)
93
    } text "Summarize all genotype tags. Default false"
Peter van 't Hof's avatar
Peter van 't Hof committed
94 95
    opt[Int]("binSize") unbounded () action { (x, c) =>
      c.copy(binSize = x)
96
    } text "Binsize in estimated base pairs"
Peter van 't Hof's avatar
Peter van 't Hof committed
97 98
    opt[Unit]("writeBinStats") unbounded () action { (x, c) =>
      c.copy(writeBinStats = true)
99
    } text "Write bin statistics. Default False"
Peter van 't Hof's avatar
Peter van 't Hof committed
100 101
    opt[String]("generalWiggle") unbounded () action { (x, c) =>
      c.copy(generalWiggle = x :: c.generalWiggle, writeBinStats = true)
102
    } validate {
Peter van 't Hof's avatar
Peter van 't Hof committed
103
      x => if (generalWiggleOptions.contains(x)) success else failure(s"""Nonexistent field $x""")
Peter van 't Hof's avatar
Peter van 't Hof committed
104 105
    } text s"""Create a wiggle track with bin size <binSize> for any of the following statistics:
        |${generalWiggleOptions.mkString(", ")}""".stripMargin
Peter van 't Hof's avatar
Peter van 't Hof committed
106 107
    opt[String]("genotypeWiggle") unbounded () action { (x, c) =>
      c.copy(genotypeWiggle = x :: c.genotypeWiggle, writeBinStats = true)
108 109
    } validate {
      x => if (genotypeWiggleOptions.contains(x)) success else failure(s"""Non-existent field $x""")
Peter van 't Hof's avatar
Peter van 't Hof committed
110 111
    } text s"""Create a wiggle track with bin size <binSize> for any of the following genotype fields:
        |${genotypeWiggleOptions.mkString(", ")}""".stripMargin
112 113
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
114
  protected var cmdArgs: Args = _
Peter van 't Hof's avatar
Peter van 't Hof committed
115

116 117
  val defaultGenotypeFields = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

118
  val defaultInfoFields = List("QUAL", "general", "AC", "AF", "AN", "DP")
119

Peter van 't Hof's avatar
Peter van 't Hof committed
120 121 122
  val sampleDistributions = List("Het", "HetNonRef", "Hom", "HomRef", "HomVar", "Mixed", "NoCall",
    "NonInformative", "Available", "Called", "Filtered", "Variant")

123 124 125 126 127 128
  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
    logger.info("Started")
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
129
    cmdArgs = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
130

Peter van 't Hof's avatar
Peter van 't Hof committed
131
    val reader = new VCFFileReader(cmdArgs.inputFile, true)
132 133 134
    val header = reader.getFileHeader
    val samples = header.getSampleNamesInOrder.toList

Peter van 't Hof's avatar
Peter van 't Hof committed
135 136
    reader.close()

Peter van 't Hof's avatar
Peter van 't Hof committed
137 138
    val adInfoTags = {
      (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
139
        infoTag <- cmdArgs.infoTags if !defaultInfoFields.contains(infoTag)
Peter van 't Hof's avatar
Peter van 't Hof committed
140 141 142 143
      ) yield {
        require(header.getInfoHeaderLine(infoTag) != null, "Info tag '" + infoTag + "' not found in header of vcf file")
        infoTag
      }) ::: (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
144
        line <- header.getInfoHeaderLines if cmdArgs.allInfoTags if !defaultInfoFields.contains(line.getID) if !cmdArgs.infoTags.contains(line.getID)
Peter van 't Hof's avatar
Peter van 't Hof committed
145 146 147 148
      ) yield {
        line.getID
      }).toList ::: defaultInfoFields
    }
149

150
    val adGenotypeTags = (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
151
      genotypeTag <- cmdArgs.genotypeTags if !defaultGenotypeFields.contains(genotypeTag)
152
    ) yield {
153 154
      require(header.getFormatHeaderLine(genotypeTag) != null, "Format tag '" + genotypeTag + "' not found in header of vcf file")
      genotypeTag
155
    }) ::: (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
156
      line <- header.getFormatHeaderLines if cmdArgs.allGenotypeTags if !defaultGenotypeFields.contains(line.getID) if !cmdArgs.genotypeTags.contains(line.getID) if line.getID != "PL"
157
    ) yield {
158
      line.getID
Peter van 't Hof's avatar
Peter van 't Hof committed
159
    }).toList ::: defaultGenotypeFields
160

Peter van 't Hof's avatar
Peter van 't Hof committed
161 162
    val bedRecords = (cmdArgs.intervals match {
      case Some(intervals) => BedRecordList.fromFile(intervals).validateContigs(cmdArgs.referenceFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
163
      case _               => BedRecordList.fromReference(cmdArgs.referenceFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
164
    }).combineOverlap.scatter(cmdArgs.binSize)
165 166

    val intervals: List[Interval] = bedRecords.toSamIntervals.toList
Peter van 't Hof's avatar
Peter van 't Hof committed
167

168
    val totalBases = bedRecords.length
Peter van 't Hof's avatar
Peter van 't Hof committed
169

Peter van 't Hof's avatar
Peter van 't Hof committed
170 171
    // Reading vcf records
    logger.info("Start reading vcf records")
Peter van 't Hof's avatar
Peter van 't Hof committed
172 173 174 175 176 177

    def createStats: Stats = {
      val stats = new Stats
      //init stats
      for (sample1 <- samples) {
        stats.samplesStats += sample1 -> new SampleStats
178
        for (sample2 <- samples) {
Peter van 't Hof's avatar
Peter van 't Hof committed
179
          stats.samplesStats(sample1).sampleToSample += sample2 -> new SampleToSampleStats
180 181
        }
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
182 183 184 185 186
      stats
    }

    var variantCounter = 0L
    var baseCounter = 0L
Peter van 't Hof's avatar
Peter van 't Hof committed
187

Peter van 't Hof's avatar
Peter van 't Hof committed
188
    def status(count: Int, interval: Interval): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
189 190 191 192 193
      variantCounter += count
      baseCounter += interval.length()
      val fraction = baseCounter.toFloat / totalBases * 100
      logger.info(interval + " done, " + count + " rows processed")
      logger.info("total: " + variantCounter + " rows processed, " + fraction + "% done")
194
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
195

Peter van 't Hof's avatar
Peter van 't Hof committed
196
    // Triple for loop to not keep all bins in memory
197
    val statsFutures = for (intervals <- Random.shuffle(intervals).grouped(intervals.size / (if (intervals.size > 10) 4 else 1)).toList) yield Future {
Peter van 't Hof's avatar
Peter van 't Hof committed
198
      val chunkStats = for (intervals <- intervals.grouped(25)) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
199
        val binStats = for (interval <- intervals.par) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
200
          val reader = new VCFFileReader(cmdArgs.inputFile, true)
Peter van 't Hof's avatar
Peter van 't Hof committed
201 202 203 204
          var chunkCounter = 0
          val stats = createStats
          logger.info("Starting on: " + interval)

205 206
          val query = reader.query(interval.getContig, interval.getStart, interval.getEnd)
          if (!query.hasNext) {
Peter van 't Hof's avatar
Peter van 't Hof committed
207
            Stats.mergeNestedStatsMap(stats.generalStats, fillGeneral(adInfoTags))
208
            for (sample <- samples) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
209
              Stats.mergeNestedStatsMap(stats.samplesStats(sample).genotypeStats, fillGenotype(adGenotypeTags))
210 211 212 213
            }
            chunkCounter += 1
          }

Peter van 't Hof's avatar
Peter van 't Hof committed
214
          for (
215
            record <- query if record.getStart <= interval.getEnd
Peter van 't Hof's avatar
Peter van 't Hof committed
216
          ) {
Peter van 't Hof's avatar
Peter van 't Hof committed
217
            Stats.mergeNestedStatsMap(stats.generalStats, checkGeneral(record, adInfoTags))
Peter van 't Hof's avatar
Peter van 't Hof committed
218 219
            for (sample1 <- samples) yield {
              val genotype = record.getGenotype(sample1)
Peter van 't Hof's avatar
Peter van 't Hof committed
220
              Stats.mergeNestedStatsMap(stats.samplesStats(sample1).genotypeStats, checkGenotype(record, genotype, adGenotypeTags))
Peter van 't Hof's avatar
Peter van 't Hof committed
221 222 223 224
              for (sample2 <- samples) {
                val genotype2 = record.getGenotype(sample2)
                if (genotype.getAlleles == genotype2.getAlleles)
                  stats.samplesStats(sample1).sampleToSample(sample2).genotypeOverlap += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
225
                stats.samplesStats(sample1).sampleToSample(sample2).alleleOverlap += VcfUtils.alleleOverlap(genotype.getAlleles.toList, genotype2.getAlleles.toList)
Peter van 't Hof's avatar
Peter van 't Hof committed
226
              }
Peter van 't Hof's avatar
Peter van 't Hof committed
227
            }
Peter van 't Hof's avatar
Peter van 't Hof committed
228
            chunkCounter += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
229
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
230
          reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
231

Peter van 't Hof's avatar
Peter van 't Hof committed
232 233
          if (cmdArgs.writeBinStats) {
            val binOutputDir = new File(cmdArgs.outputDir, "bins" + File.separator + interval.getContig)
Peter van 't Hof's avatar
Peter van 't Hof committed
234

Peter van 't Hof's avatar
Peter van 't Hof committed
235 236
            stats.writeGenotypeField(samples, "general", binOutputDir, prefix = "genotype-" + interval.getStart + "-" + interval.getEnd)
            stats.writeField("general", binOutputDir, prefix = interval.getStart + "-" + interval.getEnd)
Peter van 't Hof's avatar
Peter van 't Hof committed
237
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
238

Peter van 't Hof's avatar
Peter van 't Hof committed
239 240
          status(chunkCounter, interval)
          stats
Peter van 't Hof's avatar
Peter van 't Hof committed
241
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
242
        binStats.toList.fold(createStats)(_ += _)
Peter van 't Hof's avatar
Peter van 't Hof committed
243
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
244
      chunkStats.toList.fold(createStats)(_ += _)
245
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
246
    val stats = statsFutures.foldLeft(createStats) { case (a, b) => a += Await.result(b, Duration.Inf) }
Peter van 't Hof's avatar
Peter van 't Hof committed
247

Peter van 't Hof's avatar
Peter van 't Hof committed
248
    logger.info("Done reading vcf records")
249

Peter van 't Hof's avatar
Rename  
Peter van 't Hof committed
250
    val allWriter = new PrintWriter(new File(cmdArgs.outputDir, "stats.json"))
251
    val json = ConfigUtils.mapToJson(stats.getAllStats(FastaUtils.getCachedDict(cmdArgs.referenceFile).getSequences.map(_.getSequenceName).toList, samples, adGenotypeTags, adInfoTags, sampleDistributions))
Peter van 't Hof's avatar
Peter van 't Hof committed
252
    allWriter.println(json.nospaces)
253
    allWriter.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
254 255

    // Write general wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
256 257
    for (field <- cmdArgs.generalWiggle) {
      val file = new File(cmdArgs.outputDir, "wigs" + File.separator + "general-" + field + ".wig")
Peter van 't Hof's avatar
Peter van 't Hof committed
258
      writeWiggle(intervals, field, "count", file, genotype = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
259 260
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
261
    // Write sample wiggle tracks
Peter van 't Hof's avatar
Peter van 't Hof committed
262 263
    for (field <- cmdArgs.genotypeWiggle; sample <- samples) {
      val file = new File(cmdArgs.outputDir, "wigs" + File.separator + "genotype-" + sample + "-" + field + ".wig")
Peter van 't Hof's avatar
Peter van 't Hof committed
264
      writeWiggle(intervals, field, sample, file, genotype = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
265 266
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
267 268
    writeOverlap(stats, _.genotypeOverlap, cmdArgs.outputDir + "/sample_compare/genotype_overlap", samples)
    writeOverlap(stats, _.alleleOverlap, cmdArgs.outputDir + "/sample_compare/allele_overlap", samples)
269 270

    logger.info("Done")
Peter van 't Hof's avatar
Peter van 't Hof committed
271 272
  }

273
  //FIXME: does only work correct for reference and not with a bed file
Peter van 't Hof's avatar
Peter van 't Hof committed
274
  protected def writeWiggle(intervals: List[Interval], row: String, column: String, outputFile: File, genotype: Boolean): Unit = {
275
    val groupedIntervals = intervals.groupBy(_.getContig).map { case (k, v) => k -> v.sortBy(_.getStart) }
Peter van 't Hof's avatar
Peter van 't Hof committed
276
    outputFile.getParentFile.mkdirs()
Peter van 't Hof's avatar
Peter van 't Hof committed
277 278
    val writer = new PrintWriter(outputFile)
    writer.println("track type=wiggle_0")
279
    for ((chr, intervals) <- groupedIntervals) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
280 281 282
      val length = intervals.head.length()
      writer.println(s"fixedStep chrom=$chr start=1 step=$length span=$length")
      for (interval <- intervals) {
Peter van 't Hof's avatar
Peter van 't Hof committed
283
        val file = {
Peter van 't Hof's avatar
Peter van 't Hof committed
284 285
          if (genotype) new File(cmdArgs.outputDir, "bins" + File.separator + chr + File.separator + "genotype-" + interval.getStart + "-" + interval.getEnd + "-general.tsv")
          else new File(cmdArgs.outputDir, "bins" + File.separator + chr + File.separator + interval.getStart + "-" + interval.getEnd + "-general.tsv")
Peter van 't Hof's avatar
Peter van 't Hof committed
286 287
        }
        writer.println(valueFromTsv(file, row, column).getOrElse(0))
Peter van 't Hof's avatar
Peter van 't Hof committed
288 289 290 291 292
      }
    }
    writer.close()
  }

293 294 295 296 297 298 299
  /**
   * Gets single value from a tsv file
   * @param file Input tsv file
   * @param row Row id
   * @param column column id
   * @return value
   */
Peter van 't Hof's avatar
Peter van 't Hof committed
300 301 302 303 304 305
  def valueFromTsv(file: File, row: String, column: String): Option[String] = {
    val reader = Source.fromFile(file)
    val it = reader.getLines()
    val index = it.next().split("\t").indexOf(column)

    val value = it.find(_.startsWith(row))
306
    reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
307 308

    value.collect { case x => x.split("\t")(index) }
309 310
  }

311 312 313 314 315 316 317 318 319
  protected[tools] def fillGeneral(additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
    val buffer = mutable.Map[String, Map[Any, Int]]()

    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
      val map = buffer.getOrElse(key, Map())
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
320
    addToBuffer("QUAL", "not set", false)
321

Sander Bollen's avatar
Sander Bollen committed
322 323 324 325 326 327 328 329 330 331 332 333
    addToBuffer("SampleDistribution-Het", "not set", found = false)
    addToBuffer("SampleDistribution-HetNonRef", "not set", found = false)
    addToBuffer("SampleDistribution-Hom", "not set", found = false)
    addToBuffer("SampleDistribution-HomRef", "not set", found = false)
    addToBuffer("SampleDistribution-HomVar", "not set", found = false)
    addToBuffer("SampleDistribution-Mixed", "not set", found = false)
    addToBuffer("SampleDistribution-NoCall", "not set", found = false)
    addToBuffer("SampleDistribution-NonInformative", "not set", found = false)
    addToBuffer("SampleDistribution-Available", "not set", found = false)
    addToBuffer("SampleDistribution-Called", "not set", found = false)
    addToBuffer("SampleDistribution-Filtered", "not set", found = false)
    addToBuffer("SampleDistribution-Variant", "not set", found = false)
334

Peter van 't Hof's avatar
Peter van 't Hof committed
335
    addToBuffer("general", "Total", false)
336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357
    addToBuffer("general", "Biallelic", false)
    addToBuffer("general", "ComplexIndel", false)
    addToBuffer("general", "Filtered", false)
    addToBuffer("general", "FullyDecoded", false)
    addToBuffer("general", "Indel", false)
    addToBuffer("general", "Mixed", false)
    addToBuffer("general", "MNP", false)
    addToBuffer("general", "MonomorphicInSamples", false)
    addToBuffer("general", "NotFiltered", false)
    addToBuffer("general", "PointEvent", false)
    addToBuffer("general", "PolymorphicInSamples", false)
    addToBuffer("general", "SimpleDeletion", false)
    addToBuffer("general", "SimpleInsertion", false)
    addToBuffer("general", "SNP", false)
    addToBuffer("general", "StructuralIndel", false)
    addToBuffer("general", "Symbolic", false)
    addToBuffer("general", "SymbolicOrSV", false)
    addToBuffer("general", "Variant", false)

    val skipTags = List("QUAL", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
358
      addToBuffer(tag, "not set", found = false)
359 360 361 362 363
    }

    Map("total" -> buffer.toMap)
  }

364
  /** Function to check all general stats, all info expect sample/genotype specific stats */
Sander Bollen's avatar
Sander Bollen committed
365
  protected[tools] def checkGeneral(record: VariantContext, additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
366 367
    val buffer = mutable.Map[String, Map[Any, Int]]()

368
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
369
      val map = buffer.getOrElse(key, Map())
370
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
371
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
372 373
    }

374
    addToBuffer("QUAL", Math.round(record.getPhredScaledQual), true)
Peter van 't Hof's avatar
Peter van 't Hof committed
375

Peter van 't Hof's avatar
Peter van 't Hof committed
376 377 378 379 380 381 382 383 384 385 386 387 388
    addToBuffer("SampleDistribution-Het", record.getGenotypes.count(genotype => genotype.isHet), found = true)
    addToBuffer("SampleDistribution-HetNonRef", record.getGenotypes.count(genotype => genotype.isHetNonRef), found = true)
    addToBuffer("SampleDistribution-Hom", record.getGenotypes.count(genotype => genotype.isHom), found = true)
    addToBuffer("SampleDistribution-HomRef", record.getGenotypes.count(genotype => genotype.isHomRef), found = true)
    addToBuffer("SampleDistribution-HomVar", record.getGenotypes.count(genotype => genotype.isHomVar), found = true)
    addToBuffer("SampleDistribution-Mixed", record.getGenotypes.count(genotype => genotype.isMixed), found = true)
    addToBuffer("SampleDistribution-NoCall", record.getGenotypes.count(genotype => genotype.isNoCall), found = true)
    addToBuffer("SampleDistribution-NonInformative", record.getGenotypes.count(genotype => genotype.isNonInformative), found = true)
    addToBuffer("SampleDistribution-Available", record.getGenotypes.count(genotype => genotype.isAvailable), found = true)
    addToBuffer("SampleDistribution-Called", record.getGenotypes.count(genotype => genotype.isCalled), found = true)
    addToBuffer("SampleDistribution-Filtered", record.getGenotypes.count(genotype => genotype.isFiltered), found = true)
    addToBuffer("SampleDistribution-Variant", record.getGenotypes.count(genotype => genotype.isHetNonRef || genotype.isHet || genotype.isHomVar), found = true)

389
    addToBuffer("general", "Total", true)
390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408
    addToBuffer("general", "Biallelic", record.isBiallelic)
    addToBuffer("general", "ComplexIndel", record.isComplexIndel)
    addToBuffer("general", "Filtered", record.isFiltered)
    addToBuffer("general", "FullyDecoded", record.isFullyDecoded)
    addToBuffer("general", "Indel", record.isIndel)
    addToBuffer("general", "Mixed", record.isMixed)
    addToBuffer("general", "MNP", record.isMNP)
    addToBuffer("general", "MonomorphicInSamples", record.isMonomorphicInSamples)
    addToBuffer("general", "NotFiltered", record.isNotFiltered)
    addToBuffer("general", "PointEvent", record.isPointEvent)
    addToBuffer("general", "PolymorphicInSamples", record.isPolymorphicInSamples)
    addToBuffer("general", "SimpleDeletion", record.isSimpleDeletion)
    addToBuffer("general", "SimpleInsertion", record.isSimpleInsertion)
    addToBuffer("general", "SNP", record.isSNP)
    addToBuffer("general", "StructuralIndel", record.isStructuralIndel)
    addToBuffer("general", "Symbolic", record.isSymbolic)
    addToBuffer("general", "SymbolicOrSV", record.isSymbolicOrSV)
    addToBuffer("general", "Variant", record.isVariant)

Peter van 't Hof's avatar
Peter van 't Hof committed
409 410 411
    val skipTags = List("QUAL", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
412
      val value = record.getAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
413 414
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
415 416
    }

417
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
Peter van 't Hof's avatar
Peter van 't Hof committed
418 419
  }

420 421 422 423 424 425 426 427 428
  protected[tools] def fillGenotype(additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
    val buffer = mutable.Map[String, Map[Any, Int]]()

    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
      val map = buffer.getOrElse(key, Map())
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
429 430
    addToBuffer("DP", "not set", false)
    addToBuffer("GQ", "not set", false)
431

Peter van 't Hof's avatar
Peter van 't Hof committed
432
    addToBuffer("general", "Total", false)
433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455
    addToBuffer("general", "Het", false)
    addToBuffer("general", "HetNonRef", false)
    addToBuffer("general", "Hom", false)
    addToBuffer("general", "HomRef", false)
    addToBuffer("general", "HomVar", false)
    addToBuffer("general", "Mixed", false)
    addToBuffer("general", "NoCall", false)
    addToBuffer("general", "NonInformative", false)
    addToBuffer("general", "Available", false)
    addToBuffer("general", "Called", false)
    addToBuffer("general", "Filtered", false)
    addToBuffer("general", "Variant", false)

    val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
      addToBuffer(tag, 0, found = false)
    }

    Map("total" -> buffer.toMap)

  }

456
  /** Function to check sample/genotype specific stats */
Sander Bollen's avatar
Sander Bollen committed
457
  protected[tools] def checkGenotype(record: VariantContext, genotype: Genotype, additionalTags: List[String]): Map[String, Map[String, Map[Any, Int]]] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
458
    val buffer = mutable.Map[String, Map[Any, Int]]()
Peter van 't Hof's avatar
Peter van 't Hof committed
459

460
    def addToBuffer(key: String, value: Any, found: Boolean): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
461
      val map = buffer.getOrElse(key, Map())
462
      if (found) buffer += key -> (map + (value -> (map.getOrElse(value, 0) + 1)))
Peter van 't Hof's avatar
Peter van 't Hof committed
463
      else buffer += key -> (map + (value -> map.getOrElse(value, 0)))
Peter van 't Hof's avatar
Peter van 't Hof committed
464 465
    }

Peter van 't Hof's avatar
Peter van 't Hof committed
466 467 468
    buffer += "DP" -> Map((if (genotype.hasDP) genotype.getDP else "not set") -> 1)
    buffer += "GQ" -> Map((if (genotype.hasGQ) genotype.getGQ else "not set") -> 1)

Peter van 't Hof's avatar
Peter van 't Hof committed
469 470
    val usedAlleles = (for (allele <- genotype.getAlleles) yield record.getAlleleIndex(allele)).toList

Peter van 't Hof's avatar
Peter van 't Hof committed
471
    addToBuffer("general", "Total", found = true)
472 473 474 475 476 477 478 479 480 481 482 483
    addToBuffer("general", "Het", genotype.isHet)
    addToBuffer("general", "HetNonRef", genotype.isHetNonRef)
    addToBuffer("general", "Hom", genotype.isHom)
    addToBuffer("general", "HomRef", genotype.isHomRef)
    addToBuffer("general", "HomVar", genotype.isHomVar)
    addToBuffer("general", "Mixed", genotype.isMixed)
    addToBuffer("general", "NoCall", genotype.isNoCall)
    addToBuffer("general", "NonInformative", genotype.isNonInformative)
    addToBuffer("general", "Available", genotype.isAvailable)
    addToBuffer("general", "Called", genotype.isCalled)
    addToBuffer("general", "Filtered", genotype.isFiltered)
    addToBuffer("general", "Variant", genotype.isHetNonRef || genotype.isHet || genotype.isHomVar)
Peter van 't Hof's avatar
Peter van 't Hof committed
484

Peter van 't Hof's avatar
Peter van 't Hof committed
485 486 487
    if (genotype.hasAD) {
      val ad = genotype.getAD
      for (i <- 0 until ad.size if ad(i) > 0) {
Peter van 't Hof's avatar
Peter van 't Hof committed
488 489 490 491 492
        addToBuffer("AD", ad(i), found = true)
        if (i == 0) addToBuffer("AD-ref", ad(i), found = true)
        if (i > 0) addToBuffer("AD-alt", ad(i), found = true)
        if (usedAlleles.contains(i)) addToBuffer("AD-used", ad(i), found = true)
        else addToBuffer("AD-not_used", ad(i), found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
493 494
      }
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
495

Peter van 't Hof's avatar
Peter van 't Hof committed
496 497 498
    val skipTags = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")

    for (tag <- additionalTags if !skipTags.contains(tag)) {
499
      val value = genotype.getAnyAttribute(tag)
Peter van 't Hof's avatar
Peter van 't Hof committed
500 501
      if (value == null) addToBuffer(tag, "notset", found = true)
      else addToBuffer(tag, value, found = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
502 503
    }

504
    Map(record.getContig -> buffer.toMap, "total" -> buffer.toMap)
505 506
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
507
  /** Function to write sample to sample compare tsv's / heatmaps */
Peter van 't Hof's avatar
Peter van 't Hof committed
508 509
  def writeOverlap(stats: Stats, function: SampleToSampleStats => Int,
                   prefix: String, samples: List[String]): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
510 511 512 513 514 515 516
    val absFile = new File(prefix + ".abs.tsv")
    val relFile = new File(prefix + ".rel.tsv")

    absFile.getParentFile.mkdirs()

    val absWriter = new PrintWriter(absFile)
    val relWriter = new PrintWriter(relFile)
517 518 519 520

    absWriter.println(samples.mkString("\t", "\t", ""))
    relWriter.println(samples.mkString("\t", "\t", ""))
    for (sample1 <- samples) {
Peter van 't Hof's avatar
Peter van 't Hof committed
521 522
      val values = for (sample2 <- samples) yield function(stats.samplesStats(sample1).sampleToSample(sample2))

523 524
      absWriter.println(values.mkString(sample1 + "\t", "\t", ""))

Peter van 't Hof's avatar
Peter van 't Hof committed
525
      val total = function(stats.samplesStats(sample1).sampleToSample(sample1))
526 527 528 529
      relWriter.println(values.map(_.toFloat / total).mkString(sample1 + "\t", "\t", ""))
    }
    absWriter.close()
    relWriter.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
530 531

    plotHeatmap(relFile)
532 533
  }

534
  /** Plots heatmaps on target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
535
  def plotHeatmap(file: File) {
Peter van 't Hof's avatar
Peter van 't Hof committed
536 537
    executeRscript("plotHeatmap.R", Array(file.getAbsolutePath,
      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.png",
Peter van 't Hof's avatar
Peter van 't Hof committed
538 539
      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.clustering.png",
      file.getAbsolutePath.stripSuffix(".tsv") + ".heatmap.dendrogram.png"))
Peter van 't Hof's avatar
Peter van 't Hof committed
540 541
  }

542
  /** Plots line graph with target tsv file */
Peter van 't Hof's avatar
Peter van 't Hof committed
543
  def plotLine(file: File) {
Peter van 't Hof's avatar
Peter van 't Hof committed
544 545 546 547
    executeRscript("plotXY.R", Array(file.getAbsolutePath,
      file.getAbsolutePath.stripSuffix(".tsv") + ".xy.png"))
  }

548
  /** Function to execute Rscript as subproces */
Peter van 't Hof's avatar
Peter van 't Hof committed
549 550 551
  def executeRscript(resource: String, args: Array[String]): Unit = {
    val is = getClass.getResourceAsStream(resource)
    val file = File.createTempFile("script.", "." + resource)
552
    file.deleteOnExit()
Peter van 't Hof's avatar
Peter van 't Hof committed
553 554 555 556 557 558
    val os = new FileOutputStream(file)
    org.apache.commons.io.IOUtils.copy(is, os)
    os.close()

    val command: String = "Rscript " + file + " " + args.mkString(" ")

559
    logger.info("Starting: " + command)
Peter van 't Hof's avatar
Peter van 't Hof committed
560 561 562 563 564 565 566 567 568
    try {
      val process = Process(command).run(ProcessLogger(x => logger.debug(x), x => logger.debug(x)))
      if (process.exitValue() == 0) logger.info("Done: " + command)
      else {
        logger.warn("Failed: " + command)
        if (!logger.isDebugEnabled) logger.warn("Use -l debug for more info")
      }
    } catch {
      case e: IOException =>
569
    }
570 571
  }
}