RegionAfCount.scala 6.29 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
package nl.lumc.sasc.biopet.tools

Peter van 't Hof's avatar
Peter van 't Hof committed
18
import java.io.{ PrintWriter, InputStream, File }
Peter van 't Hof's avatar
Peter van 't Hof committed
19
import java.util
Peter van 't Hof's avatar
Peter van 't Hof committed
20

Peter van 't Hof's avatar
Peter van 't Hof committed
21
import htsjdk.variant.vcf.VCFFileReader
Peter van 't Hof's avatar
Peter van 't Hof committed
22
import nl.lumc.sasc.biopet.utils.ToolCommand
23
import nl.lumc.sasc.biopet.utils.rscript.ScatterPlot
Peter van 't Hof's avatar
Peter van 't Hof committed
24
import nl.lumc.sasc.biopet.utils.intervals.{ BedRecord, BedRecordList }
Peter van 't Hof's avatar
Peter van 't Hof committed
25 26 27 28 29 30

import scala.collection.JavaConversions._
import scala.collection.mutable

object RegionAfCount extends ToolCommand {
  case class Args(bedFile: File = null,
31 32
                  outputPrefix: String = null,
                  scatterpPlot: Boolean = false,
33
                  vcfFiles: List[File] = Nil) extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
34 35

  class OptParser extends AbstractOptParser {
36
    opt[File]('b', "bedFile") unbounded () required () maxOccurs 1 valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
37 38
      c.copy(bedFile = x)
    }
39 40
    opt[String]('o', "outputPrefix") unbounded () required () maxOccurs 1 valueName "<file prefix>" action { (x, c) =>
      c.copy(outputPrefix = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
41
    }
42 43
    opt[Unit]('s', "scatterPlot") unbounded () action { (x, c) =>
      c.copy(scatterpPlot = true)
44
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
45
    opt[File]('V', "vcfFile") unbounded () minOccurs 1 action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
46
      c.copy(vcfFiles = c.vcfFiles ::: x :: Nil)
Peter van 't Hof's avatar
Peter van 't Hof committed
47 48 49 50 51
    }
  }

  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
52
    val cmdArgs: Args = argsParser.parse(args, Args()) getOrElse(throw new IllegalArgumentException)
Peter van 't Hof's avatar
Peter van 't Hof committed
53

Peter van 't Hof's avatar
Peter van 't Hof committed
54
    logger.info("Start")
55
    logger.info("Reading bed file")
Peter van 't Hof's avatar
Peter van 't Hof committed
56

57
    val bedRecords = BedRecordList.fromFile(cmdArgs.bedFile).sorted
Peter van 't Hof's avatar
Peter van 't Hof committed
58 59 60

    logger.info(s"Combine ${bedRecords.allRecords.size} bed records")

61
    val combinedBedRecords = bedRecords.combineOverlap
Peter van 't Hof's avatar
Peter van 't Hof committed
62 63

    logger.info(s"${combinedBedRecords.allRecords.size} left")
64
    logger.info(s"${combinedBedRecords.allRecords.size * cmdArgs.vcfFiles.size} query's to do")
65
    logger.info("Reading vcf files")
Peter van 't Hof's avatar
Peter van 't Hof committed
66

67 68 69 70 71 72
    case class AfCounts(var names: Double = 0,
                        var namesExons: Double = 0,
                        var namesIntrons: Double = 0,
                        var namesCoding: Double = 0,
                        var utr: Double = 0,
                        var utr5: Double = 0,
Peter van 't Hof's avatar
Peter van 't Hof committed
73
                        var utr3: Double = 0)
74

75
    var c = 0
76 77 78 79 80 81
    val afCounts = (for (vcfFile <- cmdArgs.vcfFiles.par) yield vcfFile -> {
      val reader = new VCFFileReader(vcfFile, true)
      val afCounts: mutable.Map[String, AfCounts] = mutable.Map()
      for (region <- combinedBedRecords.allRecords) yield {
        val originals = region.originals()
        for (variant <- reader.query(region.chr, region.start, region.end)) {
Peter van 't Hof's avatar
Peter van 't Hof committed
82 83
          val sum = (variant.getAttribute("AF", 0) match {
            case a: util.ArrayList[_] => a.map(_.toString.toDouble).toArray
Peter van 't Hof's avatar
Peter van 't Hof committed
84
            case s                    => Array(s.toString.toDouble)
85
          }).sum
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
          val interval = BedRecord(variant.getContig, variant.getStart, variant.getEnd)
          originals.foreach { x =>
            val name = x.name.getOrElse(s"${x.chr}:${x.start}-${x.end}")
            if (!afCounts.contains(name)) afCounts += name -> AfCounts()
            afCounts(name).names += sum
            val exons = x.exons.getOrElse(Seq()).filter(_.overlapWith(interval))
            val introns = x.introns.getOrElse(Seq()).filter(_.overlapWith(interval))
            val utr5 = x.utr5.map(_.overlapWith(interval))
            val utr3 = x.utr3.map(_.overlapWith(interval))
            if (exons.nonEmpty) {
              afCounts(name).namesExons += sum
              if (!utr5.getOrElse(false) && !utr3.getOrElse(false)) afCounts(name).namesCoding += sum
            }
            if (introns.nonEmpty) afCounts(name).namesIntrons += sum
            if (utr5.getOrElse(false) || utr3.getOrElse(false)) afCounts(name).utr += sum
            if (utr5.getOrElse(false)) afCounts(name).utr5 += sum
            if (utr3.getOrElse(false)) afCounts(name).utr3 += sum
          }
104
        }
105 106 107 108 109
        c += 1
        if (c % 100 == 0) logger.info(s"$c regions done")
      }
      afCounts.toMap
    }).toMap
Peter van 't Hof's avatar
Peter van 't Hof committed
110

111
    logger.info(s"Done reading, ${c} regions")
Peter van 't Hof's avatar
Peter van 't Hof committed
112

113
    logger.info("Writing output files")
114

115 116 117 118 119 120
    def writeOutput(tsvFile: File, function: AfCounts => Double): Unit = {
      val writer = new PrintWriter(tsvFile)
      writer.println("\t" + cmdArgs.vcfFiles.map(_.getName).mkString("\t"))
      for (r <- cmdArgs.vcfFiles.foldLeft(Set[String]())((a, b) => a ++ afCounts(b).keySet)) {
        writer.print(r + "\t")
        writer.println(cmdArgs.vcfFiles.map(x => function(afCounts(x).getOrElse(r, AfCounts()))).mkString("\t"))
121
      }
122
      writer.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
123

124
      if (cmdArgs.scatterpPlot) generatePlot(tsvFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
125
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
126

127 128
    def generatePlot(tsvFile: File): Unit = {
      logger.info(s"Generate plot for $tsvFile")
Peter van 't Hof's avatar
Peter van 't Hof committed
129

130
      val scatterPlot = new ScatterPlot(null)
131 132
      scatterPlot.input = tsvFile
      scatterPlot.output = new File(tsvFile.getAbsolutePath.stripSuffix(".tsv") + ".png")
133 134 135 136 137
      scatterPlot.ylabel = Some("Sum of AFs")
      scatterPlot.width = Some(1200)
      scatterPlot.height = Some(1000)
      scatterPlot.runLocal()
    }
138 139 140 141 142 143 144 145 146 147 148
    for (
      arg <- List[(File, AfCounts => Double)](
        (new File(cmdArgs.outputPrefix + ".names.tsv"), _.names),
        (new File(cmdArgs.outputPrefix + ".names.exons_only.tsv"), _.namesExons),
        (new File(cmdArgs.outputPrefix + ".names.introns_only.tsv"), _.namesIntrons),
        (new File(cmdArgs.outputPrefix + ".names.coding.tsv"), _.namesCoding),
        (new File(cmdArgs.outputPrefix + ".names.utr.tsv"), _.utr),
        (new File(cmdArgs.outputPrefix + ".names.utr5.tsv"), _.utr5),
        (new File(cmdArgs.outputPrefix + ".names.utr3.tsv"), _.utr3)
      ).par
    ) writeOutput(arg._1, arg._2)
Peter van 't Hof's avatar
Peter van 't Hof committed
149

Peter van 't Hof's avatar
Peter van 't Hof committed
150
    logger.info("Done")
Peter van 't Hof's avatar
Peter van 't Hof committed
151 152
  }
}