RegionAfCount.scala 6.37 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1
2
3
4
5
6
7
8
9
10
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
11
 * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
Peter van 't Hof's avatar
Peter van 't Hof committed
12
13
14
15
16
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
package nl.lumc.sasc.biopet.tools

17
import java.io.{ File, PrintWriter }
Peter van 't Hof's avatar
Peter van 't Hof committed
18
import java.util
Peter van 't Hof's avatar
Peter van 't Hof committed
19

Peter van 't Hof's avatar
Peter van 't Hof committed
20
import htsjdk.variant.vcf.VCFFileReader
Peter van 't Hof's avatar
Peter van 't Hof committed
21
import nl.lumc.sasc.biopet.utils.ToolCommand
22
import nl.lumc.sasc.biopet.utils.rscript.ScatterPlot
23
import nl.lumc.sasc.biopet.utils.intervals.{ BedRecord, BedRecordList }
Peter van 't Hof's avatar
Peter van 't Hof committed
24
25
26

import scala.collection.JavaConversions._
import scala.collection.mutable
Peter van 't Hof's avatar
WIP    
Peter van 't Hof committed
27
28
import scala.concurrent.ExecutionContext
import scala.concurrent.ExecutionContext.Implicits.global
Peter van 't Hof's avatar
Peter van 't Hof committed
29
30
31

object RegionAfCount extends ToolCommand {
  case class Args(bedFile: File = null,
32
33
                  outputPrefix: String = null,
                  scatterpPlot: Boolean = false,
34
                  vcfFiles: List[File] = Nil) extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
35
36

  class OptParser extends AbstractOptParser {
37
    opt[File]('b', "bedFile") unbounded () required () maxOccurs 1 valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
38
39
      c.copy(bedFile = x)
    }
40
41
    opt[String]('o', "outputPrefix") unbounded () required () maxOccurs 1 valueName "<file prefix>" action { (x, c) =>
      c.copy(outputPrefix = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
42
    }
43
44
    opt[Unit]('s', "scatterPlot") unbounded () action { (x, c) =>
      c.copy(scatterpPlot = true)
45
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
46
    opt[File]('V', "vcfFile") unbounded () minOccurs 1 action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
47
      c.copy(vcfFiles = c.vcfFiles ::: x :: Nil)
Peter van 't Hof's avatar
Peter van 't Hof committed
48
49
50
51
52
    }
  }

  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
53
    val cmdArgs: Args = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
Peter van 't Hof's avatar
Peter van 't Hof committed
54

Peter van 't Hof's avatar
Peter van 't Hof committed
55
    logger.info("Start")
56
    logger.info("Reading bed file")
Peter van 't Hof's avatar
Peter van 't Hof committed
57

58
    val bedRecords = BedRecordList.fromFile(cmdArgs.bedFile).sorted
Peter van 't Hof's avatar
Peter van 't Hof committed
59
60
61

    logger.info(s"Combine ${bedRecords.allRecords.size} bed records")

62
    val combinedBedRecords = bedRecords.combineOverlap
Peter van 't Hof's avatar
Peter van 't Hof committed
63
64

    logger.info(s"${combinedBedRecords.allRecords.size} left")
65
    logger.info(s"${combinedBedRecords.allRecords.size * cmdArgs.vcfFiles.size} query's to do")
66
    logger.info("Reading vcf files")
Peter van 't Hof's avatar
Peter van 't Hof committed
67

68
69
70
71
72
73
    case class AfCounts(var names: Double = 0,
                        var namesExons: Double = 0,
                        var namesIntrons: Double = 0,
                        var namesCoding: Double = 0,
                        var utr: Double = 0,
                        var utr5: Double = 0,
Peter van 't Hof's avatar
Peter van 't Hof committed
74
                        var utr3: Double = 0)
75

76
    var c = 0
77
78
79
80
81
82
    val afCounts = (for (vcfFile <- cmdArgs.vcfFiles.par) yield vcfFile -> {
      val reader = new VCFFileReader(vcfFile, true)
      val afCounts: mutable.Map[String, AfCounts] = mutable.Map()
      for (region <- combinedBedRecords.allRecords) yield {
        val originals = region.originals()
        for (variant <- reader.query(region.chr, region.start, region.end)) {
Peter van 't Hof's avatar
Peter van 't Hof committed
83
84
          val sum = (variant.getAttribute("AF", 0) match {
            case a: util.ArrayList[_] => a.map(_.toString.toDouble).toArray
Peter van 't Hof's avatar
Peter van 't Hof committed
85
            case s                    => Array(s.toString.toDouble)
86
          }).sum
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
          val interval = BedRecord(variant.getContig, variant.getStart, variant.getEnd)
          originals.foreach { x =>
            val name = x.name.getOrElse(s"${x.chr}:${x.start}-${x.end}")
            if (!afCounts.contains(name)) afCounts += name -> AfCounts()
            afCounts(name).names += sum
            val exons = x.exons.getOrElse(Seq()).filter(_.overlapWith(interval))
            val introns = x.introns.getOrElse(Seq()).filter(_.overlapWith(interval))
            val utr5 = x.utr5.map(_.overlapWith(interval))
            val utr3 = x.utr3.map(_.overlapWith(interval))
            if (exons.nonEmpty) {
              afCounts(name).namesExons += sum
              if (!utr5.getOrElse(false) && !utr3.getOrElse(false)) afCounts(name).namesCoding += sum
            }
            if (introns.nonEmpty) afCounts(name).namesIntrons += sum
            if (utr5.getOrElse(false) || utr3.getOrElse(false)) afCounts(name).utr += sum
            if (utr5.getOrElse(false)) afCounts(name).utr5 += sum
            if (utr3.getOrElse(false)) afCounts(name).utr3 += sum
          }
105
        }
106
107
108
109
110
        c += 1
        if (c % 100 == 0) logger.info(s"$c regions done")
      }
      afCounts.toMap
    }).toMap
Peter van 't Hof's avatar
Peter van 't Hof committed
111

112
    logger.info(s"Done reading, ${c} regions")
Peter van 't Hof's avatar
Peter van 't Hof committed
113

114
    logger.info("Writing output files")
115

116
117
118
119
120
121
    def writeOutput(tsvFile: File, function: AfCounts => Double): Unit = {
      val writer = new PrintWriter(tsvFile)
      writer.println("\t" + cmdArgs.vcfFiles.map(_.getName).mkString("\t"))
      for (r <- cmdArgs.vcfFiles.foldLeft(Set[String]())((a, b) => a ++ afCounts(b).keySet)) {
        writer.print(r + "\t")
        writer.println(cmdArgs.vcfFiles.map(x => function(afCounts(x).getOrElse(r, AfCounts()))).mkString("\t"))
122
      }
123
      writer.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
124

125
      if (cmdArgs.scatterpPlot) generatePlot(tsvFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
126
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
127

Peter van 't Hof's avatar
WIP    
Peter van 't Hof committed
128
    def generatePlot(tsvFile: File)(implicit ec: ExecutionContext): Unit = {
129
      logger.info(s"Generate plot for $tsvFile")
Peter van 't Hof's avatar
Peter van 't Hof committed
130

131
      val scatterPlot = new ScatterPlot(null)
132
133
      scatterPlot.input = tsvFile
      scatterPlot.output = new File(tsvFile.getAbsolutePath.stripSuffix(".tsv") + ".png")
134
135
136
137
138
      scatterPlot.ylabel = Some("Sum of AFs")
      scatterPlot.width = Some(1200)
      scatterPlot.height = Some(1000)
      scatterPlot.runLocal()
    }
139
140
141
142
143
144
145
146
147
148
149
    for (
      arg <- List[(File, AfCounts => Double)](
        (new File(cmdArgs.outputPrefix + ".names.tsv"), _.names),
        (new File(cmdArgs.outputPrefix + ".names.exons_only.tsv"), _.namesExons),
        (new File(cmdArgs.outputPrefix + ".names.introns_only.tsv"), _.namesIntrons),
        (new File(cmdArgs.outputPrefix + ".names.coding.tsv"), _.namesCoding),
        (new File(cmdArgs.outputPrefix + ".names.utr.tsv"), _.utr),
        (new File(cmdArgs.outputPrefix + ".names.utr5.tsv"), _.utr5),
        (new File(cmdArgs.outputPrefix + ".names.utr3.tsv"), _.utr3)
      ).par
    ) writeOutput(arg._1, arg._2)
Peter van 't Hof's avatar
Peter van 't Hof committed
150

Peter van 't Hof's avatar
Peter van 't Hof committed
151
    logger.info("Done")
Peter van 't Hof's avatar
Peter van 't Hof committed
152
153
  }
}