MpileupToVcf.scala 9.56 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
16
package nl.lumc.sasc.biopet.tools
Peter van 't Hof's avatar
Peter van 't Hof committed
17
18
19

import java.io.File
import java.io.PrintWriter
20
import htsjdk.samtools.SamReaderFactory
Peter van 't Hof's avatar
Peter van 't Hof committed
21
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
Peter van 't Hof's avatar
Peter van 't Hof committed
22
import nl.lumc.sasc.biopet.core.ToolCommand
Peter van 't Hof's avatar
Peter van 't Hof committed
23
import nl.lumc.sasc.biopet.core.config.Configurable
Peter van 't Hof's avatar
Peter van 't Hof committed
24
import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsMpileup
25
import nl.lumc.sasc.biopet.utils.ConfigUtils
Peter van 't Hof's avatar
Peter van 't Hof committed
26
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
27
import scala.collection.mutable.ArrayBuffer
Peter van 't Hof's avatar
Peter van 't Hof committed
28
29
import scala.io.Source
import scala.math.round
Peter van 't Hof's avatar
Peter van 't Hof committed
30
import scala.math.floor
Peter van 't Hof's avatar
Peter van 't Hof committed
31
import scala.collection.JavaConversions._
Peter van 't Hof's avatar
Peter van 't Hof committed
32
33
34

class MpileupToVcf(val root: Configurable) extends BiopetJavaCommandLineFunction {
  javaMainClass = getClass.getName
Peter van 't Hof's avatar
Peter van 't Hof committed
35

Peter van 't Hof's avatar
Peter van 't Hof committed
36
  @Input(doc = "Input mpileup file", shortName = "mpileup", required = false)
37
  var inputMpileup: File = _
Peter van 't Hof's avatar
Peter van 't Hof committed
38

Peter van 't Hof's avatar
Peter van 't Hof committed
39
  @Input(doc = "Input bam file", shortName = "bam", required = false)
40
  var inputBam: File = _
Peter van 't Hof's avatar
Peter van 't Hof committed
41

Peter van 't Hof's avatar
Peter van 't Hof committed
42
43
  @Output(doc = "Output tag library", shortName = "output", required = true)
  var output: File = _
Peter van 't Hof's avatar
Peter van 't Hof committed
44
45
46
47
48

  var minDP: Option[Int] = config("min_dp")
  var minAP: Option[Int] = config("min_ap")
  var homoFraction: Option[Double] = config("homoFraction")
  var ploidy: Option[Int] = config("ploidy")
Peter van 't Hof's avatar
Peter van 't Hof committed
49
  var sample: String = _
Peter van 't Hof's avatar
Peter van 't Hof committed
50
  var reference: String = config("reference")
Peter van 't Hof's avatar
Peter van 't Hof committed
51

Peter van 't Hof's avatar
Peter van 't Hof committed
52
  override val defaultCoreMemory = 3.0
Peter van 't Hof's avatar
Peter van 't Hof committed
53

54
55
  override def defaults = ConfigUtils.mergeMaps(Map("samtoolsmpileup" -> Map("disable_baq" -> true, "min_map_quality" -> 1)),
    super.defaults)
Peter van 't Hof's avatar
Peter van 't Hof committed
56

57
58
  override def beforeGraph {
    super.beforeGraph
Peter van 't Hof's avatar
Peter van 't Hof committed
59
60
    val samtoolsMpileup = new SamtoolsMpileup(this)
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
61

62
63
64
65
66
67
68
69
70
71
  override def beforeCmd: Unit = {
    if (sample == null && inputBam.exists()) {
      val inputSam = SamReaderFactory.makeDefault.open(inputBam)
      val readGroups = inputSam.getFileHeader.getReadGroups
      val samples = readGroups.map(readGroup => readGroup.getSample).distinct
      sample = samples.head
      inputSam.close
    }
  }

72
73
74
  override def commandLine = {
    (if (inputMpileup == null) {
      val samtoolsMpileup = new SamtoolsMpileup(this)
Peter van 't Hof's avatar
Peter van 't Hof committed
75
      samtoolsMpileup.input = List(inputBam)
Peter van 't Hof's avatar
Peter van 't Hof committed
76
      samtoolsMpileup.cmdPipe + " | "
Peter van 't Hof's avatar
Peter van 't Hof committed
77
78
79
80
    } else "") +
      super.commandLine +
      required("-o", output) +
      optional("--minDP", minDP) +
Peter van 't Hof's avatar
Peter van 't Hof committed
81
82
83
      optional("--minAP", minAP) +
      optional("--homoFraction", homoFraction) +
      optional("--ploidy", ploidy) +
Peter van 't Hof's avatar
Peter van 't Hof committed
84
      required("--sample", sample) +
85
86
      (if (inputBam == null) required("-I", inputMpileup) else "")
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
87
88
}

Peter van 't Hof's avatar
Peter van 't Hof committed
89
object MpileupToVcf extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
90
91
  case class Args(input: File = null, output: File = null, sample: String = null, minDP: Int = 8, minAP: Int = 2,
                  homoFraction: Double = 0.8, ploidy: Int = 2) extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
92
93

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
94
95
96
97
98
99
100
101
102
    opt[File]('I', "input") valueName ("<file>") action { (x, c) =>
      c.copy(input = x)
    } text ("input, default is stdin")
    opt[File]('o', "output") required () valueName ("<file>") action { (x, c) =>
      c.copy(output = x)
    } text ("out is a required file property")
    opt[String]('s', "sample") required () action { (x, c) =>
      c.copy(sample = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
103
    opt[Int]("minDP") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
104
105
      c.copy(minDP = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
106
    opt[Int]("minAP") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
107
108
      c.copy(minAP = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
109
    opt[Double]("homoFraction") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
110
111
      c.copy(homoFraction = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
112
    opt[Int]("ploidy") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
113
114
      c.copy(ploidy = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
115
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
116

Peter van 't Hof's avatar
Peter van 't Hof committed
117
118
119
120
  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
121
122
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
Peter van 't Hof's avatar
Peter van 't Hof committed
123

Peter van 't Hof's avatar
Peter van 't Hof committed
124
    import scala.collection.mutable.Map
Peter van 't Hof's avatar
Peter van 't Hof committed
125
    if (commandArgs.input != null && !commandArgs.input.exists) throw new IllegalStateException("Input file does not exist")
Peter van 't Hof's avatar
Peter van 't Hof committed
126

Peter van 't Hof's avatar
Peter van 't Hof committed
127
    val writer = new PrintWriter(commandArgs.output)
Peter van 't Hof's avatar
Peter van 't Hof committed
128
    writer.println("##fileformat=VCFv4.1")
Peter van 't Hof's avatar
Peter van 't Hof committed
129
    writer.println("##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">")
130
    writer.println("##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency, for each ALT allele, in the same order as listed\">")
Peter van 't Hof's avatar
Peter van 't Hof committed
131
    writer.println("##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">")
132
133
    writer.println("##FORMAT=<ID=AD,Number=.,Type=Integer,Description=\"Total Allele Depth\">")
    writer.println("##FORMAT=<ID=FREQ,Number=A,Type=Float,Description=\"Allele Frequency\">")
Peter van 't Hof's avatar
Peter van 't Hof committed
134
135
    writer.println("##FORMAT=<ID=RFC,Number=1,Type=Integer,Description=\"Reference Forward Reads\">")
    writer.println("##FORMAT=<ID=RRC,Number=1,Type=Integer,Description=\"Reference Reverse Reads\">")
136
137
    writer.println("##FORMAT=<ID=AFC,Number=A,Type=Integer,Description=\"Alternetive Forward Reads\">")
    writer.println("##FORMAT=<ID=ARC,Number=A,Type=Integer,Description=\"Alternetive Reverse Reads\">")
Peter van 't Hof's avatar
Peter van 't Hof committed
138
    writer.println("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">")
Peter van 't Hof's avatar
Peter van 't Hof committed
139
    writer.println("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + commandArgs.sample)
140
141
142
143
144
145
    val inputStream = if (commandArgs.input != null) {
      Source.fromFile(commandArgs.input).getLines
    } else {
      logger.info("No input file as argument, waiting on stdin")
      Source.stdin.getLines
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
146
147
148
    class Counts(var forward: Int, var reverse: Int)
    for (
      line <- inputStream;
Peter van 't Hof's avatar
Peter van 't Hof committed
149
      values = line.split("\t");
Peter van 't Hof's avatar
Peter van 't Hof committed
150
151
      if values.size > 5
    ) {
Peter van 't Hof's avatar
Peter van 't Hof committed
152
153
      val chr = values(0)
      val pos = values(1)
154
155
156
157
158
      val ref = values(2) match {
        case "A" | "T" | "G" | "C" => values(2)
        case "a" | "t" | "g" | "c" => values(2).toUpperCase
        case _                     => "N"
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
159
160
161
      val reads = values(3).toInt
      val mpileup = values(4)
      val qual = values(5)
Peter van 't Hof's avatar
Peter van 't Hof committed
162
163
164
165

      val counts: Map[String, Counts] = Map(ref.toUpperCase -> new Counts(0, 0))

      def addCount(s: String) {
Peter van 't Hof's avatar
Peter van 't Hof committed
166
        val upper = s.toUpperCase
Peter van 't Hof's avatar
Peter van 't Hof committed
167
        if (!counts.contains(upper)) counts += upper -> new Counts(0, 0)
Peter van 't Hof's avatar
Peter van 't Hof committed
168
169
170
        if (s(0).isLower) counts(upper).reverse += 1
        else counts(upper).forward += 1
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
171

Peter van 't Hof's avatar
Peter van 't Hof committed
172
      var t = 0
173
      var dels = 0
Peter van 't Hof's avatar
Peter van 't Hof committed
174
      while (t < mpileup.size) {
Peter van 't Hof's avatar
Peter van 't Hof committed
175
176
        mpileup(t) match {
          case ',' => {
Peter van 't Hof's avatar
Peter van 't Hof committed
177
178
            addCount(ref.toLowerCase)
            t += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
179
180
          }
          case '.' => {
Peter van 't Hof's avatar
Peter van 't Hof committed
181
182
            addCount(ref.toUpperCase)
            t += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
183
184
185
          }
          case '^' => t += 2
          case '$' => t += 1
186
          case '*' => {
Peter van 't Hof's avatar
Peter van 't Hof committed
187
188
            dels += 1
            t += 1
189
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
190
          case '+' | '-' => {
Peter van 't Hof's avatar
Peter van 't Hof committed
191
192
193
194
195
            t += 1
            var size = ""
            var insert = ""
            while (mpileup(t).isDigit) {
              size += mpileup(t)
Peter van 't Hof's avatar
Peter van 't Hof committed
196
              t += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
197
198
199
            }
            for (c <- t until t + size.toInt) insert = insert + mpileup(c)
            t += size.toInt
Peter van 't Hof's avatar
Peter van 't Hof committed
200
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
201
202
203
          case 'a' | 'c' | 't' | 'g' | 'A' | 'C' | 'T' | 'G' => {
            addCount(mpileup(t).toString)
            t += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
204
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
205
          case _ => t += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
206
207
        }
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
208

209
210
211
212
213
214
      val info: ArrayBuffer[String] = ArrayBuffer("DP=" + reads)
      val format: Map[String, String] = Map("DP" -> reads.toString)
      val alt: ArrayBuffer[String] = new ArrayBuffer
      format += ("RFC" -> counts(ref.toUpperCase).forward.toString)
      format += ("RRC" -> counts(ref.toUpperCase).reverse.toString)
      format += ("AD" -> (counts(ref.toUpperCase).forward + counts(ref.toUpperCase).reverse).toString)
Peter van 't Hof's avatar
Peter van 't Hof committed
215
      if (reads >= commandArgs.minDP) for ((key, value) <- counts if key != ref.toUpperCase if value.forward + value.reverse >= commandArgs.minAP) {
216
217
        alt += key
        format += ("AD" -> (format("AD") + "," + (value.forward + value.reverse).toString))
Peter van 't Hof's avatar
Peter van 't Hof committed
218
219
220
221
        format += ("AFC" -> ((if (format.contains("AFC")) format("AFC") + "," else "") + value.forward))
        format += ("ARC" -> ((if (format.contains("ARC")) format("ARC") + "," else "") + value.reverse))
        format += ("FREQ" -> ((if (format.contains("FREQ")) format("FREQ") + "," else "") +
          round((value.forward + value.reverse).toDouble / reads * 1E4).toDouble / 1E2))
222
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
223

224
      if (alt.size > 0) {
Peter van 't Hof's avatar
Peter van 't Hof committed
225
        val ad = for (ad <- format("AD").split(",")) yield ad.toInt
226
        var left = reads - dels
Peter van 't Hof's avatar
Peter van 't Hof committed
227
        val gt = ArrayBuffer[Int]()
Peter van 't Hof's avatar
Peter van 't Hof committed
228

Peter van 't Hof's avatar
Peter van 't Hof committed
229
        for (p <- 0 to alt.size if gt.size < commandArgs.ploidy) {
Peter van 't Hof's avatar
Peter van 't Hof committed
230
231
          var max = -1
          for (a <- 0 until ad.length if ad(a) > (if (max >= 0) ad(max) else -1) && !gt.exists(_ == a)) max = a
Peter van 't Hof's avatar
Peter van 't Hof committed
232
          val f = ad(max).toDouble / left
Peter van 't Hof's avatar
Peter van 't Hof committed
233
234
235
          for (a <- 0 to floor(f).toInt if gt.size < commandArgs.ploidy) gt.append(max)
          if (f - floor(f) >= commandArgs.homoFraction) {
            for (b <- p to commandArgs.ploidy if gt.size < commandArgs.ploidy) gt.append(max)
Peter van 't Hof's avatar
Peter van 't Hof committed
236
237
238
          }
          left -= ad(max)
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
239
240
241
        writer.println(Array(chr, pos, ".", ref.toUpperCase, alt.mkString(","), ".", ".", info.mkString(";"),
          "GT:" + format.keys.mkString(":"), gt.sortWith(_ < _).mkString("/") + ":" + format.values.mkString(":")
        ).mkString("\t"))
Peter van 't Hof's avatar
Peter van 't Hof committed
242
243
244
245
246
      }
    }
    writer.close
  }
}