VcfFilter.scala 14.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
16
package nl.lumc.sasc.biopet.tools
Peter van 't Hof's avatar
Peter van 't Hof committed
17
18
19

import htsjdk.variant.variantcontext.writer.AsyncVariantContextWriter
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder
20
21
import htsjdk.variant.vcf.{ VCFHeader, VCFFileReader }
import htsjdk.variant.variantcontext.VariantContext
Peter van 't Hof's avatar
Peter van 't Hof committed
22
import java.io.File
Peter van 't Hof's avatar
Peter van 't Hof committed
23
import java.util.ArrayList
Peter van 't Hof's avatar
Peter van 't Hof committed
24
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
Peter van 't Hof's avatar
Peter van 't Hof committed
25
import nl.lumc.sasc.biopet.core.ToolCommand
Peter van 't Hof's avatar
Peter van 't Hof committed
26
27
28
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
import scala.collection.JavaConversions._
29
import scala.io.Source
Peter van 't Hof's avatar
Peter van 't Hof committed
30
31
32
33
34
35

class VcfFilter(val root: Configurable) extends BiopetJavaCommandLineFunction {
  javaMainClass = getClass.getName

  @Input(doc = "Input vcf", shortName = "I", required = true)
  var inputVcf: File = _
Peter van 't Hof's avatar
Peter van 't Hof committed
36

Peter van 't Hof's avatar
Peter van 't Hof committed
37
38
  @Output(doc = "Output vcf", shortName = "o", required = false)
  var outputVcf: File = _
Peter van 't Hof's avatar
Peter van 't Hof committed
39

Peter van 't Hof's avatar
Peter van 't Hof committed
40
41
42
43
  var minSampleDepth: Option[Int] = config("min_sample_depth")
  var minTotalDepth: Option[Int] = config("min_total_depth")
  var minAlternateDepth: Option[Int] = config("min_alternate_depth")
  var minSamplesPass: Option[Int] = config("min_samples_pass")
Peter van 't Hof's avatar
Peter van 't Hof committed
44
  var filterRefCalls: Boolean = config("filter_ref_calls", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
45

Peter van 't Hof's avatar
Peter van 't Hof committed
46
  override val defaultCoreMemory = 1.0
Peter van 't Hof's avatar
Peter van 't Hof committed
47
48
49

  override def commandLine = super.commandLine +
    required("-I", inputVcf) +
Peter van 't Hof's avatar
Peter van 't Hof committed
50
    required("-o", outputVcf) +
Peter van 't Hof's avatar
Peter van 't Hof committed
51
52
    optional("--minSampleDepth", minSampleDepth) +
    optional("--minTotalDepth", minTotalDepth) +
Peter van 't Hof's avatar
Peter van 't Hof committed
53
    optional("--minAlternateDepth", minAlternateDepth) +
Peter van 't Hof's avatar
Peter van 't Hof committed
54
55
    optional("--minSamplesPass", minSamplesPass) +
    conditional(filterRefCalls, "--filterRefCalls")
Peter van 't Hof's avatar
Peter van 't Hof committed
56
57
}

Peter van 't Hof's avatar
Peter van 't Hof committed
58
object VcfFilter extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
59
60
61
62
63
64
  case class Trio(child: String, father: String, mother: String) {
    def this(arg: String) = {
      this(arg.split(":")(0), arg.split(":")(1), arg.split(":")(2))
    }
  }

65
66
  case class Args(inputVcf: File = null,
                  outputVcf: File = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
67
                  invertedOutputVcf: Option[File] = None,
68
                  minQualScore: Option[Double] = None,
69
70
71
                  minSampleDepth: Int = -1,
                  minTotalDepth: Int = -1,
                  minAlternateDepth: Int = -1,
Peter van 't Hof's avatar
Peter van 't Hof committed
72
                  minSamplesPass: Int = 1,
73
74
                  minBamAlternateDepth: Int = 0,
                  mustHaveVariant: List[String] = Nil,
Peter van 't Hof's avatar
Peter van 't Hof committed
75
                  calledIn: List[String] = Nil,
76
                  deNovoInSample: String = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
77
                  resToDom: List[Trio] = Nil,
Peter van 't Hof's avatar
Peter van 't Hof committed
78
79
                  deNovoTrio: List[Trio] = Nil,
                  trioLossOfHet: List[Trio] = Nil,
Peter van 't Hof's avatar
Peter van 't Hof committed
80
                  diffGenotype: List[(String, String)] = Nil,
81
                  filterHetVarToHomVar: List[(String, String)] = Nil,
82
                  filterRefCalls: Boolean = false,
83
84
                  filterNoCalls: Boolean = false,
                  iDset: Set[String] = Set()) extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
85
86

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
87
88
    opt[File]('I', "inputVcf") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(inputVcf = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
89
    } text ("Input vcf file")
Peter van 't Hof's avatar
Peter van 't Hof committed
90
91
    opt[File]('o', "outputVcf") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(outputVcf = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
92
    } text ("Output vcf file")
Peter van 't Hof's avatar
Peter van 't Hof committed
93
94
95
    opt[File]("invertedOutputVcf") maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(invertedOutputVcf = Some(x))
    } text ("inverted output vcf file")
Peter van 't Hof's avatar
Peter van 't Hof committed
96
    opt[Int]("minSampleDepth") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
97
      c.copy(minSampleDepth = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
98
99
    } text ("Min value for DP in genotype fields")
    opt[Int]("minTotalDepth") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
100
      c.copy(minTotalDepth = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
101
102
    } text ("Min value of DP field in INFO fields")
    opt[Int]("minAlternateDepth") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
103
      c.copy(minAlternateDepth = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
104
105
    } text ("Min value of AD field in genotype fields")
    opt[Int]("minSamplesPass") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
106
      c.copy(minSamplesPass = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
107
    } text ("Min number off samples to pass --minAlternateDepth, --minBamAlternateDepth and --minSampleDepth")
Peter van 't Hof's avatar
Peter van 't Hof committed
108
    opt[Int]("minBamAlternateDepth") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
109
      c.copy(minBamAlternateDepth = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
110
    } // TODO: Convert this to more generic filter
Peter van 't Hof's avatar
Peter van 't Hof committed
111
112
113
    opt[String]("resToDom") unbounded () valueName ("<child:father:mother>") action { (x, c) =>
      c.copy(resToDom = new Trio(x) :: c.resToDom)
    } text ("Only show variants that contain unique alleles in complete set for given sample")
114
115
    opt[String]("deNovoInSample") maxOccurs (1) unbounded () valueName ("<sample>") action { (x, c) =>
      c.copy(deNovoInSample = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
116
    } text ("Only show variants that contain unique alleles in complete set for given sample")
Peter van 't Hof's avatar
Peter van 't Hof committed
117
118
119
120
121
122
    opt[String]("deNovoTrio") unbounded () valueName ("<child:father:mother>") action { (x, c) =>
      c.copy(deNovoTrio = new Trio(x) :: c.deNovoTrio)
    } text ("Only show variants that are denovo in the trio")
    opt[String]("trioLossOfHet") unbounded () valueName ("<child:father:mother>") action { (x, c) =>
      c.copy(trioLossOfHet = new Trio(x) :: c.trioLossOfHet)
    } text ("Only show variants where a loss of hetrozygosity is detected")
Peter van 't Hof's avatar
Peter van 't Hof committed
123
    opt[String]("mustHaveVariant") unbounded () valueName ("<sample>") action { (x, c) =>
124
      c.copy(mustHaveVariant = x :: c.mustHaveVariant)
Peter van 't Hof's avatar
Peter van 't Hof committed
125
    } text ("Given sample must have 1 alternative allele")
Peter van 't Hof's avatar
Peter van 't Hof committed
126
127
128
    opt[String]("calledIn") unbounded () valueName ("<sample>") action { (x, c) =>
      c.copy(calledIn = x :: c.calledIn)
    } text ("Must be called in this sample")
Peter van 't Hof's avatar
Peter van 't Hof committed
129
130
131
132
133
    opt[String]("diffGenotype") unbounded () valueName ("<sample:sample>") action { (x, c) =>
      c.copy(diffGenotype = (x.split(":")(0), x.split(":")(1)) :: c.diffGenotype)
    } validate { x => if (x.split(":").length == 2) success else failure("--notSameGenotype should be in this format: sample:sample")
    } text ("Given samples must have a different genotype")
    opt[String]("filterHetVarToHomVar") unbounded () valueName ("<sample:sample>") action { (x, c) =>
134
      c.copy(filterHetVarToHomVar = (x.split(":")(0), x.split(":")(1)) :: c.filterHetVarToHomVar)
Peter van 't Hof's avatar
Peter van 't Hof committed
135
    } validate { x => if (x.split(":").length == 2) success else failure("--filterHetVarToHomVar should be in this format: sample:sample")
Peter van 't Hof's avatar
Peter van 't Hof committed
136
    } text ("If variants in sample 1 are heterogeneous and alternative alleles are homogeneous in sample 2 variants are filtered")
Peter van 't Hof's avatar
Peter van 't Hof committed
137
138
    opt[Unit]("filterRefCalls") unbounded () action { (x, c) =>
      c.copy(filterRefCalls = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
139
    } text ("Filter when there are only ref calls")
140
141
142
    opt[Unit]("filterNoCalls") unbounded () action { (x, c) =>
      c.copy(filterNoCalls = true)
    } text ("Filter when there are only no calls")
143
144
    opt[Double]("minQualScore") unbounded () action { (x, c) =>
      c.copy(minQualScore = Some(x))
Peter van 't Hof's avatar
Peter van 't Hof committed
145
    } text ("Min qual score")
146
147
148
    opt[String]("id") unbounded () action { (x, c) =>
      c.copy(iDset = c.iDset + x)
    } text ("Id that may pass the filter")
149
    opt[File]("idFile") unbounded () action { (x, c) =>
150
151
      c.copy(iDset = c.iDset ++ Source.fromFile(x).getLines())
    } text ("File that contain list of IDs to get from vcf file")
Peter van 't Hof's avatar
Peter van 't Hof committed
152
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
153

154
155
  var commandArgs: Args = _

Peter van 't Hof's avatar
Peter van 't Hof committed
156
157
158
159
  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
160
    logger.info("Start")
Peter van 't Hof's avatar
Peter van 't Hof committed
161
    val argsParser = new OptParser
162
    commandArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1)
Peter van 't Hof's avatar
Peter van 't Hof committed
163

Peter van 't Hof's avatar
Peter van 't Hof committed
164
    val reader = new VCFFileReader(commandArgs.inputVcf, false)
Peter van 't Hof's avatar
Peter van 't Hof committed
165
    val header = reader.getFileHeader
166
167
168
169
    val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().
      setOutputFile(commandArgs.outputVcf).
      setReferenceDictionary(header.getSequenceDictionary).
      build)
Peter van 't Hof's avatar
Peter van 't Hof committed
170
    writer.writeHeader(header)
Peter van 't Hof's avatar
Peter van 't Hof committed
171

Sander Bollen's avatar
Sander Bollen committed
172
173
174
175
176
177
    val invertedWriter = commandArgs.invertedOutputVcf.collect {
      case x => new VariantContextWriterBuilder().
        setOutputFile(x).
        setReferenceDictionary(header.getSequenceDictionary).
        build
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
178
    invertedWriter.foreach(_.writeHeader(header))
Peter van 't Hof's avatar
Peter van 't Hof committed
179

180
181
    var counterTotal = 0
    var counterLeft = 0
Peter van 't Hof's avatar
Peter van 't Hof committed
182
    for (record <- reader) {
Peter van 't Hof's avatar
Peter van 't Hof committed
183
184
      if (minQualscore(record) &&
        filterRefCalls(record) &&
185
186
187
188
        filterNoCalls(record) &&
        minTotalDepth(record) &&
        minSampleDepth(record) &&
        minAlternateDepth(record) &&
189
190
        minBamAlternateDepth(record, header) &&
        mustHaveVariant(record) &&
Peter van 't Hof's avatar
Peter van 't Hof committed
191
        called(record) &&
192
193
        notSameGenotype(record) &&
        filterHetVarToHomVar(record) &&
194
        denovoInSample(record) &&
Peter van 't Hof's avatar
Peter van 't Hof committed
195
196
        denovoTrio(record, commandArgs.deNovoTrio) &&
        denovoTrio(record, commandArgs.trioLossOfHet, true) &&
Peter van 't Hof's avatar
Peter van 't Hof committed
197
        resToDom(record, commandArgs.resToDom) &&
198
        inIdSet(record)) {
Peter van 't Hof's avatar
Peter van 't Hof committed
199
        writer.add(record)
200
        counterLeft += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
201
202
      } else
        invertedWriter.foreach(_.add(record))
203
204
      counterTotal += 1
      if (counterTotal % 100000 == 0) logger.info(counterTotal + " variants processed, " + counterLeft + " left")
Peter van 't Hof's avatar
Peter van 't Hof committed
205
    }
206
    logger.info(counterTotal + " variants processed, " + counterLeft + " left")
Peter van 't Hof's avatar
Peter van 't Hof committed
207
208
    reader.close
    writer.close
Peter van 't Hof's avatar
Peter van 't Hof committed
209
    invertedWriter.foreach(_.close())
210
    logger.info("Done")
Peter van 't Hof's avatar
Peter van 't Hof committed
211
  }
212

Peter van 't Hof's avatar
Peter van 't Hof committed
213
214
215
216
217
  def called(record: VariantContext): Boolean = {
    if (!commandArgs.calledIn.forall(record.getGenotype(_).isCalled)) false
    else true
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
218
  def minQualscore(record: VariantContext): Boolean = {
219
220
    if (commandArgs.minQualScore.isEmpty) return true
    record.getPhredScaledQual >= commandArgs.minQualScore.get
Peter van 't Hof's avatar
Peter van 't Hof committed
221
222
  }

223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
  def filterRefCalls(record: VariantContext): Boolean = {
    if (commandArgs.filterNoCalls) record.getGenotypes.exists(g => !g.isHomRef)
    else true
  }

  def filterNoCalls(record: VariantContext): Boolean = {
    if (commandArgs.filterNoCalls) record.getGenotypes.exists(g => !g.isNoCall)
    else true
  }

  def minTotalDepth(record: VariantContext): Boolean = {
    record.getAttributeAsInt("DP", -1) >= commandArgs.minTotalDepth
  }

  def minSampleDepth(record: VariantContext): Boolean = {
    record.getGenotypes.count(genotype => {
      val DP = if (genotype.hasDP) genotype.getDP else -1
      DP >= commandArgs.minSampleDepth
    }) >= commandArgs.minSamplesPass
  }

  def minAlternateDepth(record: VariantContext): Boolean = {
    record.getGenotypes.count(genotype => {
      val AD = if (genotype.hasAD) List(genotype.getAD: _*) else Nil
      if (!AD.isEmpty) AD.tail.count(_ >= commandArgs.minAlternateDepth) > 0 else true
    }) >= commandArgs.minSamplesPass
  }

251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
  def minBamAlternateDepth(record: VariantContext, header: VCFHeader): Boolean = {
    val bamADFields = (for (line <- header.getInfoHeaderLines if line.getID.startsWith("BAM-AD-")) yield line.getID).toList

    val bamADvalues = (for (field <- bamADFields) yield {
      record.getAttribute(field, new ArrayList) match {
        case t: ArrayList[_] if t.length > 1 => {
          for (i <- 1 until t.size) yield {
            t(i) match {
              case a: Int    => a > commandArgs.minBamAlternateDepth
              case a: String => a.toInt > commandArgs.minBamAlternateDepth
              case _         => false
            }
          }
        }
        case _ => List(false)
      }
    }).flatten

    return commandArgs.minBamAlternateDepth <= 0 || bamADvalues.count(_ == true) >= commandArgs.minSamplesPass
  }

  def mustHaveVariant(record: VariantContext): Boolean = {
    return !commandArgs.mustHaveVariant.map(record.getGenotype(_)).exists(a => a.isHomRef || a.isNoCall)
  }

  def notSameGenotype(record: VariantContext): Boolean = {
Peter van 't Hof's avatar
Peter van 't Hof committed
277
    for ((sample1, sample2) <- commandArgs.diffGenotype) {
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
      val genotype1 = record.getGenotype(sample1)
      val genotype2 = record.getGenotype(sample2)
      if (genotype1.sameGenotype(genotype2)) return false
    }
    return true
  }

  def filterHetVarToHomVar(record: VariantContext): Boolean = {
    for ((sample1, sample2) <- commandArgs.filterHetVarToHomVar) {
      val genotype1 = record.getGenotype(sample1)
      val genotype2 = record.getGenotype(sample2)
      if (genotype1.isHet && !genotype1.getAlleles.forall(_.isNonReference)) {
        for (allele <- genotype1.getAlleles if allele.isNonReference) {
          if (genotype2.getAlleles.forall(_.basesMatch(allele))) return false
        }
      }
    }
    return true
  }

  def denovoInSample(record: VariantContext): Boolean = {
299
300
    if (commandArgs.deNovoInSample == null) return true
    val genotype = record.getGenotype(commandArgs.deNovoInSample)
301
    for (allele <- genotype.getAlleles if allele.isNonReference) {
302
      for (g <- record.getGenotypes if g.getSampleName != commandArgs.deNovoInSample) {
303
304
305
306
307
        if (g.getAlleles.exists(_.basesMatch(allele))) return false
      }
    }
    return true
  }
308

Peter van 't Hof's avatar
Peter van 't Hof committed
309
310
311
312
313
314
315
316
317
318
319
320
  def resToDom(record: VariantContext, trios: List[Trio]): Boolean = {
    for (trio <- trios) {
      val child = record.getGenotype(trio.child)

      if (child.isHomVar && child.getAlleles.forall(allele => {
        record.getGenotype(trio.father).countAllele(allele) == 1 &&
          record.getGenotype(trio.mother).countAllele(allele) == 1
      })) return true
    }
    return trios.isEmpty
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
321
322
323
324
325
326
327
328
329
330
331
  def denovoTrio(record: VariantContext, trios: List[Trio], onlyLossHet: Boolean = false): Boolean = {
    for (trio <- trios) {
      val child = record.getGenotype(trio.child)
      val father = record.getGenotype(trio.father)
      val mother = record.getGenotype(trio.mother)

      for (allele <- child.getAlleles) {
        val childCount = child.countAllele(allele)
        val fatherCount = father.countAllele(allele)
        val motherCount = mother.countAllele(allele)

Peter van 't Hof's avatar
Peter van 't Hof committed
332
333
334
335
336
337
338
339
        if (onlyLossHet) {
          if (childCount == 2 && (
            (fatherCount == 2 && motherCount == 0) ||
            (fatherCount == 0 && motherCount == 2))) return true
        } else {
          if (childCount == 1 && fatherCount == 0 && motherCount == 0) return true
          else if (childCount == 2 && (fatherCount == 0 || motherCount == 0)) return true
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
340
341
342
343
344
      }
    }
    return trios.isEmpty
  }

345
346
347
348
  def inIdSet(record: VariantContext): Boolean = {
    if (commandArgs.iDset.isEmpty) true
    else record.getID.split(",").exists(commandArgs.iDset.contains(_))
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
349
}