VcfFilter.scala 15.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
16
package nl.lumc.sasc.biopet.tools
Peter van 't Hof's avatar
Peter van 't Hof committed
17
18
19

import htsjdk.variant.variantcontext.writer.AsyncVariantContextWriter
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder
20
21
import htsjdk.variant.vcf.{ VCFHeader, VCFFileReader }
import htsjdk.variant.variantcontext.VariantContext
Peter van 't Hof's avatar
Peter van 't Hof committed
22
import java.io.File
Peter van 't Hof's avatar
Peter van 't Hof committed
23
import java.util.ArrayList
Peter van 't Hof's avatar
Peter van 't Hof committed
24
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
Peter van 't Hof's avatar
Peter van 't Hof committed
25
import nl.lumc.sasc.biopet.core.ToolCommand
Peter van 't Hof's avatar
Peter van 't Hof committed
26
27
28
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
import scala.collection.JavaConversions._
29
import scala.io.Source
Peter van 't Hof's avatar
Peter van 't Hof committed
30
31
32
33
34
35

class VcfFilter(val root: Configurable) extends BiopetJavaCommandLineFunction {
  javaMainClass = getClass.getName

  @Input(doc = "Input vcf", shortName = "I", required = true)
  var inputVcf: File = _
Peter van 't Hof's avatar
Peter van 't Hof committed
36

Peter van 't Hof's avatar
Peter van 't Hof committed
37
38
  @Output(doc = "Output vcf", shortName = "o", required = false)
  var outputVcf: File = _
Peter van 't Hof's avatar
Peter van 't Hof committed
39

Peter van 't Hof's avatar
Peter van 't Hof committed
40
41
42
43
  var minSampleDepth: Option[Int] = config("min_sample_depth")
  var minTotalDepth: Option[Int] = config("min_total_depth")
  var minAlternateDepth: Option[Int] = config("min_alternate_depth")
  var minSamplesPass: Option[Int] = config("min_samples_pass")
Peter van 't Hof's avatar
Peter van 't Hof committed
44
  var filterRefCalls: Boolean = config("filter_ref_calls", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
45

Peter van 't Hof's avatar
Peter van 't Hof committed
46
  override val defaultCoreMemory = 1.0
Peter van 't Hof's avatar
Peter van 't Hof committed
47
48
49

  override def commandLine = super.commandLine +
    required("-I", inputVcf) +
Peter van 't Hof's avatar
Peter van 't Hof committed
50
    required("-o", outputVcf) +
Peter van 't Hof's avatar
Peter van 't Hof committed
51
52
    optional("--minSampleDepth", minSampleDepth) +
    optional("--minTotalDepth", minTotalDepth) +
Peter van 't Hof's avatar
Peter van 't Hof committed
53
    optional("--minAlternateDepth", minAlternateDepth) +
Peter van 't Hof's avatar
Peter van 't Hof committed
54
55
    optional("--minSamplesPass", minSamplesPass) +
    conditional(filterRefCalls, "--filterRefCalls")
Peter van 't Hof's avatar
Peter van 't Hof committed
56
57
}

Peter van 't Hof's avatar
Peter van 't Hof committed
58
object VcfFilter extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
59
60
61
62
63
64
  case class Trio(child: String, father: String, mother: String) {
    def this(arg: String) = {
      this(arg.split(":")(0), arg.split(":")(1), arg.split(":")(2))
    }
  }

65
66
  case class Args(inputVcf: File = null,
                  outputVcf: File = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
67
                  invertedOutputVcf: Option[File] = None,
68
                  minQualScore: Option[Double] = None,
69
70
71
                  minSampleDepth: Int = -1,
                  minTotalDepth: Int = -1,
                  minAlternateDepth: Int = -1,
Peter van 't Hof's avatar
Peter van 't Hof committed
72
                  minSamplesPass: Int = 1,
73
74
                  minBamAlternateDepth: Int = 0,
                  mustHaveVariant: List[String] = Nil,
Peter van 't Hof's avatar
Peter van 't Hof committed
75
                  calledIn: List[String] = Nil,
76
                  deNovoInSample: String = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
77
                  resToDom: List[Trio] = Nil,
Peter van 't Hof's avatar
Peter van 't Hof committed
78
                  trioCompound: List[Trio] = Nil,
Peter van 't Hof's avatar
Peter van 't Hof committed
79
80
                  deNovoTrio: List[Trio] = Nil,
                  trioLossOfHet: List[Trio] = Nil,
Peter van 't Hof's avatar
Peter van 't Hof committed
81
                  diffGenotype: List[(String, String)] = Nil,
82
                  filterHetVarToHomVar: List[(String, String)] = Nil,
83
                  filterRefCalls: Boolean = false,
84
85
                  filterNoCalls: Boolean = false,
                  iDset: Set[String] = Set()) extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
86
87

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
88
89
    opt[File]('I', "inputVcf") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(inputVcf = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
90
    } text ("Input vcf file")
Peter van 't Hof's avatar
Peter van 't Hof committed
91
92
    opt[File]('o', "outputVcf") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(outputVcf = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
93
    } text ("Output vcf file")
Peter van 't Hof's avatar
Peter van 't Hof committed
94
95
96
    opt[File]("invertedOutputVcf") maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(invertedOutputVcf = Some(x))
    } text ("inverted output vcf file")
Peter van 't Hof's avatar
Peter van 't Hof committed
97
    opt[Int]("minSampleDepth") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
98
      c.copy(minSampleDepth = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
99
100
    } text ("Min value for DP in genotype fields")
    opt[Int]("minTotalDepth") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
101
      c.copy(minTotalDepth = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
102
103
    } text ("Min value of DP field in INFO fields")
    opt[Int]("minAlternateDepth") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
104
      c.copy(minAlternateDepth = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
105
106
    } text ("Min value of AD field in genotype fields")
    opt[Int]("minSamplesPass") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
107
      c.copy(minSamplesPass = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
108
    } text ("Min number off samples to pass --minAlternateDepth, --minBamAlternateDepth and --minSampleDepth")
Peter van 't Hof's avatar
Peter van 't Hof committed
109
    opt[Int]("minBamAlternateDepth") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
110
      c.copy(minBamAlternateDepth = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
111
    } // TODO: Convert this to more generic filter
Peter van 't Hof's avatar
Peter van 't Hof committed
112
113
    opt[String]("resToDom") unbounded () valueName ("<child:father:mother>") action { (x, c) =>
      c.copy(resToDom = new Trio(x) :: c.resToDom)
Peter van 't Hof's avatar
Peter van 't Hof committed
114
115
116
117
    } text ("Only shows variants where child is homozygous and both parants hetrozygous")
    opt[String]("trioCompound") unbounded () valueName ("<child:father:mother>") action { (x, c) =>
      c.copy(trioCompound = new Trio(x) :: c.trioCompound)
    } text ("Only shows variants where child is a compound variant combined from both parants")
118
119
    opt[String]("deNovoInSample") maxOccurs (1) unbounded () valueName ("<sample>") action { (x, c) =>
      c.copy(deNovoInSample = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
120
    } text ("Only show variants that contain unique alleles in complete set for given sample")
Peter van 't Hof's avatar
Peter van 't Hof committed
121
122
123
124
125
126
    opt[String]("deNovoTrio") unbounded () valueName ("<child:father:mother>") action { (x, c) =>
      c.copy(deNovoTrio = new Trio(x) :: c.deNovoTrio)
    } text ("Only show variants that are denovo in the trio")
    opt[String]("trioLossOfHet") unbounded () valueName ("<child:father:mother>") action { (x, c) =>
      c.copy(trioLossOfHet = new Trio(x) :: c.trioLossOfHet)
    } text ("Only show variants where a loss of hetrozygosity is detected")
Peter van 't Hof's avatar
Peter van 't Hof committed
127
    opt[String]("mustHaveVariant") unbounded () valueName ("<sample>") action { (x, c) =>
128
      c.copy(mustHaveVariant = x :: c.mustHaveVariant)
Peter van 't Hof's avatar
Peter van 't Hof committed
129
    } text ("Given sample must have 1 alternative allele")
Peter van 't Hof's avatar
Peter van 't Hof committed
130
131
132
    opt[String]("calledIn") unbounded () valueName ("<sample>") action { (x, c) =>
      c.copy(calledIn = x :: c.calledIn)
    } text ("Must be called in this sample")
Peter van 't Hof's avatar
Peter van 't Hof committed
133
134
135
136
137
    opt[String]("diffGenotype") unbounded () valueName ("<sample:sample>") action { (x, c) =>
      c.copy(diffGenotype = (x.split(":")(0), x.split(":")(1)) :: c.diffGenotype)
    } validate { x => if (x.split(":").length == 2) success else failure("--notSameGenotype should be in this format: sample:sample")
    } text ("Given samples must have a different genotype")
    opt[String]("filterHetVarToHomVar") unbounded () valueName ("<sample:sample>") action { (x, c) =>
138
      c.copy(filterHetVarToHomVar = (x.split(":")(0), x.split(":")(1)) :: c.filterHetVarToHomVar)
Peter van 't Hof's avatar
Peter van 't Hof committed
139
    } validate { x => if (x.split(":").length == 2) success else failure("--filterHetVarToHomVar should be in this format: sample:sample")
Peter van 't Hof's avatar
Peter van 't Hof committed
140
    } text ("If variants in sample 1 are heterogeneous and alternative alleles are homogeneous in sample 2 variants are filtered")
Peter van 't Hof's avatar
Peter van 't Hof committed
141
142
    opt[Unit]("filterRefCalls") unbounded () action { (x, c) =>
      c.copy(filterRefCalls = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
143
    } text ("Filter when there are only ref calls")
144
145
146
    opt[Unit]("filterNoCalls") unbounded () action { (x, c) =>
      c.copy(filterNoCalls = true)
    } text ("Filter when there are only no calls")
147
148
    opt[Double]("minQualScore") unbounded () action { (x, c) =>
      c.copy(minQualScore = Some(x))
Peter van 't Hof's avatar
Peter van 't Hof committed
149
    } text ("Min qual score")
150
151
152
    opt[String]("id") unbounded () action { (x, c) =>
      c.copy(iDset = c.iDset + x)
    } text ("Id that may pass the filter")
153
    opt[File]("idFile") unbounded () action { (x, c) =>
154
155
      c.copy(iDset = c.iDset ++ Source.fromFile(x).getLines())
    } text ("File that contain list of IDs to get from vcf file")
Peter van 't Hof's avatar
Peter van 't Hof committed
156
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
157

158
159
  var commandArgs: Args = _

Peter van 't Hof's avatar
Peter van 't Hof committed
160
161
162
163
  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
164
    logger.info("Start")
Peter van 't Hof's avatar
Peter van 't Hof committed
165
    val argsParser = new OptParser
166
    commandArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1)
Peter van 't Hof's avatar
Peter van 't Hof committed
167

Peter van 't Hof's avatar
Peter van 't Hof committed
168
    val reader = new VCFFileReader(commandArgs.inputVcf, false)
Peter van 't Hof's avatar
Peter van 't Hof committed
169
    val header = reader.getFileHeader
170
171
172
173
    val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().
      setOutputFile(commandArgs.outputVcf).
      setReferenceDictionary(header.getSequenceDictionary).
      build)
Peter van 't Hof's avatar
Peter van 't Hof committed
174
    writer.writeHeader(header)
Peter van 't Hof's avatar
Peter van 't Hof committed
175

Sander Bollen's avatar
Sander Bollen committed
176
177
178
179
180
181
    val invertedWriter = commandArgs.invertedOutputVcf.collect {
      case x => new VariantContextWriterBuilder().
        setOutputFile(x).
        setReferenceDictionary(header.getSequenceDictionary).
        build
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
182
    invertedWriter.foreach(_.writeHeader(header))
Peter van 't Hof's avatar
Peter van 't Hof committed
183

184
185
    var counterTotal = 0
    var counterLeft = 0
Peter van 't Hof's avatar
Peter van 't Hof committed
186
    for (record <- reader) {
Peter van 't Hof's avatar
Peter van 't Hof committed
187
188
      if (minQualscore(record) &&
        filterRefCalls(record) &&
189
190
191
192
        filterNoCalls(record) &&
        minTotalDepth(record) &&
        minSampleDepth(record) &&
        minAlternateDepth(record) &&
193
194
        minBamAlternateDepth(record, header) &&
        mustHaveVariant(record) &&
Peter van 't Hof's avatar
Peter van 't Hof committed
195
        called(record) &&
196
197
        notSameGenotype(record) &&
        filterHetVarToHomVar(record) &&
198
        denovoInSample(record) &&
Peter van 't Hof's avatar
Peter van 't Hof committed
199
200
        denovoTrio(record, commandArgs.deNovoTrio) &&
        denovoTrio(record, commandArgs.trioLossOfHet, true) &&
Peter van 't Hof's avatar
Peter van 't Hof committed
201
        resToDom(record, commandArgs.resToDom) &&
Peter van 't Hof's avatar
Peter van 't Hof committed
202
        trioCompound(record, commandArgs.trioCompound) &&
203
        inIdSet(record)) {
Peter van 't Hof's avatar
Peter van 't Hof committed
204
        writer.add(record)
205
        counterLeft += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
206
207
      } else
        invertedWriter.foreach(_.add(record))
208
209
      counterTotal += 1
      if (counterTotal % 100000 == 0) logger.info(counterTotal + " variants processed, " + counterLeft + " left")
Peter van 't Hof's avatar
Peter van 't Hof committed
210
    }
211
    logger.info(counterTotal + " variants processed, " + counterLeft + " left")
Peter van 't Hof's avatar
Peter van 't Hof committed
212
213
    reader.close
    writer.close
Peter van 't Hof's avatar
Peter van 't Hof committed
214
    invertedWriter.foreach(_.close())
215
    logger.info("Done")
Peter van 't Hof's avatar
Peter van 't Hof committed
216
  }
217

Peter van 't Hof's avatar
Peter van 't Hof committed
218
219
220
221
222
  def called(record: VariantContext): Boolean = {
    if (!commandArgs.calledIn.forall(record.getGenotype(_).isCalled)) false
    else true
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
223
  def minQualscore(record: VariantContext): Boolean = {
224
225
    if (commandArgs.minQualScore.isEmpty) return true
    record.getPhredScaledQual >= commandArgs.minQualScore.get
Peter van 't Hof's avatar
Peter van 't Hof committed
226
227
  }

228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
  def filterRefCalls(record: VariantContext): Boolean = {
    if (commandArgs.filterNoCalls) record.getGenotypes.exists(g => !g.isHomRef)
    else true
  }

  def filterNoCalls(record: VariantContext): Boolean = {
    if (commandArgs.filterNoCalls) record.getGenotypes.exists(g => !g.isNoCall)
    else true
  }

  def minTotalDepth(record: VariantContext): Boolean = {
    record.getAttributeAsInt("DP", -1) >= commandArgs.minTotalDepth
  }

  def minSampleDepth(record: VariantContext): Boolean = {
    record.getGenotypes.count(genotype => {
      val DP = if (genotype.hasDP) genotype.getDP else -1
      DP >= commandArgs.minSampleDepth
    }) >= commandArgs.minSamplesPass
  }

  def minAlternateDepth(record: VariantContext): Boolean = {
    record.getGenotypes.count(genotype => {
      val AD = if (genotype.hasAD) List(genotype.getAD: _*) else Nil
      if (!AD.isEmpty) AD.tail.count(_ >= commandArgs.minAlternateDepth) > 0 else true
    }) >= commandArgs.minSamplesPass
  }

256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
  def minBamAlternateDepth(record: VariantContext, header: VCFHeader): Boolean = {
    val bamADFields = (for (line <- header.getInfoHeaderLines if line.getID.startsWith("BAM-AD-")) yield line.getID).toList

    val bamADvalues = (for (field <- bamADFields) yield {
      record.getAttribute(field, new ArrayList) match {
        case t: ArrayList[_] if t.length > 1 => {
          for (i <- 1 until t.size) yield {
            t(i) match {
              case a: Int    => a > commandArgs.minBamAlternateDepth
              case a: String => a.toInt > commandArgs.minBamAlternateDepth
              case _         => false
            }
          }
        }
        case _ => List(false)
      }
    }).flatten

    return commandArgs.minBamAlternateDepth <= 0 || bamADvalues.count(_ == true) >= commandArgs.minSamplesPass
  }

  def mustHaveVariant(record: VariantContext): Boolean = {
    return !commandArgs.mustHaveVariant.map(record.getGenotype(_)).exists(a => a.isHomRef || a.isNoCall)
  }

  def notSameGenotype(record: VariantContext): Boolean = {
Peter van 't Hof's avatar
Peter van 't Hof committed
282
    for ((sample1, sample2) <- commandArgs.diffGenotype) {
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
      val genotype1 = record.getGenotype(sample1)
      val genotype2 = record.getGenotype(sample2)
      if (genotype1.sameGenotype(genotype2)) return false
    }
    return true
  }

  def filterHetVarToHomVar(record: VariantContext): Boolean = {
    for ((sample1, sample2) <- commandArgs.filterHetVarToHomVar) {
      val genotype1 = record.getGenotype(sample1)
      val genotype2 = record.getGenotype(sample2)
      if (genotype1.isHet && !genotype1.getAlleles.forall(_.isNonReference)) {
        for (allele <- genotype1.getAlleles if allele.isNonReference) {
          if (genotype2.getAlleles.forall(_.basesMatch(allele))) return false
        }
      }
    }
    return true
  }

  def denovoInSample(record: VariantContext): Boolean = {
304
305
    if (commandArgs.deNovoInSample == null) return true
    val genotype = record.getGenotype(commandArgs.deNovoInSample)
306
    for (allele <- genotype.getAlleles if allele.isNonReference) {
307
      for (g <- record.getGenotypes if g.getSampleName != commandArgs.deNovoInSample) {
308
309
310
311
312
        if (g.getAlleles.exists(_.basesMatch(allele))) return false
      }
    }
    return true
  }
313

Peter van 't Hof's avatar
Peter van 't Hof committed
314
315
316
317
318
319
320
321
322
323
324
325
  def resToDom(record: VariantContext, trios: List[Trio]): Boolean = {
    for (trio <- trios) {
      val child = record.getGenotype(trio.child)

      if (child.isHomVar && child.getAlleles.forall(allele => {
        record.getGenotype(trio.father).countAllele(allele) == 1 &&
          record.getGenotype(trio.mother).countAllele(allele) == 1
      })) return true
    }
    return trios.isEmpty
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
326
327
328
329
330
331
332
333
334
335
336
337
  def trioCompound(record: VariantContext, trios: List[Trio]): Boolean = {
    for (trio <- trios) {
      val child = record.getGenotype(trio.child)

      if (child.isHetNonRef && child.getAlleles.forall(allele => {
        record.getGenotype(trio.father).countAllele(allele) >= 1 &&
          record.getGenotype(trio.mother).countAllele(allele) >= 1
      })) return true
    }
    return trios.isEmpty
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
338
339
340
341
342
343
344
345
346
347
348
  def denovoTrio(record: VariantContext, trios: List[Trio], onlyLossHet: Boolean = false): Boolean = {
    for (trio <- trios) {
      val child = record.getGenotype(trio.child)
      val father = record.getGenotype(trio.father)
      val mother = record.getGenotype(trio.mother)

      for (allele <- child.getAlleles) {
        val childCount = child.countAllele(allele)
        val fatherCount = father.countAllele(allele)
        val motherCount = mother.countAllele(allele)

Peter van 't Hof's avatar
Peter van 't Hof committed
349
350
351
352
353
354
355
356
        if (onlyLossHet) {
          if (childCount == 2 && (
            (fatherCount == 2 && motherCount == 0) ||
            (fatherCount == 0 && motherCount == 2))) return true
        } else {
          if (childCount == 1 && fatherCount == 0 && motherCount == 0) return true
          else if (childCount == 2 && (fatherCount == 0 || motherCount == 0)) return true
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
357
358
359
360
361
      }
    }
    return trios.isEmpty
  }

362
363
364
365
  def inIdSet(record: VariantContext): Boolean = {
    if (commandArgs.iDset.isEmpty) true
    else record.getID.split(",").exists(commandArgs.iDset.contains(_))
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
366
}