VcfFilter.scala 13.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
16
package nl.lumc.sasc.biopet.tools
Peter van 't Hof's avatar
Peter van 't Hof committed
17
18
19

import htsjdk.variant.variantcontext.writer.AsyncVariantContextWriter
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder
20
21
import htsjdk.variant.vcf.{ VCFHeader, VCFFileReader }
import htsjdk.variant.variantcontext.VariantContext
Peter van 't Hof's avatar
Peter van 't Hof committed
22
import java.io.File
Peter van 't Hof's avatar
Peter van 't Hof committed
23
import java.util.ArrayList
Peter van 't Hof's avatar
Peter van 't Hof committed
24
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
Peter van 't Hof's avatar
Peter van 't Hof committed
25
import nl.lumc.sasc.biopet.core.ToolCommand
Peter van 't Hof's avatar
Peter van 't Hof committed
26
27
28
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
import scala.collection.JavaConversions._
29
import scala.io.Source
Peter van 't Hof's avatar
Peter van 't Hof committed
30
31
32
33
34
35

class VcfFilter(val root: Configurable) extends BiopetJavaCommandLineFunction {
  javaMainClass = getClass.getName

  @Input(doc = "Input vcf", shortName = "I", required = true)
  var inputVcf: File = _
Peter van 't Hof's avatar
Peter van 't Hof committed
36

Peter van 't Hof's avatar
Peter van 't Hof committed
37
38
  @Output(doc = "Output vcf", shortName = "o", required = false)
  var outputVcf: File = _
Peter van 't Hof's avatar
Peter van 't Hof committed
39

Peter van 't Hof's avatar
Peter van 't Hof committed
40
41
42
43
  var minSampleDepth: Option[Int] = config("min_sample_depth")
  var minTotalDepth: Option[Int] = config("min_total_depth")
  var minAlternateDepth: Option[Int] = config("min_alternate_depth")
  var minSamplesPass: Option[Int] = config("min_samples_pass")
Peter van 't Hof's avatar
Peter van 't Hof committed
44
  var filterRefCalls: Boolean = config("filter_ref_calls", default = false)
Peter van 't Hof's avatar
Peter van 't Hof committed
45

Peter van 't Hof's avatar
Peter van 't Hof committed
46
  override val defaultCoreMemory = 1.0
Peter van 't Hof's avatar
Peter van 't Hof committed
47
48
49

  override def commandLine = super.commandLine +
    required("-I", inputVcf) +
Peter van 't Hof's avatar
Peter van 't Hof committed
50
    required("-o", outputVcf) +
Peter van 't Hof's avatar
Peter van 't Hof committed
51
52
    optional("--minSampleDepth", minSampleDepth) +
    optional("--minTotalDepth", minTotalDepth) +
Peter van 't Hof's avatar
Peter van 't Hof committed
53
    optional("--minAlternateDepth", minAlternateDepth) +
Peter van 't Hof's avatar
Peter van 't Hof committed
54
55
    optional("--minSamplesPass", minSamplesPass) +
    conditional(filterRefCalls, "--filterRefCalls")
Peter van 't Hof's avatar
Peter van 't Hof committed
56
57
}

Peter van 't Hof's avatar
Peter van 't Hof committed
58
object VcfFilter extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
59
60
61
62
63
64
  case class Trio(child: String, father: String, mother: String) {
    def this(arg: String) = {
      this(arg.split(":")(0), arg.split(":")(1), arg.split(":")(2))
    }
  }

65
66
  case class Args(inputVcf: File = null,
                  outputVcf: File = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
67
                  invertedOutputVcf: Option[File] = None,
68
                  minQualScore: Option[Double] = None,
69
70
71
72
73
74
                  minSampleDepth: Int = -1,
                  minTotalDepth: Int = -1,
                  minAlternateDepth: Int = -1,
                  minSamplesPass: Int = 0,
                  minBamAlternateDepth: Int = 0,
                  mustHaveVariant: List[String] = Nil,
75
                  deNovoInSample: String = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
76
77
                  deNovoTrio: List[Trio] = Nil,
                  trioLossOfHet: List[Trio] = Nil,
Peter van 't Hof's avatar
Peter van 't Hof committed
78
                  diffGenotype: List[(String, String)] = Nil,
79
                  filterHetVarToHomVar: List[(String, String)] = Nil,
80
                  filterRefCalls: Boolean = false,
81
82
                  filterNoCalls: Boolean = false,
                  iDset: Set[String] = Set()) extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
83
84

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
85
86
    opt[File]('I', "inputVcf") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(inputVcf = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
87
    } text ("Input vcf file")
Peter van 't Hof's avatar
Peter van 't Hof committed
88
89
    opt[File]('o', "outputVcf") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(outputVcf = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
90
    } text ("Output vcf file")
Peter van 't Hof's avatar
Peter van 't Hof committed
91
92
93
    opt[File]("invertedOutputVcf") maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(invertedOutputVcf = Some(x))
    } text ("inverted output vcf file")
Peter van 't Hof's avatar
Peter van 't Hof committed
94
    opt[Int]("minSampleDepth") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
95
      c.copy(minSampleDepth = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
96
97
    } text ("Min value for DP in genotype fields")
    opt[Int]("minTotalDepth") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
98
      c.copy(minTotalDepth = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
99
100
    } text ("Min value of DP field in INFO fields")
    opt[Int]("minAlternateDepth") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
101
      c.copy(minAlternateDepth = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
102
103
    } text ("Min value of AD field in genotype fields")
    opt[Int]("minSamplesPass") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
104
      c.copy(minSamplesPass = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
105
    } text ("Min number off samples to pass --minAlternateDepth, --minBamAlternateDepth and --minSampleDepth")
Peter van 't Hof's avatar
Peter van 't Hof committed
106
    opt[Int]("minBamAlternateDepth") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
107
      c.copy(minBamAlternateDepth = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
108
    } // TODO: Convert this to more generic filter
109
110
    opt[String]("deNovoInSample") maxOccurs (1) unbounded () valueName ("<sample>") action { (x, c) =>
      c.copy(deNovoInSample = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
111
    } text ("Only show variants that contain unique alleles in complete set for given sample")
Peter van 't Hof's avatar
Peter van 't Hof committed
112
113
114
115
116
117
    opt[String]("deNovoTrio") unbounded () valueName ("<child:father:mother>") action { (x, c) =>
      c.copy(deNovoTrio = new Trio(x) :: c.deNovoTrio)
    } text ("Only show variants that are denovo in the trio")
    opt[String]("trioLossOfHet") unbounded () valueName ("<child:father:mother>") action { (x, c) =>
      c.copy(trioLossOfHet = new Trio(x) :: c.trioLossOfHet)
    } text ("Only show variants where a loss of hetrozygosity is detected")
Peter van 't Hof's avatar
Peter van 't Hof committed
118
    opt[String]("mustHaveVariant") unbounded () valueName ("<sample>") action { (x, c) =>
119
      c.copy(mustHaveVariant = x :: c.mustHaveVariant)
Peter van 't Hof's avatar
Peter van 't Hof committed
120
121
122
123
124
125
    } text ("Given sample must have 1 alternative allele")
    opt[String]("diffGenotype") unbounded () valueName ("<sample:sample>") action { (x, c) =>
      c.copy(diffGenotype = (x.split(":")(0), x.split(":")(1)) :: c.diffGenotype)
    } validate { x => if (x.split(":").length == 2) success else failure("--notSameGenotype should be in this format: sample:sample")
    } text ("Given samples must have a different genotype")
    opt[String]("filterHetVarToHomVar") unbounded () valueName ("<sample:sample>") action { (x, c) =>
126
      c.copy(filterHetVarToHomVar = (x.split(":")(0), x.split(":")(1)) :: c.filterHetVarToHomVar)
Peter van 't Hof's avatar
Peter van 't Hof committed
127
    } validate { x => if (x.split(":").length == 2) success else failure("--filterHetVarToHomVar should be in this format: sample:sample")
Peter van 't Hof's avatar
Peter van 't Hof committed
128
    } text ("If variants in sample 1 are heterogeneous and alternative alleles are homogeneous in sample 2 variants are filtered")
Peter van 't Hof's avatar
Peter van 't Hof committed
129
130
    opt[Unit]("filterRefCalls") unbounded () action { (x, c) =>
      c.copy(filterRefCalls = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
131
    } text ("Filter when there are only ref calls")
132
133
134
    opt[Unit]("filterNoCalls") unbounded () action { (x, c) =>
      c.copy(filterNoCalls = true)
    } text ("Filter when there are only no calls")
135
136
    opt[Double]("minQualScore") unbounded () action { (x, c) =>
      c.copy(minQualScore = Some(x))
Peter van 't Hof's avatar
Peter van 't Hof committed
137
    } text ("Min qual score")
138
139
140
    opt[String]("id") unbounded () action { (x, c) =>
      c.copy(iDset = c.iDset + x)
    } text ("Id that may pass the filter")
141
    opt[File]("idFile") unbounded () action { (x, c) =>
142
143
      c.copy(iDset = c.iDset ++ Source.fromFile(x).getLines())
    } text ("File that contain list of IDs to get from vcf file")
Peter van 't Hof's avatar
Peter van 't Hof committed
144
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
145

146
147
  var commandArgs: Args = _

Peter van 't Hof's avatar
Peter van 't Hof committed
148
149
150
151
  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
152
    logger.info("Start")
Peter van 't Hof's avatar
Peter van 't Hof committed
153
    val argsParser = new OptParser
154
    commandArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1)
Peter van 't Hof's avatar
Peter van 't Hof committed
155

Peter van 't Hof's avatar
Peter van 't Hof committed
156
    val reader = new VCFFileReader(commandArgs.inputVcf, false)
Peter van 't Hof's avatar
Peter van 't Hof committed
157
    val header = reader.getFileHeader
158
159
160
161
    val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().
      setOutputFile(commandArgs.outputVcf).
      setReferenceDictionary(header.getSequenceDictionary).
      build)
Peter van 't Hof's avatar
Peter van 't Hof committed
162
    writer.writeHeader(header)
Peter van 't Hof's avatar
Peter van 't Hof committed
163

Sander Bollen's avatar
Sander Bollen committed
164
165
166
167
168
169
    val invertedWriter = commandArgs.invertedOutputVcf.collect {
      case x => new VariantContextWriterBuilder().
        setOutputFile(x).
        setReferenceDictionary(header.getSequenceDictionary).
        build
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
170
    invertedWriter.foreach(_.writeHeader(header))
Peter van 't Hof's avatar
Peter van 't Hof committed
171

172
173
    var counterTotal = 0
    var counterLeft = 0
Peter van 't Hof's avatar
Peter van 't Hof committed
174
    for (record <- reader) {
Peter van 't Hof's avatar
Peter van 't Hof committed
175
176
      if (minQualscore(record) &&
        filterRefCalls(record) &&
177
178
179
180
        filterNoCalls(record) &&
        minTotalDepth(record) &&
        minSampleDepth(record) &&
        minAlternateDepth(record) &&
181
182
183
184
        minBamAlternateDepth(record, header) &&
        mustHaveVariant(record) &&
        notSameGenotype(record) &&
        filterHetVarToHomVar(record) &&
185
        denovoInSample(record) &&
Peter van 't Hof's avatar
Peter van 't Hof committed
186
187
        denovoTrio(record, commandArgs.deNovoTrio) &&
        denovoTrio(record, commandArgs.trioLossOfHet, true) &&
188
        inIdSet(record)) {
Peter van 't Hof's avatar
Peter van 't Hof committed
189
        writer.add(record)
190
        counterLeft += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
191
192
      } else
        invertedWriter.foreach(_.add(record))
193
194
      counterTotal += 1
      if (counterTotal % 100000 == 0) logger.info(counterTotal + " variants processed, " + counterLeft + " left")
Peter van 't Hof's avatar
Peter van 't Hof committed
195
    }
196
    logger.info(counterTotal + " variants processed, " + counterLeft + " left")
Peter van 't Hof's avatar
Peter van 't Hof committed
197
198
    reader.close
    writer.close
Peter van 't Hof's avatar
Peter van 't Hof committed
199
    invertedWriter.foreach(_.close())
200
    logger.info("Done")
Peter van 't Hof's avatar
Peter van 't Hof committed
201
  }
202

Peter van 't Hof's avatar
Peter van 't Hof committed
203
  def minQualscore(record: VariantContext): Boolean = {
204
205
    if (commandArgs.minQualScore.isEmpty) return true
    record.getPhredScaledQual >= commandArgs.minQualScore.get
Peter van 't Hof's avatar
Peter van 't Hof committed
206
207
  }

208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
  def filterRefCalls(record: VariantContext): Boolean = {
    if (commandArgs.filterNoCalls) record.getGenotypes.exists(g => !g.isHomRef)
    else true
  }

  def filterNoCalls(record: VariantContext): Boolean = {
    if (commandArgs.filterNoCalls) record.getGenotypes.exists(g => !g.isNoCall)
    else true
  }

  def minTotalDepth(record: VariantContext): Boolean = {
    record.getAttributeAsInt("DP", -1) >= commandArgs.minTotalDepth
  }

  def minSampleDepth(record: VariantContext): Boolean = {
    record.getGenotypes.count(genotype => {
      val DP = if (genotype.hasDP) genotype.getDP else -1
      DP >= commandArgs.minSampleDepth
    }) >= commandArgs.minSamplesPass
  }

  def minAlternateDepth(record: VariantContext): Boolean = {
    record.getGenotypes.count(genotype => {
      val AD = if (genotype.hasAD) List(genotype.getAD: _*) else Nil
      if (!AD.isEmpty) AD.tail.count(_ >= commandArgs.minAlternateDepth) > 0 else true
    }) >= commandArgs.minSamplesPass
  }

236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
  def minBamAlternateDepth(record: VariantContext, header: VCFHeader): Boolean = {
    val bamADFields = (for (line <- header.getInfoHeaderLines if line.getID.startsWith("BAM-AD-")) yield line.getID).toList

    val bamADvalues = (for (field <- bamADFields) yield {
      record.getAttribute(field, new ArrayList) match {
        case t: ArrayList[_] if t.length > 1 => {
          for (i <- 1 until t.size) yield {
            t(i) match {
              case a: Int    => a > commandArgs.minBamAlternateDepth
              case a: String => a.toInt > commandArgs.minBamAlternateDepth
              case _         => false
            }
          }
        }
        case _ => List(false)
      }
    }).flatten

    return commandArgs.minBamAlternateDepth <= 0 || bamADvalues.count(_ == true) >= commandArgs.minSamplesPass
  }

  def mustHaveVariant(record: VariantContext): Boolean = {
    return !commandArgs.mustHaveVariant.map(record.getGenotype(_)).exists(a => a.isHomRef || a.isNoCall)
  }

  def notSameGenotype(record: VariantContext): Boolean = {
Peter van 't Hof's avatar
Peter van 't Hof committed
262
    for ((sample1, sample2) <- commandArgs.diffGenotype) {
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
      val genotype1 = record.getGenotype(sample1)
      val genotype2 = record.getGenotype(sample2)
      if (genotype1.sameGenotype(genotype2)) return false
    }
    return true
  }

  def filterHetVarToHomVar(record: VariantContext): Boolean = {
    for ((sample1, sample2) <- commandArgs.filterHetVarToHomVar) {
      val genotype1 = record.getGenotype(sample1)
      val genotype2 = record.getGenotype(sample2)
      if (genotype1.isHet && !genotype1.getAlleles.forall(_.isNonReference)) {
        for (allele <- genotype1.getAlleles if allele.isNonReference) {
          if (genotype2.getAlleles.forall(_.basesMatch(allele))) return false
        }
      }
    }
    return true
  }

  def denovoInSample(record: VariantContext): Boolean = {
284
285
    if (commandArgs.deNovoInSample == null) return true
    val genotype = record.getGenotype(commandArgs.deNovoInSample)
286
    for (allele <- genotype.getAlleles if allele.isNonReference) {
287
      for (g <- record.getGenotypes if g.getSampleName != commandArgs.deNovoInSample) {
288
289
290
291
292
        if (g.getAlleles.exists(_.basesMatch(allele))) return false
      }
    }
    return true
  }
293

Peter van 't Hof's avatar
Peter van 't Hof committed
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
  def denovoTrio(record: VariantContext, trios: List[Trio], onlyLossHet: Boolean = false): Boolean = {
    for (trio <- trios) {
      val child = record.getGenotype(trio.child)
      val father = record.getGenotype(trio.father)
      val mother = record.getGenotype(trio.mother)

      for (allele <- child.getAlleles) {
        val childCount = child.countAllele(allele)
        val fatherCount = father.countAllele(allele)
        val motherCount = mother.countAllele(allele)

        if (!onlyLossHet && childCount == 1 && fatherCount == 0 && motherCount == 0) return true
        else if (childCount == 2 && (fatherCount == 0 || motherCount == 0)) return true
      }
    }
    return trios.isEmpty
  }

312
313
314
315
  def inIdSet(record: VariantContext): Boolean = {
    if (commandArgs.iDset.isEmpty) true
    else record.getID.split(",").exists(commandArgs.iDset.contains(_))
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
316
}