VcfFilter.scala 10.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
16
package nl.lumc.sasc.biopet.tools
Peter van 't Hof's avatar
Peter van 't Hof committed
17
18
19

import htsjdk.variant.variantcontext.writer.AsyncVariantContextWriter
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder
20
21
import htsjdk.variant.vcf.{ VCFHeader, VCFFileReader }
import htsjdk.variant.variantcontext.VariantContext
Peter van 't Hof's avatar
Peter van 't Hof committed
22
import java.io.File
Peter van 't Hof's avatar
Peter van 't Hof committed
23
import java.util.ArrayList
Peter van 't Hof's avatar
Peter van 't Hof committed
24
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
Peter van 't Hof's avatar
Peter van 't Hof committed
25
import nl.lumc.sasc.biopet.core.ToolCommand
Peter van 't Hof's avatar
Peter van 't Hof committed
26
27
28
29
30
31
32
33
34
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
import scala.collection.JavaConversions._

class VcfFilter(val root: Configurable) extends BiopetJavaCommandLineFunction {
  javaMainClass = getClass.getName

  @Input(doc = "Input vcf", shortName = "I", required = true)
  var inputVcf: File = _
Peter van 't Hof's avatar
Peter van 't Hof committed
35

Peter van 't Hof's avatar
Peter van 't Hof committed
36
37
  @Output(doc = "Output vcf", shortName = "o", required = false)
  var outputVcf: File = _
Peter van 't Hof's avatar
Peter van 't Hof committed
38

Peter van 't Hof's avatar
Peter van 't Hof committed
39
40
41
42
  var minSampleDepth: Option[Int] = _
  var minTotalDepth: Option[Int] = _
  var minAlternateDepth: Option[Int] = _
  var minSamplesPass: Option[Int] = _
Peter van 't Hof's avatar
Peter van 't Hof committed
43
  var filterRefCalls: Boolean = _
Peter van 't Hof's avatar
Peter van 't Hof committed
44

Peter van 't Hof's avatar
Peter van 't Hof committed
45
46
  override val defaultVmem = "8G"
  memoryLimit = Option(4.0)
Peter van 't Hof's avatar
Peter van 't Hof committed
47

Peter van 't Hof's avatar
Peter van 't Hof committed
48
49
50
51
52
  override def afterGraph {
    minSampleDepth = config("min_sample_depth")
    minTotalDepth = config("min_total_depth")
    minAlternateDepth = config("min_alternate_depth")
    minSamplesPass = config("min_samples_pass")
Peter van 't Hof's avatar
Peter van 't Hof committed
53
    filterRefCalls = config("filter_ref_calls")
Peter van 't Hof's avatar
Peter van 't Hof committed
54
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
55
56
57

  override def commandLine = super.commandLine +
    required("-I", inputVcf) +
Peter van 't Hof's avatar
Peter van 't Hof committed
58
    required("-o", outputVcf) +
Peter van 't Hof's avatar
Peter van 't Hof committed
59
60
    optional("--minSampleDepth", minSampleDepth) +
    optional("--minTotalDepth", minTotalDepth) +
Peter van 't Hof's avatar
Peter van 't Hof committed
61
    optional("--minAlternateDepth", minAlternateDepth) +
Peter van 't Hof's avatar
Peter van 't Hof committed
62
63
    optional("--minSamplesPass", minSamplesPass) +
    conditional(filterRefCalls, "--filterRefCalls")
Peter van 't Hof's avatar
Peter van 't Hof committed
64
65
}

Peter van 't Hof's avatar
Peter van 't Hof committed
66
object VcfFilter extends ToolCommand {
67
68
  case class Args(inputVcf: File = null,
                  outputVcf: File = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
69
                  minQualscore: Option[Double] = None,
70
71
72
73
74
75
76
                  minSampleDepth: Int = -1,
                  minTotalDepth: Int = -1,
                  minAlternateDepth: Int = -1,
                  minSamplesPass: Int = 0,
                  minBamAlternateDepth: Int = 0,
                  mustHaveVariant: List[String] = Nil,
                  denovoInSample: String = null,
Peter van 't Hof's avatar
Peter van 't Hof committed
77
                  diffGenotype: List[(String, String)] = Nil,
78
                  filterHetVarToHomVar: List[(String, String)] = Nil,
79
80
                  filterRefCalls: Boolean = false,
                  filterNoCalls: Boolean = false) extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
81
82

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
83
84
    opt[File]('I', "inputVcf") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(inputVcf = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
85
    } text ("Input vcf file")
Peter van 't Hof's avatar
Peter van 't Hof committed
86
87
    opt[File]('o', "outputVcf") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(outputVcf = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
88
89
    } text ("Output vcf file")
    opt[Int]("minSampleDepth") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
90
      c.copy(minSampleDepth = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
91
92
    } text ("Min value for DP in genotype fields")
    opt[Int]("minTotalDepth") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
93
      c.copy(minTotalDepth = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
94
95
    } text ("Min value of DP field in INFO fields")
    opt[Int]("minAlternateDepth") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
96
      c.copy(minAlternateDepth = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
97
98
    } text ("Min value of AD field in genotype fields")
    opt[Int]("minSamplesPass") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
99
      c.copy(minSamplesPass = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
100
101
    } text ("Min number opf samples to pass --minAlternateDepth, --minBamAlternateDepth and --minSampleDepth")
    opt[Int]("minBamAlternateDepth") unbounded () valueName ("<int>") action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
102
      c.copy(minBamAlternateDepth = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
103
104
    } // TODO: Convert this to more generic filter
    opt[String]("denovoInSample") maxOccurs (1) unbounded () valueName ("<sample>") action { (x, c) =>
105
      c.copy(denovoInSample = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
106
107
    } text ("Only show variants that contain unique alleles in compete set for given sample")
    opt[String]("mustHaveVariant") unbounded () valueName ("<sample>") action { (x, c) =>
108
      c.copy(mustHaveVariant = x :: c.mustHaveVariant)
Peter van 't Hof's avatar
Peter van 't Hof committed
109
110
111
112
113
114
    } text ("Given sample must have 1 alternative allele")
    opt[String]("diffGenotype") unbounded () valueName ("<sample:sample>") action { (x, c) =>
      c.copy(diffGenotype = (x.split(":")(0), x.split(":")(1)) :: c.diffGenotype)
    } validate { x => if (x.split(":").length == 2) success else failure("--notSameGenotype should be in this format: sample:sample")
    } text ("Given samples must have a different genotype")
    opt[String]("filterHetVarToHomVar") unbounded () valueName ("<sample:sample>") action { (x, c) =>
115
      c.copy(filterHetVarToHomVar = (x.split(":")(0), x.split(":")(1)) :: c.filterHetVarToHomVar)
Peter van 't Hof's avatar
Peter van 't Hof committed
116
117
    } validate { x => if (x.split(":").length == 2) success else failure("--filterHetVarToHomVar should be in this format: sample:sample")
    } text ("If variants in sample 1 are heterogeneous and alternative alleles are homogeneous in sample 2 variants are filterd")
Peter van 't Hof's avatar
Peter van 't Hof committed
118
119
    opt[Unit]("filterRefCalls") unbounded () action { (x, c) =>
      c.copy(filterRefCalls = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
120
    } text ("Filter when there are only ref calls")
121
122
123
    opt[Unit]("filterNoCalls") unbounded () action { (x, c) =>
      c.copy(filterNoCalls = true)
    } text ("Filter when there are only no calls")
Peter van 't Hof's avatar
Peter van 't Hof committed
124
125
126
    opt[Double]("minQualscore") unbounded () action { (x, c) =>
      c.copy(minQualscore = Some(x))
    } text ("Min qual score")
Peter van 't Hof's avatar
Peter van 't Hof committed
127
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
128

129
130
  var commandArgs: Args = _

Peter van 't Hof's avatar
Peter van 't Hof committed
131
132
133
134
  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
135
    val argsParser = new OptParser
136
    commandArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1)
Peter van 't Hof's avatar
Peter van 't Hof committed
137

Peter van 't Hof's avatar
Peter van 't Hof committed
138
    val reader = new VCFFileReader(commandArgs.inputVcf, false)
Peter van 't Hof's avatar
Peter van 't Hof committed
139
    val header = reader.getFileHeader
Peter van 't Hof's avatar
Peter van 't Hof committed
140
    val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().setOutputFile(commandArgs.outputVcf).build)
Peter van 't Hof's avatar
Peter van 't Hof committed
141
    writer.writeHeader(header)
Peter van 't Hof's avatar
Peter van 't Hof committed
142

Peter van 't Hof's avatar
Peter van 't Hof committed
143
    for (record <- reader) {
Peter van 't Hof's avatar
Peter van 't Hof committed
144
145
      if (minQualscore(record) &&
        filterRefCalls(record) &&
146
147
148
149
        filterNoCalls(record) &&
        minTotalDepth(record) &&
        minSampleDepth(record) &&
        minAlternateDepth(record) &&
150
151
152
153
        minBamAlternateDepth(record, header) &&
        mustHaveVariant(record) &&
        notSameGenotype(record) &&
        filterHetVarToHomVar(record) &&
Peter van 't Hof's avatar
Peter van 't Hof committed
154
        denovoInSample(record)) {
Peter van 't Hof's avatar
Peter van 't Hof committed
155
        writer.add(record)
Peter van 't Hof's avatar
Peter van 't Hof committed
156
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
157
158
159
160
    }
    reader.close
    writer.close
  }
161

Peter van 't Hof's avatar
Peter van 't Hof committed
162
163
164
165
166
  def minQualscore(record: VariantContext): Boolean = {
    if (commandArgs.minQualscore.isEmpty) return true
    record.getPhredScaledQual >= commandArgs.minQualscore.get
  }

167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
  def filterRefCalls(record: VariantContext): Boolean = {
    if (commandArgs.filterNoCalls) record.getGenotypes.exists(g => !g.isHomRef)
    else true
  }

  def filterNoCalls(record: VariantContext): Boolean = {
    if (commandArgs.filterNoCalls) record.getGenotypes.exists(g => !g.isNoCall)
    else true
  }

  def minTotalDepth(record: VariantContext): Boolean = {
    record.getAttributeAsInt("DP", -1) >= commandArgs.minTotalDepth
  }

  def minSampleDepth(record: VariantContext): Boolean = {
    record.getGenotypes.count(genotype => {
      val DP = if (genotype.hasDP) genotype.getDP else -1
      DP >= commandArgs.minSampleDepth
    }) >= commandArgs.minSamplesPass
  }

  def minAlternateDepth(record: VariantContext): Boolean = {
    record.getGenotypes.count(genotype => {
      val AD = if (genotype.hasAD) List(genotype.getAD: _*) else Nil
      if (!AD.isEmpty) AD.tail.count(_ >= commandArgs.minAlternateDepth) > 0 else true
    }) >= commandArgs.minSamplesPass
  }

195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
  def minBamAlternateDepth(record: VariantContext, header: VCFHeader): Boolean = {
    val bamADFields = (for (line <- header.getInfoHeaderLines if line.getID.startsWith("BAM-AD-")) yield line.getID).toList

    val bamADvalues = (for (field <- bamADFields) yield {
      record.getAttribute(field, new ArrayList) match {
        case t: ArrayList[_] if t.length > 1 => {
          for (i <- 1 until t.size) yield {
            t(i) match {
              case a: Int    => a > commandArgs.minBamAlternateDepth
              case a: String => a.toInt > commandArgs.minBamAlternateDepth
              case _         => false
            }
          }
        }
        case _ => List(false)
      }
    }).flatten

    return commandArgs.minBamAlternateDepth <= 0 || bamADvalues.count(_ == true) >= commandArgs.minSamplesPass
  }

  def mustHaveVariant(record: VariantContext): Boolean = {
    return !commandArgs.mustHaveVariant.map(record.getGenotype(_)).exists(a => a.isHomRef || a.isNoCall)
  }

  def notSameGenotype(record: VariantContext): Boolean = {
Peter van 't Hof's avatar
Peter van 't Hof committed
221
    for ((sample1, sample2) <- commandArgs.diffGenotype) {
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
      val genotype1 = record.getGenotype(sample1)
      val genotype2 = record.getGenotype(sample2)
      if (genotype1.sameGenotype(genotype2)) return false
    }
    return true
  }

  def filterHetVarToHomVar(record: VariantContext): Boolean = {
    for ((sample1, sample2) <- commandArgs.filterHetVarToHomVar) {
      val genotype1 = record.getGenotype(sample1)
      val genotype2 = record.getGenotype(sample2)
      if (genotype1.isHet && !genotype1.getAlleles.forall(_.isNonReference)) {
        for (allele <- genotype1.getAlleles if allele.isNonReference) {
          if (genotype2.getAlleles.forall(_.basesMatch(allele))) return false
        }
      }
    }
    return true
  }

  def denovoInSample(record: VariantContext): Boolean = {
    if (commandArgs.denovoInSample == null) return true
    val genotype = record.getGenotype(commandArgs.denovoInSample)
    for (allele <- genotype.getAlleles if allele.isNonReference) {
      for (g <- record.getGenotypes if g.getSampleName != commandArgs.denovoInSample) {
        if (g.getAlleles.exists(_.basesMatch(allele))) return false
      }
    }
    return true
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
252
}