Commit 74c523c5 authored by Wai Yi Leung's avatar Wai Yi Leung
Browse files

Merge branch 'feature-vcffilter_trio_filters' into 'develop'

Feature vcffilter trio filters

Added some more filter options

See merge request !162
parents 2b989a5a 209ef798
......@@ -17,10 +17,9 @@ package nl.lumc.sasc.biopet.tools
import htsjdk.variant.variantcontext.writer.AsyncVariantContextWriter
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder
import htsjdk.variant.vcf.{ VCFHeader, VCFFileReader }
import htsjdk.variant.vcf.VCFFileReader
import htsjdk.variant.variantcontext.VariantContext
import java.io.File
import java.util.ArrayList
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.ToolCommand
import nl.lumc.sasc.biopet.core.config.Configurable
......@@ -56,6 +55,13 @@ class VcfFilter(val root: Configurable) extends BiopetJavaCommandLineFunction {
}
object VcfFilter extends ToolCommand {
/** Container class for a trio */
protected case class Trio(child: String, father: String, mother: String) {
def this(arg: String) = {
this(arg.split(":")(0), arg.split(":")(1), arg.split(":")(2))
}
}
case class Args(inputVcf: File = null,
outputVcf: File = null,
invertedOutputVcf: Option[File] = None,
......@@ -63,10 +69,14 @@ object VcfFilter extends ToolCommand {
minSampleDepth: Int = -1,
minTotalDepth: Int = -1,
minAlternateDepth: Int = -1,
minSamplesPass: Int = 0,
minBamAlternateDepth: Int = 0,
minSamplesPass: Int = 1,
mustHaveVariant: List[String] = Nil,
calledIn: List[String] = Nil,
deNovoInSample: String = null,
resToDom: List[Trio] = Nil,
trioCompound: List[Trio] = Nil,
deNovoTrio: List[Trio] = Nil,
trioLossOfHet: List[Trio] = Nil,
diffGenotype: List[(String, String)] = Nil,
filterHetVarToHomVar: List[(String, String)] = Nil,
filterRefCalls: Boolean = false,
......@@ -74,80 +84,88 @@ object VcfFilter extends ToolCommand {
iDset: Set[String] = Set()) extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[File]('I', "inputVcf") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
opt[File]('I', "inputVcf") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(inputVcf = x)
} text ("Input vcf file")
opt[File]('o', "outputVcf") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
} text "Input vcf file"
opt[File]('o', "outputVcf") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(outputVcf = x)
} text ("Output vcf file")
opt[File]("invertedOutputVcf") maxOccurs (1) valueName ("<file>") action { (x, c) =>
} text "Output vcf file"
opt[File]("invertedOutputVcf") maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(invertedOutputVcf = Some(x))
} text ("inverted output vcf file")
opt[Int]("minSampleDepth") unbounded () valueName ("<int>") action { (x, c) =>
} text "inverted output vcf file"
opt[Int]("minSampleDepth") unbounded () valueName "<int>" action { (x, c) =>
c.copy(minSampleDepth = x)
} text ("Min value for DP in genotype fields")
opt[Int]("minTotalDepth") unbounded () valueName ("<int>") action { (x, c) =>
} text "Min value for DP in genotype fields"
opt[Int]("minTotalDepth") unbounded () valueName "<int>" action { (x, c) =>
c.copy(minTotalDepth = x)
} text ("Min value of DP field in INFO fields")
opt[Int]("minAlternateDepth") unbounded () valueName ("<int>") action { (x, c) =>
} text "Min value of DP field in INFO fields"
opt[Int]("minAlternateDepth") unbounded () valueName "<int>" action { (x, c) =>
c.copy(minAlternateDepth = x)
} text ("Min value of AD field in genotype fields")
opt[Int]("minSamplesPass") unbounded () valueName ("<int>") action { (x, c) =>
} text "Min value of AD field in genotype fields"
opt[Int]("minSamplesPass") unbounded () valueName "<int>" action { (x, c) =>
c.copy(minSamplesPass = x)
} text ("Min number off samples to pass --minAlternateDepth, --minBamAlternateDepth and --minSampleDepth")
opt[Int]("minBamAlternateDepth") unbounded () valueName ("<int>") action { (x, c) =>
c.copy(minBamAlternateDepth = x)
} // TODO: Convert this to more generic filter
opt[String]("deNovoInSample") maxOccurs (1) unbounded () valueName ("<sample>") action { (x, c) =>
} text "Min number off samples to pass --minAlternateDepth, --minBamAlternateDepth and --minSampleDepth"
opt[String]("resToDom") unbounded () valueName "<child:father:mother>" action { (x, c) =>
c.copy(resToDom = new Trio(x) :: c.resToDom)
} text "Only shows variants where child is homozygous and both parants hetrozygous"
opt[String]("trioCompound") unbounded () valueName "<child:father:mother>" action { (x, c) =>
c.copy(trioCompound = new Trio(x) :: c.trioCompound)
} text "Only shows variants where child is a compound variant combined from both parants"
opt[String]("deNovoInSample") maxOccurs 1 unbounded () valueName "<sample>" action { (x, c) =>
c.copy(deNovoInSample = x)
} text ("Only show variants that contain unique alleles in complete set for given sample")
opt[String]("mustHaveVariant") unbounded () valueName ("<sample>") action { (x, c) =>
} text "Only show variants that contain unique alleles in complete set for given sample"
opt[String]("deNovoTrio") unbounded () valueName "<child:father:mother>" action { (x, c) =>
c.copy(deNovoTrio = new Trio(x) :: c.deNovoTrio)
} text "Only show variants that are denovo in the trio"
opt[String]("trioLossOfHet") unbounded () valueName "<child:father:mother>" action { (x, c) =>
c.copy(trioLossOfHet = new Trio(x) :: c.trioLossOfHet)
} text "Only show variants where a loss of hetrozygosity is detected"
opt[String]("mustHaveVariant") unbounded () valueName "<sample>" action { (x, c) =>
c.copy(mustHaveVariant = x :: c.mustHaveVariant)
} text ("Given sample must have 1 alternative allele")
opt[String]("diffGenotype") unbounded () valueName ("<sample:sample>") action { (x, c) =>
} text "Given sample must have 1 alternative allele"
opt[String]("calledIn") unbounded () valueName "<sample>" action { (x, c) =>
c.copy(calledIn = x :: c.calledIn)
} text "Must be called in this sample"
opt[String]("diffGenotype") unbounded () valueName "<sample:sample>" action { (x, c) =>
c.copy(diffGenotype = (x.split(":")(0), x.split(":")(1)) :: c.diffGenotype)
} validate { x => if (x.split(":").length == 2) success else failure("--notSameGenotype should be in this format: sample:sample")
} text ("Given samples must have a different genotype")
opt[String]("filterHetVarToHomVar") unbounded () valueName ("<sample:sample>") action { (x, c) =>
} text "Given samples must have a different genotype"
opt[String]("filterHetVarToHomVar") unbounded () valueName "<sample:sample>" action { (x, c) =>
c.copy(filterHetVarToHomVar = (x.split(":")(0), x.split(":")(1)) :: c.filterHetVarToHomVar)
} validate { x => if (x.split(":").length == 2) success else failure("--filterHetVarToHomVar should be in this format: sample:sample")
} text ("If variants in sample 1 are heterogeneous and alternative alleles are homogeneous in sample 2 variants are filtered")
} text "If variants in sample 1 are heterogeneous and alternative alleles are homogeneous in sample 2 variants are filtered"
opt[Unit]("filterRefCalls") unbounded () action { (x, c) =>
c.copy(filterRefCalls = true)
} text ("Filter when there are only ref calls")
} text "Filter when there are only ref calls"
opt[Unit]("filterNoCalls") unbounded () action { (x, c) =>
c.copy(filterNoCalls = true)
} text ("Filter when there are only no calls")
} text "Filter when there are only no calls"
opt[Double]("minQualScore") unbounded () action { (x, c) =>
c.copy(minQualScore = Some(x))
} text ("Min qual score")
} text "Min qual score"
opt[String]("id") unbounded () action { (x, c) =>
c.copy(iDset = c.iDset + x)
} text ("Id that may pass the filter")
} text "Id that may pass the filter"
opt[File]("idFile") unbounded () action { (x, c) =>
c.copy(iDset = c.iDset ++ Source.fromFile(x).getLines())
} text ("File that contain list of IDs to get from vcf file")
} text "File that contain list of IDs to get from vcf file"
}
var commandArgs: Args = _
/**
* @param args the command line arguments
*/
/** @param args the command line arguments */
def main(args: Array[String]): Unit = {
logger.info("Start")
val argsParser = new OptParser
commandArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1)
val cmdArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1)
val reader = new VCFFileReader(commandArgs.inputVcf, false)
val reader = new VCFFileReader(cmdArgs.inputVcf, false)
val header = reader.getFileHeader
val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().
setOutputFile(commandArgs.outputVcf).
setOutputFile(cmdArgs.outputVcf).
setReferenceDictionary(header.getSequenceDictionary).
build)
writer.writeHeader(header)
val invertedWriter = commandArgs.invertedOutputVcf.collect {
val invertedWriter = cmdArgs.invertedOutputVcf.collect {
case x => new VariantContextWriterBuilder().
setOutputFile(x).
setReferenceDictionary(header.getSequenceDictionary).
......@@ -158,18 +176,25 @@ object VcfFilter extends ToolCommand {
var counterTotal = 0
var counterLeft = 0
for (record <- reader) {
if (minQualscore(record) &&
filterRefCalls(record) &&
filterNoCalls(record) &&
minTotalDepth(record) &&
minSampleDepth(record) &&
minAlternateDepth(record) &&
minBamAlternateDepth(record, header) &&
mustHaveVariant(record) &&
notSameGenotype(record) &&
filterHetVarToHomVar(record) &&
denovoInSample(record) &&
inIdSet(record)) {
if (cmdArgs.minQualScore.map(minQualscore(record, _)).getOrElse(true) &&
(!cmdArgs.filterRefCalls || hasNonRefCalls(record)) &&
(!cmdArgs.filterNoCalls || hasCalls(record)) &&
hasMinTotalDepth(record, cmdArgs.minTotalDepth) &&
hasMinSampleDepth(record, cmdArgs.minSampleDepth, cmdArgs.minSamplesPass) &&
minAlternateDepth(record, cmdArgs.minAlternateDepth, cmdArgs.minSamplesPass) &&
(cmdArgs.mustHaveVariant.isEmpty || mustHaveVariant(record, cmdArgs.mustHaveVariant)) &&
calledIn(record, cmdArgs.calledIn) &&
(cmdArgs.diffGenotype.isEmpty || cmdArgs.diffGenotype.forall(x => notSameGenotype(record, x._1, x._2))) &&
(
cmdArgs.filterHetVarToHomVar.isEmpty ||
cmdArgs.filterHetVarToHomVar.forall(x => filterHetVarToHomVar(record, x._1, x._2))
) &&
denovoInSample(record, cmdArgs.deNovoInSample) &&
denovoTrio(record, cmdArgs.deNovoTrio) &&
denovoTrio(record, cmdArgs.trioLossOfHet, onlyLossHet = true) &&
resToDom(record, cmdArgs.resToDom) &&
trioCompound(record, cmdArgs.trioCompound) &&
(cmdArgs.iDset.isEmpty || inIdSet(record, cmdArgs.iDset))) {
writer.add(record)
counterLeft += 1
} else
......@@ -178,105 +203,172 @@ object VcfFilter extends ToolCommand {
if (counterTotal % 100000 == 0) logger.info(counterTotal + " variants processed, " + counterLeft + " left")
}
logger.info(counterTotal + " variants processed, " + counterLeft + " left")
reader.close
writer.close
reader.close()
writer.close()
invertedWriter.foreach(_.close())
logger.info("Done")
}
def minQualscore(record: VariantContext): Boolean = {
if (commandArgs.minQualScore.isEmpty) return true
record.getPhredScaledQual >= commandArgs.minQualScore.get
/**
* Checks if given samples are called
* @param record VCF record
* @param samples Samples that need this sample to be called
* @return false when filters fail
*/
def calledIn(record: VariantContext, samples: List[String]): Boolean = {
if (!samples.forall(record.getGenotype(_).isCalled)) false
else true
}
def filterRefCalls(record: VariantContext): Boolean = {
if (commandArgs.filterNoCalls) record.getGenotypes.exists(g => !g.isHomRef)
else true
/**
* Checks if record has atleast minQualScore
* @param record VCF record
* @param minQualScore Minimal quality score
* @return false when filters fail
*/
def minQualscore(record: VariantContext, minQualScore: Double): Boolean = {
record.getPhredScaledQual >= minQualScore
}
def filterNoCalls(record: VariantContext): Boolean = {
if (commandArgs.filterNoCalls) record.getGenotypes.exists(g => !g.isNoCall)
else true
/** returns true record contains Non reference genotypes */
def hasNonRefCalls(record: VariantContext): Boolean = {
record.getGenotypes.exists(g => !g.isHomRef)
}
/** returns true when record has calls */
def hasCalls(record: VariantContext): Boolean = {
record.getGenotypes.exists(g => !g.isNoCall)
}
def minTotalDepth(record: VariantContext): Boolean = {
record.getAttributeAsInt("DP", -1) >= commandArgs.minTotalDepth
/** returns true when DP INFO field is atleast the given value */
def hasMinTotalDepth(record: VariantContext, minTotalDepth: Int): Boolean = {
record.getAttributeAsInt("DP", -1) >= minTotalDepth
}
def minSampleDepth(record: VariantContext): Boolean = {
/**
* Checks if DP genotype field have a minimal value
* @param record VCF record
* @param minSampleDepth minimal depth
* @param minSamplesPass Minimal number of samples to pass filter
* @return true if filter passed
*/
def hasMinSampleDepth(record: VariantContext, minSampleDepth: Int, minSamplesPass: Int = 1): Boolean = {
record.getGenotypes.count(genotype => {
val DP = if (genotype.hasDP) genotype.getDP else -1
DP >= commandArgs.minSampleDepth
}) >= commandArgs.minSamplesPass
DP >= minSampleDepth
}) >= minSamplesPass
}
def minAlternateDepth(record: VariantContext): Boolean = {
/**
* Checks if AD genotype field have a minimal value
* @param record VCF record
* @param minAlternateDepth minimal depth
* @param minSamplesPass Minimal number of samples to pass filter
* @return true if filter passed
*/
def minAlternateDepth(record: VariantContext, minAlternateDepth: Int, minSamplesPass: Int = 1): Boolean = {
record.getGenotypes.count(genotype => {
val AD = if (genotype.hasAD) List(genotype.getAD: _*) else Nil
if (!AD.isEmpty) AD.tail.count(_ >= commandArgs.minAlternateDepth) > 0 else true
}) >= commandArgs.minSamplesPass
if (!AD.isEmpty) AD.tail.count(_ >= minAlternateDepth) > 0 else true
}) >= minSamplesPass
}
def minBamAlternateDepth(record: VariantContext, header: VCFHeader): Boolean = {
val bamADFields = (for (line <- header.getInfoHeaderLines if line.getID.startsWith("BAM-AD-")) yield line.getID).toList
val bamADvalues = (for (field <- bamADFields) yield {
record.getAttribute(field, new ArrayList) match {
case t: ArrayList[_] if t.length > 1 => {
for (i <- 1 until t.size) yield {
t(i) match {
case a: Int => a > commandArgs.minBamAlternateDepth
case a: String => a.toInt > commandArgs.minBamAlternateDepth
case _ => false
}
}
}
case _ => List(false)
}
}).flatten
return commandArgs.minBamAlternateDepth <= 0 || bamADvalues.count(_ == true) >= commandArgs.minSamplesPass
/**
* Checks if given samples does have a variant hin this record
* @param record VCF record
* @param mustHaveVariant List of samples that should have this variant
* @return true if filter passed
*/
def mustHaveVariant(record: VariantContext, mustHaveVariant: List[String]): Boolean = {
!mustHaveVariant.map(record.getGenotype).exists(a => a.isHomRef || a.isNoCall)
}
def mustHaveVariant(record: VariantContext): Boolean = {
return !commandArgs.mustHaveVariant.map(record.getGenotype(_)).exists(a => a.isHomRef || a.isNoCall)
/** Checks if given samples have the same genotype */
def notSameGenotype(record: VariantContext, sample1: String, sample2: String): Boolean = {
val genotype1 = record.getGenotype(sample1)
val genotype2 = record.getGenotype(sample2)
if (genotype1.sameGenotype(genotype2)) false
else true
}
def notSameGenotype(record: VariantContext): Boolean = {
for ((sample1, sample2) <- commandArgs.diffGenotype) {
val genotype1 = record.getGenotype(sample1)
val genotype2 = record.getGenotype(sample2)
if (genotype1.sameGenotype(genotype2)) return false
/** Checks if sample1 is hetrozygous and if sample2 is homozygous for a alternative allele in sample1 */
def filterHetVarToHomVar(record: VariantContext, sample1: String, sample2: String): Boolean = {
val genotype1 = record.getGenotype(sample1)
val genotype2 = record.getGenotype(sample2)
if (genotype1.isHet && !genotype1.getAlleles.forall(_.isNonReference)) {
for (allele <- genotype1.getAlleles if allele.isNonReference) {
if (genotype2.getAlleles.forall(_.basesMatch(allele))) return false
}
}
return true
true
}
def filterHetVarToHomVar(record: VariantContext): Boolean = {
for ((sample1, sample2) <- commandArgs.filterHetVarToHomVar) {
val genotype1 = record.getGenotype(sample1)
val genotype2 = record.getGenotype(sample2)
if (genotype1.isHet && !genotype1.getAlleles.forall(_.isNonReference)) {
for (allele <- genotype1.getAlleles if allele.isNonReference) {
if (genotype2.getAlleles.forall(_.basesMatch(allele))) return false
}
/** Checks if given sample have alternative alleles that are unique in the VCF record */
def denovoInSample(record: VariantContext, sample: String): Boolean = {
if (sample == null) return true
val genotype = record.getGenotype(sample)
if (genotype.isNoCall) return false
for (allele <- genotype.getAlleles) {
for (g <- record.getGenotypes if g.getSampleName != sample) {
if (g.getAlleles.exists(_.basesMatch(allele))) return false
}
}
return true
true
}
def denovoInSample(record: VariantContext): Boolean = {
if (commandArgs.deNovoInSample == null) return true
val genotype = record.getGenotype(commandArgs.deNovoInSample)
for (allele <- genotype.getAlleles if allele.isNonReference) {
for (g <- record.getGenotypes if g.getSampleName != commandArgs.deNovoInSample) {
if (g.getAlleles.exists(_.basesMatch(allele))) return false
/** Return true when variant is homozygous in the child and hetrozygous in parants */
def resToDom(record: VariantContext, trios: List[Trio]): Boolean = {
for (trio <- trios) {
val child = record.getGenotype(trio.child)
if (child.isHomVar && child.getAlleles.forall(allele => {
record.getGenotype(trio.father).countAllele(allele) == 1 &&
record.getGenotype(trio.mother).countAllele(allele) == 1
})) return true
}
trios.isEmpty
}
/** Returns true when variant a compound variant in the child and hetrozygous in parants */
def trioCompound(record: VariantContext, trios: List[Trio]): Boolean = {
for (trio <- trios) {
val child = record.getGenotype(trio.child)
if (child.isHetNonRef && child.getAlleles.forall(allele => {
record.getGenotype(trio.father).countAllele(allele) >= 1 &&
record.getGenotype(trio.mother).countAllele(allele) >= 1
})) return true
}
trios.isEmpty
}
/** Returns true when child got a deNovo variant */
def denovoTrio(record: VariantContext, trios: List[Trio], onlyLossHet: Boolean = false): Boolean = {
for (trio <- trios) {
val child = record.getGenotype(trio.child)
val father = record.getGenotype(trio.father)
val mother = record.getGenotype(trio.mother)
for (allele <- child.getAlleles) {
val childCount = child.countAllele(allele)
val fatherCount = father.countAllele(allele)
val motherCount = mother.countAllele(allele)
if (onlyLossHet) {
if (childCount == 2 && (
(fatherCount == 2 && motherCount == 0) ||
(fatherCount == 0 && motherCount == 2))) return true
} else {
if (childCount == 1 && fatherCount == 0 && motherCount == 0) return true
else if (childCount == 2 && (fatherCount == 0 || motherCount == 0)) return true
}
}
}
return true
trios.isEmpty
}
def inIdSet(record: VariantContext): Boolean = {
if (commandArgs.iDset.isEmpty) true
else record.getID.split(",").exists(commandArgs.iDset.contains(_))
/** Returns true when VCF record contains a ID from the given list */
def inIdSet(record: VariantContext, idSet: Set[String]): Boolean = {
record.getID.split(",").exists(idSet.contains)
}
}
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment