diff --git a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala index 39d7465b04ad72a7b3a4bcf2a6a961a79e1f9bf4..117c9083d8bd0e74d00b4fa3ba8b9ca3e7a619e9 100644 --- a/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala +++ b/protected/biopet-gatk-pipelines/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala @@ -49,9 +49,6 @@ class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScri } def init() { - if (config.contains("target_bed")) { - defaults ++= Map("gatk" -> Map(("intervals" -> config("target_bed").asStringList))) - } if (config.contains("gvcfFiles")) for (file <- config("gvcfFiles").asList) gvcfFiles :+= file.toString diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/MpileupToVcf.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/MpileupToVcf.scala index 08fbbb701632337090174957438950b2905c5c30..d6f72b816788afa99e45cad2b154f4e38c1bd985 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/MpileupToVcf.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/MpileupToVcf.scala @@ -35,8 +35,7 @@ class MpileupToVcf(val root: Configurable) extends BiopetJavaCommandLineFunction override val defaultVmem = "6G" memoryLimit = Option(2.0) - if (config.contains("target_bed")) defaults ++= Map("samtoolsmpileup" -> Map("interval_bed" -> config("target_bed").asStringList.head, - "disable_baq" -> true, "min_map_quality" -> 1)) + defaults ++= Map("samtoolsmpileup" -> Map("disable_baq" -> true, "min_map_quality" -> 1)) override def afterGraph { super.afterGraph diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfFilter.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfFilter.scala index d9f9c8c49aad126d42d3538e3276274b66e83d2c..e948c05e6bcc37796a856c42d8f1c4d0991c0306 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfFilter.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfFilter.scala @@ -2,7 +2,8 @@ package nl.lumc.sasc.biopet.tools import htsjdk.variant.variantcontext.writer.AsyncVariantContextWriter import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder -import htsjdk.variant.vcf.VCFFileReader +import htsjdk.variant.vcf.{ VCFHeader, VCFFileReader } +import htsjdk.variant.variantcontext.VariantContext import java.io.File import java.util.ArrayList import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction @@ -48,81 +49,189 @@ class VcfFilter(val root: Configurable) extends BiopetJavaCommandLineFunction { } object VcfFilter extends ToolCommand { - case class Args(inputVcf: File = null, outputVcf: File = null, minSampleDepth: Int = -1, minTotalDepth: Int = -1, - minAlternateDepth: Int = -1, minSamplesPass: Int = 0, minBamAlternateDepth: Int = 0, filterRefCalls: Boolean = false) extends AbstractArgs + case class Args(inputVcf: File = null, + outputVcf: File = null, + minQualscore: Option[Double] = None, + minSampleDepth: Int = -1, + minTotalDepth: Int = -1, + minAlternateDepth: Int = -1, + minSamplesPass: Int = 0, + minBamAlternateDepth: Int = 0, + mustHaveVariant: List[String] = Nil, + denovoInSample: String = null, + diffGenotype: List[(String, String)] = Nil, + filterHetVarToHomVar: List[(String, String)] = Nil, + filterRefCalls: Boolean = false, + filterNoCalls: Boolean = false) extends AbstractArgs class OptParser extends AbstractOptParser { opt[File]('I', "inputVcf") required () maxOccurs (1) valueName ("<file>") action { (x, c) => c.copy(inputVcf = x) - } + } text ("Input vcf file") opt[File]('o', "outputVcf") required () maxOccurs (1) valueName ("<file>") action { (x, c) => c.copy(outputVcf = x) - } text ("output file, default to stdout") - opt[Int]("minSampleDepth") unbounded () action { (x, c) => + } text ("Output vcf file") + opt[Int]("minSampleDepth") unbounded () valueName ("<int>") action { (x, c) => c.copy(minSampleDepth = x) - } - opt[Int]("minTotalDepth") unbounded () action { (x, c) => + } text ("Min value for DP in genotype fields") + opt[Int]("minTotalDepth") unbounded () valueName ("<int>") action { (x, c) => c.copy(minTotalDepth = x) - } - opt[Int]("minAlternateDepth") unbounded () action { (x, c) => + } text ("Min value of DP field in INFO fields") + opt[Int]("minAlternateDepth") unbounded () valueName ("<int>") action { (x, c) => c.copy(minAlternateDepth = x) - } - opt[Int]("minSamplesPass") unbounded () action { (x, c) => + } text ("Min value of AD field in genotype fields") + opt[Int]("minSamplesPass") unbounded () valueName ("<int>") action { (x, c) => c.copy(minSamplesPass = x) - } - opt[Int]("minBamAlternateDepth") unbounded () action { (x, c) => + } text ("Min number opf samples to pass --minAlternateDepth, --minBamAlternateDepth and --minSampleDepth") + opt[Int]("minBamAlternateDepth") unbounded () valueName ("<int>") action { (x, c) => c.copy(minBamAlternateDepth = x) - } + } // TODO: Convert this to more generic filter + opt[String]("denovoInSample") maxOccurs (1) unbounded () valueName ("<sample>") action { (x, c) => + c.copy(denovoInSample = x) + } text ("Only show variants that contain unique alleles in compete set for given sample") + opt[String]("mustHaveVariant") unbounded () valueName ("<sample>") action { (x, c) => + c.copy(mustHaveVariant = x :: c.mustHaveVariant) + } text ("Given sample must have 1 alternative allele") + opt[String]("diffGenotype") unbounded () valueName ("<sample:sample>") action { (x, c) => + c.copy(diffGenotype = (x.split(":")(0), x.split(":")(1)) :: c.diffGenotype) + } validate { x => if (x.split(":").length == 2) success else failure("--notSameGenotype should be in this format: sample:sample") + } text ("Given samples must have a different genotype") + opt[String]("filterHetVarToHomVar") unbounded () valueName ("<sample:sample>") action { (x, c) => + c.copy(filterHetVarToHomVar = (x.split(":")(0), x.split(":")(1)) :: c.filterHetVarToHomVar) + } validate { x => if (x.split(":").length == 2) success else failure("--filterHetVarToHomVar should be in this format: sample:sample") + } text ("If variants in sample 1 are heterogeneous and alternative alleles are homogeneous in sample 2 variants are filterd") opt[Unit]("filterRefCalls") unbounded () action { (x, c) => c.copy(filterRefCalls = true) - } + } text ("Filter when there are only ref calls") + opt[Unit]("filterNoCalls") unbounded () action { (x, c) => + c.copy(filterNoCalls = true) + } text ("Filter when there are only no calls") + opt[Double]("minQualscore") unbounded () action { (x, c) => + c.copy(minQualscore = Some(x)) + } text ("Min qual score") } + var commandArgs: Args = _ + /** * @param args the command line arguments */ def main(args: Array[String]): Unit = { val argsParser = new OptParser - val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1) + commandArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1) val reader = new VCFFileReader(commandArgs.inputVcf, false) val header = reader.getFileHeader val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().setOutputFile(commandArgs.outputVcf).build) writer.writeHeader(header) - val bamADFields = (for (line <- header.getInfoHeaderLines if line.getID.startsWith("BAM-AD-")) yield line.getID).toList - val bamDPFields = (for (line <- header.getInfoHeaderLines if line.getID.startsWith("BAM-DP-")) yield line.getID).toList - for (record <- reader) { - val genotypes = for (genotype <- record.getGenotypes) yield { - val DP = if (genotype.hasDP) genotype.getDP else -1 - val AD = if (genotype.hasAD) List(genotype.getAD: _*) else Nil - DP >= commandArgs.minSampleDepth && - (if (!AD.isEmpty) AD.tail.count(_ >= commandArgs.minAlternateDepth) > 0 else true) && - !(commandArgs.filterRefCalls && genotype.isHomRef) + if (minQualscore(record) && + filterRefCalls(record) && + filterNoCalls(record) && + minTotalDepth(record) && + minSampleDepth(record) && + minAlternateDepth(record) && + minBamAlternateDepth(record, header) && + mustHaveVariant(record) && + notSameGenotype(record) && + filterHetVarToHomVar(record) && + denovoInSample(record)) { + writer.add(record) } + } + reader.close + writer.close + } + + def minQualscore(record: VariantContext): Boolean = { + if (commandArgs.minQualscore.isEmpty) return true + record.getPhredScaledQual >= commandArgs.minQualscore.get + } + + def filterRefCalls(record: VariantContext): Boolean = { + if (commandArgs.filterNoCalls) record.getGenotypes.exists(g => !g.isHomRef) + else true + } + + def filterNoCalls(record: VariantContext): Boolean = { + if (commandArgs.filterNoCalls) record.getGenotypes.exists(g => !g.isNoCall) + else true + } + + def minTotalDepth(record: VariantContext): Boolean = { + record.getAttributeAsInt("DP", -1) >= commandArgs.minTotalDepth + } - val bamADvalues = (for (field <- bamADFields) yield { - record.getAttribute(field, new ArrayList) match { - case t: ArrayList[_] if t.length > 1 => { - for (i <- 1 until t.size) yield { - t(i) match { - case a: Int => a > commandArgs.minBamAlternateDepth - case a: String => a.toInt > commandArgs.minBamAlternateDepth - case _ => false - } + def minSampleDepth(record: VariantContext): Boolean = { + record.getGenotypes.count(genotype => { + val DP = if (genotype.hasDP) genotype.getDP else -1 + DP >= commandArgs.minSampleDepth + }) >= commandArgs.minSamplesPass + } + + def minAlternateDepth(record: VariantContext): Boolean = { + record.getGenotypes.count(genotype => { + val AD = if (genotype.hasAD) List(genotype.getAD: _*) else Nil + if (!AD.isEmpty) AD.tail.count(_ >= commandArgs.minAlternateDepth) > 0 else true + }) >= commandArgs.minSamplesPass + } + + def minBamAlternateDepth(record: VariantContext, header: VCFHeader): Boolean = { + val bamADFields = (for (line <- header.getInfoHeaderLines if line.getID.startsWith("BAM-AD-")) yield line.getID).toList + + val bamADvalues = (for (field <- bamADFields) yield { + record.getAttribute(field, new ArrayList) match { + case t: ArrayList[_] if t.length > 1 => { + for (i <- 1 until t.size) yield { + t(i) match { + case a: Int => a > commandArgs.minBamAlternateDepth + case a: String => a.toInt > commandArgs.minBamAlternateDepth + case _ => false } } - case _ => List(false) } - }).flatten + case _ => List(false) + } + }).flatten - if (record.getAttributeAsInt("DP", -1) >= commandArgs.minTotalDepth && - genotypes.count(_ == true) >= commandArgs.minSamplesPass && - (commandArgs.minBamAlternateDepth <= 0 || bamADvalues.count(_ == true) >= commandArgs.minSamplesPass)) - writer.add(record) + return commandArgs.minBamAlternateDepth <= 0 || bamADvalues.count(_ == true) >= commandArgs.minSamplesPass + } + + def mustHaveVariant(record: VariantContext): Boolean = { + return !commandArgs.mustHaveVariant.map(record.getGenotype(_)).exists(a => a.isHomRef || a.isNoCall) + } + + def notSameGenotype(record: VariantContext): Boolean = { + for ((sample1, sample2) <- commandArgs.diffGenotype) { + val genotype1 = record.getGenotype(sample1) + val genotype2 = record.getGenotype(sample2) + if (genotype1.sameGenotype(genotype2)) return false } - reader.close - writer.close + return true + } + + def filterHetVarToHomVar(record: VariantContext): Boolean = { + for ((sample1, sample2) <- commandArgs.filterHetVarToHomVar) { + val genotype1 = record.getGenotype(sample1) + val genotype2 = record.getGenotype(sample2) + if (genotype1.isHet && !genotype1.getAlleles.forall(_.isNonReference)) { + for (allele <- genotype1.getAlleles if allele.isNonReference) { + if (genotype2.getAlleles.forall(_.basesMatch(allele))) return false + } + } + } + return true + } + + def denovoInSample(record: VariantContext): Boolean = { + if (commandArgs.denovoInSample == null) return true + val genotype = record.getGenotype(commandArgs.denovoInSample) + for (allele <- genotype.getAlleles if allele.isNonReference) { + for (g <- record.getGenotypes if g.getSampleName != commandArgs.denovoInSample) { + if (g.getAlleles.exists(_.basesMatch(allele))) return false + } + } + return true } } \ No newline at end of file