From 7bedb39b8d675953d859adb0b768290a50a97450 Mon Sep 17 00:00:00 2001 From: Peter van 't Hof <p.j.van_t_hof@lumc.nl> Date: Mon, 11 Jan 2016 12:08:31 +0100 Subject: [PATCH] Merge bed records --- .../nl/lumc/sasc/biopet/tools/GvcfToBed.scala | 82 ++++++++++--------- .../sasc/biopet/tools/GvcfToBedTest.scala | 38 +-------- .../nl/lumc/sasc/biopet/utils/VcfUtils.scala | 26 +++++- 3 files changed, 74 insertions(+), 72 deletions(-) diff --git a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/GvcfToBed.scala b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/GvcfToBed.scala index 83beea074..f96835488 100644 --- a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/GvcfToBed.scala +++ b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/GvcfToBed.scala @@ -4,7 +4,7 @@ import java.io.{ File, PrintWriter } import htsjdk.variant.variantcontext.VariantContext import htsjdk.variant.vcf.VCFFileReader -import nl.lumc.sasc.biopet.utils.ToolCommand +import nl.lumc.sasc.biopet.utils.{ VcfUtils, ToolCommand } import nl.lumc.sasc.biopet.utils.intervals.BedRecord import scala.collection.JavaConversions._ @@ -17,6 +17,7 @@ object GvcfToBed extends ToolCommand { case class Args(inputVcf: File = null, outputBed: File = null, + invertedOutputBed: Option[File] = None, sample: Option[String] = None, minGenomeQuality: Int = 0, inverse: Boolean = false) extends AbstractArgs @@ -28,15 +29,15 @@ object GvcfToBed extends ToolCommand { opt[File]('O', "outputBed") required () maxOccurs 1 valueName "<file>" action { (x, c) => c.copy(outputBed = x) } text "Output bed file" + opt[File]("invertedOutputBed") maxOccurs 1 valueName "<file>" action { (x, c) => + c.copy(invertedOutputBed = Some(x)) + } text "Output bed file" opt[String]('S', "sample") unbounded () maxOccurs 1 valueName "<sample>" action { (x, c) => c.copy(sample = Some(x)) } text "Sample to consider. Will take first sample on alphabetical order by default" opt[Int]("minGenomeQuality") unbounded () maxOccurs 1 valueName "<int>" action { (x, c) => c.copy(minGenomeQuality = x) } text "Minimum genome quality to consider" - opt[Boolean]("invert") unbounded () maxOccurs 1 valueName "<boolean>" action { (x, c) => - c.copy(inverse = x) - } text "Invert filter (i.e. emit only records NOT passing filter)" } def main(args: Array[String]): Unit = { @@ -47,54 +48,61 @@ object GvcfToBed extends ToolCommand { val reader = new VCFFileReader(cmdArgs.inputVcf, false) logger.debug("Opening writer") val writer = new PrintWriter(cmdArgs.outputBed) + val invertedWriter = cmdArgs.invertedOutputBed.collect { + case file => + logger.debug("Opening inverted writer") + new PrintWriter(file) + } - var counter = 0 + val sample = cmdArgs.sample.getOrElse(reader.getFileHeader.getSampleNamesInOrder.head) + + val it = reader.iterator() + val firstRecord = it.next() + var contig = firstRecord.getContig + var start = firstRecord.getStart + var end = firstRecord.getEnd + var pass = VcfUtils.hasMinGenomeQuality(firstRecord, sample, cmdArgs.minGenomeQuality) + + def writeResetCachedRecord(newRecord: VariantContext): Unit = { + writeCachedRecord() + contig = newRecord.getContig + start = newRecord.getStart + end = newRecord.getEnd + pass = VcfUtils.hasMinGenomeQuality(newRecord, sample, cmdArgs.minGenomeQuality) + } + def writeCachedRecord(): Unit = { + if (pass) writer.println(new BedRecord(contig, start - 1, end)) + else invertedWriter.foreach(_.println(new BedRecord(contig, start - 1, end))) + } + + var counter = 1 logger.info("Start") - for (r <- reader) { - if (!hasMinGenomeQuality(r, cmdArgs.sample, cmdArgs.minGenomeQuality) && cmdArgs.inverse) { - writer.println(createBedRecord(r).toString) - } else if (hasMinGenomeQuality(r, cmdArgs.sample, cmdArgs.minGenomeQuality) && !cmdArgs.inverse) { - writer.println(createBedRecord(r).toString) - } + for (r <- it) { + if (contig == r.getContig) { + val p = VcfUtils.hasMinGenomeQuality(r, sample, cmdArgs.minGenomeQuality) + if (p != pass || r.getStart > (end + 1)) writeResetCachedRecord(r) + else end = r.getEnd + } else writeResetCachedRecord(r) counter += 1 if (counter % 100000 == 0) { logger.info(s"Processed $counter records") } } + writeCachedRecord() + + logger.info(s"Processed $counter records") logger.debug("Closing writer") writer.close() + invertedWriter.foreach { w => + logger.debug("Closing inverted writer") + w.close() + } logger.debug("Closing reader") reader.close() logger.info("Finished!") } - - /** - * Check whether record has minimum genome qality - * @param record variant context - * @param sample Option[String] with sample name - * @param minGQ minimum genome quality value - * @return - */ - def hasMinGenomeQuality(record: VariantContext, sample: Option[String], minGQ: Int): Boolean = { - sample foreach { x => - if (!record.getSampleNamesOrderedByName.contains(x)) - throw new IllegalArgumentException("Sample does not exist") - } - val gt = record.getGenotype(sample.getOrElse(record.getSampleNamesOrderedByName.head)) - gt.hasGQ && gt.getGQ >= minGQ - } - - /** - * Create bed record from variantcontext - * @param record variant context - * @return BedRecord - */ - def createBedRecord(record: VariantContext): BedRecord = { - new BedRecord(record.getContig, record.getStart, record.getEnd) - } - } diff --git a/public/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/GvcfToBedTest.scala b/public/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/GvcfToBedTest.scala index 56a9ceb91..5664c9c6f 100644 --- a/public/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/GvcfToBedTest.scala +++ b/public/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/GvcfToBedTest.scala @@ -4,6 +4,7 @@ import java.io.File import java.nio.file.Paths import htsjdk.variant.vcf.VCFFileReader +import nl.lumc.sasc.biopet.utils.VcfUtils import org.scalatest.Matchers import org.scalatest.mock.MockitoSugar import org.scalatest.testng.TestNGSuite @@ -30,43 +31,12 @@ class GvcfToBedTest extends TestNGSuite with Matchers with MockitoSugar { val reader = new VCFFileReader(vepped, false) val record = reader.iterator().next() - hasMinGenomeQuality(record, None, 99) shouldBe true - hasMinGenomeQuality(record, Some("Sample_101"), 99) shouldBe true + VcfUtils.hasMinGenomeQuality(record, "Sample_101", 99) shouldBe true val reader2 = new VCFFileReader(unvepped, false) val record2 = reader2.iterator.next() - hasMinGenomeQuality(record2, None, 99) shouldBe false - hasMinGenomeQuality(record2, None, 0) shouldBe false - hasMinGenomeQuality(record2, Some("Sample_102"), 3) shouldBe true - hasMinGenomeQuality(record2, Some("Sample_102"), 99) shouldBe false + VcfUtils.hasMinGenomeQuality(record2, "Sample_102", 3) shouldBe true + VcfUtils.hasMinGenomeQuality(record2, "Sample_102", 99) shouldBe false } - - @Test def wrongSample = { - val reader = new VCFFileReader(vepped, false) - val record = reader.iterator().next() - - an[IllegalArgumentException] should be thrownBy hasMinGenomeQuality(record, Some("dummy"), 99) - } - - @Test def testCreateBedRecord = { - val reader = new VCFFileReader(vepped, false) - val record = reader.iterator().next() - - val bed = createBedRecord(record) - bed.chr shouldBe "chr1" - bed.start shouldBe 871042 - bed.end shouldBe 871042 - - val reader2 = new VCFFileReader(unvepped, false) - val record2 = reader2.iterator.next() - - val bed2 = createBedRecord(record2) - bed2.chr shouldBe "chr1" - bed2.start shouldBe 14599 - bed2.end shouldBe 14599 - - //TODO: add GVCF-block vcf file to test - } - } diff --git a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/VcfUtils.scala b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/VcfUtils.scala index 2c3064b84..e724575ac 100644 --- a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/VcfUtils.scala +++ b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/VcfUtils.scala @@ -18,7 +18,7 @@ package nl.lumc.sasc.biopet.utils import java.io.File import java.util -import htsjdk.variant.variantcontext.VariantContext +import htsjdk.variant.variantcontext.{ Genotype, VariantContext } import htsjdk.variant.vcf.{ VCFFileReader, VCFHeader, VCFFilterHeaderLine } import scala.collection.JavaConversions._ @@ -103,4 +103,28 @@ object VcfUtils { reader.close() samples } + + /** + * Check whether record has minimum genome Quality + * @param record variant context + * @param sample sample name + * @param minGQ minimum genome quality value + * @return + */ + def hasMinGenomeQuality(record: VariantContext, sample: String, minGQ: Int): Boolean = { + if (!record.getSampleNamesOrderedByName.contains(sample)) + throw new IllegalArgumentException("Sample does not exist") + val gt = record.getGenotype(sample) + hasMinGenomeQuality(gt, minGQ) + } + + /** + * Check whether genotype has minimum genome Quality + * @param gt Genotype + * @param minGQ minimum genome quality value + * @return + */ + def hasMinGenomeQuality(gt: Genotype, minGQ: Int): Boolean = { + gt.hasGQ && gt.getGQ >= minGQ + } } -- GitLab