From 7bedb39b8d675953d859adb0b768290a50a97450 Mon Sep 17 00:00:00 2001
From: Peter van 't Hof <p.j.van_t_hof@lumc.nl>
Date: Mon, 11 Jan 2016 12:08:31 +0100
Subject: [PATCH] Merge bed records

---
 .../nl/lumc/sasc/biopet/tools/GvcfToBed.scala | 82 ++++++++++---------
 .../sasc/biopet/tools/GvcfToBedTest.scala     | 38 +--------
 .../nl/lumc/sasc/biopet/utils/VcfUtils.scala  | 26 +++++-
 3 files changed, 74 insertions(+), 72 deletions(-)

diff --git a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/GvcfToBed.scala b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/GvcfToBed.scala
index 83beea074..f96835488 100644
--- a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/GvcfToBed.scala
+++ b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/GvcfToBed.scala
@@ -4,7 +4,7 @@ import java.io.{ File, PrintWriter }
 
 import htsjdk.variant.variantcontext.VariantContext
 import htsjdk.variant.vcf.VCFFileReader
-import nl.lumc.sasc.biopet.utils.ToolCommand
+import nl.lumc.sasc.biopet.utils.{ VcfUtils, ToolCommand }
 import nl.lumc.sasc.biopet.utils.intervals.BedRecord
 
 import scala.collection.JavaConversions._
@@ -17,6 +17,7 @@ object GvcfToBed extends ToolCommand {
 
   case class Args(inputVcf: File = null,
                   outputBed: File = null,
+                  invertedOutputBed: Option[File] = None,
                   sample: Option[String] = None,
                   minGenomeQuality: Int = 0,
                   inverse: Boolean = false) extends AbstractArgs
@@ -28,15 +29,15 @@ object GvcfToBed extends ToolCommand {
     opt[File]('O', "outputBed") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
       c.copy(outputBed = x)
     } text "Output bed file"
+    opt[File]("invertedOutputBed") maxOccurs 1 valueName "<file>" action { (x, c) =>
+      c.copy(invertedOutputBed = Some(x))
+    } text "Output bed file"
     opt[String]('S', "sample") unbounded () maxOccurs 1 valueName "<sample>" action { (x, c) =>
       c.copy(sample = Some(x))
     } text "Sample to consider. Will take first sample on alphabetical order by default"
     opt[Int]("minGenomeQuality") unbounded () maxOccurs 1 valueName "<int>" action { (x, c) =>
       c.copy(minGenomeQuality = x)
     } text "Minimum genome quality to consider"
-    opt[Boolean]("invert") unbounded () maxOccurs 1 valueName "<boolean>" action { (x, c) =>
-      c.copy(inverse = x)
-    } text "Invert filter (i.e. emit only records NOT passing filter)"
   }
 
   def main(args: Array[String]): Unit = {
@@ -47,54 +48,61 @@ object GvcfToBed extends ToolCommand {
     val reader = new VCFFileReader(cmdArgs.inputVcf, false)
     logger.debug("Opening writer")
     val writer = new PrintWriter(cmdArgs.outputBed)
+    val invertedWriter = cmdArgs.invertedOutputBed.collect {
+      case file =>
+        logger.debug("Opening inverted writer")
+        new PrintWriter(file)
+    }
 
-    var counter = 0
+    val sample = cmdArgs.sample.getOrElse(reader.getFileHeader.getSampleNamesInOrder.head)
+
+    val it = reader.iterator()
+    val firstRecord = it.next()
+    var contig = firstRecord.getContig
+    var start = firstRecord.getStart
+    var end = firstRecord.getEnd
+    var pass = VcfUtils.hasMinGenomeQuality(firstRecord, sample, cmdArgs.minGenomeQuality)
+
+    def writeResetCachedRecord(newRecord: VariantContext): Unit = {
+      writeCachedRecord()
+      contig = newRecord.getContig
+      start = newRecord.getStart
+      end = newRecord.getEnd
+      pass = VcfUtils.hasMinGenomeQuality(newRecord, sample, cmdArgs.minGenomeQuality)
+    }
 
+    def writeCachedRecord(): Unit = {
+      if (pass) writer.println(new BedRecord(contig, start - 1, end))
+      else invertedWriter.foreach(_.println(new BedRecord(contig, start - 1, end)))
+    }
+
+    var counter = 1
     logger.info("Start")
-    for (r <- reader) {
-      if (!hasMinGenomeQuality(r, cmdArgs.sample, cmdArgs.minGenomeQuality) && cmdArgs.inverse) {
-        writer.println(createBedRecord(r).toString)
-      } else if (hasMinGenomeQuality(r, cmdArgs.sample, cmdArgs.minGenomeQuality) && !cmdArgs.inverse) {
-        writer.println(createBedRecord(r).toString)
-      }
+    for (r <- it) {
+      if (contig == r.getContig) {
+        val p = VcfUtils.hasMinGenomeQuality(r, sample, cmdArgs.minGenomeQuality)
+        if (p != pass || r.getStart > (end + 1)) writeResetCachedRecord(r)
+        else end = r.getEnd
+      } else writeResetCachedRecord(r)
 
       counter += 1
       if (counter % 100000 == 0) {
         logger.info(s"Processed $counter records")
       }
     }
+    writeCachedRecord()
+
+    logger.info(s"Processed $counter records")
 
     logger.debug("Closing writer")
     writer.close()
+    invertedWriter.foreach { w =>
+      logger.debug("Closing inverted writer")
+      w.close()
+    }
     logger.debug("Closing reader")
     reader.close()
 
     logger.info("Finished!")
   }
-
-  /**
-   * Check whether record has minimum genome qality
-   * @param record variant context
-   * @param sample Option[String] with sample name
-   * @param minGQ minimum genome quality value
-   * @return
-   */
-  def hasMinGenomeQuality(record: VariantContext, sample: Option[String], minGQ: Int): Boolean = {
-    sample foreach { x =>
-      if (!record.getSampleNamesOrderedByName.contains(x))
-        throw new IllegalArgumentException("Sample does not exist")
-    }
-    val gt = record.getGenotype(sample.getOrElse(record.getSampleNamesOrderedByName.head))
-    gt.hasGQ && gt.getGQ >= minGQ
-  }
-
-  /**
-   * Create bed record from variantcontext
-   * @param record variant context
-   * @return BedRecord
-   */
-  def createBedRecord(record: VariantContext): BedRecord = {
-    new BedRecord(record.getContig, record.getStart, record.getEnd)
-  }
-
 }
diff --git a/public/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/GvcfToBedTest.scala b/public/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/GvcfToBedTest.scala
index 56a9ceb91..5664c9c6f 100644
--- a/public/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/GvcfToBedTest.scala
+++ b/public/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/GvcfToBedTest.scala
@@ -4,6 +4,7 @@ import java.io.File
 import java.nio.file.Paths
 
 import htsjdk.variant.vcf.VCFFileReader
+import nl.lumc.sasc.biopet.utils.VcfUtils
 import org.scalatest.Matchers
 import org.scalatest.mock.MockitoSugar
 import org.scalatest.testng.TestNGSuite
@@ -30,43 +31,12 @@ class GvcfToBedTest extends TestNGSuite with Matchers with MockitoSugar {
     val reader = new VCFFileReader(vepped, false)
     val record = reader.iterator().next()
 
-    hasMinGenomeQuality(record, None, 99) shouldBe true
-    hasMinGenomeQuality(record, Some("Sample_101"), 99) shouldBe true
+    VcfUtils.hasMinGenomeQuality(record, "Sample_101", 99) shouldBe true
 
     val reader2 = new VCFFileReader(unvepped, false)
     val record2 = reader2.iterator.next()
 
-    hasMinGenomeQuality(record2, None, 99) shouldBe false
-    hasMinGenomeQuality(record2, None, 0) shouldBe false
-    hasMinGenomeQuality(record2, Some("Sample_102"), 3) shouldBe true
-    hasMinGenomeQuality(record2, Some("Sample_102"), 99) shouldBe false
+    VcfUtils.hasMinGenomeQuality(record2, "Sample_102", 3) shouldBe true
+    VcfUtils.hasMinGenomeQuality(record2, "Sample_102", 99) shouldBe false
   }
-
-  @Test def wrongSample = {
-    val reader = new VCFFileReader(vepped, false)
-    val record = reader.iterator().next()
-
-    an[IllegalArgumentException] should be thrownBy hasMinGenomeQuality(record, Some("dummy"), 99)
-  }
-
-  @Test def testCreateBedRecord = {
-    val reader = new VCFFileReader(vepped, false)
-    val record = reader.iterator().next()
-
-    val bed = createBedRecord(record)
-    bed.chr shouldBe "chr1"
-    bed.start shouldBe 871042
-    bed.end shouldBe 871042
-
-    val reader2 = new VCFFileReader(unvepped, false)
-    val record2 = reader2.iterator.next()
-
-    val bed2 = createBedRecord(record2)
-    bed2.chr shouldBe "chr1"
-    bed2.start shouldBe 14599
-    bed2.end shouldBe 14599
-
-    //TODO: add GVCF-block vcf file to test
-  }
-
 }
diff --git a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/VcfUtils.scala b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/VcfUtils.scala
index 2c3064b84..e724575ac 100644
--- a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/VcfUtils.scala
+++ b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/VcfUtils.scala
@@ -18,7 +18,7 @@ package nl.lumc.sasc.biopet.utils
 import java.io.File
 import java.util
 
-import htsjdk.variant.variantcontext.VariantContext
+import htsjdk.variant.variantcontext.{ Genotype, VariantContext }
 import htsjdk.variant.vcf.{ VCFFileReader, VCFHeader, VCFFilterHeaderLine }
 
 import scala.collection.JavaConversions._
@@ -103,4 +103,28 @@ object VcfUtils {
     reader.close()
     samples
   }
+
+  /**
+   * Check whether record has minimum genome Quality
+   * @param record variant context
+   * @param sample sample name
+   * @param minGQ minimum genome quality value
+   * @return
+   */
+  def hasMinGenomeQuality(record: VariantContext, sample: String, minGQ: Int): Boolean = {
+    if (!record.getSampleNamesOrderedByName.contains(sample))
+      throw new IllegalArgumentException("Sample does not exist")
+    val gt = record.getGenotype(sample)
+    hasMinGenomeQuality(gt, minGQ)
+  }
+
+  /**
+   * Check whether genotype has minimum genome Quality
+   * @param gt Genotype
+   * @param minGQ minimum genome quality value
+   * @return
+   */
+  def hasMinGenomeQuality(gt: Genotype, minGQ: Int): Boolean = {
+    gt.hasGQ && gt.getGQ >= minGQ
+  }
 }
-- 
GitLab