From a5b9de0f1a4f65c4e4d1b511f51e13a80a899a22 Mon Sep 17 00:00:00 2001
From: Sander Bollen <a.h.b.bollen@lumc.nl>
Date: Thu, 19 Feb 2015 16:44:23 +0100
Subject: [PATCH] Explode and standard in one function, no longer custom
 exceptions, CSQ tag by default removed.

---
 .../sasc/biopet/tools/VEPNormalizer.scala     | 130 ++++++------------
 .../sasc/biopet/tools/VEPNormalizerTest.scala |   8 +-
 2 files changed, 47 insertions(+), 91 deletions(-)

diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VEPNormalizer.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VEPNormalizer.scala
index 3fa3653cb..cab3a8968 100644
--- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VEPNormalizer.scala
+++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VEPNormalizer.scala
@@ -6,10 +6,9 @@ import htsjdk.tribble.TribbleException
 import scala.collection.JavaConversions._
 import nl.lumc.sasc.biopet.core.{ BiopetJavaCommandLineFunction, ToolCommand }
 import collection.mutable.{ Map => MMap }
-import collection.JavaConverters._
 import htsjdk.variant.vcf._
 import htsjdk.variant.variantcontext.{ VariantContextBuilder, VariantContext }
-import htsjdk.variant.variantcontext.writer.{ VariantContextWriter, VariantContextWriterBuilder }
+import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriter, VariantContextWriterBuilder }
 import nl.lumc.sasc.biopet.core.config.Configurable
 import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
 
@@ -53,14 +52,11 @@ object VEPNormalizer extends ToolCommand {
     logger.info(s"""Input VCF is $input""")
     logger.info(s"""Output VCF is $output""")
 
-    var reader: VCFFileReader = null
-    // this can give a codec error if malformed VCF
-    //
-    try {
-      reader = new VCFFileReader(input, false)
+    val reader = try {
+      new VCFFileReader(input, false)
     } catch {
       case e: TribbleException.MalformedFeatureFile =>
-        logger.error("Malformed VCF file! VCFv3 not supported!")
+        logger.error("Malformed VCF file! Are you sure this isn't a VCFv3 file?")
         throw e
     }
 
@@ -82,80 +78,41 @@ object VEPNormalizer extends ToolCommand {
     logger.debug("Header parsing done")
 
     logger.debug("Writing header to file")
-    val writerBuilder = new VariantContextWriterBuilder()
-    writerBuilder.
+    val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().
       setOutputFile(output).
-      setOutputFileType(VariantContextWriterBuilder.OutputType.BLOCK_COMPRESSED_VCF).
-      setReferenceDictionary(seqDict)
-    val writer = writerBuilder.build()
+      build())
     writer.writeHeader(header)
     logger.debug("Wrote header to file")
 
-    if (commandArgs.mode == "explode") {
-      logger.info("You have selected explode mode")
-      explode(reader, writer, new_infos)
-    } else if (commandArgs.mode == "standard") {
-      logger.info("You have selected standard mode")
-      standard(reader, writer, new_infos)
-    } else {
-      // this should be impossible, but should nevertheless be checked
-      logger.error("impossibru!", new IllegalArgumentException)
-    }
-  }
-
-  /**
-   * Wrapper for mode explode
-   * @param reader input VCF VCFFileReader
-   * @param writer output VCF VariantContextWriter
-   * @param new_infos array of string containing names of new info fields
-   */
-  def explode(reader: VCFFileReader, writer: VariantContextWriter, new_infos: Array[String]) = {
-    logger.info("Start processing records")
-    var nprocessed_records: Int = 0
-    var nwritten_records: Int = 0
-    for ((record, i) <- reader.iterator().zipWithIndex) {
-      nprocessed_records += 1
-      if (i % 1000 == 0) {
-        logger.info(s"""Read $i records \t Wrote $nwritten_records records""")
-      }
-      val new_records = explodeTranscripts(record, new_infos)
-      for (vc <- new_records) {
-        writer.add(vc)
-      }
-      nwritten_records += new_records.length
-    }
-    logger.info(s"""Processed $nprocessed_records records""")
-    logger.info("Done. Goodbye")
+    normalize(reader, writer, new_infos, commandArgs.mode, commandArgs.remove_CSQ)
     writer.close()
     logger.debug("Closed writer")
     reader.close()
     logger.debug("Closed reader")
+    logger.info("Done. Goodbye")
   }
 
   /**
-   * Wrapper for mode standard
+   * Normalizer
    * @param reader input VCF VCFFileReader
-   * @param writer output VCF VariantContextWriter
+   * @param writer output VCF AsyncVariantContextWriter
    * @param new_infos array of string containing names of new info fields
+   * @param mode normalizer mode (explode or standard)
+   * @param remove_csq remove csq tag (Boolean)
+   * @return
    */
-  def standard(reader: VCFFileReader, writer: VariantContextWriter, new_infos: Array[String]) = {
+  def normalize(reader: VCFFileReader, writer: AsyncVariantContextWriter,
+                new_infos: Array[String], mode: String, remove_csq: Boolean) = {
+    logger.info(s"""You have selected mode $mode""")
     logger.info("Start processing records")
-    var nprocessed_records: Int = 0
-    var nwritten_records: Int = 0
-    for ((record, i) <- reader.iterator().zipWithIndex) {
-      nprocessed_records += 1
-      if (i % 1000 == 0) {
-        logger.info(s"""Read $i records \t Wrote $nwritten_records records""")
+
+    for (record <- reader) {
+      mode match {
+        case "explode"  => explodeTranscripts(record, new_infos, remove_csq).foreach(vc => writer.add(vc))
+        case "standard" => writer.add(standardTranscripts(record, new_infos, remove_csq))
+        case _          => throw new IllegalArgumentException("Something odd happened!")
       }
-      writer.add(standardTranscripts(record, new_infos))
-      nwritten_records += 1
     }
-    logger.info(s"""Processed $nprocessed_records records""")
-    logger.info("Done. Goodbye")
-    writer.close()
-    logger.debug("Closed writer")
-    reader.close()
-    logger.debug("Closed reader")
   }
 
   /**
@@ -165,7 +122,7 @@ object VEPNormalizer extends ToolCommand {
   def csqCheck(header: VCFHeader) = {
     if (!header.hasInfoLine("CSQ")) {
       //logger.error("No CSQ info tag found! Is this file VEP-annotated?")
-      throw new VEPException("No CSQ info tag found! Is this file VEP-annotated?")
+      throw new IllegalArgumentException("No CSQ info tag found! Is this file VEP-annotated?")
     }
   }
 
@@ -186,7 +143,7 @@ object VEPNormalizer extends ToolCommand {
     val version = VCFHeaderVersion.toHeaderVersion(format)
     if (!version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0)) {
       //logger.error(s"""version $version is not supported""")
-      throw new VEPException(s"""version $version is not supported""")
+      throw new IllegalArgumentException(s"""version $version is not supported""")
     }
   }
 
@@ -197,7 +154,7 @@ object VEPNormalizer extends ToolCommand {
    */
   def parseCsq(header: VCFHeader): Array[String] = {
     header.getInfoHeaderLine("CSQ").getDescription.
-      split(':')(1).trim.split('|')
+      split(':')(1).trim.split('|').map("VEP_"+_)
   }
 
   /**
@@ -207,21 +164,26 @@ object VEPNormalizer extends ToolCommand {
    * @param csq_infos An array with names of new info tags
    * @return An array with the new records
    */
-  def explodeTranscripts(record: VariantContext, csq_infos: Array[String]): Array[VariantContext] = {
+  def explodeTranscripts(record: VariantContext, csq_infos: Array[String], remove_CSQ: Boolean): Array[VariantContext] = {
     val csq = record.getAttributeAsString("CSQ", "unknown")
-    val attributes = record.getAttributes.toMap
+    val attributes = if(remove_CSQ) record.getAttributes.toMap - "CSQ" else record.getAttributes.toMap
 
     csq.
       stripPrefix("[").
       stripSuffix("]").
       split(",").
       map(x => attributes ++ csq_infos.zip(x.split("""\|""", -1))).
-      map(x => new VariantContextBuilder(record).attributes(x).make())
+      map(x => {
+      if (remove_CSQ) new VariantContextBuilder(record)
+        .attributes(x)
+        .make()
+      else new VariantContextBuilder(record).attributes(x).make()
+    })
   }
 
-  def standardTranscripts(record: VariantContext, csqInfos: Array[String]): VariantContext = {
+  def standardTranscripts(record: VariantContext, csqInfos: Array[String], remove_CSQ: Boolean): VariantContext = {
     val csq = record.getAttributeAsString("CSQ", "unknown")
-    val attributes = record.getAttributes.toMap
+    val attributes = if(remove_CSQ) record.getAttributes.toMap - "CSQ" else record.getAttributes.toMap
 
     val new_attrs = attributes ++ csqInfos.zip(csq.
       stripPrefix("[").
@@ -236,23 +198,13 @@ object VEPNormalizer extends ToolCommand {
       ).
         map(x => x.mkString(",")))
 
+
     new VariantContextBuilder(record).attributes(new_attrs).make()
   }
-  /**
-   * This one-line class defines a new VEP-specific exception
-   * @param msg The error message
-   */
-  class VEPException(msg: String) extends RuntimeException(msg)
-
-  /**
-   * This one-line class defines a version exception
-   * @param msg The error message
-   */
-  class VCFVersionException(msg: String) extends RuntimeException(msg)
-
   case class Args(inputVCF: File = null,
                   outputVCF: File = null,
-                  mode: String = null) extends AbstractArgs
+                  mode: String = null,
+                  remove_CSQ: Boolean = true ) extends AbstractArgs
 
   class OptParser extends AbstractOptParser {
 
@@ -267,7 +219,8 @@ object VEPNormalizer extends ToolCommand {
     opt[File]('O', "OutputFile") required () valueName "<vcf>" action { (x, c) =>
       c.copy(outputVCF = x)
     } validate {
-      x => if (x.exists) success else success
+      x => if (!x.getName.endsWith(".vcf") && (!x.getName.endsWith(".vcf.gz")) &&(!x.getName.endsWith(".bcf")))
+        failure("Unsupported output file type") else success
     } text "Output VCF file"
 
     opt[String]('m', "mode") required () valueName "<mode>" action { (x, c) =>
@@ -275,6 +228,9 @@ object VEPNormalizer extends ToolCommand {
     } validate {
       x => if (x == "explode") success else if (x == "standard") success else failure("Unsupported mode")
     } text "Mode"
+
+    opt[Unit]("do-not-remove") action { (x, c) =>
+    c.copy(remove_CSQ = false)} text "Do not remove CSQ tag"
   }
 }
 
diff --git a/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VEPNormalizerTest.scala b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VEPNormalizerTest.scala
index c87ad68c9..9d216575e 100644
--- a/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VEPNormalizerTest.scala
+++ b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VEPNormalizerTest.scala
@@ -36,21 +36,21 @@ class VEPNormalizerTest extends TestNGSuite with MockitoSugar with Matchers {
     val reader = new VCFFileReader(vepped, false)
     val header = reader.getFileHeader
     val new_infos = parseCsq(header)
-    explodeTranscripts(reader.iterator().next(), new_infos).length should be(11)
+    explodeTranscripts(reader.iterator().next(), new_infos, true).length should be(11)
   }
 
   @Test def testStandardVEPLength() = {
     val reader = new VCFFileReader(vepped, false)
     val header = reader.getFileHeader
     val new_infos = parseCsq(header)
-    Array(standardTranscripts(reader.iterator().next(), new_infos)).length should be(1)
+    Array(standardTranscripts(reader.iterator().next(), new_infos, true)).length should be(1)
   }
 
   @Test def testStandardVEPAttributeLength() = {
     val reader = new VCFFileReader(vepped, false)
     val header = reader.getFileHeader
     val new_infos = parseCsq(header)
-    val record = standardTranscripts(reader.iterator().next(), new_infos)
+    val record = standardTranscripts(reader.iterator().next(), new_infos, true)
     def checkItems(items: Array[String]) = {
       items.foreach { check }
     }
@@ -73,7 +73,7 @@ class VEPNormalizerTest extends TestNGSuite with MockitoSugar with Matchers {
     val reader = new VCFFileReader(vcf3, false)
   }
 
-  @Test(expectedExceptions = Array(classOf[VEPException]))
+  @Test(expectedExceptions = Array(classOf[IllegalArgumentException]))
   def testNoCSQTagException {
     csqCheck(new VCFFileReader(unvepped, false).getFileHeader)
   }
-- 
GitLab