From 9a00253ef61856ed0c58460617f8f0610e05372f Mon Sep 17 00:00:00 2001
From: bow <bow@bow.web.id>
Date: Fri, 31 Oct 2014 14:26:19 +0100
Subject: [PATCH] Add refFlat support

---
 .../nl/lumc/sasc/biopet/tools/WipeReads.scala | 37 +++++++++++++++----
 .../src/test/resources/rrna01.refFlat         |  3 ++
 .../sasc/biopet/tools/WipeReadsUnitTest.scala | 29 +++++++++++----
 3 files changed, 55 insertions(+), 14 deletions(-)
 create mode 100644 biopet-framework/src/test/resources/rrna01.refFlat

diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala
index d5b243f83..f28f61256 100644
--- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala
+++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala
@@ -6,6 +6,7 @@ package nl.lumc.sasc.biopet.tools
 
 import java.io.File
 import scala.collection.JavaConverters._
+import scala.io.Source
 import scala.math.{ max, min }
 
 import com.google.common.hash.{ Funnel, BloomFilter, PrimitiveSink }
@@ -83,20 +84,42 @@ object WipeReads extends ToolCommand {
         .asScala
         .map(x => new Interval(x.getChr, x.getStart, x.getEnd))
 
-    /** Function to create iterator from refFlat file */
-    def makeIntervalFromRefFlat(inFile: File): Iterator[Interval] = ???
-      // convert coordinate to 1-based fully closed
-      // parse chrom, start blocks, end blocks, strands
+    /**
+     * Parses a refFlat file to yield Interval objects
+     *
+     * Format description:
+     * http://genome.csdb.cn/cgi-bin/hgTables?hgsid=6&hgta_doSchemaDb=hg18&hgta_doSchemaTable=refFlat
+     *
+     * @param inFile input refFlat file
+     */
+    def makeIntervalFromRefFlat(inFile: File): Iterator[Interval] =
+      Source.fromFile(inFile)
+        // read each line
+        .getLines()
+        // skip all empty lines
+        .filterNot(x => x.trim.isEmpty)
+        // split per column
+        .map(line => line.trim.split("\t"))
+        // take chromosome and exonEnds and exonStars
+        .map(x => (x(2), x.reverse.take(2)))
+        // split starts and ends based on comma
+        .map(x => (x._1, x._2.map(y => y.split(","))))
+        // zip exonStarts and exonEnds, note the index was reversed because we did .reverse above
+        .map(x => (x._1, x._2(1).zip(x._2(0))))
+        // make Intervals
+        .map(x => x._2.map(y => new Interval(x._1, y._1.toInt, y._2.toInt)))
+        // flatten sublist
+        .flatten
 
     /** Function to create iterator from GTF file */
     def makeIntervalFromGtf(inFile: File): Iterator[Interval] = ???
-        // convert coordinate to 1-based fully closed
-        // parse chrom, start blocks, end blocks, strands
 
     // detect interval file format from extension
     val iterFunc: (File => Iterator[Interval]) =
       if (getExtension(inFile.toString.toLowerCase) == "bed")
         makeIntervalFromBed
+      else if (getExtension(inFile.toString.toLowerCase) == "refflat")
+        makeIntervalFromRefFlat
       else
         throw new IllegalArgumentException("Unexpected interval file type: " + inFile.getPath)
 
@@ -319,7 +342,7 @@ object WipeReads extends ToolCommand {
       x => if (x.exists) success else failure("Input BAM file not found")
     } text "Input BAM file"
 
-    opt[File]('r', "interval_file") required () valueName "<bed>" action { (x, c) =>
+    opt[File]('r', "interval_file") required () valueName "<bed/refflat>" action { (x, c) =>
       c.copy(targetRegions = x)
     } validate {
       x => if (x.exists) success else failure("Target regions file not found")
diff --git a/biopet-framework/src/test/resources/rrna01.refFlat b/biopet-framework/src/test/resources/rrna01.refFlat
new file mode 100644
index 000000000..30e68e91c
--- /dev/null
+++ b/biopet-framework/src/test/resources/rrna01.refFlat
@@ -0,0 +1,3 @@
+gene1	trans1	chrS	+	100	500	100	500	1	100,	500,
+gene1	trans1	chrR	+	100	500	200	400	1	100,	500,
+gene1	trans1	chrQ	+	100	1000	200	900	3	100,300,800	200,400,1000
diff --git a/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala b/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala
index f05b34219..9523ba2bb 100644
--- a/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala
+++ b/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala
@@ -126,18 +126,33 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers {
   )
 
   val pBamFile3 = new File(resourcePath("/paired03.bam"))
+
   val BedFile1 = new File(resourcePath("/rrna01.bed"))
-  val minArgList = List("-I", sBamFile1.toString, "-l", BedFile1.toString, "-o", "mock.bam")
+  val RefFlatFile1 = new File(resourcePath("/rrna01.refFlat"))
 
-  @Test def testMakeFeatureFromBed() = {
+  @Test def testMakeIntervalFromBed() = {
     val intervals: List[Interval] = makeIntervalFromFile(BedFile1)
-    intervals.length should be(3)
+    intervals.length shouldBe 3
     intervals.head.getSequence should ===("chrQ")
-    intervals.head.getStart should be(991)
-    intervals.head.getEnd should be(1000)
+    intervals.head.getStart shouldBe 991
+    intervals.head.getEnd shouldBe 1000
+    intervals.last.getSequence should ===("chrQ")
+    intervals.last.getStart shouldBe 291
+    intervals.last.getEnd shouldBe 320
+  }
+
+  @Test def testMakeIntervalFromRefFlat() = {
+    val intervals: List[Interval] = makeIntervalFromFile(RefFlatFile1)
+    intervals.length shouldBe 5
     intervals.last.getSequence should ===("chrQ")
-    intervals.last.getStart should be(291)
-    intervals.last.getEnd should be(320)
+    intervals.last.getStart shouldBe 100
+    intervals.last.getEnd shouldBe 200
+    intervals(2).getSequence should ===("chrQ")
+    intervals(2).getStart shouldBe 800
+    intervals(2).getEnd shouldBe 1000
+    intervals.head.getSequence should ===("chrS")
+    intervals.head.getStart shouldBe 100
+    intervals.head.getEnd shouldBe 500
   }
 
   @Test def testSingleBamDefault() = {
-- 
GitLab