From 9a00253ef61856ed0c58460617f8f0610e05372f Mon Sep 17 00:00:00 2001 From: bow <bow@bow.web.id> Date: Fri, 31 Oct 2014 14:26:19 +0100 Subject: [PATCH] Add refFlat support --- .../nl/lumc/sasc/biopet/tools/WipeReads.scala | 37 +++++++++++++++---- .../src/test/resources/rrna01.refFlat | 3 ++ .../sasc/biopet/tools/WipeReadsUnitTest.scala | 29 +++++++++++---- 3 files changed, 55 insertions(+), 14 deletions(-) create mode 100644 biopet-framework/src/test/resources/rrna01.refFlat diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala index d5b243f83..f28f61256 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala @@ -6,6 +6,7 @@ package nl.lumc.sasc.biopet.tools import java.io.File import scala.collection.JavaConverters._ +import scala.io.Source import scala.math.{ max, min } import com.google.common.hash.{ Funnel, BloomFilter, PrimitiveSink } @@ -83,20 +84,42 @@ object WipeReads extends ToolCommand { .asScala .map(x => new Interval(x.getChr, x.getStart, x.getEnd)) - /** Function to create iterator from refFlat file */ - def makeIntervalFromRefFlat(inFile: File): Iterator[Interval] = ??? - // convert coordinate to 1-based fully closed - // parse chrom, start blocks, end blocks, strands + /** + * Parses a refFlat file to yield Interval objects + * + * Format description: + * http://genome.csdb.cn/cgi-bin/hgTables?hgsid=6&hgta_doSchemaDb=hg18&hgta_doSchemaTable=refFlat + * + * @param inFile input refFlat file + */ + def makeIntervalFromRefFlat(inFile: File): Iterator[Interval] = + Source.fromFile(inFile) + // read each line + .getLines() + // skip all empty lines + .filterNot(x => x.trim.isEmpty) + // split per column + .map(line => line.trim.split("\t")) + // take chromosome and exonEnds and exonStars + .map(x => (x(2), x.reverse.take(2))) + // split starts and ends based on comma + .map(x => (x._1, x._2.map(y => y.split(",")))) + // zip exonStarts and exonEnds, note the index was reversed because we did .reverse above + .map(x => (x._1, x._2(1).zip(x._2(0)))) + // make Intervals + .map(x => x._2.map(y => new Interval(x._1, y._1.toInt, y._2.toInt))) + // flatten sublist + .flatten /** Function to create iterator from GTF file */ def makeIntervalFromGtf(inFile: File): Iterator[Interval] = ??? - // convert coordinate to 1-based fully closed - // parse chrom, start blocks, end blocks, strands // detect interval file format from extension val iterFunc: (File => Iterator[Interval]) = if (getExtension(inFile.toString.toLowerCase) == "bed") makeIntervalFromBed + else if (getExtension(inFile.toString.toLowerCase) == "refflat") + makeIntervalFromRefFlat else throw new IllegalArgumentException("Unexpected interval file type: " + inFile.getPath) @@ -319,7 +342,7 @@ object WipeReads extends ToolCommand { x => if (x.exists) success else failure("Input BAM file not found") } text "Input BAM file" - opt[File]('r', "interval_file") required () valueName "<bed>" action { (x, c) => + opt[File]('r', "interval_file") required () valueName "<bed/refflat>" action { (x, c) => c.copy(targetRegions = x) } validate { x => if (x.exists) success else failure("Target regions file not found") diff --git a/biopet-framework/src/test/resources/rrna01.refFlat b/biopet-framework/src/test/resources/rrna01.refFlat new file mode 100644 index 000000000..30e68e91c --- /dev/null +++ b/biopet-framework/src/test/resources/rrna01.refFlat @@ -0,0 +1,3 @@ +gene1 trans1 chrS + 100 500 100 500 1 100, 500, +gene1 trans1 chrR + 100 500 200 400 1 100, 500, +gene1 trans1 chrQ + 100 1000 200 900 3 100,300,800 200,400,1000 diff --git a/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala b/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala index f05b34219..9523ba2bb 100644 --- a/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala +++ b/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala @@ -126,18 +126,33 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers { ) val pBamFile3 = new File(resourcePath("/paired03.bam")) + val BedFile1 = new File(resourcePath("/rrna01.bed")) - val minArgList = List("-I", sBamFile1.toString, "-l", BedFile1.toString, "-o", "mock.bam") + val RefFlatFile1 = new File(resourcePath("/rrna01.refFlat")) - @Test def testMakeFeatureFromBed() = { + @Test def testMakeIntervalFromBed() = { val intervals: List[Interval] = makeIntervalFromFile(BedFile1) - intervals.length should be(3) + intervals.length shouldBe 3 intervals.head.getSequence should ===("chrQ") - intervals.head.getStart should be(991) - intervals.head.getEnd should be(1000) + intervals.head.getStart shouldBe 991 + intervals.head.getEnd shouldBe 1000 + intervals.last.getSequence should ===("chrQ") + intervals.last.getStart shouldBe 291 + intervals.last.getEnd shouldBe 320 + } + + @Test def testMakeIntervalFromRefFlat() = { + val intervals: List[Interval] = makeIntervalFromFile(RefFlatFile1) + intervals.length shouldBe 5 intervals.last.getSequence should ===("chrQ") - intervals.last.getStart should be(291) - intervals.last.getEnd should be(320) + intervals.last.getStart shouldBe 100 + intervals.last.getEnd shouldBe 200 + intervals(2).getSequence should ===("chrQ") + intervals(2).getStart shouldBe 800 + intervals(2).getEnd shouldBe 1000 + intervals.head.getSequence should ===("chrS") + intervals.head.getStart shouldBe 100 + intervals.head.getEnd shouldBe 500 } @Test def testSingleBamDefault() = { -- GitLab