diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala index 704b203ec5953b1b9143d8fc14a6d80b845e352c..599d12f5bea8d5732cbebe2eacf2a7452603819d 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala @@ -74,7 +74,7 @@ object WipeReads extends ToolCommand { * * @param inFile input interval file */ - def makeIntervalFromFile(inFile: File): List[Interval] = { + def makeIntervalFromFile(inFile: File, gtfFeatureType: String = "exon"): List[Interval] = { logger.info("Parsing interval file ...") @@ -111,8 +111,26 @@ object WipeReads extends ToolCommand { // flatten sublist .flatten - /** Function to create iterator from GTF file */ - def makeIntervalFromGtf(inFile: File): Iterator[Interval] = ??? + /** + * Parses a GTF file to yield Interval objects + * + * @param inFile input GTF file + * @return + */ + def makeIntervalFromGtf(inFile: File): Iterator[Interval] = + Source.fromFile(inFile) + // read each line + .getLines() + // skip all empty lines + .filterNot(x => x.trim.isEmpty) + // skip all UCSC track lines and/or ensembl comment lines + .dropWhile(x => x.matches("^track | ^browser | ^#")) + // split to columns + .map(x => x.split("\t")) + // exclude intervals whose type is different from the supplied one + .filter(x => x(2) == gtfFeatureType) + // and finally create the interval objects + .map(x => new Interval(x(0), x(3).toInt, x(4).toInt)) // detect interval file format from extension val iterFunc: (File => Iterator[Interval]) = @@ -120,6 +138,8 @@ object WipeReads extends ToolCommand { makeIntervalFromBed else if (getExtension(inFile.toString.toLowerCase) == "refflat") makeIntervalFromRefFlat + else if (getExtension(inFile.toString.toLowerCase) == "gtf") + makeIntervalFromGtf else throw new IllegalArgumentException("Unexpected interval file type: " + inFile.getPath) @@ -130,7 +150,7 @@ object WipeReads extends ToolCommand { acc match { case head :: tail if x.intersects(head) => new Interval(x.getSequence, min(x.getStart, head.getStart), max(x.getEnd, head.getEnd)) :: tail - case _ => x :: acc + case _ => x :: acc } } ) @@ -173,12 +193,10 @@ object WipeReads extends ToolCommand { else if (iv.getSequence.startsWith("chr") && getIndex(iv.getSequence.substring(3)) > -1) { logger.warn("Removing 'chr' prefix from interval " + iv.toString) Some(new QueryInterval(getIndex(iv.getSequence.substring(3)), iv.getStart, iv.getEnd)) - } - else if (!iv.getSequence.startsWith("chr") && getIndex("chr" + iv.getSequence) > -1) { + } else if (!iv.getSequence.startsWith("chr") && getIndex("chr" + iv.getSequence) > -1) { logger.warn("Adding 'chr' prefix to interval " + iv.toString) Some(new QueryInterval(getIndex("chr" + iv.getSequence), iv.getStart, iv.getEnd)) - } - else { + } else { logger.warn("Sequence " + iv.getSequence + " does not exist in alignment") None } @@ -326,6 +344,7 @@ object WipeReads extends ToolCommand { minMapQ: Int = 0, limitToRegion: Boolean = false, noMakeIndex: Boolean = false, + featureType: String = "exon", bloomSize: Long = 70000000, bloomFp: Double = 4e-7) extends AbstractArgs @@ -342,7 +361,7 @@ object WipeReads extends ToolCommand { x => if (x.exists) success else failure("Input BAM file not found") } text "Input BAM file" - opt[File]('r', "interval_file") required () valueName "<bed/refflat>" action { (x, c) => + opt[File]('r', "interval_file") required () valueName "<bed/gtf/refflat>" action { (x, c) => c.copy(targetRegions = x) } validate { x => if (x.exists) success else failure("Target regions file not found") @@ -374,7 +393,13 @@ object WipeReads extends ToolCommand { } text "Whether to index output BAM file or not (default: yes)" - note("\nAdvanced options") + note("\nGTF-only options:") + + opt[String]('t', "feature_type") optional () valueName "<gtf_feature_type>" action { (x, c) => + c.copy(featureType = x) + } text "GTF feature containing intervals (default: exon)" + + note("\nAdvanced options:") opt[Long]("bloom_size") optional () action { (x, c) => c.copy(bloomSize = x) @@ -409,7 +434,7 @@ object WipeReads extends ToolCommand { // cannot use SamReader as inBam directly since it only allows one active iterator at any given time val filterFunc = makeFilterNotFunction( - ivl = makeIntervalFromFile(commandArgs.targetRegions), + ivl = makeIntervalFromFile(commandArgs.targetRegions, gtfFeatureType = commandArgs.featureType), inBam = commandArgs.inputBam, filterOutMulti = !commandArgs.limitToRegion, minMapQ = commandArgs.minMapQ, diff --git a/biopet-framework/src/test/resources/rrna01.gtf b/biopet-framework/src/test/resources/rrna01.gtf new file mode 100644 index 0000000000000000000000000000000000000000..9065a12a2716a621082ddab947deded5287585f0 --- /dev/null +++ b/biopet-framework/src/test/resources/rrna01.gtf @@ -0,0 +1,10 @@ +chrQ ensembl gene 669 778 . - . gene_id "ENSG00000252956"; +chrQ ensembl transcript 669 778 . - . gene_id "ENSG00000252956"; +chrQ ensembl exon 669 778 . - . gene_id "ENSG00000252956"; +chrQ ensembl gene 184 284 . - . gene_id "ENSG00000222952"; +chrQ ensembl transcript 184 284 . - . gene_id "ENSG00000222952"; +chrQ ensembl exon 184 284 . - . gene_id "ENSG00000222952"; +chrP ensembl gene 2949 3063 . + . gene_id "ENSG00000201148"; +chrP ensembl transcript 2949 3063 . + . gene_id "ENSG00000201148"; +chrP ensembl exon 2949 3063 . + . gene_id "ENSG00000201148"; +chrP ensembl gene 677 786 . - . gene_id "ENSG00000252368"; diff --git a/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala b/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala index 48f8a1c8daaea149f690a614ed05f8355fb73346..8b7cb2ac8ea68c1d167899658d918867b932bb3b 100644 --- a/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala +++ b/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala @@ -123,6 +123,7 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers { val BedFile1 = new File(resourcePath("/rrna01.bed")) val RefFlatFile1 = new File(resourcePath("/rrna01.refFlat")) + val GtfFile1 = new File(resourcePath("/rrna01.gtf")) @Test def testMakeIntervalFromBed() = { val intervals: List[Interval] = makeIntervalFromFile(BedFile1) @@ -149,6 +150,17 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers { intervals.last.getEnd shouldBe 200 } + @Test def testMakeIntervalFromGtf() = { + val intervals: List[Interval] = makeIntervalFromFile(GtfFile1, "exon") + intervals.length shouldBe 3 + intervals.head.getSequence should ===("chrQ") + intervals.head.getStart shouldBe 669 + intervals.head.getEnd shouldBe 778 + intervals.last.getSequence should ===("chrP") + intervals.last.getStart shouldBe 2949 + intervals.last.getEnd shouldBe 3063 + } + @Test def testSingleBamDefault() = { val intervals: List[Interval] = List( new Interval("chrQ", 291, 320), // overlaps r01, second hit, @@ -492,6 +504,7 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers { "-G", "002", "--limit_removal", "--no_make_index", + "--feature_type", "gene", "--bloom_size", "10000", "--false_positive", "1e-8" )) @@ -504,6 +517,7 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers { parsed.readGroupIds should contain("002") parsed.limitToRegion shouldBe true parsed.noMakeIndex shouldBe true + parsed.featureType should ===("gene") parsed.bloomSize shouldBe 10000 parsed.bloomFp shouldBe 1e-8 }