diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala index d5b243f8358bb5caf87be11435d98e17b46e1ead..9bfb15686a0fc82694c94aca0fc91ae7d5df4f95 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala @@ -6,6 +6,7 @@ package nl.lumc.sasc.biopet.tools import java.io.File import scala.collection.JavaConverters._ +import scala.io.Source import scala.math.{ max, min } import com.google.common.hash.{ Funnel, BloomFilter, PrimitiveSink } @@ -73,7 +74,7 @@ object WipeReads extends ToolCommand { * * @param inFile input interval file */ - def makeIntervalFromFile(inFile: File): List[Interval] = { + def makeIntervalFromFile(inFile: File, gtfFeatureType: String = "exon"): List[Interval] = { logger.info("Parsing interval file ...") @@ -83,20 +84,62 @@ object WipeReads extends ToolCommand { .asScala .map(x => new Interval(x.getChr, x.getStart, x.getEnd)) - /** Function to create iterator from refFlat file */ - def makeIntervalFromRefFlat(inFile: File): Iterator[Interval] = ??? - // convert coordinate to 1-based fully closed - // parse chrom, start blocks, end blocks, strands + /** + * Parses a refFlat file to yield Interval objects + * + * Format description: + * http://genome.csdb.cn/cgi-bin/hgTables?hgsid=6&hgta_doSchemaDb=hg18&hgta_doSchemaTable=refFlat + * + * @param inFile input refFlat file + */ + def makeIntervalFromRefFlat(inFile: File): Iterator[Interval] = + Source.fromFile(inFile) + // read each line + .getLines() + // skip all empty lines + .filterNot(x => x.trim.isEmpty) + // split per column + .map(line => line.trim.split("\t")) + // take chromosome and exonEnds and exonStars + .map(x => (x(2), x.reverse.take(2))) + // split starts and ends based on comma + .map(x => (x._1, x._2.map(y => y.split(",")))) + // zip exonStarts and exonEnds, note the index was reversed because we did .reverse above + .map(x => (x._1, x._2(1).zip(x._2(0)))) + // make Intervals, accounting for the fact that refFlat coordinates are 0-based + .map(x => x._2.map(y => new Interval(x._1, y._1.toInt + 1, y._2.toInt))) + // flatten sublist + .flatten - /** Function to create iterator from GTF file */ - def makeIntervalFromGtf(inFile: File): Iterator[Interval] = ??? - // convert coordinate to 1-based fully closed - // parse chrom, start blocks, end blocks, strands + /** + * Parses a GTF file to yield Interval objects + * + * @param inFile input GTF file + * @return + */ + def makeIntervalFromGtf(inFile: File): Iterator[Interval] = + Source.fromFile(inFile) + // read each line + .getLines() + // skip all empty lines + .filterNot(x => x.trim.isEmpty) + // skip all UCSC track lines and/or ensembl comment lines + .dropWhile(x => x.matches("^track | ^browser | ^#")) + // split to columns + .map(x => x.split("\t")) + // exclude intervals whose type is different from the supplied one + .filter(x => x(2) == gtfFeatureType) + // and finally create the interval objects + .map(x => new Interval(x(0), x(3).toInt, x(4).toInt)) // detect interval file format from extension val iterFunc: (File => Iterator[Interval]) = if (getExtension(inFile.toString.toLowerCase) == "bed") makeIntervalFromBed + else if (getExtension(inFile.toString.toLowerCase) == "refflat") + makeIntervalFromRefFlat + else if (getExtension(inFile.toString.toLowerCase) == "gtf") + makeIntervalFromGtf else throw new IllegalArgumentException("Unexpected interval file type: " + inFile.getPath) @@ -107,7 +150,7 @@ object WipeReads extends ToolCommand { acc match { case head :: tail if x.intersects(head) => new Interval(x.getSequence, min(x.getStart, head.getStart), max(x.getEnd, head.getEnd)) :: tail - case _ => x :: acc + case _ => x :: acc } } ) @@ -150,12 +193,10 @@ object WipeReads extends ToolCommand { else if (iv.getSequence.startsWith("chr") && getIndex(iv.getSequence.substring(3)) > -1) { logger.warn("Removing 'chr' prefix from interval " + iv.toString) Some(new QueryInterval(getIndex(iv.getSequence.substring(3)), iv.getStart, iv.getEnd)) - } - else if (!iv.getSequence.startsWith("chr") && getIndex("chr" + iv.getSequence) > -1) { + } else if (!iv.getSequence.startsWith("chr") && getIndex("chr" + iv.getSequence) > -1) { logger.warn("Adding 'chr' prefix to interval " + iv.toString) Some(new QueryInterval(getIndex("chr" + iv.getSequence), iv.getStart, iv.getEnd)) - } - else { + } else { logger.warn("Sequence " + iv.getSequence + " does not exist in alignment") None } @@ -303,6 +344,7 @@ object WipeReads extends ToolCommand { minMapQ: Int = 0, limitToRegion: Boolean = false, noMakeIndex: Boolean = false, + featureType: String = "exon", bloomSize: Long = 70000000, bloomFp: Double = 4e-7) extends AbstractArgs @@ -319,7 +361,7 @@ object WipeReads extends ToolCommand { x => if (x.exists) success else failure("Input BAM file not found") } text "Input BAM file" - opt[File]('r', "interval_file") required () valueName "<bed>" action { (x, c) => + opt[File]('r', "interval_file") required () valueName "<bed/gtf/refflat>" action { (x, c) => c.copy(targetRegions = x) } validate { x => if (x.exists) success else failure("Target regions file not found") @@ -351,15 +393,21 @@ object WipeReads extends ToolCommand { } text "Whether to index output BAM file or not (default: yes)" - note("\nAdvanced options") + note("\nGTF-only options:") + + opt[String]('t', "feature_type") optional () valueName "<gtf_feature_type>" action { (x, c) => + c.copy(featureType = x) + } text "GTF feature containing intervals (default: exon)" + + note("\nAdvanced options:") opt[Long]("bloom_size") optional () action { (x, c) => c.copy(bloomSize = x) - } text "expected maximum number of reads in target regions (default: 7e7)" + } text "Expected maximum number of reads in target regions (default: 7e7)" opt[Double]("false_positive") optional () action { (x, c) => c.copy(bloomFp = x) - } text "false positive rate (default: 4e-7)" + } text "False positive rate (default: 4e-7)" note( """ @@ -386,7 +434,7 @@ object WipeReads extends ToolCommand { // cannot use SamReader as inBam directly since it only allows one active iterator at any given time val filterFunc = makeFilterNotFunction( - ivl = makeIntervalFromFile(commandArgs.targetRegions), + ivl = makeIntervalFromFile(commandArgs.targetRegions, gtfFeatureType = commandArgs.featureType), inBam = commandArgs.inputBam, filterOutMulti = !commandArgs.limitToRegion, minMapQ = commandArgs.minMapQ, @@ -402,4 +450,4 @@ object WipeReads extends ToolCommand { commandArgs.filteredOutBam.map(x => prepOutBam(x, commandArgs.inputBam, writeIndex = !commandArgs.noMakeIndex)) ) } -} +} \ No newline at end of file diff --git a/biopet-framework/src/test/resources/rrna01.gtf b/biopet-framework/src/test/resources/rrna01.gtf new file mode 100644 index 0000000000000000000000000000000000000000..9065a12a2716a621082ddab947deded5287585f0 --- /dev/null +++ b/biopet-framework/src/test/resources/rrna01.gtf @@ -0,0 +1,10 @@ +chrQ ensembl gene 669 778 . - . gene_id "ENSG00000252956"; +chrQ ensembl transcript 669 778 . - . gene_id "ENSG00000252956"; +chrQ ensembl exon 669 778 . - . gene_id "ENSG00000252956"; +chrQ ensembl gene 184 284 . - . gene_id "ENSG00000222952"; +chrQ ensembl transcript 184 284 . - . gene_id "ENSG00000222952"; +chrQ ensembl exon 184 284 . - . gene_id "ENSG00000222952"; +chrP ensembl gene 2949 3063 . + . gene_id "ENSG00000201148"; +chrP ensembl transcript 2949 3063 . + . gene_id "ENSG00000201148"; +chrP ensembl exon 2949 3063 . + . gene_id "ENSG00000201148"; +chrP ensembl gene 677 786 . - . gene_id "ENSG00000252368"; diff --git a/biopet-framework/src/test/resources/rrna01.refFlat b/biopet-framework/src/test/resources/rrna01.refFlat new file mode 100644 index 0000000000000000000000000000000000000000..30e68e91c16fcdbe6ec252fb8631409df4d44d43 --- /dev/null +++ b/biopet-framework/src/test/resources/rrna01.refFlat @@ -0,0 +1,3 @@ +gene1 trans1 chrS + 100 500 100 500 1 100, 500, +gene1 trans1 chrR + 100 500 200 400 1 100, 500, +gene1 trans1 chrQ + 100 1000 200 900 3 100,300,800 200,400,1000 diff --git a/biopet-framework/src/test/resources/rrna02.bed b/biopet-framework/src/test/resources/rrna02.bed new file mode 100644 index 0000000000000000000000000000000000000000..191138d56b2f30f42f1dfe3a26769777e51e04d3 --- /dev/null +++ b/biopet-framework/src/test/resources/rrna02.bed @@ -0,0 +1,6 @@ +chrQ 300 350 rRNA03 0 + +chrQ 350 400 rRNA03 0 + +chrQ 450 480 rRNA02 0 - +chrQ 470 475 rRNA04 0 - +chrQ 1 200 rRNA01 0 . +chrQ 150 250 rRNA01 0 . diff --git a/biopet-framework/src/test/resources/single05.bam b/biopet-framework/src/test/resources/single05.bam new file mode 100644 index 0000000000000000000000000000000000000000..e9fab38e6a945bf942921353d4a443cee881de2e Binary files /dev/null and b/biopet-framework/src/test/resources/single05.bam differ diff --git a/biopet-framework/src/test/resources/single05.bam.bai b/biopet-framework/src/test/resources/single05.bam.bai new file mode 100644 index 0000000000000000000000000000000000000000..7a081f4700900bd7f3da6634d39b676b5547130a Binary files /dev/null and b/biopet-framework/src/test/resources/single05.bam.bai differ diff --git a/biopet-framework/src/test/resources/single05.sam b/biopet-framework/src/test/resources/single05.sam new file mode 100644 index 0000000000000000000000000000000000000000..b5fe7ceae436ea67e0a7eb44b5f2b7b051cf72bf --- /dev/null +++ b/biopet-framework/src/test/resources/single05.sam @@ -0,0 +1,9 @@ +@HD VN:1.0 SO:coordinate +@SQ SN:chrQ LN:10000 +@SQ SN:chrR LN:10000 +@RG ID:001 DS:single-end reads SM:WipeReadsTestCase +r02 16 chrQ 50 60 10M * 0 0 TACGTACGTA EEFFGGHHII RG:Z:001 +r04 0 chrQ 500 60 10M * 0 0 CGTACGTACG EEFFGGHHII RG:Z:001 +r01 0 chrR 50 60 10M * 0 0 TACGTACGTA EEFFGGHHII RG:Z:001 +r03 16 chrR 500 60 10M * 0 0 GGGGGAAAAA GGGGGGGGGG RG:Z:001 +r05 4 * 0 0 * * 0 0 ATATATATAT HIHIHIHIHI RG:Z:001 diff --git a/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala b/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala index 772ed62950b4d08074ce535db339a4865fb69db3..16cfabe816af4fa8deee1b295a88da68f659c3bf 100644 --- a/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala +++ b/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala @@ -35,6 +35,7 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers { private val samP: SAMLineParser = { val samh = new SAMFileHeader samh.addSequence(new SAMSequenceRecord("chrQ", 10000)) + samh.addSequence(new SAMSequenceRecord("chrR", 10000)) samh.addReadGroup(new SAMReadGroupRecord("001")) samh.addReadGroup(new SAMReadGroupRecord("002")) new SAMLineParser(samh) @@ -43,12 +44,6 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers { private def makeSams(raws: String*): Seq[SAMRecord] = raws.map(s => samP.parseLine(s)) - private def makeTempBam(): File = - File.createTempFile("WipeReads", java.util.UUID.randomUUID.toString + ".bam") - - private def makeTempBamIndex(bam: File): File = - new File(bam.getAbsolutePath.stripSuffix(".bam") + ".bai") - private def makeSamReader(f: File): SamReader = SamReaderFactory .make() .validationStringency(ValidationStringency.LENIENT) @@ -83,6 +78,15 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers { val sBamFile3 = new File(resourcePath("/single03.bam")) val sBamFile4 = new File(resourcePath("/single04.bam")) + val sBamFile5 = new File(resourcePath("/single05.bam")) + val sBamRecs5 = makeSams( + "r02\t16\tchrR\t50\t60\t10M\t*\t0\t0\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001", + "r04\t0\tchrQ\t500\t60\t10M\t*\t0\t0\tCGTACGTACG\tEEFFGGHHII\tRG:Z:001", + "r01\t0\tchrR\t50\t60\t10M\t*\t0\t0\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001", + "r03\t16\tchrQ\t500\t60\t10M\t*\t0\t0\tGGGGGAAAAA\tGGGGGGGGGG\tRG:Z:001", + "r05\t4\t*\t0\t0\t*\t*\t0\t0\tATATATATAT\tHIHIHIHIHI\tRG:Z:001" + ) + val pBamFile1 = new File(resourcePath("/paired01.bam")) val pBamRecs1 = makeSams( "r02\t99\tchrQ\t50\t60\t10M\t=\t90\t50\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001", @@ -116,18 +120,64 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers { ) val pBamFile3 = new File(resourcePath("/paired03.bam")) + val BedFile1 = new File(resourcePath("/rrna01.bed")) - val minArgList = List("-I", sBamFile1.toString, "-l", BedFile1.toString, "-o", "mock.bam") + val BedFile2 = new File(resourcePath("/rrna02.bed")) + val RefFlatFile1 = new File(resourcePath("/rrna01.refFlat")) + val GtfFile1 = new File(resourcePath("/rrna01.gtf")) - @Test def testMakeFeatureFromBed() = { + @Test def testMakeIntervalFromUnknown() = { + val thrown = intercept[IllegalArgumentException] { + makeIntervalFromFile(new File("false.bam")) + } + thrown.getMessage should ===("Unexpected interval file type: false.bam") + } + + @Test def testMakeIntervalFromBed() = { val intervals: List[Interval] = makeIntervalFromFile(BedFile1) - intervals.length should be(3) + intervals.length shouldBe 3 + intervals.head.getSequence should ===("chrQ") + intervals.head.getStart shouldBe 991 + intervals.head.getEnd shouldBe 1000 + intervals.last.getSequence should ===("chrQ") + intervals.last.getStart shouldBe 291 + intervals.last.getEnd shouldBe 320 + } + + @Test def testMakeIntervalFromRefFlat() = { + val intervals: List[Interval] = makeIntervalFromFile(RefFlatFile1) + intervals.length shouldBe 5 + intervals.head.getSequence should ===("chrS") + intervals.head.getStart shouldBe 101 + intervals.head.getEnd shouldBe 500 + intervals(2).getSequence should ===("chrQ") + intervals(2).getStart shouldBe 801 + intervals(2).getEnd shouldBe 1000 + intervals.last.getSequence should ===("chrQ") + intervals.last.getStart shouldBe 101 + intervals.last.getEnd shouldBe 200 + } + + @Test def testMakeIntervalFromGtf() = { + val intervals: List[Interval] = makeIntervalFromFile(GtfFile1, "exon") + intervals.length shouldBe 3 intervals.head.getSequence should ===("chrQ") - intervals.head.getStart should be(991) - intervals.head.getEnd should be(1000) + intervals.head.getStart shouldBe 669 + intervals.head.getEnd shouldBe 778 + intervals.last.getSequence should ===("chrP") + intervals.last.getStart shouldBe 2949 + intervals.last.getEnd shouldBe 3063 + } + + @Test def testMakeIntervalFromBedOverlap() = { + val intervals: List[Interval] = makeIntervalFromFile(BedFile2) + intervals.length shouldBe 4 + intervals.head.getSequence should ===("chrQ") + intervals.head.getStart shouldBe 451 + intervals.head.getEnd shouldBe 480 intervals.last.getSequence should ===("chrQ") - intervals.last.getStart should be(291) - intervals.last.getEnd should be(320) + intervals.last.getStart shouldBe 2 + intervals.last.getEnd shouldBe 250 } @Test def testSingleBamDefault() = { @@ -196,6 +246,19 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers { filterNotFunc(sBamRecs1(6)) shouldBe false } + @Test def testSingleBamDifferentChromosomes() = { + val intervals: List[Interval] = List( + new Interval("chrQ", 50, 55), + new Interval("chrR", 500, 505) + ) + val filterNotFunc = makeFilterNotFunction(intervals, sBamFile5, bloomSize = bloomSize, bloomFp = bloomFp) + filterNotFunc(sBamRecs5(0)) shouldBe true + filterNotFunc(sBamRecs5(1)) shouldBe false + filterNotFunc(sBamRecs5(2)) shouldBe false + filterNotFunc(sBamRecs5(3)) shouldBe true + filterNotFunc(sBamRecs5(4)) shouldBe false + } + @Test def testSingleBamFilterOutMultiNotSet() = { val intervals: List[Interval] = List( new Interval("chrQ", 291, 320), // overlaps r01, second hit, @@ -244,7 +307,7 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers { filterNotFunc(sBamRecs2(2)) shouldBe false filterNotFunc(sBamRecs2(3)) shouldBe true filterNotFunc(sBamRecs2(4)) shouldBe true - // this r07 is not in since filterOuMulti is false + // this r07 is not in since filterOutMulti is false filterNotFunc(sBamRecs2(5)) shouldBe false filterNotFunc(sBamRecs2(6)) shouldBe false filterNotFunc(sBamRecs2(7)) shouldBe false @@ -460,6 +523,7 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers { "-G", "002", "--limit_removal", "--no_make_index", + "--feature_type", "gene", "--bloom_size", "10000", "--false_positive", "1e-8" )) @@ -472,6 +536,7 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers { parsed.readGroupIds should contain("002") parsed.limitToRegion shouldBe true parsed.noMakeIndex shouldBe true + parsed.featureType should ===("gene") parsed.bloomSize shouldBe 10000 parsed.bloomFp shouldBe 1e-8 }