Commit 46cfaf0e authored by bow's avatar bow
Browse files

Create interval tree map lazily in WipeReads

parent d5e6eb8c
...@@ -170,6 +170,18 @@ object WipeReads { ...@@ -170,6 +170,18 @@ object WipeReads {
} }
} }
/** function to make IntervalTree from our RawInterval objects
*
* @param ri iterable over RawInterval objects
* @return IntervalTree objects, filled with intervals from RawInterval
*/
def makeIntervalTree(ri: Iterable[RawInterval]): IntervalTree = {
val ivt = new IntervalTree
for (iv: RawInterval <- ri)
ivt.insert(new Interval(iv.start, iv.end))
ivt
}
/** /**
* Function to ensure that a SAMRecord overlaps our target regions * Function to ensure that a SAMRecord overlaps our target regions
* *
...@@ -187,7 +199,7 @@ object WipeReads { ...@@ -187,7 +199,7 @@ object WipeReads {
if (rec.getSAMString.split("\t")(5).contains("N")) { if (rec.getSAMString.split("\t")(5).contains("N")) {
for (ab: AlignmentBlock <- rec.getAlignmentBlocks.asScala) { for (ab: AlignmentBlock <- rec.getAlignmentBlocks.asScala) {
if (!ivtm(rec.getReferenceName).findOverlapping( if (!ivtm(rec.getReferenceName).findOverlapping(
new Interval(ab.getReferenceStart, ab.getReferenceStart + ab.getLength - 1, null)).isEmpty) new Interval(ab.getReferenceStart, ab.getReferenceStart + ab.getLength - 1)).isEmpty)
return true return true
} }
false false
...@@ -212,25 +224,28 @@ object WipeReads { ...@@ -212,25 +224,28 @@ object WipeReads {
val secondBAM = prepIndexedInputBAM() val secondBAM = prepIndexedInputBAM()
val bfm = BloomFilter(bloomSize, bloomFp, 13) val bfm = BloomFilter(bloomSize, bloomFp, 13)
val intervals = iv.toList /* NOTE: the interval vector here should be bypass-able if we can make
val queryIntervals = intervals.flatMap(x => monadicMakeQueryInterval(firstBAM, x)).toArray the BAM query intervals with Interval objects. This is not possible
at the moment since we can not retrieve star and end coordinates
/** interval tree for ensuring that split reads do overlap our target regions */ of an Interval, so we resort to our own RawInterval vector
val ivtm = scala.collection.mutable.Map.empty[String, IntervalTree] */
for (iv: RawInterval <- intervals) { lazy val intervals = iv.toVector
ivtm.getOrElseUpdate(iv.chrom, new IntervalTree).insert(new Interval(iv.start, iv.end)) lazy val intervalTreeMap: Map[String, IntervalTree] = intervals
} .groupBy(x => x.chrom)
.map({ case (key, value) => (key, makeIntervalTree(value)) })
lazy val queryIntervals = intervals
.flatMap(x => monadicMakeQueryInterval(firstBAM, x)).toArray
val filteredOutSet: BF = firstBAM.queryOverlapping(queryIntervals).asScala val filteredOutSet: BF = firstBAM.queryOverlapping(queryIntervals).asScala
// ensure spliced reads have at least one block overlapping target region // ensure spliced reads have at least one block overlapping target region
.filter(x => alignmentBlockOverlaps(x, ivtm.toMap)) .filter(x => alignmentBlockOverlaps(x, intervalTreeMap))
// filter for MAPQ on target region reads // filter for MAPQ on target region reads
.filter(x => x.getMappingQuality >= minMapQ) .filter(x => x.getMappingQuality >= minMapQ)
// filter on specific read group IDs // filter on specific read group IDs
.filter(x => rgFilter(x)) .filter(x => rgFilter(x))
// TODO: how to directly get SAMRecord and its pairs without multiple flattens? // TODO: how to directly get SAMRecord and its pairs without multiple flattens?
.flatMap(x => Vector(Some(x), monadicMateQuery(secondBAM, x)).flatten) .flatMap(x => Vector(Some(x), monadicMateQuery(secondBAM, x)).flatten)
// transfrom SAMRecord to string // transform SAMRecord to string
.map(x => SAMToElem(x)) .map(x => SAMToElem(x))
// build bloom filter using fold to prevent loading all strings to memory // build bloom filter using fold to prevent loading all strings to memory
.foldLeft(bfm.create())(_.+(_)) .foldLeft(bfm.create())(_.+(_))
......
...@@ -35,7 +35,7 @@ class WipeReadsUnitTest extends Assertions { ...@@ -35,7 +35,7 @@ class WipeReadsUnitTest extends Assertions {
val bed01 = new File(resourcePath("/rrna01.bed")) val bed01 = new File(resourcePath("/rrna01.bed"))
val minArgList = List("-I", sbam01.toString, "-l", bed01.toString, "-o", "mock.bam") val minArgList = List("-I", sbam01.toString, "-l", bed01.toString, "-o", "mock.bam")
@Test def rawIntervalBED() = { @Test def testMakeRawIntervalFromBED() = {
val intervals: Vector[RawInterval] = makeRawIntervalFromFile(bed01).toVector val intervals: Vector[RawInterval] = makeRawIntervalFromFile(bed01).toVector
assert(intervals.length == 3) assert(intervals.length == 3)
assert(intervals.head.chrom == "chrQ") assert(intervals.head.chrom == "chrQ")
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment