Commit d5e6eb8c authored by bow's avatar bow
Browse files

Update WipeReads to take splicing structure into account when finding overlaps

parent c8e494d2
......@@ -9,10 +9,12 @@ import scala.collection.JavaConverters._
import scala.io.Source
import com.twitter.algebird.{ BF, BloomFilter }
import htsjdk.samtools.AlignmentBlock
import htsjdk.samtools.SAMFileReader
import htsjdk.samtools.SAMFileReader.QueryInterval
import htsjdk.samtools.SAMFileWriterFactory
import htsjdk.samtools.SAMRecord
import htsjdk.tribble.index.interval.{ Interval, IntervalTree }
import org.apache.commons.io.FilenameUtils.getExtension
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
......@@ -168,6 +170,30 @@ object WipeReads {
}
}
/**
* Function to ensure that a SAMRecord overlaps our target regions
*
* This is required because htsjdk's queryOverlap method does not take into
* account the SAMRecord splicing structure
*
* @param rec SAMRecord to check
* @param ivtm mutable mapping of a chromosome and its interval tree
* @return
*/
def alignmentBlockOverlaps(rec: SAMRecord, ivtm: Map[String, IntervalTree]): Boolean =
// if SAMRecord is not spliced, assume queryOverlap has done its job
// otherwise check for alignment block overlaps in our interval list
// using raw SAMString to bypass cigar string decoding
if (rec.getSAMString.split("\t")(5).contains("N")) {
for (ab: AlignmentBlock <- rec.getAlignmentBlocks.asScala) {
if (!ivtm(rec.getReferenceName).findOverlapping(
new Interval(ab.getReferenceStart, ab.getReferenceStart + ab.getLength - 1, null)).isEmpty)
return true
}
false
} else
true
/** filter function for read IDs */
val rgFilter =
if (readGroupIDs.size == 0)
......@@ -185,8 +211,19 @@ object WipeReads {
val firstBAM = prepIndexedInputBAM()
val secondBAM = prepIndexedInputBAM()
val bfm = BloomFilter(bloomSize, bloomFp, 13)
val intervals = iv.flatMap(x => monadicMakeQueryInterval(firstBAM, x)).toArray
val filteredOutSet: BF = firstBAM.queryOverlapping(intervals).asScala
val intervals = iv.toList
val queryIntervals = intervals.flatMap(x => monadicMakeQueryInterval(firstBAM, x)).toArray
/** interval tree for ensuring that split reads do overlap our target regions */
val ivtm = scala.collection.mutable.Map.empty[String, IntervalTree]
for (iv: RawInterval <- intervals) {
ivtm.getOrElseUpdate(iv.chrom, new IntervalTree).insert(new Interval(iv.start, iv.end))
}
val filteredOutSet: BF = firstBAM.queryOverlapping(queryIntervals).asScala
// ensure spliced reads have at least one block overlapping target region
.filter(x => alignmentBlockOverlaps(x, ivtm.toMap))
// filter for MAPQ on target region reads
.filter(x => x.getMappingQuality >= minMapQ)
// filter on specific read group IDs
......
......@@ -61,15 +61,38 @@ class WipeReadsUnitTest extends Assertions {
// by default, set elements are SAM record read names
assert(!isFilteredOut("r02"))
assert(!isFilteredOut("r03"))
/* TODO: exclude r05 from set since the mapped read does not overlap with our interval
this is a limitation of htsjdk, as it checks for overlaps with records instead of alignment blocks
assert(!isFilteredOut("r05"))
*/
assert(!isFilteredOut("r06"))
assert(isFilteredOut("r01"))
assert(isFilteredOut("r04"))
}
@Test def testSingleBAMDefaultPartialExonOverlap() = {
val intervals: Iterator[RawInterval] = Iterator(
RawInterval("chrQ", 881, 1000, "+") // overlaps first exon of r05
)
val isFilteredOut = makeFilterOutFunction(intervals, sbam01, bloomSize = 1000, bloomFp = 1e-10)
assert(!isFilteredOut("r01"))
assert(!isFilteredOut("r02"))
assert(!isFilteredOut("r03"))
assert(!isFilteredOut("r04"))
assert(!isFilteredOut("r06"))
assert(isFilteredOut("r05"))
}
@Test def testSingleBAMDefaultNoExonOverlap() = {
val intervals: Iterator[RawInterval] = Iterator(
RawInterval("chrP", 881, 1000, "+")
)
val isFilteredOut = makeFilterOutFunction(intervals, sbam01, bloomSize = 1000, bloomFp = 1e-10)
assert(!isFilteredOut("r01"))
assert(!isFilteredOut("r02"))
assert(!isFilteredOut("r03"))
assert(!isFilteredOut("r04"))
assert(!isFilteredOut("r06"))
assert(!isFilteredOut("r05"))
}
@Test def testSingleBAMFilterOutMultiNotSet() = {
val intervals: Iterator[RawInterval] = Iterator(
RawInterval("chrQ", 291, 320, "+"), // overlaps r01, second hit,
......@@ -85,10 +108,7 @@ class WipeReadsUnitTest extends Assertions {
assert(!isFilteredOut("r06\t4\t*\t0\t0\t*\t*\t0\t0\tATATATATAT\tHIHIHIHIHI\tRG:Z:001\n"))
assert(isFilteredOut("r01\t16\tchrQ\t290\t60\t10M\t*\t0\t0\tGGGGGAAAAA\tGGGGGGGGGG\tRG:Z:001\n"))
assert(isFilteredOut("r04\t0\tchrQ\t450\t60\t10M\t*\t0\t0\tCGTACGTACG\tEEFFGGHHII\tRG:Z:001\n"))
/* TODO: exclude r05 from set since the mapped read does not overlap with our interval
this is a limitation of htsjdk, as it checks for overlaps with records instead of alignment blocks
assert(!isFilteredOut("r05\t0\tchrQ\t890\t60\t5M200N5M\t*\t0\t0\tGATACGATAC\tFEFEFEFEFE\tRG:Z:001\n"))
*/
}
@Test def testSingleBAMFilterMinMapQ() = {
......@@ -148,18 +168,27 @@ class WipeReadsUnitTest extends Assertions {
RawInterval("chrQ", 991, 1000, "+") // overlaps nothing; lies in the spliced region of r05
)
val isFilteredOut = makeFilterOutFunction(intervals, pbam01, bloomSize = 1000, bloomFp = 1e-10)
// by default, set elements are SAM record read names
assert(!isFilteredOut("r02"))
assert(!isFilteredOut("r03"))
/* TODO: exclude r05 from set since the mapped read does not overlap with our interval
this is a limitation of htsjdk, as it checks for overlaps with records instead of alignment blocks
assert(!isFilteredOut("r05"))
*/
assert(!isFilteredOut("r06"))
assert(isFilteredOut("r01"))
assert(isFilteredOut("r04"))
}
@Test def testPairBAMPartialExonOverlap() = {
val intervals: Iterator[RawInterval] = Iterator(
RawInterval("chrQ", 891, 1000, "+")
)
val isFilteredOut = makeFilterOutFunction(intervals, pbam01, bloomSize = 1000, bloomFp = 1e-10)
assert(!isFilteredOut("r01"))
assert(!isFilteredOut("r02"))
assert(!isFilteredOut("r03"))
assert(!isFilteredOut("r04"))
assert(!isFilteredOut("r06"))
assert(isFilteredOut("r05"))
}
@Test def testPairBAMFilterOutMultiNotSet() = {
val intervals: Iterator[RawInterval] = Iterator(
RawInterval("chrQ", 291, 320, "+"), // overlaps r01, second hit,
......@@ -180,10 +209,8 @@ class WipeReadsUnitTest extends Assertions {
assert(isFilteredOut("r01\t83\tchrQ\t290\t60\t10M\t=\t250\t-50\tGGGGGAAAAA\tGGGGGGGGGG\tRG:Z:001\n"))
assert(isFilteredOut("r04\t99\tchrQ\t450\t60\t10M\t=\t490\t50\tCGTACGTACG\tEEFFGGHHII\tRG:Z:001\n"))
assert(isFilteredOut("r04\t147\tchrQ\t490\t60\t10M\t=\t450\t-50\tGCATGCATGC\tEEFFGGHHII\tRG:Z:001\n"))
/* TODO: exclude r05 from set
assert(!isFilteredOut("r05\t99\tchrQ\t850\t60\t5M100N5M\t=\t1140\t50\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001\n"))
assert(!isFilteredOut("r05\t147\tchrQ\t1140\t60\t10M\t=\t850\t-50\tATGCATGCAT\tEEFFGGHHII\tRG:Z:001\n"))
*/
}
@Test def testPairBAMFilterMinMapQ() = {
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment