Commit d502cf63 authored by bow's avatar bow
Browse files

Refactor query interval builder for fewer BAM objects

parent a748e6fa
...@@ -36,15 +36,15 @@ object WipeReads { ...@@ -36,15 +36,15 @@ object WipeReads {
type OptionMap = Map[String, Any] type OptionMap = Map[String, Any]
case class RawInterval(chrom: String, start: Int, end: Int, strand: String)
object Strand extends Enumeration { object Strand extends Enumeration {
type Strand = Value type Strand = Value
val Identical, Opposite, Both = Value val Identical, Opposite, Both = Value
} }
// TODO: check that interval chrom is in the BAM file (optionally, when prepended with 'chr' too) // TODO: check that interval chrom is in the BAM file (optionally, when prepended with 'chr' too)
def makeQueryIntervalFromFile(inFile: File, inBAM: SAMFileReader): Iterator[QueryInterval] = { def makeRawIntervalFromFile(inFile: File): Iterator[RawInterval] = {
case class RawInterval(chrom: String, start: Int, end: Int, strand: String)
def makeRawIntervalFromBED(inFile: File): Iterator[RawInterval] = def makeRawIntervalFromBED(inFile: File): Iterator[RawInterval] =
// BED file coordinates are 0-based, half open so we need to do some conversion // BED file coordinates are 0-based, half open so we need to do some conversion
...@@ -73,12 +73,10 @@ object WipeReads { ...@@ -73,12 +73,10 @@ object WipeReads {
throw new IllegalArgumentException("Unexpected interval file type: " + inFile.getPath) throw new IllegalArgumentException("Unexpected interval file type: " + inFile.getPath)
iterFunc(inFile) iterFunc(inFile)
.filter(x => inBAM.getFileHeader.getSequenceIndex(x.chrom) > -1)
.map(x => inBAM.makeQueryInterval(x.chrom, x.start, x.end))
} }
// TODO: set minimum fraction for overlap // TODO: set minimum fraction for overlap
def makeBloomFilter(iv: Iterator[QueryInterval], def makeBloomFilter(iv: Iterator[RawInterval],
inBAM: File, inBAMIndex: File = null, inBAM: File, inBAMIndex: File = null,
filterOutMulti: Boolean = true, filterOutMulti: Boolean = true,
minMapQ: Int = 0, readGroupIDs: Set[String] = Set(), minMapQ: Int = 0, readGroupIDs: Set[String] = Set(),
...@@ -96,6 +94,20 @@ object WipeReads { ...@@ -96,6 +94,20 @@ object WipeReads {
sfr sfr
} }
// create objects for querying intervals, allowing for
// chromosome names with or without a "chr" prefix
def monadicMakeQueryInterval(inBAM: SAMFileReader, ri: RawInterval): Option[QueryInterval] =
if (inBAM.getFileHeader.getSequenceIndex(ri.chrom) > -1)
Some(inBAM.makeQueryInterval(ri.chrom, ri.start, ri.end))
else if (ri.chrom.startsWith("chr")
&& inBAM.getFileHeader.getSequenceIndex(ri.chrom.substring(3)) > -1)
Some(inBAM.makeQueryInterval(ri.chrom.substring(3), ri.start, ri.end))
else if (!ri.chrom.startsWith("chr")
&& inBAM.getFileHeader.getSequenceIndex("chr" + ri.chrom) > -1)
Some(inBAM.makeQueryInterval("chr" + ri.chrom, ri.start, ri.end))
else
None
// TODO: can we accumulate errors / exceptions instead of ignoring them? // TODO: can we accumulate errors / exceptions instead of ignoring them?
def monadicMateQuery(inBAM: SAMFileReader, rec: SAMRecord): Option[SAMRecord] = def monadicMateQuery(inBAM: SAMFileReader, rec: SAMRecord): Option[SAMRecord] =
try { try {
...@@ -121,8 +133,9 @@ object WipeReads { ...@@ -121,8 +133,9 @@ object WipeReads {
val firstBAM = prepIndexedInputBAM() val firstBAM = prepIndexedInputBAM()
val secondBAM = prepIndexedInputBAM() val secondBAM = prepIndexedInputBAM()
val bfm = BloomFilter(bloomSize, bloomFp, 13) val bfm = BloomFilter(bloomSize, bloomFp, 13)
val intervals = iv.flatMap(x => monadicMakeQueryInterval(firstBAM, x)).toArray
firstBAM.queryOverlapping(iv.toArray).asScala firstBAM.queryOverlapping(intervals).asScala
// filter for MAPQ on target region reads // filter for MAPQ on target region reads
.filter(x => x.getMappingQuality >= minMapQ) .filter(x => x.getMappingQuality >= minMapQ)
// filter on specific read group IDs // filter on specific read group IDs
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment