Commit 2594289d authored by bow's avatar bow
Browse files

Use orestes.bloomfilter instead of com.twitter.algebird for Bloom filter implementation

Conflicts:
	biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/apps/WipeReads.scala
parent e1444972
...@@ -23,6 +23,10 @@ ...@@ -23,6 +23,10 @@
<name>BioJava repository</name> <name>BioJava repository</name>
<url>http://www.biojava.org/download/maven/</url> <url>http://www.biojava.org/download/maven/</url>
</repository> </repository>
<repository>
<id>orestes-bloom-filter</id>
<url>https://raw.githubusercontent.com/Baqend/Orestes-Bloomfilter/master/maven-repo</url>
</repository>
</repositories> </repositories>
<dependencies> <dependencies>
<dependency> <dependency>
...@@ -62,9 +66,9 @@ ...@@ -62,9 +66,9 @@
<version>3.1.0</version> <version>3.1.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.twitter</groupId> <groupId>com.baqend</groupId>
<artifactId>algebird-core_2.10</artifactId> <artifactId>bloom-filter</artifactId>
<version>0.8.1</version> <version>1.02</version>
</dependency> </dependency>
</dependencies> </dependencies>
<build> <build>
......
...@@ -8,13 +8,14 @@ import java.io.{ File, IOException } ...@@ -8,13 +8,14 @@ import java.io.{ File, IOException }
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.io.Source import scala.io.Source
import com.twitter.algebird.{ BF, BloomFilter, BloomFilterMonoid }
import htsjdk.samtools.AlignmentBlock import htsjdk.samtools.AlignmentBlock
import htsjdk.samtools.SAMFileReader import htsjdk.samtools.SAMFileReader
import htsjdk.samtools.SAMFileReader.QueryInterval import htsjdk.samtools.SAMFileReader.QueryInterval
import htsjdk.samtools.SAMFileWriterFactory import htsjdk.samtools.SAMFileWriterFactory
import htsjdk.samtools.SAMRecord import htsjdk.samtools.SAMRecord
import htsjdk.tribble.index.interval.{ Interval, IntervalTree } import htsjdk.tribble.index.interval.{ Interval, IntervalTree }
import orestes.bloomfilter.HashProvider.HashMethod
import orestes.bloomfilter.{ BloomFilter, FilterBuilder }
import org.apache.commons.io.FilenameUtils.getExtension import org.apache.commons.io.FilenameUtils.getExtension
import org.broadinstitute.gatk.utils.commandline.{ Input, Output } import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
...@@ -276,19 +277,6 @@ object WipeReads extends MainCommand { ...@@ -276,19 +277,6 @@ object WipeReads extends MainCommand {
val firstBAM = prepIndexedInputBAM() val firstBAM = prepIndexedInputBAM()
val secondBAM = prepIndexedInputBAM() val secondBAM = prepIndexedInputBAM()
val bfm = BloomFilter(bloomSize, bloomFp, 13)
/** Function to make a BloomFilter containing one element from the SAMRecord */
def makeBFFromSAM(rec: SAMRecord, bfm: BloomFilterMonoid): BF = {
if (filterOutMulti)
bfm.create(rec.getReadName)
else if (!rec.getReadPairedFlag)
bfm.create(SAMToElem(rec))
else
// to bypass querying for each mate, we store the records that the mate also has
// namely, the read name and the alignment start
bfm.create(SAMToElem(rec), rec.getReadName + "_" + rec.getMateAlignmentStart)
}
/* NOTE: the interval vector here should be bypass-able if we can make /* NOTE: the interval vector here should be bypass-able if we can make
the BAM query intervals with Interval objects. This is not possible the BAM query intervals with Interval objects. This is not possible
...@@ -305,27 +293,36 @@ object WipeReads extends MainCommand { ...@@ -305,27 +293,36 @@ object WipeReads extends MainCommand {
.sortBy(x => (x.referenceIndex, x.start, x.end)) .sortBy(x => (x.referenceIndex, x.start, x.end))
.toArray .toArray
val filteredOutSet: BF = firstBAM.queryOverlapping(queryIntervals).asScala val filteredRecords: Iterator[SAMRecord] = firstBAM.queryOverlapping(queryIntervals).asScala
// ensure spliced reads have at least one block overlapping target region // ensure spliced reads have at least one block overlapping target region
.filter(x => alignmentBlockOverlaps(x, intervalTreeMap)) .filter(x => alignmentBlockOverlaps(x, intervalTreeMap))
// filter for MAPQ on target region reads // filter for MAPQ on target region reads
.filter(x => x.getMappingQuality >= minMapQ) .filter(x => x.getMappingQuality >= minMapQ)
// filter on specific read group IDs // filter on specific read group IDs
.filter(x => rgFilter(x)) .filter(x => rgFilter(x))
// transform SAMRecord to string
.map(x => makeBFFromSAM(x, bfm)) val filteredOutSet: BloomFilter[String] = new FilterBuilder(bloomSize, bloomFp)
// build bloom filter using fold to prevent loading all strings to memory .hashFunction(HashMethod.Murmur3KirschMitzenmacher)
.foldLeft(bfm.create())(_.++(_)) .buildBloomFilter()
for (rec <- filteredRecords) {
if ((!filterOutMulti) && rec.getReadPairedFlag) {
filteredOutSet.add(SAMToElem(rec))
filteredOutSet.add(rec.getReadName + "_" + rec.getMateAlignmentStart.toString)
}
else
filteredOutSet.add(SAMToElem(rec))
}
if (filterOutMulti) if (filterOutMulti)
(rec: SAMRecord) => filteredOutSet.contains(rec.getReadName).isTrue (rec: SAMRecord) => filteredOutSet.contains(rec.getReadName)
else else
(rec: SAMRecord) => { (rec: SAMRecord) => {
if (rec.getReadPairedFlag) if (rec.getReadPairedFlag)
filteredOutSet.contains(rec.getReadName + "_" + rec.getAlignmentStart).isTrue && filteredOutSet.contains(rec.getReadName + "_" + rec.getAlignmentStart) &&
filteredOutSet.contains(rec.getReadName + "_" + rec.getMateAlignmentStart).isTrue filteredOutSet.contains(rec.getReadName + "_" + rec.getMateAlignmentStart)
else else
filteredOutSet.contains(rec.getReadName + "_" + rec.getAlignmentStart).isTrue filteredOutSet.contains(rec.getReadName + "_" + rec.getAlignmentStart)
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment