diff --git a/.idea/libraries/Maven__com_baqend_bloom_filter_1_02.xml b/.idea/libraries/Maven__com_baqend_bloom_filter_1_02.xml deleted file mode 100644 index a08d14f4eb8ef4a4d8eb3ab6ef242b63e7000b28..0000000000000000000000000000000000000000 --- a/.idea/libraries/Maven__com_baqend_bloom_filter_1_02.xml +++ /dev/null @@ -1,13 +0,0 @@ -<component name="libraryTable"> - <library name="Maven: com.baqend:bloom-filter:1.02"> - <CLASSES> - <root url="jar://$MAVEN_REPOSITORY$/com/baqend/bloom-filter/1.02/bloom-filter-1.02.jar!/" /> - </CLASSES> - <JAVADOC> - <root url="jar://$MAVEN_REPOSITORY$/com/baqend/bloom-filter/1.02/bloom-filter-1.02-javadoc.jar!/" /> - </JAVADOC> - <SOURCES> - <root url="jar://$MAVEN_REPOSITORY$/com/baqend/bloom-filter/1.02/bloom-filter-1.02-sources.jar!/" /> - </SOURCES> - </library> -</component> \ No newline at end of file diff --git a/.idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml b/.idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml deleted file mode 100644 index 4533c1ba79a2f6aad70eab0b302d97e0bfb41dda..0000000000000000000000000000000000000000 --- a/.idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml +++ /dev/null @@ -1,13 +0,0 @@ -<component name="libraryTable"> - <library name="Maven: com.google.code.gson:gson:2.2.4"> - <CLASSES> - <root url="jar://$MAVEN_REPOSITORY$/com/google/code/gson/gson/2.2.4/gson-2.2.4.jar!/" /> - </CLASSES> - <JAVADOC> - <root url="jar://$MAVEN_REPOSITORY$/com/google/code/gson/gson/2.2.4/gson-2.2.4-javadoc.jar!/" /> - </JAVADOC> - <SOURCES> - <root url="jar://$MAVEN_REPOSITORY$/com/google/code/gson/gson/2.2.4/gson-2.2.4-sources.jar!/" /> - </SOURCES> - </library> -</component> \ No newline at end of file diff --git a/.idea/libraries/Maven__com_google_guava_guava_17_0.xml b/.idea/libraries/Maven__com_google_guava_guava_18_0.xml similarity index 68% rename from .idea/libraries/Maven__com_google_guava_guava_17_0.xml rename to .idea/libraries/Maven__com_google_guava_guava_18_0.xml index 2a9069ca399f4895428f9c87f5dddd978dc31766..bbd71d77e995b85a163660856a9d45a449599fcc 100644 --- a/.idea/libraries/Maven__com_google_guava_guava_17_0.xml +++ b/.idea/libraries/Maven__com_google_guava_guava_18_0.xml @@ -1,13 +1,13 @@ <component name="libraryTable"> - <library name="Maven: com.google.guava:guava:17.0"> + <library name="Maven: com.google.guava:guava:18.0"> <CLASSES> - <root url="jar://$MAVEN_REPOSITORY$/com/google/guava/guava/17.0/guava-17.0.jar!/" /> + <root url="jar://$MAVEN_REPOSITORY$/com/google/guava/guava/18.0/guava-18.0.jar!/" /> </CLASSES> <JAVADOC> - <root url="jar://$MAVEN_REPOSITORY$/com/google/guava/guava/17.0/guava-17.0-javadoc.jar!/" /> + <root url="jar://$MAVEN_REPOSITORY$/com/google/guava/guava/18.0/guava-18.0-javadoc.jar!/" /> </JAVADOC> <SOURCES> - <root url="jar://$MAVEN_REPOSITORY$/com/google/guava/guava/17.0/guava-17.0-sources.jar!/" /> + <root url="jar://$MAVEN_REPOSITORY$/com/google/guava/guava/18.0/guava-18.0-sources.jar!/" /> </SOURCES> </library> </component> \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_commons_commons_pool2_2_2.xml b/.idea/libraries/Maven__org_apache_commons_commons_pool2_2_2.xml deleted file mode 100644 index aa6a889052a2db3f805e9ec976fdce0ad9faad1d..0000000000000000000000000000000000000000 --- a/.idea/libraries/Maven__org_apache_commons_commons_pool2_2_2.xml +++ /dev/null @@ -1,13 +0,0 @@ -<component name="libraryTable"> - <library name="Maven: org.apache.commons:commons-pool2:2.2"> - <CLASSES> - <root url="jar://$MAVEN_REPOSITORY$/org/apache/commons/commons-pool2/2.2/commons-pool2-2.2.jar!/" /> - </CLASSES> - <JAVADOC> - <root url="jar://$MAVEN_REPOSITORY$/org/apache/commons/commons-pool2/2.2/commons-pool2-2.2-javadoc.jar!/" /> - </JAVADOC> - <SOURCES> - <root url="jar://$MAVEN_REPOSITORY$/org/apache/commons/commons-pool2/2.2/commons-pool2-2.2-sources.jar!/" /> - </SOURCES> - </library> -</component> \ No newline at end of file diff --git a/.idea/libraries/Maven__redis_clients_jedis_2_5_1.xml b/.idea/libraries/Maven__redis_clients_jedis_2_5_1.xml deleted file mode 100644 index 1d28363366b82a563ff8823880190ac6f4170599..0000000000000000000000000000000000000000 --- a/.idea/libraries/Maven__redis_clients_jedis_2_5_1.xml +++ /dev/null @@ -1,13 +0,0 @@ -<component name="libraryTable"> - <library name="Maven: redis.clients:jedis:2.5.1"> - <CLASSES> - <root url="jar://$MAVEN_REPOSITORY$/redis/clients/jedis/2.5.1/jedis-2.5.1.jar!/" /> - </CLASSES> - <JAVADOC> - <root url="jar://$MAVEN_REPOSITORY$/redis/clients/jedis/2.5.1/jedis-2.5.1-javadoc.jar!/" /> - </JAVADOC> - <SOURCES> - <root url="jar://$MAVEN_REPOSITORY$/redis/clients/jedis/2.5.1/jedis-2.5.1-sources.jar!/" /> - </SOURCES> - </library> -</component> \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index ccf88d1b84fcdeddbef86f014b7292d010c6e432..8a80acb0ffc07bb839ad475b6ca7b3c68aa4e03b 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -10,7 +10,7 @@ </list> </option> </component> - <component name="ProjectRootManager" version="2" languageLevel="JDK_1_6" assert-keyword="true" jdk-15="true" project-jdk-name="1.8" project-jdk-type="JavaSDK"> + <component name="ProjectRootManager" version="2" languageLevel="JDK_1_6" assert-keyword="true" jdk-15="true" project-jdk-name="1.7" project-jdk-type="JavaSDK"> <output url="file://$PROJECT_DIR$/out" /> </component> </project> diff --git a/biopet-framework/BiopetFramework.iml b/biopet-framework/BiopetFramework.iml index 7d46545ae978871a9ec99087cdcadcd599b616de..e8e94271583519aa54415881ac71c3b6f26fb845 100644 --- a/biopet-framework/BiopetFramework.iml +++ b/biopet-framework/BiopetFramework.iml @@ -21,6 +21,7 @@ <sourceFolder url="file://$MODULE_DIR$/src/main/scala" isTestSource="false" /> <sourceFolder url="file://$MODULE_DIR$/src/test/scala" isTestSource="true" /> <sourceFolder url="file://$MODULE_DIR$/src/main/scripts" type="java-resource" /> + <sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" /> <sourceFolder url="file://$MODULE_DIR$/src/test/resources" type="java-test-resource" /> <excludeFolder url="file://$MODULE_DIR$/target" /> </content> @@ -45,11 +46,7 @@ <orderEntry type="library" name="Maven: com.github.julien-truffaut:monocle-core_2.11:0.5.0" level="project" /> <orderEntry type="library" name="Maven: org.biojava:biojava3-core:3.1.0" level="project" /> <orderEntry type="library" name="Maven: org.biojava:biojava3-sequencing:3.1.0" level="project" /> - <orderEntry type="library" name="Maven: com.google.guava:guava:17.0" level="project" /> - <orderEntry type="library" name="Maven: com.baqend:bloom-filter:1.02" level="project" /> - <orderEntry type="library" name="Maven: redis.clients:jedis:2.5.1" level="project" /> - <orderEntry type="library" name="Maven: org.apache.commons:commons-pool2:2.2" level="project" /> - <orderEntry type="library" name="Maven: com.google.code.gson:gson:2.2.4" level="project" /> + <orderEntry type="library" name="Maven: com.google.guava:guava:18.0" level="project" /> <orderEntry type="library" name="Maven: com.github.scopt:scopt_2.10:3.2.0" level="project" /> </component> </module> diff --git a/biopet-framework/pom.xml b/biopet-framework/pom.xml index d735e4bbd703a6dad97a6768a515bd7a6044e270..de92b97ba5c7724228082872bf00eb3741e331e8 100644 --- a/biopet-framework/pom.xml +++ b/biopet-framework/pom.xml @@ -24,10 +24,6 @@ <name>BioJava repository</name> <url>http://www.biojava.org/download/maven/</url> </repository> - <repository> - <id>orestes-bloom-filter</id> - <url>https://raw.githubusercontent.com/Baqend/Orestes-Bloomfilter/master/maven-repo</url> - </repository> </repositories> <dependencies> <dependency> @@ -67,9 +63,9 @@ <version>3.1.0</version> </dependency> <dependency> - <groupId>com.baqend</groupId> - <artifactId>bloom-filter</artifactId> - <version>1.02</version> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + <version>18.0</version> </dependency> <dependency> <groupId>com.github.scopt</groupId> diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala index 39b1863166c05428da70d82b06588cb75767befb..bccec55a9f84c6f704d0cfbb821a40dc7be9532f 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala @@ -8,6 +8,7 @@ import java.io.File import scala.collection.JavaConverters._ +import com.google.common.hash.{Funnel, BloomFilter, PrimitiveSink} import htsjdk.samtools.AlignmentBlock import htsjdk.samtools.SAMFileReader import htsjdk.samtools.SAMFileReader.QueryInterval @@ -18,8 +19,6 @@ import htsjdk.tribble.Feature import htsjdk.tribble.BasicFeature import htsjdk.tribble.bed.BEDCodec import htsjdk.tribble.index.interval.{ Interval, IntervalTree } -import orestes.bloomfilter.HashProvider.HashMethod -import orestes.bloomfilter.{ BloomFilter, FilterBuilder } import org.apache.commons.io.FilenameUtils.getExtension import org.broadinstitute.gatk.utils.commandline.{ Input, Output } @@ -218,8 +217,32 @@ object WipeReads extends ToolCommand { return true } false - } else - true + } else true + + /** function to create a fake SAMRecord pair ~ hack to limit querying BAM file for real pair */ + def makeMockPair(rec: SAMRecord): SAMRecord = { + require(rec.getReadPairedFlag) + val fakePair = rec.clone.asInstanceOf[SAMRecord] + fakePair.setAlignmentStart(rec.getMateAlignmentStart) + fakePair + } + + /** function to create set element from SAMRecord */ + def elemFromSam(rec: SAMRecord): String = { + if (filterOutMulti) + rec.getReadName + else + rec.getReadName + "_" + rec.getAlignmentStart.toString + } + + /** object for use by BloomFilter */ + object SAMFunnel extends Funnel[SAMRecord] { + override def funnel(rec: SAMRecord, into: PrimitiveSink): Unit = { + val elem = elemFromSam(rec) + logger.debug("Adding " + elem + " to set ...") + into.putUnencodedChars(elem) + } + } /** filter function for read IDs */ val rgFilter = @@ -228,15 +251,6 @@ object WipeReads extends ToolCommand { else (r: SAMRecord) => readGroupIds.contains(r.getReadGroup.getReadGroupId) - /** function to get set element */ - val SamRecordElement = - if (filterOutMulti) - (r: SAMRecord) => r.getReadName - else - (r: SAMRecord) => r.getReadName + "_" + r.getAlignmentStart.toString - - val SamRecordMateElement = - (r: SAMRecord) => r.getReadName + "_" + r.getMateAlignmentStart.toString val readyBam = prepIndexedInputBam() @@ -257,7 +271,7 @@ object WipeReads extends ToolCommand { .groupBy(x => x.getChr) .map({ case (key, value) => (key, makeIntervalTree(value)) }) - lazy val filteredOutSet: BloomFilter[String] = readyBam + lazy val filteredOutSet: BloomFilter[SAMRecord] = readyBam // query BAM file with intervals .queryOverlapping(queryIntervals) // for compatibility @@ -269,28 +283,21 @@ object WipeReads extends ToolCommand { // filter on specific read group IDs .filter(x => rgFilter(x)) // fold starting from empty set - .foldLeft(new FilterBuilder(bloomSize.toInt, bloomFp) - .hashFunction(HashMethod.Murmur3KirschMitzenmacher) - .buildBloomFilter(): BloomFilter[String] + .foldLeft(BloomFilter.create(SAMFunnel, bloomSize.toInt, bloomFp) )((acc, rec) => { - logger.debug("Adding read " + rec.getReadName + " to set ...") - if ((!filterOutMulti) && rec.getReadPairedFlag) { - acc.add(SamRecordElement(rec)) - acc.add(SamRecordMateElement(rec)) - } else - acc.add(SamRecordElement(rec)) + acc.put(rec) + if (rec.getReadPairedFlag) acc.put(makeMockPair(rec)) acc }) if (filterOutMulti) - (rec: SAMRecord) => filteredOutSet.contains(rec.getReadName) + (rec: SAMRecord) => filteredOutSet.mightContain(rec) else (rec: SAMRecord) => { if (rec.getReadPairedFlag) - filteredOutSet.contains(SamRecordElement(rec)) && - filteredOutSet.contains(SamRecordMateElement(rec)) + filteredOutSet.mightContain(rec) && filteredOutSet.mightContain(makeMockPair(rec)) else - filteredOutSet.contains(SamRecordElement(rec)) + filteredOutSet.mightContain(rec) } } @@ -345,13 +352,6 @@ object WipeReads extends ToolCommand { } } - /** Function to check whether the bloom filter can fulfill size and false positive guarantees - As we are currently limited to maximum integer size if the optimal array size equals or - exceeds it, we assume that it's a result of a truncation and return false. - */ - def bloomParamsOk(bloomSize: Long, bloomFp: Double): Boolean = - FilterBuilder.optimalM(bloomSize, bloomFp) <= Int.MaxValue - case class Args(inputBam: File = null, targetRegions: File = null, outputBam: File = null, @@ -425,12 +425,6 @@ object WipeReads extends ToolCommand { |the given ones, they will also be removed. """.stripMargin) - checkConfig { c => - if (!bloomParamsOk(c.bloomSize, c.bloomFp)) - failure("Bloom parameters combination exceed Int limitation") - else - success - } } def main(args: Array[String]): Unit = {