diff --git a/.idea/libraries/Maven__com_baqend_bloom_filter_1_02.xml b/.idea/libraries/Maven__com_baqend_bloom_filter_1_02.xml deleted file mode 100644 index a08d14f4eb8ef4a4d8eb3ab6ef242b63e7000b28..0000000000000000000000000000000000000000 --- a/.idea/libraries/Maven__com_baqend_bloom_filter_1_02.xml +++ /dev/null @@ -1,13 +0,0 @@ -<component name="libraryTable"> - <library name="Maven: com.baqend:bloom-filter:1.02"> - <CLASSES> - <root url="jar://$MAVEN_REPOSITORY$/com/baqend/bloom-filter/1.02/bloom-filter-1.02.jar!/" /> - </CLASSES> - <JAVADOC> - <root url="jar://$MAVEN_REPOSITORY$/com/baqend/bloom-filter/1.02/bloom-filter-1.02-javadoc.jar!/" /> - </JAVADOC> - <SOURCES> - <root url="jar://$MAVEN_REPOSITORY$/com/baqend/bloom-filter/1.02/bloom-filter-1.02-sources.jar!/" /> - </SOURCES> - </library> -</component> \ No newline at end of file diff --git a/.idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml b/.idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml deleted file mode 100644 index 4533c1ba79a2f6aad70eab0b302d97e0bfb41dda..0000000000000000000000000000000000000000 --- a/.idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml +++ /dev/null @@ -1,13 +0,0 @@ -<component name="libraryTable"> - <library name="Maven: com.google.code.gson:gson:2.2.4"> - <CLASSES> - <root url="jar://$MAVEN_REPOSITORY$/com/google/code/gson/gson/2.2.4/gson-2.2.4.jar!/" /> - </CLASSES> - <JAVADOC> - <root url="jar://$MAVEN_REPOSITORY$/com/google/code/gson/gson/2.2.4/gson-2.2.4-javadoc.jar!/" /> - </JAVADOC> - <SOURCES> - <root url="jar://$MAVEN_REPOSITORY$/com/google/code/gson/gson/2.2.4/gson-2.2.4-sources.jar!/" /> - </SOURCES> - </library> -</component> \ No newline at end of file diff --git a/.idea/libraries/Maven__com_google_guava_guava_17_0.xml b/.idea/libraries/Maven__com_google_guava_guava_18_0.xml similarity index 68% rename from .idea/libraries/Maven__com_google_guava_guava_17_0.xml rename to .idea/libraries/Maven__com_google_guava_guava_18_0.xml index 2a9069ca399f4895428f9c87f5dddd978dc31766..bbd71d77e995b85a163660856a9d45a449599fcc 100644 --- a/.idea/libraries/Maven__com_google_guava_guava_17_0.xml +++ b/.idea/libraries/Maven__com_google_guava_guava_18_0.xml @@ -1,13 +1,13 @@ <component name="libraryTable"> - <library name="Maven: com.google.guava:guava:17.0"> + <library name="Maven: com.google.guava:guava:18.0"> <CLASSES> - <root url="jar://$MAVEN_REPOSITORY$/com/google/guava/guava/17.0/guava-17.0.jar!/" /> + <root url="jar://$MAVEN_REPOSITORY$/com/google/guava/guava/18.0/guava-18.0.jar!/" /> </CLASSES> <JAVADOC> - <root url="jar://$MAVEN_REPOSITORY$/com/google/guava/guava/17.0/guava-17.0-javadoc.jar!/" /> + <root url="jar://$MAVEN_REPOSITORY$/com/google/guava/guava/18.0/guava-18.0-javadoc.jar!/" /> </JAVADOC> <SOURCES> - <root url="jar://$MAVEN_REPOSITORY$/com/google/guava/guava/17.0/guava-17.0-sources.jar!/" /> + <root url="jar://$MAVEN_REPOSITORY$/com/google/guava/guava/18.0/guava-18.0-sources.jar!/" /> </SOURCES> </library> </component> \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_commons_commons_pool2_2_2.xml b/.idea/libraries/Maven__org_apache_commons_commons_pool2_2_2.xml deleted file mode 100644 index aa6a889052a2db3f805e9ec976fdce0ad9faad1d..0000000000000000000000000000000000000000 --- a/.idea/libraries/Maven__org_apache_commons_commons_pool2_2_2.xml +++ /dev/null @@ -1,13 +0,0 @@ -<component name="libraryTable"> - <library name="Maven: org.apache.commons:commons-pool2:2.2"> - <CLASSES> - <root url="jar://$MAVEN_REPOSITORY$/org/apache/commons/commons-pool2/2.2/commons-pool2-2.2.jar!/" /> - </CLASSES> - <JAVADOC> - <root url="jar://$MAVEN_REPOSITORY$/org/apache/commons/commons-pool2/2.2/commons-pool2-2.2-javadoc.jar!/" /> - </JAVADOC> - <SOURCES> - <root url="jar://$MAVEN_REPOSITORY$/org/apache/commons/commons-pool2/2.2/commons-pool2-2.2-sources.jar!/" /> - </SOURCES> - </library> -</component> \ No newline at end of file diff --git a/.idea/libraries/Maven__org_broadinstitute_gatk_gatk_queue_package_distribution_3_3.xml b/.idea/libraries/Maven__org_broadinstitute_gatk_gatk_queue_package_distribution_3_3.xml new file mode 100644 index 0000000000000000000000000000000000000000..02a939e03978c562e4eb360277a1e8a74b2899ed --- /dev/null +++ b/.idea/libraries/Maven__org_broadinstitute_gatk_gatk_queue_package_distribution_3_3.xml @@ -0,0 +1,13 @@ +<component name="libraryTable"> + <library name="Maven: org.broadinstitute.gatk:gatk-queue-package-distribution:3.3"> + <CLASSES> + <root url="jar://$MAVEN_REPOSITORY$/org/broadinstitute/gatk/gatk-queue-package-distribution/3.3/gatk-queue-package-distribution-3.3.jar!/" /> + </CLASSES> + <JAVADOC> + <root url="jar://$MAVEN_REPOSITORY$/org/broadinstitute/gatk/gatk-queue-package-distribution/3.3/gatk-queue-package-distribution-3.3-javadoc.jar!/" /> + </JAVADOC> + <SOURCES> + <root url="jar://$MAVEN_REPOSITORY$/org/broadinstitute/gatk/gatk-queue-package-distribution/3.3/gatk-queue-package-distribution-3.3-sources.jar!/" /> + </SOURCES> + </library> +</component> \ No newline at end of file diff --git a/.idea/libraries/Maven__redis_clients_jedis_2_5_1.xml b/.idea/libraries/Maven__redis_clients_jedis_2_5_1.xml deleted file mode 100644 index 1d28363366b82a563ff8823880190ac6f4170599..0000000000000000000000000000000000000000 --- a/.idea/libraries/Maven__redis_clients_jedis_2_5_1.xml +++ /dev/null @@ -1,13 +0,0 @@ -<component name="libraryTable"> - <library name="Maven: redis.clients:jedis:2.5.1"> - <CLASSES> - <root url="jar://$MAVEN_REPOSITORY$/redis/clients/jedis/2.5.1/jedis-2.5.1.jar!/" /> - </CLASSES> - <JAVADOC> - <root url="jar://$MAVEN_REPOSITORY$/redis/clients/jedis/2.5.1/jedis-2.5.1-javadoc.jar!/" /> - </JAVADOC> - <SOURCES> - <root url="jar://$MAVEN_REPOSITORY$/redis/clients/jedis/2.5.1/jedis-2.5.1-sources.jar!/" /> - </SOURCES> - </library> -</component> \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index ccf88d1b84fcdeddbef86f014b7292d010c6e432..8a80acb0ffc07bb839ad475b6ca7b3c68aa4e03b 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -10,7 +10,7 @@ </list> </option> </component> - <component name="ProjectRootManager" version="2" languageLevel="JDK_1_6" assert-keyword="true" jdk-15="true" project-jdk-name="1.8" project-jdk-type="JavaSDK"> + <component name="ProjectRootManager" version="2" languageLevel="JDK_1_6" assert-keyword="true" jdk-15="true" project-jdk-name="1.7" project-jdk-type="JavaSDK"> <output url="file://$PROJECT_DIR$/out" /> </component> </project> diff --git a/biopet-framework/BiopetFramework.iml b/biopet-framework/BiopetFramework.iml index 7d46545ae978871a9ec99087cdcadcd599b616de..cd52bbb064aa631f89cd2e626866d9f7ee166ef3 100644 --- a/biopet-framework/BiopetFramework.iml +++ b/biopet-framework/BiopetFramework.iml @@ -21,6 +21,7 @@ <sourceFolder url="file://$MODULE_DIR$/src/main/scala" isTestSource="false" /> <sourceFolder url="file://$MODULE_DIR$/src/test/scala" isTestSource="true" /> <sourceFolder url="file://$MODULE_DIR$/src/main/scripts" type="java-resource" /> + <sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" /> <sourceFolder url="file://$MODULE_DIR$/src/test/resources" type="java-test-resource" /> <excludeFolder url="file://$MODULE_DIR$/target" /> </content> @@ -38,18 +39,14 @@ <orderEntry type="library" name="Maven: org.scala-lang:scala-library:2.11.2" level="project" /> <orderEntry type="library" name="Maven: org.scala-lang:scala-reflect:2.11.2" level="project" /> <orderEntry type="library" name="Maven: org.scala-lang.modules:scala-xml_2.11:1.0.2" level="project" /> - <orderEntry type="library" name="Maven: org.broadinstitute.gatk:gatk-queue-package-distribution:3.2" level="project" /> + <orderEntry type="library" name="Maven: org.broadinstitute.gatk:gatk-queue-package-distribution:3.3" level="project" /> <orderEntry type="library" name="Maven: io.argonaut:argonaut_2.11:6.1-M4" level="project" /> <orderEntry type="library" name="Maven: org.scalaz:scalaz-core_2.11:7.1.0" level="project" /> <orderEntry type="library" name="Maven: org.scala-lang.modules:scala-parser-combinators_2.11:1.0.2" level="project" /> <orderEntry type="library" name="Maven: com.github.julien-truffaut:monocle-core_2.11:0.5.0" level="project" /> <orderEntry type="library" name="Maven: org.biojava:biojava3-core:3.1.0" level="project" /> <orderEntry type="library" name="Maven: org.biojava:biojava3-sequencing:3.1.0" level="project" /> - <orderEntry type="library" name="Maven: com.google.guava:guava:17.0" level="project" /> - <orderEntry type="library" name="Maven: com.baqend:bloom-filter:1.02" level="project" /> - <orderEntry type="library" name="Maven: redis.clients:jedis:2.5.1" level="project" /> - <orderEntry type="library" name="Maven: org.apache.commons:commons-pool2:2.2" level="project" /> - <orderEntry type="library" name="Maven: com.google.code.gson:gson:2.2.4" level="project" /> + <orderEntry type="library" name="Maven: com.google.guava:guava:18.0" level="project" /> <orderEntry type="library" name="Maven: com.github.scopt:scopt_2.10:3.2.0" level="project" /> </component> </module> diff --git a/biopet-framework/pom.xml b/biopet-framework/pom.xml index 8385a13dd70e2982641e41a7c61bc26fc370943d..2b25af4b14787738d88d0f28eeca7925bceb978d 100644 --- a/biopet-framework/pom.xml +++ b/biopet-framework/pom.xml @@ -24,10 +24,6 @@ <name>BioJava repository</name> <url>http://www.biojava.org/download/maven/</url> </repository> - <repository> - <id>orestes-bloom-filter</id> - <url>https://raw.githubusercontent.com/Baqend/Orestes-Bloomfilter/master/maven-repo</url> - </repository> </repositories> <dependencies> <dependency> @@ -67,9 +63,9 @@ <version>3.1.0</version> </dependency> <dependency> - <groupId>com.baqend</groupId> - <artifactId>bloom-filter</artifactId> - <version>1.02</version> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + <version>18.0</version> </dependency> <dependency> <groupId>com.github.scopt</groupId> diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala index a990bab67fb7dc2ed67ad91304e74814fb9ba5cc..d131692da08156dcc7b4fd085e71cb791bfca2c0 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala @@ -4,10 +4,11 @@ */ package nl.lumc.sasc.biopet.tools -import java.io.{ File, IOException } +import java.io.File import scala.collection.JavaConverters._ +import com.google.common.hash.{Funnel, BloomFilter, PrimitiveSink} import htsjdk.samtools.AlignmentBlock import htsjdk.samtools.SAMFileReader import htsjdk.samtools.QueryInterval @@ -19,8 +20,6 @@ import htsjdk.tribble.Feature import htsjdk.tribble.BasicFeature import htsjdk.tribble.bed.BEDCodec import htsjdk.tribble.index.interval.{ Interval, IntervalTree } -import orestes.bloomfilter.HashProvider.HashMethod -import orestes.bloomfilter.{ BloomFilter, FilterBuilder } import org.apache.commons.io.FilenameUtils.getExtension import org.broadinstitute.gatk.utils.commandline.{ Input, Output } @@ -39,10 +38,10 @@ class WipeReads(val root: Configurable) extends BiopetJavaCommandLineFunction { javaMainClass = getClass.getName @Input(doc = "Input BAM file (must be indexed)", shortName = "I", required = true) - var inputBAM: File = _ + var inputBam: File = _ @Output(doc = "Output BAM", shortName = "o", required = true) - var outputBAM: File = _ + var outputBam: File = _ } @@ -62,8 +61,10 @@ object WipeReads extends ToolCommand { else feat1.getStart <= feat2.getStart && feat1.getEnd >= feat2.getStart - /** Function to merge two overlapping intervals - * NOTE: we assume subtypes of Feature has a constructor with (chr, start, end) signature */ + /** + * Function to merge two overlapping intervals + * NOTE: we assume subtypes of Feature has a constructor with (chr, start, end) signature + */ private def merge[T <: Feature](feat1: T, feat2: T): T = { if (feat1.getChr != feat2.getChr) throw new IllegalArgumentException("Can not merge features from different chromosomes") @@ -73,7 +74,7 @@ object WipeReads extends ToolCommand { else if (overlaps(feat1, feat2)) new BasicFeature(feat1.getChr, feat1.getStart, feat2.getEnd).asInstanceOf[T] else - throw new IllegalArgumentException("Can not merge non-overlapping RawInterval objects") + throw new IllegalArgumentException("Can not merge non-overlapping Feature objects") } /** Function to create an iterator yielding non-overlapped Feature objects */ @@ -81,11 +82,13 @@ object WipeReads extends ToolCommand { ri.toList .sortBy(x => (x.getChr, x.getStart, x.getEnd)) .foldLeft(List.empty[T]) { - (acc, i) => acc match { - case head :: tail if overlaps(head, i) => merge(head, i) :: tail - case _ => i :: acc + (acc, i) => + acc match { + case head :: tail if overlaps(head, i) => merge(head, i) :: tail + case _ => i :: acc - }} + } + } .toIterator /** @@ -98,70 +101,60 @@ object WipeReads extends ToolCommand { logger.info("Parsing interval file ...") /** Function to create iterator from BED file */ - def makeFeatureFromBED(inFile: File): Iterator[Feature] = + def makeFeatureFromBed(inFile: File): Iterator[Feature] = asScalaIteratorConverter(getFeatureReader(inFile.toPath.toString, new BEDCodec(), false).iterator).asScala - // TODO: implementation /** Function to create iterator from refFlat file */ def makeFeatureFromRefFlat(inFile: File): Iterator[Feature] = ??? - // convert coordinate to 1-based fully closed - // parse chrom, start blocks, end blocks, strands + // convert coordinate to 1-based fully closed + // parse chrom, start blocks, end blocks, strands - // TODO: implementation /** Function to create iterator from GTF file */ - def makeFeatureFromGTF(inFile: File): Iterator[Feature] = ??? - // convert coordinate to 1-based fully closed - // parse chrom, start blocks, end blocks, strands + def makeFeatureFromGtf(inFile: File): Iterator[Feature] = ??? + // convert coordinate to 1-based fully closed + // parse chrom, start blocks, end blocks, strands // detect interval file format from extension val iterFunc: (File => Iterator[Feature]) = if (getExtension(inFile.toString.toLowerCase) == "bed") - makeFeatureFromBED + makeFeatureFromBed else throw new IllegalArgumentException("Unexpected interval file type: " + inFile.getPath) mergeOverlappingIntervals(iterFunc(inFile)) } - def bloomParamsOk(bloomSize: Long, bloomFp: Double): Boolean = { - val optimalArraySize = FilterBuilder.optimalM(bloomSize, bloomFp) - // we are currently limited to maximum integer size - // if the optimal array size equals or exceeds it, we assume - // that it's a result of a truncation and return false - optimalArraySize <= Int.MaxValue - } - // TODO: set minimum fraction for overlap /** * Function to create function to check SAMRecord for exclusion in filtered BAM file. * * The returned function evaluates all filtered-in SAMRecord to false. * - * @param iv iterator yielding RawInterval objects - * @param inBAM input BAM file - * @param inBAMIndex input BAM file index + * @param iv iterator yielding Feature objects + * @param inBam input BAM file + * @param inBamIndex input BAM file index * @param filterOutMulti whether to filter out reads with same name outside target region (default: true) * @param minMapQ minimum MapQ of reads in target region to filter out (default: 0) - * @param readGroupIDs read group IDs of reads in target region to filter out (default: all IDs) + * @param readGroupIds read group IDs of reads in target region to filter out (default: all IDs) * @param bloomSize expected size of elements to contain in the Bloom filter * @param bloomFp expected Bloom filter false positive rate * @return function that checks whether a SAMRecord or String is to be excluded */ - def makeFilterOutFunction(iv: Iterator[Feature], - inBAM: File, inBAMIndex: File = null, + def makeFilterNotFunction(iv: Iterator[Feature], + inBam: File, inBamIndex: File = null, filterOutMulti: Boolean = true, - minMapQ: Int = 0, readGroupIDs: Set[String] = Set(), + minMapQ: Int = 0, readGroupIds: Set[String] = Set(), bloomSize: Long, bloomFp: Double): (SAMRecord => Boolean) = { logger.info("Building set of reads to exclude ...") // TODO: implement optional index creation /** Function to check for BAM file index and return a SAMFileReader given a File */ - def prepIndexedInputBAM(): SAMFileReader = - if (inBAMIndex != null) - new SAMFileReader(inBAM, inBAMIndex) + def prepIndexedInputBam(): SAMFileReader = + if (inBamIndex != null) + new SAMFileReader(inBam, inBamIndex) else { - val sfr = new SAMFileReader(inBAM) + val sfr = new SAMFileReader(inBam) sfr.setValidationStringency(ValidationStringency.LENIENT) if (!sfr.hasIndex) throw new IllegalStateException("Input BAM file must be indexed") @@ -175,27 +168,28 @@ object WipeReads extends ToolCommand { * The function still works when only either one of the interval or BAM * file contig is prepended with "chr" * - * @param inBAM BAM file to query as SAMFileReader + * @param inBam BAM file to query as SAMFileReader * @param feat feature object containing query * @return QueryInterval wrapped in Option */ - def monadicMakeQueryInterval(inBAM: SAMFileReader, feat: Feature): Option[QueryInterval] = - if (inBAM.getFileHeader.getSequenceIndex(feat.getChr) > -1) - Some(inBAM.makeQueryInterval(feat.getChr, feat.getStart, feat.getEnd)) + def monadicMakeQueryInterval(inBam: SAMFileReader, feat: Feature): Option[QueryInterval] = + if (inBam.getFileHeader.getSequenceIndex(feat.getChr) > -1) + Some(inBam.makeQueryInterval(feat.getChr, feat.getStart, feat.getEnd)) else if (feat.getChr.startsWith("chr") - && inBAM.getFileHeader.getSequenceIndex(feat.getChr.substring(3)) > -1) - Some(inBAM.makeQueryInterval(feat.getChr.substring(3), feat.getStart, feat.getEnd)) + && inBam.getFileHeader.getSequenceIndex(feat.getChr.substring(3)) > -1) + Some(inBam.makeQueryInterval(feat.getChr.substring(3), feat.getStart, feat.getEnd)) else if (!feat.getChr.startsWith("chr") - && inBAM.getFileHeader.getSequenceIndex("chr" + feat.getChr) > -1) - Some(inBAM.makeQueryInterval("chr" + feat.getChr, feat.getStart, feat.getEnd)) + && inBam.getFileHeader.getSequenceIndex("chr" + feat.getChr) > -1) + Some(inBam.makeQueryInterval("chr" + feat.getChr, feat.getStart, feat.getEnd)) else None - /** function to make IntervalTree from our RawInterval objects - * - * @param ifeat iterable over feature objects - * @return IntervalTree objects, filled with intervals from RawInterval - */ + /** + * function to make IntervalTree from our Feature objects + * + * @param ifeat iterable over feature objects + * @return IntervalTree objects, filled with intervals from Feature + */ def makeIntervalTree[T <: Feature](ifeat: Iterable[T]): IntervalTree = { val ivt = new IntervalTree for (iv <- ifeat) @@ -224,104 +218,116 @@ object WipeReads extends ToolCommand { return true } false - } else - true + } else true + + /** function to create a fake SAMRecord pair ~ hack to limit querying BAM file for real pair */ + def makeMockPair(rec: SAMRecord): SAMRecord = { + require(rec.getReadPairedFlag) + val fakePair = rec.clone.asInstanceOf[SAMRecord] + fakePair.setAlignmentStart(rec.getMateAlignmentStart) + fakePair + } + + /** function to create set element from SAMRecord */ + def elemFromSam(rec: SAMRecord): String = { + if (filterOutMulti) + rec.getReadName + else + rec.getReadName + "_" + rec.getAlignmentStart.toString + } + + /** object for use by BloomFilter */ + object SAMFunnel extends Funnel[SAMRecord] { + override def funnel(rec: SAMRecord, into: PrimitiveSink): Unit = { + val elem = elemFromSam(rec) + logger.debug("Adding " + elem + " to set ...") + into.putUnencodedChars(elem) + } + } /** filter function for read IDs */ val rgFilter = - if (readGroupIDs.size == 0) + if (readGroupIds.size == 0) (r: SAMRecord) => true else - (r: SAMRecord) => readGroupIDs.contains(r.getReadGroup.getReadGroupId) + (r: SAMRecord) => readGroupIds.contains(r.getReadGroup.getReadGroupId) - /** function to get set element */ - val SAMRecordElement = - if (filterOutMulti) - (r: SAMRecord) => r.getReadName - else - (r: SAMRecord) => r.getReadName + "_" + r.getAlignmentStart.toString - - val SAMRecordMateElement = - (r: SAMRecord) => r.getReadName + "_" + r.getMateAlignmentStart.toString - val firstBAM = prepIndexedInputBAM() + val readyBam = prepIndexedInputBam() /* NOTE: the interval vector here should be bypass-able if we can make the BAM query intervals with Interval objects. This is not possible - at the moment since we can not retrieve star and end coordinates - of an Interval, so we resort to our own RawInterval vector + at the moment since we can not retrieve start and end coordinates + of an Interval, so we resort to our own Feature vector */ lazy val intervals = iv.toVector - lazy val intervalTreeMap: Map[String, IntervalTree] = intervals - .groupBy(x => x.getChr) - .map({ case (key, value) => (key, makeIntervalTree(value)) }) + lazy val queryIntervals = intervals - .flatMap(x => monadicMakeQueryInterval(firstBAM, x)) + .flatMap(x => monadicMakeQueryInterval(readyBam, x)) // makeQueryInterval only accepts a sorted QueryInterval list .sortBy(x => (x.referenceIndex, x.start, x.end)) .toArray - val filteredRecords: Iterator[SAMRecord] = firstBAM.queryOverlapping(queryIntervals).asScala + lazy val intervalTreeMap: Map[String, IntervalTree] = intervals + .groupBy(x => x.getChr) + .map({ case (key, value) => (key, makeIntervalTree(value)) }) + + lazy val filteredOutSet: BloomFilter[SAMRecord] = readyBam + // query BAM file with intervals + .queryOverlapping(queryIntervals) + // for compatibility + .asScala // ensure spliced reads have at least one block overlapping target region .filter(x => alignmentBlockOverlaps(x, intervalTreeMap)) // filter for MAPQ on target region reads .filter(x => x.getMappingQuality >= minMapQ) // filter on specific read group IDs .filter(x => rgFilter(x)) - - val filteredOutSet: BloomFilter[String] = new FilterBuilder(bloomSize.toInt, bloomFp) - .hashFunction(HashMethod.Murmur3KirschMitzenmacher) - .buildBloomFilter() - - for (rec <- filteredRecords) { - logger.debug("Adding read " + rec.getReadName + " to set ...") - if ((!filterOutMulti) && rec.getReadPairedFlag) { - filteredOutSet.add(SAMRecordElement(rec)) - filteredOutSet.add(SAMRecordMateElement(rec)) - } - else - filteredOutSet.add(SAMRecordElement(rec)) - } + // fold starting from empty set + .foldLeft(BloomFilter.create(SAMFunnel, bloomSize.toInt, bloomFp) + )((acc, rec) => { + acc.put(rec) + if (rec.getReadPairedFlag) acc.put(makeMockPair(rec)) + acc + }) if (filterOutMulti) - (rec: SAMRecord) => filteredOutSet.contains(rec.getReadName) + (rec: SAMRecord) => filteredOutSet.mightContain(rec) else (rec: SAMRecord) => { if (rec.getReadPairedFlag) - filteredOutSet.contains(SAMRecordElement(rec)) && - filteredOutSet.contains(SAMRecordMateElement(rec)) + filteredOutSet.mightContain(rec) && filteredOutSet.mightContain(makeMockPair(rec)) else - filteredOutSet.contains(SAMRecordElement(rec)) + filteredOutSet.mightContain(rec) } } - // TODO: implement stats file output /** * Function to filter input BAM and write its output to the filesystem * * @param filterFunc filter function that evaluates true for excluded SAMRecord - * @param inBAM input BAM file - * @param outBAM output BAM file + * @param inBam input BAM file + * @param outBam output BAM file * @param writeIndex whether to write index for output BAM file * @param async whether to write asynchronously - * @param filteredOutBAM whether to write excluded SAMRecords to their own BAM file + * @param filteredOutBam whether to write excluded SAMRecords to their own BAM file */ - def writeFilteredBAM(filterFunc: (SAMRecord => Boolean), inBAM: File, outBAM: File, + def writeFilteredBam(filterFunc: (SAMRecord => Boolean), inBam: File, outBam: File, writeIndex: Boolean = true, async: Boolean = true, - filteredOutBAM: File = null) = { + filteredOutBam: File = null) = { val factory = new SAMFileWriterFactory() .setCreateIndex(writeIndex) .setUseAsyncIo(async) - val templateBAM = new SAMFileReader(inBAM) - templateBAM.setValidationStringency(ValidationStringency.LENIENT) + val templateBam = new SAMFileReader(inBam) + templateBam.setValidationStringency(ValidationStringency.LENIENT) - val targetBAM = factory.makeBAMWriter(templateBAM.getFileHeader, true, outBAM) + val targetBam = factory.makeBAMWriter(templateBam.getFileHeader, true, outBam) - val filteredBAM = - if (filteredOutBAM != null) - factory.makeBAMWriter(templateBAM.getFileHeader, true, filteredOutBAM) + val filteredBam = + if (filteredOutBam != null) + factory.makeBAMWriter(templateBam.getFileHeader, true, filteredOutBam) else null @@ -329,35 +335,34 @@ object WipeReads extends ToolCommand { try { var (incl, excl) = (0, 0) - for (rec <- templateBAM.asScala) { + for (rec <- templateBam.asScala) { if (!filterFunc(rec)) { - targetBAM.addAlignment(rec) + targetBam.addAlignment(rec) incl += 1 - } - else { + } else { excl += 1 - if (filteredBAM != null) filteredBAM.addAlignment(rec) + if (filteredBam != null) filteredBam.addAlignment(rec) } } println(List("input_bam", "output_bam", "count_included", "count_excluded").mkString("\t")) - println(List(inBAM.getName, outBAM.getName, incl, excl).mkString("\t")) + println(List(inBam.getName, outBam.getName, incl, excl).mkString("\t")) } finally { - templateBAM.close() - targetBAM.close() - if (filteredBAM != null) filteredBAM.close() + templateBam.close() + targetBam.close() + if (filteredBam != null) filteredBam.close() } } - case class Args (inputBAM: File = null, - targetRegions: File = null, - outputBAM: File = null, - filteredOutBAM: File = null, - readGroupIDs: Set[String] = Set.empty[String], - minMapQ: Int = 0, - limitToRegion: Boolean = false, - noMakeIndex: Boolean = false, - bloomSize: Long = 70000000, - bloomFp: Double = 4e-7) extends AbstractArgs + case class Args(inputBam: File = null, + targetRegions: File = null, + outputBam: File = null, + filteredOutBam: File = null, + readGroupIds: Set[String] = Set.empty[String], + minMapQ: Int = 0, + limitToRegion: Boolean = false, + noMakeIndex: Boolean = false, + bloomSize: Long = 70000000, + bloomFp: Double = 4e-7) extends AbstractArgs class OptParser extends AbstractOptParser { @@ -366,43 +371,53 @@ object WipeReads extends ToolCommand { |$commandName - Region-based reads removal from an indexed BAM file """.stripMargin) - opt[File]('I', "input_file") required() valueName "<bam>" action { (x, c) => - c.copy(inputBAM = x) } validate { - x => if (x.exists) success else failure("Input BAM file not found") - } text "Input BAM file" - - opt[File]('r', "interval_file") required() valueName "<bed>" action { (x, c) => - c.copy(targetRegions = x) } validate { - x => if (x.exists) success else failure("Target regions file not found") - } text "Interval BED file" - - opt[File]('o', "output_file") required() valueName "<bam>" action { (x, c) => - c.copy(outputBAM = x) } text "Output BAM file" - - opt[File]('f', "discarded_file") optional() valueName "<bam>" action { (x, c) => - c.copy(filteredOutBAM = x) } text "Discarded reads BAM file (default: none)" - - opt[Int]('Q', "min_mapq") optional() action { (x, c) => - c.copy(minMapQ = x) } text "Minimum MAPQ of reads in target region to remove (default: 0)" - - opt[String]('G', "read_group") unbounded() optional() valueName "<rgid>" action { (x, c) => - c.copy(readGroupIDs = c.readGroupIDs + x) } text "Read group IDs to be removed (default: remove reads from all read groups)" - - opt[Unit]("limit_removal") optional() action { (_, c) => - c.copy(limitToRegion = true) } text + opt[File]('I', "input_file") required () valueName "<bam>" action { (x, c) => + c.copy(inputBam = x) + } validate { + x => if (x.exists) success else failure("Input BAM file not found") + } text "Input BAM file" + + opt[File]('r', "interval_file") required () valueName "<bed>" action { (x, c) => + c.copy(targetRegions = x) + } validate { + x => if (x.exists) success else failure("Target regions file not found") + } text "Interval BED file" + + opt[File]('o', "output_file") required () valueName "<bam>" action { (x, c) => + c.copy(outputBam = x) + } text "Output BAM file" + + opt[File]('f', "discarded_file") optional () valueName "<bam>" action { (x, c) => + c.copy(filteredOutBam = x) + } text "Discarded reads BAM file (default: none)" + + opt[Int]('Q', "min_mapq") optional () action { (x, c) => + c.copy(minMapQ = x) + } text "Minimum MAPQ of reads in target region to remove (default: 0)" + + opt[String]('G', "read_group") unbounded () optional () valueName "<rgid>" action { (x, c) => + c.copy(readGroupIds = c.readGroupIds + x) + } text "Read group IDs to be removed (default: remove reads from all read groups)" + + opt[Unit]("limit_removal") optional () action { (_, c) => + c.copy(limitToRegion = true) + } text "Whether to remove multiple-mapped reads outside the target regions (default: yes)" - opt[Unit]("no_make_index") optional() action { (_, c) => - c.copy(noMakeIndex = true) } text + opt[Unit]("no_make_index") optional () action { (_, c) => + c.copy(noMakeIndex = true) + } text "Whether to index output BAM file or not (default: yes)" note("\nAdvanced options") - opt[Long]("bloom_size") optional() action { (x, c) => - c.copy(bloomSize = x) } text "expected maximum number of reads in target regions (default: 7e7)" + opt[Long]("bloom_size") optional () action { (x, c) => + c.copy(bloomSize = x) + } text "expected maximum number of reads in target regions (default: 7e7)" - opt[Double]("false_positive") optional() action { (x, c) => - c.copy(bloomFp = x) } text "false positive rate (default: 4e-7)" + opt[Double]("false_positive") optional () action { (x, c) => + c.copy(bloomFp = x) + } text "false positive rate (default: 4e-7)" note( """ @@ -411,12 +426,6 @@ object WipeReads extends ToolCommand { |the given ones, they will also be removed. """.stripMargin) - checkConfig { c=> - if (!bloomParamsOk(c.bloomSize, c.bloomFp)) - failure("Bloom parameters combination exceed Int limitation") - else - success - } } def main(args: Array[String]): Unit = { @@ -425,22 +434,22 @@ object WipeReads extends ToolCommand { .parse(args, Args()) .getOrElse(sys.exit(1)) - val filterFunc = makeFilterOutFunction( + val filterFunc = makeFilterNotFunction( iv = makeFeatureFromFile(commandArgs.targetRegions), - inBAM = commandArgs.inputBAM, + inBam = commandArgs.inputBam, filterOutMulti = !commandArgs.limitToRegion, minMapQ = commandArgs.minMapQ, - readGroupIDs = commandArgs.readGroupIDs, + readGroupIds = commandArgs.readGroupIds, bloomSize = commandArgs.bloomSize, bloomFp = commandArgs.bloomFp ) - writeFilteredBAM( + writeFilteredBam( filterFunc, - commandArgs.inputBAM, - commandArgs.outputBAM, + commandArgs.inputBam, + commandArgs.outputBam, writeIndex = !commandArgs.noMakeIndex, - filteredOutBAM = commandArgs.filteredOutBAM + filteredOutBam = commandArgs.filteredOutBam ) } } diff --git a/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala b/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala index ee438ae74e82bf157a013360daf9b20d5a1647b6..0590dba978b19e531284112d2e9adecac88f90e2 100644 --- a/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala +++ b/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala @@ -6,6 +6,7 @@ package nl.lumc.sasc.biopet.tools import java.io.File import java.nio.file.Paths +import scala.collection.JavaConverters._ import htsjdk.samtools._ import htsjdk.tribble._ @@ -13,17 +14,14 @@ import org.scalatest.Matchers import org.scalatest.testng.TestNGSuite import org.testng.annotations.Test -import scala.collection.JavaConverters._ - - class WipeReadsUnitTest extends TestNGSuite with Matchers { - import nl.lumc.sasc.biopet.tools.WipeReads._ + import WipeReads._ private def resourcePath(p: String): String = Paths.get(getClass.getResource(p).toURI).toString - private lazy val samP: SAMLineParser = { + private val samP: SAMLineParser = { val samh = new SAMFileHeader samh.addSequence(new SAMSequenceRecord("chrQ", 10000)) samh.addReadGroup(new SAMReadGroupRecord("001")) @@ -31,20 +29,20 @@ class WipeReadsUnitTest extends TestNGSuite with Matchers { new SAMLineParser(samh) } - private def makeSAMs(raws: String*): Seq[SAMRecord] = + private def makeSams(raws: String*): Seq[SAMRecord] = raws.map(s => samP.parseLine(s)) - private def makeTempBAM(): File = + private def makeTempBam(): File = File.createTempFile("WipeReads", java.util.UUID.randomUUID.toString + ".bam") - private def makeTempBAMIndex(bam: File): File = + private def makeTempBamIndex(bam: File): File = new File(bam.getAbsolutePath.stripSuffix(".bam") + ".bai") val bloomSize: Long = 1000 val bloomFp: Double = 1e-10 - val sBAMFile1 = new File(resourcePath("/single01.bam")) - val sBAMRecs1 = makeSAMs( + val sBamFile1 = new File(resourcePath("/single01.bam")) + val sBamRecs1 = makeSams( "r02\t0\tchrQ\t50\t60\t10M\t*\t0\t0\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001", "r01\t16\tchrQ\t190\t60\t10M\t*\t0\t0\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001", "r01\t16\tchrQ\t290\t60\t10M\t*\t0\t0\tGGGGGAAAAA\tGGGGGGGGGG\tRG:Z:001", @@ -54,8 +52,8 @@ class WipeReadsUnitTest extends TestNGSuite with Matchers { "r06\t4\t*\t0\t0\t*\t*\t0\t0\tATATATATAT\tHIHIHIHIHI\tRG:Z:001" ) - val sBAMFile2 = new File(resourcePath("/single02.bam")) - val sBAMRecs2 = makeSAMs( + val sBamFile2 = new File(resourcePath("/single02.bam")) + val sBamRecs2 = makeSams( "r02\t0\tchrQ\t50\t60\t10M\t*\t0\t0\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001", "r01\t16\tchrQ\t190\t30\t10M\t*\t0\t0\tGGGGGAAAAA\tGGGGGGGGGG\tRG:Z:002", "r01\t16\tchrQ\t290\t30\t10M\t*\t0\t0\tGGGGGAAAAA\tGGGGGGGGGG\tRG:Z:002", @@ -66,11 +64,11 @@ class WipeReadsUnitTest extends TestNGSuite with Matchers { "r08\t4\t*\t0\t0\t*\t*\t0\t0\tATATATATAT\tHIHIHIHIHI\tRG:Z:002" ) - val sBAMFile3 = new File(resourcePath("/single03.bam")) - val sBAMFile4 = new File(resourcePath("/single04.bam")) + val sBamFile3 = new File(resourcePath("/single03.bam")) + val sBamFile4 = new File(resourcePath("/single04.bam")) - val pBAMFile1 = new File(resourcePath("/paired01.bam")) - val pBAMRecs1 = makeSAMs( + val pBamFile1 = new File(resourcePath("/paired01.bam")) + val pBamRecs1 = makeSams( "r02\t99\tchrQ\t50\t60\t10M\t=\t90\t50\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001", "r02\t147\tchrQ\t90\t60\t10M\t=\t50\t-50\tATGCATGCAT\tEEFFGGHHII\tRG:Z:001", "r01\t163\tchrQ\t150\t60\t10M\t=\t190\t50\tAAAAAGGGGG\tGGGGGGGGGG\tRG:Z:001", @@ -87,8 +85,8 @@ class WipeReadsUnitTest extends TestNGSuite with Matchers { "r06\t4\t*\t0\t0\t*\t*\t0\t0\tGCGCGCGCGC\tHIHIHIHIHI\tRG:Z:001" ) - val pBAMFile2 = new File(resourcePath("/paired02.bam")) - val pBAMRecs2 = makeSAMs( + val pBamFile2 = new File(resourcePath("/paired02.bam")) + val pBamRecs2 = makeSams( "r02\t99\tchrQ\t50\t60\t10M\t=\t90\t50\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001", "r02\t147\tchrQ\t90\t60\t10M\t=\t50\t-50\tATGCATGCAT\tEEFFGGHHII\tRG:Z:001", "r01\t163\tchrQ\t150\t30\t10M\t=\t190\t50\tAAAAAGGGGG\tGGGGGGGGGG\tRG:Z:002", @@ -101,22 +99,22 @@ class WipeReadsUnitTest extends TestNGSuite with Matchers { "r08\t4\t*\t0\t0\t*\t*\t0\t0\tGCGCGCGCGC\tHIHIHIHIHI\tRG:Z:002" ) - val pBAMFile3 = new File(resourcePath("/paired03.bam")) - val BEDFile1 = new File(resourcePath("/rrna01.bed")) - val minArgList = List("-I", sBAMFile1.toString, "-l", BEDFile1.toString, "-o", "mock.bam") - - @Test def testMakeFeatureFromBED() = { - val intervals: Vector[Feature] = makeFeatureFromFile(BEDFile1).toVector - intervals.length should be (3) - intervals.head.getChr should === ("chrQ") - intervals.head.getStart should be (991) - intervals.head.getEnd should be (1000) - intervals.last.getChr should === ("chrQ") - intervals.last.getStart should be (291) - intervals.last.getEnd should be (320) + val pBamFile3 = new File(resourcePath("/paired03.bam")) + val BedFile1 = new File(resourcePath("/rrna01.bed")) + val minArgList = List("-I", sBamFile1.toString, "-l", BedFile1.toString, "-o", "mock.bam") + + @Test def testMakeFeatureFromBed() = { + val intervals: Vector[Feature] = makeFeatureFromFile(BedFile1).toVector + intervals.length should be(3) + intervals.head.getChr should ===("chrQ") + intervals.head.getStart should be(991) + intervals.head.getEnd should be(1000) + intervals.last.getChr should ===("chrQ") + intervals.last.getStart should be(291) + intervals.last.getEnd should be(320) } - @Test def testSingleBAMDefault() = { + @Test def testSingleBamDefault() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 291, 320), // overlaps r01, second hit, new BasicFeature("chrQ", 451, 480), // overlaps r04 @@ -125,317 +123,317 @@ class WipeReadsUnitTest extends TestNGSuite with Matchers { // NOTE: while it's possible to have our filter produce false positives // it is highly unlikely in our test cases as we are setting a very low FP rate // and only filling the filter with a few items - val isFilteredOut = makeFilterOutFunction(intervals, sBAMFile1, bloomSize = bloomSize, bloomFp = bloomFp) + val filterNotFunc = makeFilterNotFunction(intervals, sBamFile1, bloomSize = bloomSize, bloomFp = bloomFp) // by default, set elements are SAM record read names - assert(!isFilteredOut(sBAMRecs1(0))) - assert(isFilteredOut(sBAMRecs1(1))) - assert(isFilteredOut(sBAMRecs1(2))) - assert(isFilteredOut(sBAMRecs1(3))) - assert(!isFilteredOut(sBAMRecs1(4))) - assert(!isFilteredOut(sBAMRecs1(5))) - assert(!isFilteredOut(sBAMRecs1(6))) + filterNotFunc(sBamRecs1(0)) shouldBe false + filterNotFunc(sBamRecs1(1)) shouldBe true + filterNotFunc(sBamRecs1(2)) shouldBe true + filterNotFunc(sBamRecs1(3)) shouldBe true + filterNotFunc(sBamRecs1(4)) shouldBe false + filterNotFunc(sBamRecs1(5)) shouldBe false + filterNotFunc(sBamRecs1(6)) shouldBe false } - @Test def testSingleBAMIntervalWithoutChr() = { + @Test def testSingleBamIntervalWithoutChr() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("Q", 291, 320), new BasicFeature("chrQ", 451, 480), new BasicFeature("P", 191, 480) ) - val isFilteredOut = makeFilterOutFunction(intervals, sBAMFile1, bloomSize = bloomSize, bloomFp = bloomFp) - assert(!isFilteredOut(sBAMRecs1(0))) - assert(isFilteredOut(sBAMRecs1(1))) - assert(isFilteredOut(sBAMRecs1(2))) - assert(isFilteredOut(sBAMRecs1(3))) - assert(!isFilteredOut(sBAMRecs1(4))) - assert(!isFilteredOut(sBAMRecs1(5))) - assert(!isFilteredOut(sBAMRecs1(6))) + val filterNotFunc = makeFilterNotFunction(intervals, sBamFile1, bloomSize = bloomSize, bloomFp = bloomFp) + filterNotFunc(sBamRecs1(0)) shouldBe false + filterNotFunc(sBamRecs1(1)) shouldBe true + filterNotFunc(sBamRecs1(2)) shouldBe true + filterNotFunc(sBamRecs1(3)) shouldBe true + filterNotFunc(sBamRecs1(4)) shouldBe false + filterNotFunc(sBamRecs1(5)) shouldBe false + filterNotFunc(sBamRecs1(6)) shouldBe false } - @Test def testSingleBAMDefaultPartialExonOverlap() = { + @Test def testSingleBamDefaultPartialExonOverlap() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 881, 1000) // overlaps first exon of r05 ) - val isFilteredOut = makeFilterOutFunction(intervals, sBAMFile1, bloomSize = bloomSize, bloomFp = bloomFp) - assert(!isFilteredOut(sBAMRecs1(0))) - assert(!isFilteredOut(sBAMRecs1(1))) - assert(!isFilteredOut(sBAMRecs1(2))) - assert(!isFilteredOut(sBAMRecs1(3))) - assert(!isFilteredOut(sBAMRecs1(4))) - assert(isFilteredOut(sBAMRecs1(5))) - assert(!isFilteredOut(sBAMRecs1(6))) + val filterNotFunc = makeFilterNotFunction(intervals, sBamFile1, bloomSize = bloomSize, bloomFp = bloomFp) + filterNotFunc(sBamRecs1(0)) shouldBe false + filterNotFunc(sBamRecs1(1)) shouldBe false + filterNotFunc(sBamRecs1(2)) shouldBe false + filterNotFunc(sBamRecs1(3)) shouldBe false + filterNotFunc(sBamRecs1(4)) shouldBe false + filterNotFunc(sBamRecs1(5)) shouldBe true + filterNotFunc(sBamRecs1(6)) shouldBe false } - @Test def testSingleBAMDefaultNoExonOverlap() = { + @Test def testSingleBamDefaultNoExonOverlap() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrP", 881, 1000), new BasicFeature("chrQ", 900, 920) ) - val isFilteredOut = makeFilterOutFunction(intervals, sBAMFile1, bloomSize = bloomSize, bloomFp = bloomFp) - assert(!isFilteredOut(sBAMRecs1(0))) - assert(!isFilteredOut(sBAMRecs1(1))) - assert(!isFilteredOut(sBAMRecs1(2))) - assert(!isFilteredOut(sBAMRecs1(3))) - assert(!isFilteredOut(sBAMRecs1(4))) - assert(!isFilteredOut(sBAMRecs1(5))) - assert(!isFilteredOut(sBAMRecs1(5))) - assert(!isFilteredOut(sBAMRecs1(6))) + val filterNotFunc = makeFilterNotFunction(intervals, sBamFile1, bloomSize = bloomSize, bloomFp = bloomFp) + filterNotFunc(sBamRecs1(0)) shouldBe false + filterNotFunc(sBamRecs1(1)) shouldBe false + filterNotFunc(sBamRecs1(2)) shouldBe false + filterNotFunc(sBamRecs1(3)) shouldBe false + filterNotFunc(sBamRecs1(4)) shouldBe false + filterNotFunc(sBamRecs1(5)) shouldBe false + filterNotFunc(sBamRecs1(5)) shouldBe false + filterNotFunc(sBamRecs1(6)) shouldBe false } - @Test def testSingleBAMFilterOutMultiNotSet() = { + @Test def testSingleBamFilterOutMultiNotSet() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 291, 320), // overlaps r01, second hit, new BasicFeature("chrQ", 451, 480), // overlaps r04 new BasicFeature("chrQ", 991, 1000) // overlaps nothing; lies in the spliced region of r05 ) - val isFilteredOut = makeFilterOutFunction(intervals, sBAMFile1, bloomSize = bloomSize, bloomFp = bloomFp, + val filterNotFunc = makeFilterNotFunction(intervals, sBamFile1, bloomSize = bloomSize, bloomFp = bloomFp, filterOutMulti = false) - assert(!isFilteredOut(sBAMRecs1(0))) - assert(!isFilteredOut(sBAMRecs1(1))) - assert(isFilteredOut(sBAMRecs1(2))) - assert(isFilteredOut(sBAMRecs1(3))) - assert(!isFilteredOut(sBAMRecs1(4))) - assert(!isFilteredOut(sBAMRecs1(5))) - assert(!isFilteredOut(sBAMRecs1(6))) + filterNotFunc(sBamRecs1(0)) shouldBe false + filterNotFunc(sBamRecs1(1)) shouldBe false + filterNotFunc(sBamRecs1(2)) shouldBe true + filterNotFunc(sBamRecs1(3)) shouldBe true + filterNotFunc(sBamRecs1(4)) shouldBe false + filterNotFunc(sBamRecs1(5)) shouldBe false + filterNotFunc(sBamRecs1(6)) shouldBe false } - @Test def testSingleBAMFilterMinMapQ() = { + @Test def testSingleBamFilterMinMapQ() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 291, 320), new BasicFeature("chrQ", 451, 480) ) - val isFilteredOut = makeFilterOutFunction(intervals, sBAMFile2, bloomSize = bloomSize, bloomFp = bloomFp, + val filterNotFunc = makeFilterNotFunction(intervals, sBamFile2, bloomSize = bloomSize, bloomFp = bloomFp, minMapQ = 60) - assert(!isFilteredOut(sBAMRecs2(0))) + filterNotFunc(sBamRecs2(0)) shouldBe false // r01 is not in since it is below the MAPQ threshold - assert(!isFilteredOut(sBAMRecs2(1))) - assert(!isFilteredOut(sBAMRecs2(2))) - assert(isFilteredOut(sBAMRecs2(3))) - assert(isFilteredOut(sBAMRecs2(4))) - assert(isFilteredOut(sBAMRecs2(5))) - assert(!isFilteredOut(sBAMRecs2(6))) - assert(!isFilteredOut(sBAMRecs2(7))) + filterNotFunc(sBamRecs2(1)) shouldBe false + filterNotFunc(sBamRecs2(2)) shouldBe false + filterNotFunc(sBamRecs2(3)) shouldBe true + filterNotFunc(sBamRecs2(4)) shouldBe true + filterNotFunc(sBamRecs2(5)) shouldBe true + filterNotFunc(sBamRecs2(6)) shouldBe false + filterNotFunc(sBamRecs2(7)) shouldBe false } - @Test def testSingleBAMFilterMinMapQFilterOutMultiNotSet() = { + @Test def testSingleBamFilterMinMapQFilterOutMultiNotSet() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 291, 320), new BasicFeature("chrQ", 451, 480) ) - val isFilteredOut = makeFilterOutFunction(intervals, sBAMFile2, bloomSize = bloomSize, bloomFp = bloomFp, + val filterNotFunc = makeFilterNotFunction(intervals, sBamFile2, bloomSize = bloomSize, bloomFp = bloomFp, minMapQ = 60, filterOutMulti = false) - assert(!isFilteredOut(sBAMRecs2(0))) - assert(!isFilteredOut(sBAMRecs2(1))) + filterNotFunc(sBamRecs2(0)) shouldBe false + filterNotFunc(sBamRecs2(1)) shouldBe false // this r01 is not in since it is below the MAPQ threshold - assert(!isFilteredOut(sBAMRecs2(2))) - assert(isFilteredOut(sBAMRecs2(3))) - assert(isFilteredOut(sBAMRecs2(4))) + filterNotFunc(sBamRecs2(2)) shouldBe false + filterNotFunc(sBamRecs2(3)) shouldBe true + filterNotFunc(sBamRecs2(4)) shouldBe true // this r07 is not in since filterOuMulti is false - assert(!isFilteredOut(sBAMRecs2(5))) - assert(!isFilteredOut(sBAMRecs2(6))) - assert(!isFilteredOut(sBAMRecs2(7))) + filterNotFunc(sBamRecs2(5)) shouldBe false + filterNotFunc(sBamRecs2(6)) shouldBe false + filterNotFunc(sBamRecs2(7)) shouldBe false } - @Test def testSingleBAMFilterReadGroupIDs() = { + @Test def testSingleBamFilterReadGroupIDs() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 291, 320), new BasicFeature("chrQ", 451, 480) ) - val isFilteredOut = makeFilterOutFunction(intervals, sBAMFile2, bloomSize = bloomSize, bloomFp = bloomFp, - readGroupIDs = Set("002", "003")) - assert(!isFilteredOut(sBAMRecs2(0))) + val filterNotFunc = makeFilterNotFunction(intervals, sBamFile2, bloomSize = bloomSize, bloomFp = bloomFp, + readGroupIds = Set("002", "003")) + filterNotFunc(sBamRecs2(0)) shouldBe false // only r01 is in the set since it is RG 002 - assert(isFilteredOut(sBAMRecs2(1))) - assert(isFilteredOut(sBAMRecs2(2))) - assert(!isFilteredOut(sBAMRecs2(3))) - assert(!isFilteredOut(sBAMRecs2(4))) - assert(!isFilteredOut(sBAMRecs2(5))) - assert(!isFilteredOut(sBAMRecs2(6))) - assert(!isFilteredOut(sBAMRecs2(7))) + filterNotFunc(sBamRecs2(1)) shouldBe true + filterNotFunc(sBamRecs2(2)) shouldBe true + filterNotFunc(sBamRecs2(3)) shouldBe false + filterNotFunc(sBamRecs2(4)) shouldBe false + filterNotFunc(sBamRecs2(5)) shouldBe false + filterNotFunc(sBamRecs2(6)) shouldBe false + filterNotFunc(sBamRecs2(7)) shouldBe false } - @Test def testPairBAMDefault() = { + @Test def testPairBamDefault() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 291, 320), // overlaps r01, second hit, new BasicFeature("chrQ", 451, 480), // overlaps r04 new BasicFeature("chrQ", 991, 1000) // overlaps nothing; lies in the spliced region of r05 ) - val isFilteredOut = makeFilterOutFunction(intervals, pBAMFile1, bloomSize = bloomSize, bloomFp = bloomFp) - assert(!isFilteredOut(pBAMRecs1(0))) - assert(!isFilteredOut(pBAMRecs1(1))) - assert(isFilteredOut(pBAMRecs1(2))) - assert(isFilteredOut(pBAMRecs1(3))) - assert(isFilteredOut(pBAMRecs1(4))) - assert(isFilteredOut(pBAMRecs1(5))) - assert(isFilteredOut(pBAMRecs1(6))) - assert(isFilteredOut(pBAMRecs1(7))) - assert(!isFilteredOut(pBAMRecs1(8))) - assert(!isFilteredOut(pBAMRecs1(9))) - assert(!isFilteredOut(pBAMRecs1(10))) - assert(!isFilteredOut(pBAMRecs1(11))) - assert(!isFilteredOut(pBAMRecs1(12))) - assert(!isFilteredOut(pBAMRecs1(13))) + val filterNotFunc = makeFilterNotFunction(intervals, pBamFile1, bloomSize = bloomSize, bloomFp = bloomFp) + filterNotFunc(pBamRecs1(0)) shouldBe false + filterNotFunc(pBamRecs1(1)) shouldBe false + filterNotFunc(pBamRecs1(2)) shouldBe true + filterNotFunc(pBamRecs1(3)) shouldBe true + filterNotFunc(pBamRecs1(4)) shouldBe true + filterNotFunc(pBamRecs1(5)) shouldBe true + filterNotFunc(pBamRecs1(6)) shouldBe true + filterNotFunc(pBamRecs1(7)) shouldBe true + filterNotFunc(pBamRecs1(8)) shouldBe false + filterNotFunc(pBamRecs1(9)) shouldBe false + filterNotFunc(pBamRecs1(10)) shouldBe false + filterNotFunc(pBamRecs1(11)) shouldBe false + filterNotFunc(pBamRecs1(12)) shouldBe false + filterNotFunc(pBamRecs1(13)) shouldBe false } - @Test def testPairBAMPartialExonOverlap() = { + @Test def testPairBamPartialExonOverlap() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 891, 1000) ) - val isFilteredOut = makeFilterOutFunction(intervals, pBAMFile1, bloomSize = bloomSize, bloomFp = bloomFp) - assert(!isFilteredOut(pBAMRecs1(0))) - assert(!isFilteredOut(pBAMRecs1(1))) - assert(!isFilteredOut(pBAMRecs1(2))) - assert(!isFilteredOut(pBAMRecs1(3))) - assert(!isFilteredOut(pBAMRecs1(4))) - assert(!isFilteredOut(pBAMRecs1(5))) - assert(!isFilteredOut(pBAMRecs1(6))) - assert(!isFilteredOut(pBAMRecs1(7))) - assert(!isFilteredOut(pBAMRecs1(8))) - assert(!isFilteredOut(pBAMRecs1(9))) - assert(isFilteredOut(pBAMRecs1(10))) - assert(isFilteredOut(pBAMRecs1(11))) - assert(!isFilteredOut(pBAMRecs1(12))) - assert(!isFilteredOut(pBAMRecs1(13))) + val filterNotFunc = makeFilterNotFunction(intervals, pBamFile1, bloomSize = bloomSize, bloomFp = bloomFp) + filterNotFunc(pBamRecs1(0)) shouldBe false + filterNotFunc(pBamRecs1(1)) shouldBe false + filterNotFunc(pBamRecs1(2)) shouldBe false + filterNotFunc(pBamRecs1(3)) shouldBe false + filterNotFunc(pBamRecs1(4)) shouldBe false + filterNotFunc(pBamRecs1(5)) shouldBe false + filterNotFunc(pBamRecs1(6)) shouldBe false + filterNotFunc(pBamRecs1(7)) shouldBe false + filterNotFunc(pBamRecs1(8)) shouldBe false + filterNotFunc(pBamRecs1(9)) shouldBe false + filterNotFunc(pBamRecs1(10)) shouldBe true + filterNotFunc(pBamRecs1(11)) shouldBe true + filterNotFunc(pBamRecs1(12)) shouldBe false + filterNotFunc(pBamRecs1(13)) shouldBe false } - @Test def testPairBAMFilterOutMultiNotSet() = { + @Test def testPairBamFilterOutMultiNotSet() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 291, 320), // overlaps r01, second hit, new BasicFeature("chrQ", 451, 480), // overlaps r04 new BasicFeature("chrQ", 991, 1000) // overlaps nothing; lies in the spliced region of r05 ) - val isFilteredOut = makeFilterOutFunction(intervals, pBAMFile1, bloomSize = bloomSize, bloomFp = bloomFp, + val filterNotFunc = makeFilterNotFunction(intervals, pBamFile1, bloomSize = bloomSize, bloomFp = bloomFp, filterOutMulti = false) - assert(!isFilteredOut(pBAMRecs1(0))) - assert(!isFilteredOut(pBAMRecs1(1))) - assert(!isFilteredOut(pBAMRecs1(2))) - assert(!isFilteredOut(pBAMRecs1(3))) - assert(isFilteredOut(pBAMRecs1(4))) - assert(isFilteredOut(pBAMRecs1(5))) - assert(isFilteredOut(pBAMRecs1(6))) - assert(isFilteredOut(pBAMRecs1(7))) - assert(!isFilteredOut(pBAMRecs1(8))) - assert(!isFilteredOut(pBAMRecs1(9))) - assert(!isFilteredOut(pBAMRecs1(10))) - assert(!isFilteredOut(pBAMRecs1(11))) - assert(!isFilteredOut(pBAMRecs1(12))) - assert(!isFilteredOut(pBAMRecs1(13))) + filterNotFunc(pBamRecs1(0)) shouldBe false + filterNotFunc(pBamRecs1(1)) shouldBe false + filterNotFunc(pBamRecs1(2)) shouldBe false + filterNotFunc(pBamRecs1(3)) shouldBe false + filterNotFunc(pBamRecs1(4)) shouldBe true + filterNotFunc(pBamRecs1(5)) shouldBe true + filterNotFunc(pBamRecs1(6)) shouldBe true + filterNotFunc(pBamRecs1(7)) shouldBe true + filterNotFunc(pBamRecs1(8)) shouldBe false + filterNotFunc(pBamRecs1(9)) shouldBe false + filterNotFunc(pBamRecs1(10)) shouldBe false + filterNotFunc(pBamRecs1(11)) shouldBe false + filterNotFunc(pBamRecs1(12)) shouldBe false + filterNotFunc(pBamRecs1(13)) shouldBe false } - @Test def testPairBAMFilterMinMapQ() = { + @Test def testPairBamFilterMinMapQ() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 291, 320), new BasicFeature("chrQ", 451, 480) ) - val isFilteredOut = makeFilterOutFunction(intervals, pBAMFile2, bloomSize = bloomSize, bloomFp = bloomFp, + val filterNotFunc = makeFilterNotFunction(intervals, pBamFile2, bloomSize = bloomSize, bloomFp = bloomFp, minMapQ = 60) // r01 is not in since it is below the MAPQ threshold - assert(!isFilteredOut(pBAMRecs2(0))) - assert(!isFilteredOut(pBAMRecs2(1))) - assert(!isFilteredOut(pBAMRecs2(2))) - assert(!isFilteredOut(pBAMRecs2(3))) - assert(!isFilteredOut(pBAMRecs2(4))) - assert(!isFilteredOut(pBAMRecs2(5))) - assert(isFilteredOut(pBAMRecs2(6))) - assert(isFilteredOut(pBAMRecs2(7))) - assert(!isFilteredOut(pBAMRecs2(8))) - assert(!isFilteredOut(pBAMRecs2(9))) + filterNotFunc(pBamRecs2(0)) shouldBe false + filterNotFunc(pBamRecs2(1)) shouldBe false + filterNotFunc(pBamRecs2(2)) shouldBe false + filterNotFunc(pBamRecs2(3)) shouldBe false + filterNotFunc(pBamRecs2(4)) shouldBe false + filterNotFunc(pBamRecs2(5)) shouldBe false + filterNotFunc(pBamRecs2(6)) shouldBe true + filterNotFunc(pBamRecs2(7)) shouldBe true + filterNotFunc(pBamRecs2(8)) shouldBe false + filterNotFunc(pBamRecs2(9)) shouldBe false } - @Test def testPairBAMFilterReadGroupIDs() = { + @Test def testPairBamFilterReadGroupIDs() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 291, 320), new BasicFeature("chrQ", 451, 480) ) - val isFilteredOut = makeFilterOutFunction(intervals, pBAMFile2, bloomSize = bloomSize, bloomFp = bloomFp, - readGroupIDs = Set("002", "003")) + val filterNotFunc = makeFilterNotFunction(intervals, pBamFile2, bloomSize = bloomSize, bloomFp = bloomFp, + readGroupIds = Set("002", "003")) // only r01 is in the set since it is RG 002 - assert(!isFilteredOut(pBAMRecs2(0))) - assert(!isFilteredOut(pBAMRecs2(1))) - assert(isFilteredOut(pBAMRecs2(2))) - assert(isFilteredOut(pBAMRecs2(3))) - assert(isFilteredOut(pBAMRecs2(4))) - assert(isFilteredOut(pBAMRecs2(5))) - assert(!isFilteredOut(pBAMRecs2(6))) - assert(!isFilteredOut(pBAMRecs2(7))) - assert(!isFilteredOut(pBAMRecs2(8))) - assert(!isFilteredOut(pBAMRecs2(9))) + filterNotFunc(pBamRecs2(0)) shouldBe false + filterNotFunc(pBamRecs2(1)) shouldBe false + filterNotFunc(pBamRecs2(2)) shouldBe true + filterNotFunc(pBamRecs2(3)) shouldBe true + filterNotFunc(pBamRecs2(4)) shouldBe true + filterNotFunc(pBamRecs2(5)) shouldBe true + filterNotFunc(pBamRecs2(6)) shouldBe false + filterNotFunc(pBamRecs2(7)) shouldBe false + filterNotFunc(pBamRecs2(8)) shouldBe false + filterNotFunc(pBamRecs2(9)) shouldBe false } - @Test def testWriteSingleBAMDefault() = { + @Test def testWriteSingleBamDefault() = { val mockFilterOutFunc = (r: SAMRecord) => Set("r03", "r04", "r05").contains(r.getReadName) - val outBAM = makeTempBAM() - val outBAMIndex = makeTempBAMIndex(outBAM) - outBAM.deleteOnExit() - outBAMIndex.deleteOnExit() + val outBam = makeTempBam() + val outBamIndex = makeTempBamIndex(outBam) + outBam.deleteOnExit() + outBamIndex.deleteOnExit() val stdout = new java.io.ByteArrayOutputStream Console.withOut(stdout) { - writeFilteredBAM(mockFilterOutFunc, sBAMFile1, outBAM) + writeFilteredBam(mockFilterOutFunc, sBamFile1, outBam) } - stdout.toString should === ( + stdout.toString should ===( "input_bam\toutput_bam\tcount_included\tcount_excluded\n%s\t%s\t%d\t%d\n" - .format(sBAMFile1.getName, outBAM.getName, 4, 3) + .format(sBamFile1.getName, outBam.getName, 4, 3) ) - val exp = new SAMFileReader(sBAMFile3).asScala - val obs = new SAMFileReader(outBAM).asScala + val exp = new SAMFileReader(sBamFile3).asScala + val obs = new SAMFileReader(outBam).asScala for ((e, o) <- exp.zip(obs)) - e.getSAMString should === (o.getSAMString) - outBAM should be ('exists) - outBAMIndex should be ('exists) + e.getSAMString should ===(o.getSAMString) + outBam should be('exists) + outBamIndex should be('exists) } - @Test def testWriteSingleBAMAndFilteredBAM() = { + @Test def testWriteSingleBamAndFilteredBAM() = { val mockFilterOutFunc = (r: SAMRecord) => Set("r03", "r04", "r05").contains(r.getReadName) - val outBAM = makeTempBAM() - val outBAMIndex = makeTempBAMIndex(outBAM) - outBAM.deleteOnExit() - outBAMIndex.deleteOnExit() - val filteredOutBAM = makeTempBAM() - val filteredOutBAMIndex = makeTempBAMIndex(filteredOutBAM) - filteredOutBAM.deleteOnExit() - filteredOutBAMIndex.deleteOnExit() + val outBam = makeTempBam() + val outBamIndex = makeTempBamIndex(outBam) + outBam.deleteOnExit() + outBamIndex.deleteOnExit() + val filteredOutBam = makeTempBam() + val filteredOutBamIndex = makeTempBamIndex(filteredOutBam) + filteredOutBam.deleteOnExit() + filteredOutBamIndex.deleteOnExit() val stdout = new java.io.ByteArrayOutputStream Console.withOut(stdout) { - writeFilteredBAM(mockFilterOutFunc, sBAMFile1, outBAM, filteredOutBAM = filteredOutBAM) + writeFilteredBam(mockFilterOutFunc, sBamFile1, outBam, filteredOutBam = filteredOutBam) } - stdout.toString should === ( + stdout.toString should ===( "input_bam\toutput_bam\tcount_included\tcount_excluded\n%s\t%s\t%d\t%d\n" - .format(sBAMFile1.getName, outBAM.getName, 4, 3) + .format(sBamFile1.getName, outBam.getName, 4, 3) ) - val exp = new SAMFileReader(sBAMFile4).asScala - val obs = new SAMFileReader(filteredOutBAM).asScala + val exp = new SAMFileReader(sBamFile4).asScala + val obs = new SAMFileReader(filteredOutBam).asScala for ((e, o) <- exp.zip(obs)) - e.getSAMString should === (o.getSAMString) - outBAM should be ('exists) - outBAMIndex should be ('exists) - filteredOutBAM should be ('exists) - filteredOutBAMIndex should be ('exists) + e.getSAMString should ===(o.getSAMString) + outBam should be('exists) + outBamIndex should be('exists) + filteredOutBam should be('exists) + filteredOutBamIndex should be('exists) } - @Test def testWritePairBAMDefault() = { + @Test def testWritePairBamDefault() = { val mockFilterOutFunc = (r: SAMRecord) => Set("r03", "r04", "r05").contains(r.getReadName) - val outBAM = makeTempBAM() - val outBAMIndex = makeTempBAMIndex(outBAM) - outBAM.deleteOnExit() - outBAMIndex.deleteOnExit() + val outBam = makeTempBam() + val outBamIndex = makeTempBamIndex(outBam) + outBam.deleteOnExit() + outBamIndex.deleteOnExit() val stdout = new java.io.ByteArrayOutputStream Console.withOut(stdout) { - writeFilteredBAM(mockFilterOutFunc, pBAMFile1, outBAM) + writeFilteredBam(mockFilterOutFunc, pBamFile1, outBam) } - stdout.toString should === ( + stdout.toString should ===( "input_bam\toutput_bam\tcount_included\tcount_excluded\n%s\t%s\t%d\t%d\n" - .format(pBAMFile1.getName, outBAM.getName, 8, 6) + .format(pBamFile1.getName, outBam.getName, 8, 6) ) - val exp = new SAMFileReader(pBAMFile3).asScala - val obs = new SAMFileReader(outBAM).asScala + val exp = new SAMFileReader(pBamFile3).asScala + val obs = new SAMFileReader(outBam).asScala for ((e, o) <- exp.zip(obs)) - e.getSAMString should === (o.getSAMString) - outBAM should be ('exists) - outBAMIndex should be ('exists) + e.getSAMString should ===(o.getSAMString) + outBam should be('exists) + outBamIndex should be('exists) } } \ No newline at end of file