Commit 149f43d4 authored by bow's avatar bow
Browse files

Add tests for WipeReads overlap finders

parent d502cf63
......@@ -232,6 +232,4 @@ object WipeReads {
|By default, if the removed reads are also mapped to other regions outside
|the given ones, they will also be removed.
""".stripMargin
}
Test datasets for the Biopet Framework
======================================
* Please add an entry here when adding a new test dataset.
Filename Explanation
======== ===========
single01.sam single-end SAM file, used for testing WipeReads
single01.bam single01.sam compressed with samtools v0.1.18
single01.bam.bai index for single01.bam
single02.sam single-end SAM file, used for testing WipeReads
single02.bam single02.sam compressed with samtools v0.1.18
single02.bam.bai index for single02.bam
paired01.sam paired-end SAM file, used for testing WipeReads
paired01.bam paired01.sam compressed with samtools v0.1.18
paired01.bam.bai index for paired01.bam
@HD VN:1.0 SO:coordinate
@SQ SN:chrQ LN:10000
@RG ID:001 DS:paired-end reads SM:WipeReadsTestCase
r02 99 chrQ 50 60 10M = 90 50 TACGTACGTA EEFFGGHHII RG:Z:001
r02 147 chrQ 90 60 10M = 50 -50 ATGCATGCAT EEFFGGHHII RG:Z:001
r01 163 chrQ 250 60 10M = 290 50 AAAAAGGGGG GGGGGGGGGG RG:Z:001
r01 83 chrQ 290 60 10M = 250 -50 GGGGGAAAAA GGGGGGGGGG RG:Z:001
r04 99 chrQ 450 60 10M = 490 50 CGTACGTACG EEFFGGHHII RG:Z:001
r04 147 chrQ 490 60 10M = 450 -50 GCATGCATGC EEFFGGHHII RG:Z:001
r03 163 chrQ 650 60 10M = 690 50 TTTTTCCCCC HHHHHHHHHH RG:Z:001
r03 83 chrQ 690 60 10M = 650 -50 CCCCCTTTTT HHHHHHHHHH RG:Z:001
r02 99 chrQ 850 60 5M100N5M = 1290 50 TACGTACGTA EEFFGGHHII RG:Z:001
r02 147 chrQ 1290 60 5M100N5M = 1250 -50 ATGCATGCAT EEFFGGHHII RG:Z:001
r06 4 * 0 0 * * 0 0 ATATATATAT HIHIHIHIHI RG:Z:001
r06 4 * 0 0 * * 0 0 GCGCGCGCGC HIHIHIHIHI RG:Z:001
@HD VN:1.0 SO:coordinate
@SQ SN:chrQ LN:1000
@RG ID:002 DS:single-end reads SM:WipeReadsTestCase
r02 0 chrQ 50 60 10M * 0 0 TACGTACGTA EEFFGGHHII RG:Z:002
r01 16 chrQ 290 60 10M * 0 0 GGGGGAAAAA GGGGGGGGGG RG:Z:002
r04 0 chrQ 450 60 10M * 0 0 CGTACGTACG EEFFGGHHII RG:Z:002
r03 16 chrQ 690 60 10M * 0 0 CCCCCTTTTT HHHHHHHHHH RG:Z:002
r06 4 * 0 0 * * 0 0 ATATATATAT HIHIHIHIHI RG:Z:002
@HD VN:1.0 SO:coordinate
@SQ SN:chrQ LN:10000
@RG ID:001 DS:single-end reads SM:WipeReadsTestCase
r02 0 chrQ 50 60 10M * 0 0 TACGTACGTA EEFFGGHHII RG:Z:001
r01 16 chrQ 190 60 10M * 0 0 TACGTACGTA EEFFGGHHII RG:Z:001
r01 16 chrQ 290 60 10M * 0 0 GGGGGAAAAA GGGGGGGGGG RG:Z:001
r04 0 chrQ 450 60 10M * 0 0 CGTACGTACG EEFFGGHHII RG:Z:001
r03 16 chrQ 690 60 10M * 0 0 CCCCCTTTTT HHHHHHHHHH RG:Z:001
r05 0 chrQ 890 60 5M200N5M * 0 0 GATACGATAC FEFEFEFEFE RG:Z:001
r06 4 * 0 0 * * 0 0 ATATATATAT HIHIHIHIHI RG:Z:001
@HD VN:1.0 SO:coordinate
@SQ SN:chrQ LN:10000
@RG ID:001 DS:single-end reads SM:WipeReadsTestCase
@RG ID:002 DS:single-end reads SM:WipeReadsTestCase
r02 0 chrQ 50 60 10M * 0 0 TACGTACGTA EEFFGGHHII RG:Z:001
r01 16 chrQ 190 30 10M * 0 0 TACGTACGTA EEFFGGHHII RG:Z:002
r01 16 chrQ 290 30 10M * 0 0 GGGGGAAAAA GGGGGGGGGG RG:Z:002
r04 0 chrQ 450 60 10M * 0 0 CGTACGTACG EEFFGGHHII RG:Z:001
r06 4 * 0 0 * * 0 0 ATATATATAT HIHIHIHIHI RG:Z:001
r08 4 * 0 0 * * 0 0 ATATATATAT HIHIHIHIHI RG:Z:002
......@@ -18,9 +18,113 @@ class WipeReadsUnitTest extends Assertions {
private def resourcePath(p: String): String =
Paths.get(getClass.getResource(p).toURI).toString
val bam01 = resourcePath("/single.bam")
val bed01 = resourcePath("/rrna01.bed")
val minArgList = List("-I", bam01.toString, "-l", bed01.toString, "-o", "mock.bam")
val sbam01 = new File(resourcePath("/single01.bam"))
val sbam02 = new File(resourcePath("/single02.bam"))
val pbam01 = new File(resourcePath("/paired01.bam"))
val bed01 = new File(resourcePath("/rrna01.bed"))
val minArgList = List("-I", sbam01.toString, "-l", bed01.toString, "-o", "mock.bam")
@Test def rawIntervalBED() = {
val intervals: Vector[RawInterval] = makeRawIntervalFromFile(bed01).toVector
assert(intervals.length == 3)
assert(intervals.head.chrom == "chrQ")
assert(intervals.head.start == 291)
assert(intervals.head.end == 320)
assert(intervals.head.strand == "+")
assert(intervals.last.chrom == "chrQ")
assert(intervals.last.start == 991)
assert(intervals.last.end == 1000)
assert(intervals.last.strand == "+")
}
@Test def testSingleBAMDefault() = {
val intervals: Iterator[RawInterval] = Iterator(
RawInterval("chrQ", 291, 320, "+"), // overlaps r01, second hit,
RawInterval("chrQ", 451, 480, "+"), // overlaps r04
RawInterval("chrQ", 991, 1000, "+") // overlaps nothing; lies in the spliced region of r05
)
// NOTE: while it's possible to have our filter produce false positives
// it is highly unlikely in our test cases as we are setting a very low FP rate
// and only filling the filter with a few items
val bf = makeBloomFilter(intervals, sbam01, bloomSize = 1000, bloomFp = 1e-10)
// by default, set elements are SAM record read names
assert(!bf.contains("r02").isTrue)
assert(!bf.contains("r03").isTrue)
/* TODO: exclude r05 from set since the mapped read does not overlap with our interval
this is a limitation of htsjdk, as it checks for overlaps with records instead of alignment blocks
assert(!bf.contains("r05").isTrue)
*/
assert(!bf.contains("r06").isTrue)
assert(bf.contains("r01").isTrue)
assert(bf.contains("r04").isTrue)
}
@Test def testSingleBAMFilterOutMultiNotSet() = {
val intervals: Iterator[RawInterval] = Iterator(
RawInterval("chrQ", 291, 320, "+"), // overlaps r01, second hit,
RawInterval("chrQ", 451, 480, "+"), // overlaps r04
RawInterval("chrQ", 991, 1000, "+") // overlaps nothing; lies in the spliced region of r05
)
val bf = makeBloomFilter(intervals, sbam01, bloomSize = 1000, bloomFp = 1e-10, filterOutMulti = false)
assert(!bf.contains("r02\t0\tchrQ\t50\t60\t10M\t*\t0\t0\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001\n").isTrue)
assert(!bf.contains("r01\t16\tchrQ\t190\t60\t10M\t*\t0\t0\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001\n").isTrue)
assert(!bf.contains("r03\t16\tchrQ\t690\t60\t10M\t*\t0\t0\tCCCCCTTTTT\tHHHHHHHHHH\tRG:Z:001\n").isTrue)
assert(!bf.contains("r06\t4\t*\t0\t0\t*\t*\t0\t0\tATATATATAT\tHIHIHIHIHI\tRG:Z:001\n").isTrue)
assert(!bf.contains("r06\t4\t*\t0\t0\t*\t*\t0\t0\tATATATATAT\tHIHIHIHIHI\tRG:Z:001\n").isTrue)
assert(bf.contains("r01\t16\tchrQ\t290\t60\t10M\t*\t0\t0\tGGGGGAAAAA\tGGGGGGGGGG\tRG:Z:001\n").isTrue)
assert(bf.contains("r04\t0\tchrQ\t450\t60\t10M\t*\t0\t0\tCGTACGTACG\tEEFFGGHHII\tRG:Z:001\n").isTrue)
/* TODO: exclude r05 from set since the mapped read does not overlap with our interval
this is a limitation of htsjdk, as it checks for overlaps with records instead of alignment blocks
assert(!bf.contains("r05\t0\tchrQ\t890\t60\t5M200N5M\t*\t0\t0\tGATACGATAC\tFEFEFEFEFE\tRG:Z:001\n").isTrue)
*/
}
@Test def testSingleBAMFilterMinMapQ() = {
val intervals: Iterator[RawInterval] = Iterator(
RawInterval("chrQ", 291, 320, "+"),
RawInterval("chrQ", 451, 480, "+")
)
val bf = makeBloomFilter(intervals, sbam02, bloomSize = 1000, bloomFp = 1e-10, minMapQ = 60)
assert(!bf.contains("r01").isTrue)
assert(!bf.contains("r02").isTrue)
assert(!bf.contains("r06").isTrue)
assert(!bf.contains("r08").isTrue)
// only r04 is in the set since r01 is below the MAPQ threshold
assert(bf.contains("r04").isTrue)
}
@Test def testSingleBAMFilterReadGroupIDs() = {
val intervals: Iterator[RawInterval] = Iterator(
RawInterval("chrQ", 291, 320, "+"),
RawInterval("chrQ", 451, 480, "+")
)
val bf = makeBloomFilter(intervals, sbam02, bloomSize = 1000, bloomFp = 1e-10, readGroupIDs = Set("002", "003"))
assert(!bf.contains("r02").isTrue)
assert(!bf.contains("r04").isTrue)
assert(!bf.contains("r06").isTrue)
assert(!bf.contains("r08").isTrue)
// only r01 is in the set since it is RG 002
assert(bf.contains("r01").isTrue)
}
@Test def testPairBAMDefault() = {
val intervals: Iterator[RawInterval] = Iterator(
RawInterval("chrQ", 291, 320, "+"), // overlaps r01, second hit,
RawInterval("chrQ", 451, 480, "+"), // overlaps r04
RawInterval("chrQ", 991, 1000, "+") // overlaps nothing; lies in the spliced region of r05
)
val bf = makeBloomFilter(intervals, pbam01, bloomSize = 1000, bloomFp = 1e-10)
// by default, set elements are SAM record read names
assert(!bf.contains("r02").isTrue)
assert(!bf.contains("r03").isTrue)
/* TODO: exclude r05 from set since the mapped read does not overlap with our interval
this is a limitation of htsjdk, as it checks for overlaps with records instead of alignment blocks
assert(!bf.contains("r05").isTrue)
*/
assert(!bf.contains("r06").isTrue)
assert(bf.contains("r01").isTrue)
assert(bf.contains("r04").isTrue)
}
@Test def testOptMinimum() = {
val opts = parseOption(Map(), minArgList)
......@@ -34,7 +138,7 @@ class WipeReadsUnitTest extends Assertions {
assert(pathBAM.exists)
val argList = List(
"--inputBAM", pathBAM.toPath.toString,
"--targetRegions", bed01.toString,
"--targetRegions", bed01.getPath,
"--outputBAM", "mock.bam")
val thrown = intercept[IOException] {
parseOption(Map(), argList)
......@@ -46,7 +150,7 @@ class WipeReadsUnitTest extends Assertions {
@Test def testOptMissingRegions() = {
val pathRegion = "/i/dont/exist.bed"
val argList = List(
"--inputBAM", bam01,
"--inputBAM", sbam01.getPath,
"--targetRegions", pathRegion,
"--outputBAM", "mock.bam"
)
......@@ -118,7 +222,3 @@ class WipeReadsUnitTest extends Assertions {
assert(opts("readGroup") == Vector("g1", "g2"))
}
}
/*
TODO: missing whole argument
*/
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment