Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mirrors
biopet.biopet
Commits
657702dc
Commit
657702dc
authored
Oct 06, 2014
by
bow
Browse files
Code reformat + unused import removal in WipeReads
parent
ca6ddab9
Changes
2
Hide whitespace changes
Inline
Side-by-side
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/apps/WipeReads.scala
View file @
657702dc
...
...
@@ -11,7 +11,6 @@ import scala.io.Source
import
com.twitter.algebird.
{
BF
,
BloomFilter
}
import
htsjdk.samtools.SAMFileReader
import
htsjdk.samtools.SAMFileReader.QueryInterval
import
htsjdk.samtools.SAMFileWriter
import
htsjdk.samtools.SAMFileWriterFactory
import
htsjdk.samtools.SAMRecord
import
org.apache.commons.io.FilenameUtils.getExtension
...
...
@@ -21,10 +20,11 @@ import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import
nl.lumc.sasc.biopet.core.config.Configurable
// TODO: finish implementation for usage in pipelines
/** WipeReads function class for usage in Biopet pipelines
*
* @param root Configuration object for the pipeline
*/
/**
* WipeReads function class for usage in Biopet pipelines
*
* @param root Configuration object for the pipeline
*/
class
WipeReads
(
val
root
:
Configurable
)
extends
BiopetJavaCommandLineFunction
{
javaMainClass
=
getClass
.
getName
...
...
@@ -37,7 +37,6 @@ class WipeReads(val root: Configurable) extends BiopetJavaCommandLineFunction {
}
object
WipeReads
{
/** Container type for command line flags */
...
...
@@ -52,15 +51,16 @@ object WipeReads {
val
Identical
,
Opposite
,
Both
=
Value
}
/** Function to create iterator over intervals from input interval file
*
* @param inFile input interval file
*/
/**
* Function to create iterator over intervals from input interval file
*
* @param inFile input interval file
*/
def
makeRawIntervalFromFile
(
inFile
:
File
)
:
Iterator
[
RawInterval
]
=
{
/** Function to create iterator from BED file */
def
makeRawIntervalFromBED
(
inFile
:
File
)
:
Iterator
[
RawInterval
]
=
// BED file coordinates are 0-based, half open so we need to do some conversion
// BED file coordinates are 0-based, half open so we need to do some conversion
Source
.
fromFile
(
inFile
)
.
getLines
()
.
filterNot
(
_
.
trim
.
isEmpty
)
...
...
@@ -68,7 +68,7 @@ object WipeReads {
.
map
(
line
=>
line
.
trim
.
split
(
"\t"
)
match
{
case
Array
(
chrom
,
start
,
end
)
=>
new
RawInterval
(
chrom
,
start
.
toInt
+
1
,
end
.
toInt
,
""
)
case
Array
(
chrom
,
start
,
end
,
_
,
_
,
strand
,
_
*)
=>
new
RawInterval
(
chrom
,
start
.
toInt
+
1
,
end
.
toInt
,
strand
)
})
})
// TODO: implementation
/** Function to create iterator from refFlat file */
...
...
@@ -93,26 +93,26 @@ object WipeReads {
}
// TODO: set minimum fraction for overlap
/** Function to create function to check SAMRecord for exclusion in filtered BAM file.
*
* The returned function evaluates all filtered-in SAMRecord to false.
*
* @param iv iterator yielding RawInterval objects
* @param inBAM input BAM file
* @param inBAMIndex input BAM file index
* @param filterOutMulti whether to filter out reads with same name outside target region (default: true)
* @param minMapQ minimum MapQ of reads in target region to filter out (default: 0)
* @param readGroupIDs read group IDs of reads in target region to filter out (default: all IDs)
* @param bloomSize expected size of elements to contain in the Bloom filter
* @param bloomFp expected Bloom filter false positive rate
* @return function that checks whether a SAMRecord or String is to be excluded
*/
/**
* Function to create function to check SAMRecord for exclusion in filtered BAM file.
*
* The returned function evaluates all filtered-in SAMRecord to false.
*
* @param iv iterator yielding RawInterval objects
* @param inBAM input BAM file
* @param inBAMIndex input BAM file index
* @param filterOutMulti whether to filter out reads with same name outside target region (default: true)
* @param minMapQ minimum MapQ of reads in target region to filter out (default: 0)
* @param readGroupIDs read group IDs of reads in target region to filter out (default: all IDs)
* @param bloomSize expected size of elements to contain in the Bloom filter
* @param bloomFp expected Bloom filter false positive rate
* @return function that checks whether a SAMRecord or String is to be excluded
*/
def
makeFilterOutFunction
(
iv
:
Iterator
[
RawInterval
],
inBAM
:
File
,
inBAMIndex
:
File
=
null
,
filterOutMulti
:
Boolean
=
true
,
minMapQ
:
Int
=
0
,
readGroupIDs
:
Set
[
String
]
=
Set
(),
bloomSize
:
Int
=
100000000
,
bloomFp
:
Double
=
1
e
-
10
)
:
(
Any
=>
Boolean
)
=
{
bloomSize
:
Int
=
100000000
,
bloomFp
:
Double
=
1
e
-
10
)
:
(
Any
=>
Boolean
)
=
{
// TODO: implement optional index creation
/** Function to check for BAM file index and return a SAMFileReader given a File */
...
...
@@ -127,15 +127,16 @@ object WipeReads {
sfr
}
/** Function to query intervals from a BAM file
*
* The function still works when only either one of the interval or BAM
* file contig is prepended with "chr"
*
* @param inBAM BAM file to query as SAMFileReader
* @param ri raw interval object containing query
* @return QueryInterval wrapped in Option
*/
/**
* Function to query intervals from a BAM file
*
* The function still works when only either one of the interval or BAM
* file contig is prepended with "chr"
*
* @param inBAM BAM file to query as SAMFileReader
* @param ri raw interval object containing query
* @return QueryInterval wrapped in Option
*/
def
monadicMakeQueryInterval
(
inBAM
:
SAMFileReader
,
ri
:
RawInterval
)
:
Option
[
QueryInterval
]
=
if
(
inBAM
.
getFileHeader
.
getSequenceIndex
(
ri
.
chrom
)
>
-
1
)
Some
(
inBAM
.
makeQueryInterval
(
ri
.
chrom
,
ri
.
start
,
ri
.
end
))
...
...
@@ -149,12 +150,13 @@ object WipeReads {
None
// TODO: can we accumulate errors / exceptions instead of ignoring them?
/** Function to query mate from a BAM file
*
* @param inBAM BAM file to query as SAMFileReader
* @param rec SAMRecord whose mate will be queried
* @return SAMRecord wrapped in Option
*/
/**
* Function to query mate from a BAM file
*
* @param inBAM BAM file to query as SAMFileReader
* @param rec SAMRecord whose mate will be queried
* @return SAMRecord wrapped in Option
*/
def
monadicMateQuery
(
inBAM
:
SAMFileReader
,
rec
:
SAMRecord
)
:
Option
[
SAMRecord
]
=
// catching unpaired read here since queryMate will raise an exception if not
if
(!
rec
.
getReadPairedFlag
)
{
...
...
@@ -185,41 +187,42 @@ object WipeReads {
val
bfm
=
BloomFilter
(
bloomSize
,
bloomFp
,
13
)
val
intervals
=
iv
.
flatMap
(
x
=>
monadicMakeQueryInterval
(
firstBAM
,
x
)).
toArray
val
filteredOutSet
:
BF
=
firstBAM
.
queryOverlapping
(
intervals
).
asScala
// filter for MAPQ on target region reads
.
filter
(
x
=>
x
.
getMappingQuality
>=
minMapQ
)
// filter on specific read group IDs
.
filter
(
x
=>
rgFilter
(
x
))
// TODO: how to directly get SAMRecord and its pairs without multiple flattens?
.
flatMap
(
x
=>
Vector
(
Some
(
x
),
monadicMateQuery
(
secondBAM
,
x
)).
flatten
)
// transfrom SAMRecord to string
.
map
(
x
=>
SAMToElem
(
x
))
// build bloom filter using fold to prevent loading all strings to memory
.
foldLeft
(
bfm
.
create
())(
_
.+(
_
))
// filter for MAPQ on target region reads
.
filter
(
x
=>
x
.
getMappingQuality
>=
minMapQ
)
// filter on specific read group IDs
.
filter
(
x
=>
rgFilter
(
x
))
// TODO: how to directly get SAMRecord and its pairs without multiple flattens?
.
flatMap
(
x
=>
Vector
(
Some
(
x
),
monadicMateQuery
(
secondBAM
,
x
)).
flatten
)
// transfrom SAMRecord to string
.
map
(
x
=>
SAMToElem
(
x
))
// build bloom filter using fold to prevent loading all strings to memory
.
foldLeft
(
bfm
.
create
())(
_
.+(
_
))
if
(
filterOutMulti
)
(
rec
:
Any
)
=>
rec
match
{
case
rec
:
SAMRecord
=>
filteredOutSet
.
contains
(
rec
.
getReadName
).
isTrue
case
rec
:
String
=>
filteredOutSet
.
contains
(
rec
).
isTrue
case
_
=>
false
case
_
=>
false
}
else
(
rec
:
Any
)
=>
rec
match
{
case
rec
:
SAMRecord
=>
filteredOutSet
.
contains
(
rec
.
getSAMString
).
isTrue
case
rec
:
String
=>
filteredOutSet
.
contains
(
rec
).
isTrue
case
_
=>
false
case
_
=>
false
}
}
// TODO: implement stats file output
/** Function to filter input BAM and write its output to the filesystem
*
* @param filterFunc filter function that evaluates true for excluded SAMRecord
* @param inBAM input BAM file
* @param outBAM output BAM file
* @param writeIndex whether to write index for output BAM file
* @param async whether to write asynchronously
* @param filteredOutBAM whether to write excluded SAMRecords to their own BAM file
*/
/**
* Function to filter input BAM and write its output to the filesystem
*
* @param filterFunc filter function that evaluates true for excluded SAMRecord
* @param inBAM input BAM file
* @param outBAM output BAM file
* @param writeIndex whether to write index for output BAM file
* @param async whether to write asynchronously
* @param filteredOutBAM whether to write excluded SAMRecords to their own BAM file
*/
def
writeFilteredBAM
(
filterFunc
:
(
SAMRecord
=>
Boolean
),
inBAM
:
File
,
outBAM
:
File
,
writeIndex
:
Boolean
=
true
,
async
:
Boolean
=
true
,
filteredOutBAM
:
File
=
null
)
=
{
...
...
@@ -247,13 +250,15 @@ object WipeReads {
}
}
/** Recursive function to parse command line options
*
* @param opts OptionMap instance which may contain parsed options
* @param list remaining command line arguments
* @return OptionMap instance
*/
/**
* Recursive function to parse command line options
*
* @param opts OptionMap instance which may contain parsed options
* @param list remaining command line arguments
* @return OptionMap instance
*/
def
parseOption
(
opts
:
OptionMap
,
list
:
List
[
String
])
:
OptionMap
=
// format: OFF
list
match
{
case
Nil
=>
opts
...
...
@@ -281,6 +286,7 @@ object WipeReads {
case
option
::
tail
=>
throw
new
IllegalArgumentException
(
"Unexpected or duplicate option flag: "
+
option
)
}
// format: ON
/** Function to validate OptionMap instances */
private
def
validateOption
(
opts
:
OptionMap
)
:
Unit
=
{
...
...
@@ -301,7 +307,7 @@ object WipeReads {
/** Function that returns the given BAM file if it exists and is indexed */
def
checkInputBAM
(
inBAM
:
File
)
:
File
=
{
// input BAM must have a .bam.bai index
if
(
new
File
(
inBAM
.
getPath
+
".bai"
).
exists
||
new
File
(
inBAM
.
getPath
+
".bam.bai"
).
exists
)
if
(
new
File
(
inBAM
.
getPath
+
".bai"
).
exists
||
new
File
(
inBAM
.
getPath
+
".bam.bai"
).
exists
)
checkInputFile
(
inBAM
)
else
throw
new
IOException
(
"Index for input BAM file "
+
inBAM
.
getPath
+
" not found"
)
...
...
biopet-framework/src/test/scala/nl/lumc/sasc/biopet/core/apps/WipeReadsUnitTest.scala
View file @
657702dc
...
...
@@ -12,7 +12,6 @@ import htsjdk.samtools.{ SAMFileReader, SAMRecord }
import
org.scalatest.Assertions
import
org.testng.annotations.Test
class
WipeReadsUnitTest
extends
Assertions
{
import
WipeReads._
...
...
@@ -78,7 +77,7 @@ class WipeReadsUnitTest extends Assertions {
RawInterval
(
"chrQ"
,
991
,
1000
,
"+"
)
// overlaps nothing; lies in the spliced region of r05
)
val
isFilteredOut
=
makeFilterOutFunction
(
intervals
,
sbam01
,
bloomSize
=
1000
,
bloomFp
=
1
e
-
10
,
filterOutMulti
=
false
)
filterOutMulti
=
false
)
assert
(!
isFilteredOut
(
"r02\t0\tchrQ\t50\t60\t10M\t*\t0\t0\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001\n"
))
assert
(!
isFilteredOut
(
"r01\t16\tchrQ\t190\t60\t10M\t*\t0\t0\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001\n"
))
assert
(!
isFilteredOut
(
"r03\t16\tchrQ\t690\t60\t10M\t*\t0\t0\tCCCCCTTTTT\tHHHHHHHHHH\tRG:Z:001\n"
))
...
...
@@ -98,7 +97,7 @@ class WipeReadsUnitTest extends Assertions {
RawInterval
(
"chrQ"
,
451
,
480
,
"+"
)
)
val
isFilteredOut
=
makeFilterOutFunction
(
intervals
,
sbam02
,
bloomSize
=
1000
,
bloomFp
=
1
e
-
10
,
minMapQ
=
60
)
minMapQ
=
60
)
// r01 is not in since it is below the MAPQ threshold
assert
(!
isFilteredOut
(
"r01"
))
assert
(!
isFilteredOut
(
"r02"
))
...
...
@@ -114,7 +113,7 @@ class WipeReadsUnitTest extends Assertions {
RawInterval
(
"chrQ"
,
451
,
480
,
"+"
)
)
val
isFilteredOut
=
makeFilterOutFunction
(
intervals
,
sbam02
,
bloomSize
=
1000
,
bloomFp
=
1
e
-
10
,
minMapQ
=
60
,
filterOutMulti
=
false
)
minMapQ
=
60
,
filterOutMulti
=
false
)
assert
(!
isFilteredOut
(
"r02\t0\tchrQ\t50\t60\t10M\t*\t0\t0\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001\n"
))
assert
(!
isFilteredOut
(
"r01\t16\tchrQ\t190\t30\t10M\t*\t0\t0\tGGGGGAAAAA\tGGGGGGGGGG\tRG:Z:002\n"
))
// this r01 is not in since it is below the MAPQ threshold
...
...
@@ -133,7 +132,7 @@ class WipeReadsUnitTest extends Assertions {
RawInterval
(
"chrQ"
,
451
,
480
,
"+"
)
)
val
isFilteredOut
=
makeFilterOutFunction
(
intervals
,
sbam02
,
bloomSize
=
1000
,
bloomFp
=
1
e
-
10
,
readGroupIDs
=
Set
(
"002"
,
"003"
))
readGroupIDs
=
Set
(
"002"
,
"003"
))
assert
(!
isFilteredOut
(
"r02"
))
assert
(!
isFilteredOut
(
"r04"
))
assert
(!
isFilteredOut
(
"r06"
))
...
...
@@ -168,7 +167,7 @@ class WipeReadsUnitTest extends Assertions {
RawInterval
(
"chrQ"
,
991
,
1000
,
"+"
)
// overlaps nothing; lies in the spliced region of r05
)
val
isFilteredOut
=
makeFilterOutFunction
(
intervals
,
pbam01
,
bloomSize
=
1000
,
bloomFp
=
1
e
-
10
,
filterOutMulti
=
false
)
filterOutMulti
=
false
)
assert
(!
isFilteredOut
(
"r02\t99\tchrQ\t50\t60\t10M\t=\t90\t50\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001\n"
))
assert
(!
isFilteredOut
(
"r02\t147\tchrQ\t90\t60\t10M\t=\t50\t-50\tATGCATGCAT\tEEFFGGHHII\tRG:Z:001\n"
))
assert
(!
isFilteredOut
(
"r01\t163\tchrQ\t150\t60\t10M\t=\t190\t50\tAAAAAGGGGG\tGGGGGGGGGG\tRG:Z:001\n"
))
...
...
@@ -193,7 +192,7 @@ class WipeReadsUnitTest extends Assertions {
RawInterval
(
"chrQ"
,
451
,
480
,
"+"
)
)
val
isFilteredOut
=
makeFilterOutFunction
(
intervals
,
pbam02
,
bloomSize
=
1000
,
bloomFp
=
1
e
-
10
,
minMapQ
=
60
)
minMapQ
=
60
)
// r01 is not in since it is below the MAPQ threshold
assert
(!
isFilteredOut
(
"r01"
))
assert
(!
isFilteredOut
(
"r02"
))
...
...
@@ -208,7 +207,7 @@ class WipeReadsUnitTest extends Assertions {
RawInterval
(
"chrQ"
,
451
,
480
,
"+"
)
)
val
isFilteredOut
=
makeFilterOutFunction
(
intervals
,
pbam02
,
bloomSize
=
1000
,
bloomFp
=
1
e
-
10
,
readGroupIDs
=
Set
(
"002"
,
"003"
))
readGroupIDs
=
Set
(
"002"
,
"003"
))
assert
(!
isFilteredOut
(
"r02"
))
assert
(!
isFilteredOut
(
"r04"
))
assert
(!
isFilteredOut
(
"r06"
))
...
...
@@ -226,7 +225,7 @@ class WipeReadsUnitTest extends Assertions {
writeFilteredBAM
(
mockFilterOutFunc
,
sbam01
,
outBAM
)
val
exp
=
new
SAMFileReader
(
sbam03
).
asScala
val
obs
=
new
SAMFileReader
(
outBAM
).
asScala
val
res
=
for
(
(
e
,
o
)
<-
exp
.
zip
(
obs
))
yield
e
.
getSAMString
===
o
.
getSAMString
val
res
=
for
((
e
,
o
)
<-
exp
.
zip
(
obs
))
yield
e
.
getSAMString
===
o
.
getSAMString
assert
(
res
.
reduceLeft
(
_
&&
_
))
assert
(
outBAM
.
exists
)
assert
(
outBAMIndex
.
exists
)
...
...
@@ -245,7 +244,7 @@ class WipeReadsUnitTest extends Assertions {
writeFilteredBAM
(
mockFilterOutFunc
,
sbam01
,
outBAM
,
filteredOutBAM
=
filteredOutBAM
)
val
exp
=
new
SAMFileReader
(
sbam04
).
asScala
val
obs
=
new
SAMFileReader
(
filteredOutBAM
).
asScala
val
res
=
for
(
(
e
,
o
)
<-
exp
.
zip
(
obs
))
yield
e
.
getSAMString
===
o
.
getSAMString
val
res
=
for
((
e
,
o
)
<-
exp
.
zip
(
obs
))
yield
e
.
getSAMString
===
o
.
getSAMString
assert
(
res
.
reduceLeft
(
_
&&
_
))
assert
(
outBAM
.
exists
)
assert
(
outBAMIndex
.
exists
)
...
...
@@ -262,7 +261,7 @@ class WipeReadsUnitTest extends Assertions {
writeFilteredBAM
(
mockFilterOutFunc
,
pbam01
,
outBAM
)
val
exp
=
new
SAMFileReader
(
pbam03
).
asScala
val
obs
=
new
SAMFileReader
(
outBAM
).
asScala
val
res
=
for
(
(
e
,
o
)
<-
exp
.
zip
(
obs
))
yield
e
.
getSAMString
===
o
.
getSAMString
val
res
=
for
((
e
,
o
)
<-
exp
.
zip
(
obs
))
yield
e
.
getSAMString
===
o
.
getSAMString
assert
(
res
.
reduceLeft
(
_
&&
_
))
assert
(
outBAM
.
exists
)
assert
(
outBAMIndex
.
exists
)
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment