Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mirrors
biopet.biopet
Commits
19d495c5
Commit
19d495c5
authored
Dec 09, 2014
by
Peter van 't Hof
Browse files
Merge branch 'release-0.2.0' into 'master'
Release 0.2.0 See merge request !64
parents
187bae8b
1572cb02
Changes
364
Hide whitespace changes
Inline
Side-by-side
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/PipelineCommand.scala
deleted
100644 → 0
View file @
187bae8b
package
nl.lumc.sasc.biopet.core
import
org.broadinstitute.gatk.queue.util.Logging
trait
PipelineCommand
extends
MainCommand
with
Logging
{
def
pipeline
=
"/"
+
getClass
.
getName
.
stripSuffix
(
"$"
).
replaceAll
(
"\\."
,
"/"
)
+
".class"
def
main
(
args
:
Array
[
String
])
:
Unit
=
{
var
argv
:
Array
[
String
]
=
Array
()
argv
++=
Array
(
"-S"
,
pipeline
)
argv
++=
args
BiopetQCommandLine
.
main
(
argv
)
}
}
\ No newline at end of file
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/ToolCommand.scala
deleted
100644 → 0
View file @
187bae8b
package
nl.lumc.sasc.biopet.core
import
java.text.SimpleDateFormat
import
java.util.Calendar
import
org.apache.log4j.Logger
import
org.apache.log4j.WriterAppender
import
org.apache.log4j.helpers.DateLayout
import
org.apache.log4j.spi.LoggingEvent
import
java.io.File
trait
ToolCommand
extends
MainCommand
{
abstract
class
AbstractArgs
{
}
abstract
class
AbstractOptParser
extends
scopt
.
OptionParser
[
Args
](
commandName
)
{
opt
[
Unit
](
"log_nostderr"
)
foreach
{
_
=>
logger
.
removeAppender
(
stderrAppender
)
}
text
(
"No output to stderr"
)
opt
[
File
](
"log_file"
)
foreach
{
x
=>
logger
.
addAppender
(
new
WriterAppender
(
logLayout
,
new
java
.
io
.
PrintStream
(
x
)))
}
text
(
"Log file"
)
valueName
(
"<file>"
)
opt
[
String
](
'l'
,
"log_level"
)
foreach
{
x
=>
x
.
toLowerCase
match
{
case
"debug"
=>
logger
.
setLevel
(
org
.
apache
.
log4j
.
Level
.
DEBUG
)
case
"info"
=>
logger
.
setLevel
(
org
.
apache
.
log4j
.
Level
.
INFO
)
case
"warn"
=>
logger
.
setLevel
(
org
.
apache
.
log4j
.
Level
.
WARN
)
case
"error"
=>
logger
.
setLevel
(
org
.
apache
.
log4j
.
Level
.
ERROR
)
case
_
=>
}
}
text
(
"Log level"
)
validate
{
x
=>
x
match
{
case
"debug"
|
"info"
|
"warn"
|
"error"
=>
success
case
_
=>
failure
(
"Log level must be <debug/info/warn/error>"
)
}
}
opt
[
Unit
](
'h'
,
"help"
)
foreach
{
_
=>
System
.
err
.
println
(
this
.
usage
);
sys
.
exit
(
1
)}
text
(
"Print usage"
)
}
type
Args
<:
AbstractArgs
type
OptParser
<:
AbstractOptParser
protected
val
logger
=
Logger
.
getLogger
(
commandName
)
private
val
logLayout
=
new
DateLayout
()
{
val
ignoresThrowable
=
false
def
format
(
event
:
org.apache.log4j.spi.LoggingEvent
)
:
String
=
{
val
calendar
:
Calendar
=
Calendar
.
getInstance
calendar
.
setTimeInMillis
(
event
.
getTimeStamp
)
val
formatter
:
SimpleDateFormat
=
new
SimpleDateFormat
(
"yyyy-MM-dd HH:mm:ss"
);
val
formattedDate
:
String
=
formatter
.
format
(
calendar
.
getTime
)
var
logLevel
=
event
.
getLevel
.
toString
while
(
logLevel
.
size
<
6
)
logLevel
+=
" "
logLevel
+
" ["
+
formattedDate
+
"] ["
+
event
.
getLoggerName
+
"] "
+
event
.
getMessage
+
"\n"
}
}
private
val
stderrAppender
=
new
WriterAppender
(
logLayout
,
sys
.
process
.
stderr
)
logger
.
setLevel
(
org
.
apache
.
log4j
.
Level
.
INFO
)
logger
.
addAppender
(
stderrAppender
)
}
\ No newline at end of file
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/basty/Basty.scala
deleted
100644 → 0
View file @
187bae8b
package
nl.lumc.sasc.biopet.pipelines.basty
import
nl.lumc.sasc.biopet.core.MultiSampleQScript
import
nl.lumc.sasc.biopet.core.PipelineCommand
import
nl.lumc.sasc.biopet.core.config.Configurable
import
nl.lumc.sasc.biopet.pipelines.gatk.GatkPipeline
import
org.broadinstitute.gatk.queue.QScript
class
Basty
(
val
root
:
Configurable
)
extends
QScript
with
MultiSampleQScript
{
def
this
()
=
this
(
null
)
class
LibraryOutput
extends
AbstractLibraryOutput
{
}
class
SampleOutput
extends
AbstractSampleOutput
{
}
defaults
++=
Map
(
"ploidy"
->
1
,
"use_haplotypecaller"
->
false
,
"use_unifiedgenotyper"
->
true
)
var
gatkPipeline
:
GatkPipeline
=
_
def
init
()
{
gatkPipeline
=
new
GatkPipeline
(
this
)
gatkPipeline
.
outputDir
=
outputDir
gatkPipeline
.
init
}
def
biopetScript
()
{
gatkPipeline
.
biopetScript
addAll
(
gatkPipeline
.
functions
)
runSamplesJobs
()
}
// Called for each sample
def
runSingleSampleJobs
(
sampleConfig
:
Map
[
String
,
Any
])
:
SampleOutput
=
{
val
sampleOutput
=
new
SampleOutput
val
sampleID
:
String
=
sampleConfig
(
"ID"
).
toString
val
sampleDir
=
globalSampleDir
+
sampleID
sampleOutput
.
libraries
=
runLibraryJobs
(
sampleConfig
)
return
sampleOutput
}
// Called for each run from a sample
def
runSingleLibraryJobs
(
runConfig
:
Map
[
String
,
Any
],
sampleConfig
:
Map
[
String
,
Any
])
:
LibraryOutput
=
{
val
libraryOutput
=
new
LibraryOutput
val
runID
:
String
=
runConfig
(
"ID"
).
toString
val
sampleID
:
String
=
sampleConfig
(
"ID"
).
toString
val
runDir
:
String
=
globalSampleDir
+
sampleID
+
"/run_"
+
runID
+
"/"
return
libraryOutput
}
}
object
Basty
extends
PipelineCommand
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfFilter.scala
deleted
100644 → 0
View file @
187bae8b
package
nl.lumc.sasc.biopet.tools
import
htsjdk.variant.variantcontext.writer.AsyncVariantContextWriter
import
htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder
import
htsjdk.variant.vcf.VCFFileReader
import
java.io.File
import
java.util.ArrayList
import
nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import
nl.lumc.sasc.biopet.core.ToolCommand
import
nl.lumc.sasc.biopet.core.config.Configurable
import
org.broadinstitute.gatk.utils.commandline.
{
Output
,
Input
}
import
scala.collection.JavaConversions._
class
VcfFilter
(
val
root
:
Configurable
)
extends
BiopetJavaCommandLineFunction
{
javaMainClass
=
getClass
.
getName
@Input
(
doc
=
"Input vcf"
,
shortName
=
"I"
,
required
=
true
)
var
inputVcf
:
File
=
_
@Output
(
doc
=
"Output vcf"
,
shortName
=
"o"
,
required
=
false
)
var
outputVcf
:
File
=
_
var
minSampleDepth
:
Option
[
Int
]
=
_
var
minTotalDepth
:
Option
[
Int
]
=
_
var
minAlternateDepth
:
Option
[
Int
]
=
_
var
minSamplesPass
:
Option
[
Int
]
=
_
var
filterRefCalls
:
Boolean
=
_
override
val
defaultVmem
=
"8G"
memoryLimit
=
Option
(
4.0
)
override
def
afterGraph
{
minSampleDepth
=
config
(
"min_sample_depth"
)
minTotalDepth
=
config
(
"min_total_depth"
)
minAlternateDepth
=
config
(
"min_alternate_depth"
)
minSamplesPass
=
config
(
"min_samples_pass"
)
filterRefCalls
=
config
(
"filter_ref_calls"
)
}
override
def
commandLine
=
super
.
commandLine
+
required
(
"-I"
,
inputVcf
)
+
required
(
"-o"
,
outputVcf
)
+
optional
(
"--minSampleDepth"
,
minSampleDepth
)
+
optional
(
"--minTotalDepth"
,
minTotalDepth
)
+
optional
(
"--minAlternateDepth"
,
minAlternateDepth
)
+
optional
(
"--minSamplesPass"
,
minSamplesPass
)
+
conditional
(
filterRefCalls
,
"--filterRefCalls"
)
}
object
VcfFilter
extends
ToolCommand
{
case
class
Args
(
inputVcf
:
File
=
null
,
outputVcf
:
File
=
null
,
minSampleDepth
:
Int
=
-
1
,
minTotalDepth
:
Int
=
-
1
,
minAlternateDepth
:
Int
=
-
1
,
minSamplesPass
:
Int
=
0
,
minBamAlternateDepth
:
Int
=
0
,
filterRefCalls
:
Boolean
=
false
)
extends
AbstractArgs
class
OptParser
extends
AbstractOptParser
{
opt
[
File
](
'I'
,
"inputVcf"
)
required
()
maxOccurs
(
1
)
valueName
(
"<file>"
)
action
{
(
x
,
c
)
=>
c
.
copy
(
inputVcf
=
x
)
}
opt
[
File
](
'o'
,
"outputVcf"
)
required
()
maxOccurs
(
1
)
valueName
(
"<file>"
)
action
{
(
x
,
c
)
=>
c
.
copy
(
outputVcf
=
x
)
}
text
(
"output file, default to stdout"
)
opt
[
Int
](
"minSampleDepth"
)
unbounded
()
action
{
(
x
,
c
)
=>
c
.
copy
(
minSampleDepth
=
x
)
}
opt
[
Int
](
"minTotalDepth"
)
unbounded
()
action
{
(
x
,
c
)
=>
c
.
copy
(
minTotalDepth
=
x
)
}
opt
[
Int
](
"minAlternateDepth"
)
unbounded
()
action
{
(
x
,
c
)
=>
c
.
copy
(
minAlternateDepth
=
x
)
}
opt
[
Int
](
"minSamplesPass"
)
unbounded
()
action
{
(
x
,
c
)
=>
c
.
copy
(
minSamplesPass
=
x
)
}
opt
[
Int
](
"minBamAlternateDepth"
)
unbounded
()
action
{
(
x
,
c
)
=>
c
.
copy
(
minBamAlternateDepth
=
x
)
}
opt
[
Unit
](
"filterRefCalls"
)
unbounded
()
action
{
(
x
,
c
)
=>
c
.
copy
(
filterRefCalls
=
true
)
}
}
/**
* @param args the command line arguments
*/
def
main
(
args
:
Array
[
String
])
:
Unit
=
{
val
argsParser
=
new
OptParser
val
commandArgs
:
Args
=
argsParser
.
parse
(
args
,
Args
())
getOrElse
sys
.
exit
(
1
)
val
reader
=
new
VCFFileReader
(
commandArgs
.
inputVcf
,
false
)
val
header
=
reader
.
getFileHeader
val
writer
=
new
AsyncVariantContextWriter
(
new
VariantContextWriterBuilder
().
setOutputFile
(
commandArgs
.
outputVcf
).
build
)
writer
.
writeHeader
(
header
)
val
bamADFields
=
(
for
(
line
<-
header
.
getInfoHeaderLines
if
line
.
getID
.
startsWith
(
"BAM-AD-"
))
yield
line
.
getID
).
toList
val
bamDPFields
=
(
for
(
line
<-
header
.
getInfoHeaderLines
if
line
.
getID
.
startsWith
(
"BAM-DP-"
))
yield
line
.
getID
).
toList
for
(
record
<-
reader
)
{
val
genotypes
=
for
(
genotype
<-
record
.
getGenotypes
)
yield
{
val
DP
=
if
(
genotype
.
hasDP
)
genotype
.
getDP
else
-
1
val
AD
=
if
(
genotype
.
hasAD
)
List
(
genotype
.
getAD
:_
*
)
else
Nil
DP
>=
commandArgs
.
minSampleDepth
&&
(
if
(!
AD
.
isEmpty
)
AD
.
tail
.
count
(
_
>=
commandArgs
.
minAlternateDepth
)
>
0
else
true
)
&&
!(
commandArgs
.
filterRefCalls
&&
genotype
.
isHomRef
)
}
val
bamADvalues
=
(
for
(
field
<-
bamADFields
)
yield
{
record
.
getAttribute
(
field
,
new
ArrayList
)
match
{
case
t
:
ArrayList
[
_
]
if
t
.
length
>
1
=>
{
for
(
i
<-
1
until
t
.
size
)
yield
{
t
(
i
)
match
{
case
a
:
Int
=>
a
>
commandArgs
.
minBamAlternateDepth
case
a
:
String
=>
a
.
toInt
>
commandArgs
.
minBamAlternateDepth
case
_
=>
false
}
}
}
case
_
=>
List
(
false
)
}
}).
flatten
if
(
record
.
getAttributeAsInt
(
"DP"
,
-
1
)
>=
commandArgs
.
minTotalDepth
&&
genotypes
.
count
(
_
==
true
)
>=
commandArgs
.
minSamplesPass
&&
(
commandArgs
.
minBamAlternateDepth
<=
0
||
bamADvalues
.
count
(
_
==
true
)
>=
commandArgs
.
minSamplesPass
))
writer
.
add
(
record
)
}
reader
.
close
writer
.
close
}
}
\ No newline at end of file
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala
deleted
100644 → 0
View file @
187bae8b
/**
* Copyright (c) 2014 Leiden University Medical Center - Sequencing Analysis Support Core <sasc@lumc.nl>
* @author Wibowo Arindrarto <w.arindrarto@lumc.nl>
*/
package
nl.lumc.sasc.biopet.tools
import
java.io.
{
File
,
IOException
}
import
scala.collection.JavaConverters._
import
htsjdk.samtools.AlignmentBlock
import
htsjdk.samtools.SAMFileReader
import
htsjdk.samtools.SAMFileReader.QueryInterval
import
htsjdk.samtools.SAMFileWriterFactory
import
htsjdk.samtools.SAMRecord
import
htsjdk.tribble.AbstractFeatureReader.getFeatureReader
import
htsjdk.tribble.Feature
import
htsjdk.tribble.BasicFeature
import
htsjdk.tribble.bed.BEDCodec
import
htsjdk.tribble.index.interval.
{
Interval
,
IntervalTree
}
import
orestes.bloomfilter.HashProvider.HashMethod
import
orestes.bloomfilter.
{
BloomFilter
,
FilterBuilder
}
import
org.apache.commons.io.FilenameUtils.getExtension
import
org.broadinstitute.gatk.utils.commandline.
{
Input
,
Output
}
import
nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import
nl.lumc.sasc.biopet.core.ToolCommand
import
nl.lumc.sasc.biopet.core.config.Configurable
// TODO: finish implementation for usage in pipelines
/**
* WipeReads function class for usage in Biopet pipelines
*
* @param root Configuration object for the pipeline
*/
class
WipeReads
(
val
root
:
Configurable
)
extends
BiopetJavaCommandLineFunction
{
javaMainClass
=
getClass
.
getName
@Input
(
doc
=
"Input BAM file (must be indexed)"
,
shortName
=
"I"
,
required
=
true
)
var
inputBAM
:
File
=
_
@Output
(
doc
=
"Output BAM"
,
shortName
=
"o"
,
required
=
true
)
var
outputBAM
:
File
=
_
}
object
WipeReads
extends
ToolCommand
{
/** Function to check whether one feature contains the other */
private
def
contains
(
feat1
:
Feature
,
feat2
:
Feature
)
:
Boolean
=
if
(
feat1
.
getChr
!=
feat2
.
getChr
)
false
else
feat1
.
getStart
<=
feat2
.
getStart
&&
feat1
.
getEnd
>=
feat2
.
getEnd
/** Function to check whether two features overlap each other */
private
def
overlaps
(
feat1
:
Feature
,
feat2
:
Feature
)
:
Boolean
=
if
(
feat1
.
getChr
!=
feat2
.
getChr
)
false
else
feat1
.
getStart
<=
feat2
.
getStart
&&
feat1
.
getEnd
>=
feat2
.
getStart
/** Function to merge two overlapping intervals
* NOTE: we assume subtypes of Feature has a constructor with (chr, start, end) signature */
private
def
merge
[
T
<:
Feature
](
feat1
:
T
,
feat2
:
T
)
:
T
=
{
if
(
feat1
.
getChr
!=
feat2
.
getChr
)
throw
new
IllegalArgumentException
(
"Can not merge features from different chromosomes"
)
if
(
contains
(
feat1
,
feat2
))
feat1
// FIXME: can we avoid casting here?
else
if
(
overlaps
(
feat1
,
feat2
))
new
BasicFeature
(
feat1
.
getChr
,
feat1
.
getStart
,
feat2
.
getEnd
).
asInstanceOf
[
T
]
else
throw
new
IllegalArgumentException
(
"Can not merge non-overlapping RawInterval objects"
)
}
/** Function to create an iterator yielding non-overlapped Feature objects */
private
def
mergeOverlappingIntervals
[
T
<:
Feature
](
ri
:
Iterator
[
T
])
:
Iterator
[
T
]
=
ri
.
toList
.
sortBy
(
x
=>
(
x
.
getChr
,
x
.
getStart
,
x
.
getEnd
))
.
foldLeft
(
List
.
empty
[
T
])
{
(
acc
,
i
)
=>
acc
match
{
case
head
::
tail
if
overlaps
(
head
,
i
)
=>
merge
(
head
,
i
)
::
tail
case
_
=>
i
::
acc
}}
.
toIterator
/**
* Function to create iterator over intervals from input interval file
*
* @param inFile input interval file
*/
def
makeFeatureFromFile
(
inFile
:
File
)
:
Iterator
[
Feature
]
=
{
logger
.
info
(
"Parsing interval file ..."
)
/** Function to create iterator from BED file */
def
makeFeatureFromBED
(
inFile
:
File
)
:
Iterator
[
Feature
]
=
asScalaIteratorConverter
(
getFeatureReader
(
inFile
.
toPath
.
toString
,
new
BEDCodec
(),
false
).
iterator
).
asScala
// TODO: implementation
/** Function to create iterator from refFlat file */
def
makeFeatureFromRefFlat
(
inFile
:
File
)
:
Iterator
[
Feature
]
=
???
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
// TODO: implementation
/** Function to create iterator from GTF file */
def
makeFeatureFromGTF
(
inFile
:
File
)
:
Iterator
[
Feature
]
=
???
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
// detect interval file format from extension
val
iterFunc
:
(
File
=>
Iterator
[
Feature
])
=
if
(
getExtension
(
inFile
.
toString
.
toLowerCase
)
==
"bed"
)
makeFeatureFromBED
else
throw
new
IllegalArgumentException
(
"Unexpected interval file type: "
+
inFile
.
getPath
)
mergeOverlappingIntervals
(
iterFunc
(
inFile
))
}
def
bloomParamsOk
(
bloomSize
:
Long
,
bloomFp
:
Double
)
:
Boolean
=
{
val
optimalArraySize
=
FilterBuilder
.
optimalM
(
bloomSize
,
bloomFp
)
// we are currently limited to maximum integer size
// if the optimal array size equals or exceeds it, we assume
// that it's a result of a truncation and return false
optimalArraySize
<=
Int
.
MaxValue
}
// TODO: set minimum fraction for overlap
/**
* Function to create function to check SAMRecord for exclusion in filtered BAM file.
*
* The returned function evaluates all filtered-in SAMRecord to false.
*
* @param iv iterator yielding RawInterval objects
* @param inBAM input BAM file
* @param inBAMIndex input BAM file index
* @param filterOutMulti whether to filter out reads with same name outside target region (default: true)
* @param minMapQ minimum MapQ of reads in target region to filter out (default: 0)
* @param readGroupIDs read group IDs of reads in target region to filter out (default: all IDs)
* @param bloomSize expected size of elements to contain in the Bloom filter
* @param bloomFp expected Bloom filter false positive rate
* @return function that checks whether a SAMRecord or String is to be excluded
*/
def
makeFilterOutFunction
(
iv
:
Iterator
[
Feature
],
inBAM
:
File
,
inBAMIndex
:
File
=
null
,
filterOutMulti
:
Boolean
=
true
,
minMapQ
:
Int
=
0
,
readGroupIDs
:
Set
[
String
]
=
Set
(),
bloomSize
:
Long
,
bloomFp
:
Double
)
:
(
SAMRecord
=>
Boolean
)
=
{
logger
.
info
(
"Building set of reads to exclude ..."
)
// TODO: implement optional index creation
/** Function to check for BAM file index and return a SAMFileReader given a File */
def
prepIndexedInputBAM
()
:
SAMFileReader
=
if
(
inBAMIndex
!=
null
)
new
SAMFileReader
(
inBAM
,
inBAMIndex
)
else
{
val
sfr
=
new
SAMFileReader
(
inBAM
)
sfr
.
setValidationStringency
(
SAMFileReader
.
ValidationStringency
.
LENIENT
)
if
(!
sfr
.
hasIndex
)
throw
new
IllegalStateException
(
"Input BAM file must be indexed"
)
else
sfr
}
/**
* Function to query intervals from a BAM file
*
* The function still works when only either one of the interval or BAM
* file contig is prepended with "chr"
*
* @param inBAM BAM file to query as SAMFileReader
* @param feat feature object containing query
* @return QueryInterval wrapped in Option
*/
def
monadicMakeQueryInterval
(
inBAM
:
SAMFileReader
,
feat
:
Feature
)
:
Option
[
QueryInterval
]
=
if
(
inBAM
.
getFileHeader
.
getSequenceIndex
(
feat
.
getChr
)
>
-
1
)
Some
(
inBAM
.
makeQueryInterval
(
feat
.
getChr
,
feat
.
getStart
,
feat
.
getEnd
))
else
if
(
feat
.
getChr
.
startsWith
(
"chr"
)
&&
inBAM
.
getFileHeader
.
getSequenceIndex
(
feat
.
getChr
.
substring
(
3
))
>
-
1
)
Some
(
inBAM
.
makeQueryInterval
(
feat
.
getChr
.
substring
(
3
),
feat
.
getStart
,
feat
.
getEnd
))
else
if
(!
feat
.
getChr
.
startsWith
(
"chr"
)
&&
inBAM
.
getFileHeader
.
getSequenceIndex
(
"chr"
+
feat
.
getChr
)
>
-
1
)
Some
(
inBAM
.
makeQueryInterval
(
"chr"
+
feat
.
getChr
,
feat
.
getStart
,
feat
.
getEnd
))
else
None
/** function to make IntervalTree from our RawInterval objects
*
* @param ifeat iterable over feature objects
* @return IntervalTree objects, filled with intervals from RawInterval
*/
def
makeIntervalTree
[
T
<:
Feature
](
ifeat
:
Iterable
[
T
])
:
IntervalTree
=
{
val
ivt
=
new
IntervalTree
for
(
iv
<-
ifeat
)
ivt
.
insert
(
new
Interval
(
iv
.
getStart
,
iv
.
getEnd
))
ivt
}
/**
* Function to ensure that a SAMRecord overlaps our target regions
*
* This is required because htsjdk's queryOverlap method does not take into
* account the SAMRecord splicing structure
*
* @param rec SAMRecord to check
* @param ivtm mutable mapping of a chromosome and its interval tree
* @return
*/
def
alignmentBlockOverlaps
(
rec
:
SAMRecord
,
ivtm
:
Map
[
String
,
IntervalTree
])
:
Boolean
=
// if SAMRecord is not spliced, assume queryOverlap has done its job
// otherwise check for alignment block overlaps in our interval list
// using raw SAMString to bypass cigar string decoding
if
(
rec
.
getSAMString
.
split
(
"\t"
)(
5
).
contains
(
"N"
))
{
for
(
ab
:
AlignmentBlock
<-
rec.getAlignmentBlocks.asScala
)
{
if
(!
ivtm
(
rec
.
getReferenceName
).
findOverlapping
(
new
Interval
(
ab
.
getReferenceStart
,
ab
.
getReferenceStart
+
ab
.
getLength
-
1
)).
isEmpty
)
return
true
}
false
}
else
true
/** filter function for read IDs */
val
rgFilter
=
if
(
readGroupIDs
.
size
==
0
)
(
r
:
SAMRecord
)
=>
true
else
(
r
:
SAMRecord
)
=>
readGroupIDs
.
contains
(
r
.
getReadGroup
.
getReadGroupId
)
/** function to get set element */
val
SAMRecordElement
=
if
(
filterOutMulti
)
(
r
:
SAMRecord
)
=>
r
.
getReadName
else
(
r
:
SAMRecord
)
=>
r
.
getReadName
+
"_"
+
r
.
getAlignmentStart
.
toString
val
SAMRecordMateElement
=
(
r
:
SAMRecord
)
=>
r
.
getReadName
+
"_"
+
r
.
getMateAlignmentStart
.
toString
val
firstBAM
=
prepIndexedInputBAM
()
/* NOTE: the interval vector here should be bypass-able if we can make
the BAM query intervals with Interval objects. This is not possible
at the moment since we can not retrieve star and end coordinates
of an Interval, so we resort to our own RawInterval vector
*/
lazy
val
intervals
=
iv
.
toVector
lazy
val
intervalTreeMap
:
Map
[
String
,
IntervalTree
]
=
intervals
.
groupBy
(
x
=>
x
.
getChr
)
.
map
({
case
(
key
,
value
)
=>
(
key
,
makeIntervalTree
(
value
))
})
lazy
val
queryIntervals
=
intervals
.
flatMap
(
x
=>
monadicMakeQueryInterval
(
firstBAM
,
x
))
// makeQueryInterval only accepts a sorted QueryInterval list
.
sortBy
(
x
=>
(
x
.
referenceIndex
,
x
.
start
,
x
.
end
))
.
toArray
val
filteredRecords
:
Iterator
[
SAMRecord
]
=
firstBAM
.
queryOverlapping
(
queryIntervals
).
asScala
// ensure spliced reads have at least one block overlapping target region
.
filter
(
x
=>
alignmentBlockOverlaps
(
x
,
intervalTreeMap
))
// filter for MAPQ on target region reads
.
filter
(
x
=>
x
.
getMappingQuality
>=
minMapQ
)
// filter on specific read group IDs
.
filter
(
x
=>
rgFilter
(
x
))
val
filteredOutSet
:
BloomFilter
[
String
]
=
new
FilterBuilder
(
bloomSize
.
toInt
,
bloomFp
)
.
hashFunction
(
HashMethod
.
Murmur3KirschMitzenmacher
)
.
buildBloomFilter
()
for
(
rec
<-
filteredRecords
)
{
logger
.
debug
(
"Adding read "
+
rec
.
getReadName
+
" to set ..."
)
if
((!
filterOutMulti
)
&&
rec
.
getReadPairedFlag
)
{
filteredOutSet
.
add
(
SAMRecordElement
(
rec
))
filteredOutSet
.
add
(
SAMRecordMateElement
(
rec
))
}
else
filteredOutSet
.
add
(
SAMRecordElement
(
rec
))
}
if
(
filterOutMulti
)
(
rec
:
SAMRecord
)
=>
filteredOutSet
.
contains
(
rec
.
getReadName
)
else
(
rec
:
SAMRecord
)
=>
{
if
(
rec
.
getReadPairedFlag
)
filteredOutSet
.
contains
(
SAMRecordElement
(
rec
))
&&
filteredOutSet
.
contains
(
SAMRecordMateElement
(
rec
))
else
filteredOutSet
.
contains
(
SAMRecordElement
(
rec
))
}
}
// TODO: implement stats file output
/**
* Function to filter input BAM and write its output to the filesystem
*
* @param filterFunc filter function that evaluates true for excluded SAMRecord
* @param inBAM input BAM file
* @param outBAM output BAM file
* @param writeIndex whether to write index for output BAM file
* @param async whether to write asynchronously
* @param filteredOutBAM whether to write excluded SAMRecords to their own BAM file
*/
def
writeFilteredBAM
(
filterFunc
:
(
SAMRecord
=>
Boolean
),
inBAM
:
File
,
outBAM
:
File
,
writeIndex
:
Boolean
=
true
,
async
:
Boolean
=
true
,
filteredOutBAM
:
File
=
null
)
=
{