Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mirrors
biopet.biopet
Commits
6a83f9d3
Commit
6a83f9d3
authored
Oct 13, 2014
by
wyleung
Browse files
Merge from master
parents
2a22fa63
9cde95b2
Changes
47
Hide whitespace changes
Inline
Side-by-side
.gitattributes
0 → 100644
View file @
6a83f9d3
*.bam binary
*.bam.bai binary
biopet-framework/examples/biopet-defaults.json
0 → 100644
View file @
6a83f9d3
{
"genotypegvcfs"
:
{
"scattercount"
:
100
},
"variantannotator"
:
{
"scattercount"
:
10
},
"realignertargetcreator"
:
{
"scattercount"
:
30
},
"combinevariants"
:
{
"scattercount"
:
10
},
"printreads_temp"
:
{
"scattercount"
:
30
},
"indelrealigner"
:
{
"scattercount"
:
30
},
"haplotypecaller"
:
{
"scattercount"
:
100
},
"unifiedgenotyper"
:
{
"scattercount"
:
100
},
"baserecalibrator"
:
{
"scattercount"
:
30
},
"basty"
:
{
"haplotypecaller"
:
{
"scattercount"
:
20
},
"unifiedgenotyper"
:
{
"scattercount"
:
1
},
"multisample"
:
{
"unifiedgenotyper"
:
{
"scattercount"
:
100
}
},
"baserecalibrator"
:
{
"scattercount"
:
1
},
"indelrealigner"
:
{
"scattercount"
:
1
},
"printreads_temp"
:
{
"scattercount"
:
1
},
"realignertargetcreator"
:
{
"scattercount"
:
1
},
"genotypegvcfs"
:
{
"scattercount"
:
1
},
"combinevariants"
:
{
"scattercount"
:
1
}
}
}
biopet-framework/pom.xml
View file @
6a83f9d3
...
...
@@ -23,6 +23,10 @@
<name>
BioJava repository
</name>
<url>
http://www.biojava.org/download/maven/
</url>
</repository>
<repository>
<id>
orestes-bloom-filter
</id>
<url>
https://raw.githubusercontent.com/Baqend/Orestes-Bloomfilter/master/maven-repo
</url>
</repository>
</repositories>
<dependencies>
<dependency>
...
...
@@ -31,6 +35,11 @@
<version>
6.8
</version>
<scope>
test
</scope>
</dependency>
<dependency>
<groupId>
org.scalatest
</groupId>
<artifactId>
scalatest_2.11
</artifactId>
<version>
2.2.1
</version>
</dependency>
<dependency>
<groupId>
org.scala-lang
</groupId>
<artifactId>
scala-library
</artifactId>
...
...
@@ -56,6 +65,16 @@
<artifactId>
biojava3-sequencing
</artifactId>
<version>
3.1.0
</version>
</dependency>
<dependency>
<groupId>
com.baqend
</groupId>
<artifactId>
bloom-filter
</artifactId>
<version>
1.02
</version>
</dependency>
<dependency>
<groupId>
com.github.scopt
</groupId>
<artifactId>
scopt_2.10
</artifactId>
<version>
3.2.0
</version>
</dependency>
</dependencies>
<build>
<resources>
...
...
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutable.scala
View file @
6a83f9d3
...
...
@@ -16,40 +16,77 @@ import nl.lumc.sasc.biopet.pipelines.yamsvp.Yamsvp
object
BiopetExecutable
{
val
pipelines
:
Map
[
String
,
PipelineCommand
]
=
Map
(
"flexiprep"
->
Flexiprep
,
"mapping"
->
Mapping
,
"gentrap"
->
Gentrap
,
"bam-metrics"
->
BamMetrics
,
"gatk-benchmark-genotyping"
->
GatkBenchmarkGenotyping
,
"gatk-genotyping"
->
GatkGenotyping
,
"gatk-variantcalling"
->
GatkVariantcalling
,
"gatk-pipeline"
->
GatkPipeline
,
"gatk-variant-recalibration"
->
GatkVariantRecalibration
,
"gatk-vcf-sample-compare"
->
GatkVcfSampleCompare
,
"sage"
->
Sage
,
"basty"
->
Basty
,
"yamsvp"
->
Yamsvp
)
val
modules
:
Map
[
String
,
List
[
MainCommand
]]
=
Map
(
"pipeline"
->
List
(
nl
.
lumc
.
sasc
.
biopet
.
pipelines
.
flexiprep
.
Flexiprep
,
nl
.
lumc
.
sasc
.
biopet
.
pipelines
.
mapping
.
Mapping
,
nl
.
lumc
.
sasc
.
biopet
.
pipelines
.
gentrap
.
Gentrap
,
nl
.
lumc
.
sasc
.
biopet
.
pipelines
.
bammetrics
.
BamMetrics
,
nl
.
lumc
.
sasc
.
biopet
.
pipelines
.
gatk
.
GatkBenchmarkGenotyping
,
nl
.
lumc
.
sasc
.
biopet
.
pipelines
.
gatk
.
GatkGenotyping
,
nl
.
lumc
.
sasc
.
biopet
.
pipelines
.
gatk
.
GatkVariantcalling
,
nl
.
lumc
.
sasc
.
biopet
.
pipelines
.
gatk
.
GatkPipeline
,
nl
.
lumc
.
sasc
.
biopet
.
pipelines
.
gatk
.
GatkVariantRecalibration
,
nl
.
lumc
.
sasc
.
biopet
.
pipelines
.
gatk
.
GatkVcfSampleCompare
,
nl
.
lumc
.
sasc
.
biopet
.
pipelines
.
sage
.
Sage
,
nl
.
lumc
.
sasc
.
biopet
.
pipelines
.
basty
.
Basty
),
"tool"
->
List
(
nl
.
lumc
.
sasc
.
biopet
.
core
.
apps
.
WipeReads
,
nl
.
lumc
.
sasc
.
biopet
.
core
.
apps
.
BiopetFlagstat
))
/**
* @param args the command line arguments
*/
def
main
(
args
:
Array
[
String
])
:
Unit
=
{
if
(
args
.
isEmpty
)
{
System
.
err
.
println
(
pipelineList
)
System
.
exit
(
1
)
def
toBulletedList
(
m
:
List
[
MainCommand
],
kind
:
String
=
""
,
bullet
:
String
=
"-"
)
=
"Available %ss:\n "
.
format
(
kind
)
+
bullet
+
" "
+
m
.
map
(
x
=>
x
.
commandName
).
sorted
.
mkString
(
"\n "
+
bullet
+
" "
)
def
usage
(
module
:
String
=
null
)
:
String
=
{
if
(
module
!=
null
)
checkModule
(
module
)
val
usage
:
String
=
{
val
set
=
if
(
module
==
null
)
modules
.
keySet
else
Set
(
module
)
val
u
=
for
(
m
<-
set
)
yield
toBulletedList
(
modules
(
m
),
m
)
u
.
mkString
(
"\n\n"
)
}
"""
|Usage: java -jar BiopetFramework.jar {%s} <name> [args]
|
|%s
|
|Questions or comments? Email sasc@lumc.nl or check out the project page at https://git.lumc.nl/biopet/biopet.git
"""
.
stripMargin
.
format
(
modules
.
keys
.
mkString
(
","
),
usage
)
}
else
if
(
pipelines
.
contains
(
args
.
head
))
pipelines
(
args
.
head
).
main
(
args
.
tail
)
else
{
System
.
err
.
println
(
"Pipeline '"
+
args
.
head
+
"' does not exist"
)
System
.
err
.
println
(
pipelineList
)
System
.
exit
(
1
)
def
checkModule
(
module
:
String
)
{
if
(!
modules
.
contains
(
module
))
{
System
.
err
.
println
(
s
"ERROR: module '$module' does not exist\n"
+
usage
())
System
.
exit
(
1
)
}
}
def
pipelineList
:
String
=
{
val
pipelinesArray
=
for
((
k
,
v
)
<-
pipelines
)
yield
k
"Available pipelines:"
+
pipelinesArray
.
mkString
(
"\n- "
,
"\n- "
,
"\n"
)
+
"please supply a valid pipeline"
def
getCommand
(
module
:
String
,
name
:
String
)
:
MainCommand
=
{
checkModule
(
module
)
val
command
=
modules
(
module
).
find
(
p
=>
p
.
commandName
.
toLowerCase
==
name
.
toLowerCase
)
if
(
command
==
None
)
{
System
.
err
.
println
(
s
"ERROR: command '$name' does not exist in module '$module'\n"
+
usage
(
module
))
System
.
exit
(
1
)
}
return
command
.
get
}
args
match
{
case
Array
(
module
,
name
,
passArgs
@
_
*)
=>
{
getCommand
(
module
,
name
).
main
(
passArgs
.
toArray
)
}
case
Array
(
module
)
=>
{
System
.
err
.
println
(
usage
(
module
))
System
.
exit
(
1
)
}
case
_
=>
{
System
.
err
.
println
(
usage
())
System
.
exit
(
1
)
}
}
}
}
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/MainCommand.scala
0 → 100644
View file @
6a83f9d3
package
nl.lumc.sasc.biopet.core
trait
MainCommand
{
lazy
val
commandName
=
this
.
getClass
.
getSimpleName
.
split
(
"\\$"
).
last
def
main
(
args
:
Array
[
String
])
}
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/PipelineCommand.scala
View file @
6a83f9d3
...
...
@@ -2,14 +2,14 @@ package nl.lumc.sasc.biopet.core
import
org.broadinstitute.gatk.queue.util.Logging
trait
PipelineCommand
extends
Logging
{
val
pipeline
=
""
trait
PipelineCommand
extends
MainCommand
with
Logging
{
def
pipeline
=
"/"
+
getClass
.
getName
.
stripSuffix
(
"$"
).
replaceAll
(
"\\."
,
"/"
)
+
".class"
def
main
(
args
:
Array
[
String
])
:
Unit
=
{
var
argv
:
Array
[
String
]
=
Array
()
//argv ++= Array("-S", tempFile.getAbsolutePath)
argv
++=
Array
(
"-S"
,
pipeline
)
argv
++=
args
return
BiopetQCommandLine
.
main
(
argv
)
BiopetQCommandLine
.
main
(
argv
)
}
}
\ No newline at end of file
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/ToolCommand.scala
0 → 100644
View file @
6a83f9d3
package
nl.lumc.sasc.biopet.core
import
java.text.SimpleDateFormat
import
java.util.Calendar
import
org.apache.log4j.Logger
import
org.apache.log4j.WriterAppender
import
org.apache.log4j.helpers.DateLayout
import
org.apache.log4j.spi.LoggingEvent
import
java.io.File
trait
ToolCommand
extends
MainCommand
{
abstract
class
AbstractArgs
{
}
abstract
class
AbstractOptParser
extends
scopt
.
OptionParser
[
Args
](
commandName
)
{
opt
[
Unit
](
"log_nostderr"
)
foreach
{
_
=>
logger
.
removeAppender
(
stderrAppender
)
}
text
(
"No output to stderr"
)
opt
[
File
](
"log_file"
)
foreach
{
x
=>
logger
.
addAppender
(
new
WriterAppender
(
logLayout
,
new
java
.
io
.
PrintStream
(
x
)))
}
text
(
"Log file"
)
valueName
(
"<file>"
)
opt
[
String
](
'l'
,
"log_level"
)
foreach
{
x
=>
x
.
toLowerCase
match
{
case
"debug"
=>
logger
.
setLevel
(
org
.
apache
.
log4j
.
Level
.
DEBUG
)
case
"info"
=>
logger
.
setLevel
(
org
.
apache
.
log4j
.
Level
.
INFO
)
case
"warn"
=>
logger
.
setLevel
(
org
.
apache
.
log4j
.
Level
.
WARN
)
case
"error"
=>
logger
.
setLevel
(
org
.
apache
.
log4j
.
Level
.
ERROR
)
case
_
=>
}
}
text
(
"Log level"
)
validate
{
x
=>
x
match
{
case
"debug"
|
"info"
|
"warn"
|
"error"
=>
success
case
_
=>
failure
(
"Log level must be <debug/info/warn/error>"
)
}
}
opt
[
Unit
](
'h'
,
"help"
)
foreach
{
_
=>
System
.
err
.
println
(
this
.
usage
);
sys
.
exit
(
1
)}
text
(
"Print usage"
)
}
type
Args
<:
AbstractArgs
type
OptParser
<:
AbstractOptParser
protected
val
logger
=
Logger
.
getLogger
(
commandName
)
private
val
logLayout
=
new
DateLayout
()
{
val
ignoresThrowable
=
false
def
format
(
event
:
org.apache.log4j.spi.LoggingEvent
)
:
String
=
{
val
calendar
:
Calendar
=
Calendar
.
getInstance
calendar
.
setTimeInMillis
(
event
.
getTimeStamp
)
val
formatter
:
SimpleDateFormat
=
new
SimpleDateFormat
(
"yyyy-MM-dd HH:mm:ss"
);
val
formattedDate
:
String
=
formatter
.
format
(
calendar
.
getTime
)
var
logLevel
=
event
.
getLevel
.
toString
while
(
logLevel
.
size
<
6
)
logLevel
+=
" "
logLevel
+
" ["
+
formattedDate
+
"] ["
+
event
.
getLoggerName
+
"] "
+
event
.
getMessage
+
"\n"
}
}
private
val
stderrAppender
=
new
WriterAppender
(
logLayout
,
sys
.
process
.
stderr
)
logger
.
setLevel
(
org
.
apache
.
log4j
.
Level
.
INFO
)
logger
.
addAppender
(
stderrAppender
)
}
\ No newline at end of file
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/apps/BiopetFlagstat.scala
View file @
6a83f9d3
...
...
@@ -4,6 +4,7 @@ import htsjdk.samtools.SAMFileReader
import
htsjdk.samtools.SAMRecord
import
java.io.File
import
nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import
nl.lumc.sasc.biopet.core.ToolCommand
import
nl.lumc.sasc.biopet.core.config.Configurable
import
org.broadinstitute.gatk.utils.commandline.
{
Input
,
Output
}
import
org.broadinstitute.gatk.queue.util.Logging
...
...
@@ -22,10 +23,10 @@ class BiopetFlagstat(val root: Configurable) extends BiopetJavaCommandLineFuncti
override
val
defaultVmem
=
"8G"
memoryLimit
=
Option
(
4.0
)
override
def
commandLine
=
super
.
commandLine
+
required
(
input
)
+
" > "
+
required
(
output
)
override
def
commandLine
=
super
.
commandLine
+
required
(
"I"
,
input
)
+
" > "
+
required
(
output
)
}
object
BiopetFlagstat
extends
Logging
{
object
BiopetFlagstat
extends
ToolCommand
{
def
apply
(
root
:
Configurable
,
input
:
File
,
output
:
File
)
:
BiopetFlagstat
=
{
val
flagstat
=
new
BiopetFlagstat
(
root
)
flagstat
.
input
=
input
...
...
@@ -38,12 +39,32 @@ object BiopetFlagstat extends Logging {
flagstat
.
output
=
new
File
(
outputDir
,
input
.
getName
.
stripSuffix
(
".bam"
)
+
".biopetflagstat"
)
return
flagstat
}
case
class
Args
(
inputFile
:
File
=
null
,
region
:
Option
[
String
]
=
None
)
extends
AbstractArgs
class
OptParser
extends
AbstractOptParser
{
opt
[
File
](
'I'
,
"inputFile"
)
required
()
valueName
(
"<file>"
)
action
{
(
x
,
c
)
=>
c
.
copy
(
inputFile
=
x
)
}
text
(
"out is a required file property"
)
opt
[
String
](
'r'
,
"region"
)
valueName
(
"<chr:start-stop>"
)
action
{
(
x
,
c
)
=>
c
.
copy
(
region
=
Some
(
x
))
}
text
(
"out is a required file property"
)
}
/**
* @param args the command line arguments
*/
def
main
(
args
:
Array
[
String
])
:
Unit
=
{
val
inputSam
=
new
SAMFileReader
(
new
File
(
args
(
0
)))
val
argsParser
=
new
OptParser
val
commandArgs
:
Args
=
argsParser
.
parse
(
args
,
Args
())
getOrElse
sys
.
exit
(
1
)
val
inputSam
=
new
SAMFileReader
(
commandArgs
.
inputFile
)
val
iterSam
=
if
(
commandArgs
.
region
==
None
)
inputSam
.
iterator
else
{
val
regionRegex
=
"""(.*):(.*)-(.*)"""
.
r
commandArgs
.
region
.
get
match
{
case
regionRegex
(
chr
,
start
,
stop
)
=>
inputSam
.
query
(
chr
,
start
.
toInt
,
stop
.
toInt
,
false
)
case
_
=>
sys
.
error
(
"Region wrong format"
)
}
}
val
flagstatCollector
=
new
FlagstatCollector
flagstatCollector
.
loadDefaultFunctions
val
m
=
10
...
...
@@ -89,10 +110,11 @@ object BiopetFlagstat extends Logging {
flagstatCollector
.
addFunction
(
"Mate in same strand"
,
record
=>
record
.
getReadPairedFlag
&&
record
.
getReadNegativeStrandFlag
&&
record
.
getMateNegativeStrandFlag
&&
record
.
getReferenceIndex
==
record
.
getMateReferenceIndex
)
flagstatCollector
.
addFunction
(
"Mate on other chr"
,
record
=>
record
.
getReadPairedFlag
&&
record
.
getReferenceIndex
!=
record
.
getMateReferenceIndex
)
for
(
record
<-
inputSam
.
iterator
)
{
logger
.
info
(
"Start reading file: "
+
commandArgs
.
inputFile
)
for
(
record
<-
iterSam
)
{
if
(
flagstatCollector
.
readsCount
%
1
e6
==
0
&&
flagstatCollector
.
readsCount
>
0
)
System
.
err
.
println
(
"Reads prosessed: "
+
flagstatCollector
.
readsCount
)
logger
.
info
(
"Reads prosessed: "
+
flagstatCollector
.
readsCount
)
flagstatCollector
.
loadRecord
(
record
)
}
...
...
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/apps/WipeReads.scala
0 → 100644
View file @
6a83f9d3
/**
* Copyright (c) 2014 Leiden University Medical Center - Sequencing Analysis Support Core <sasc@lumc.nl>
* @author Wibowo Arindrarto <w.arindrarto@lumc.nl>
*/
package
nl.lumc.sasc.biopet.core.apps
import
java.io.
{
File
,
IOException
}
import
scala.collection.JavaConverters._
import
scala.io.Source
import
htsjdk.samtools.AlignmentBlock
import
htsjdk.samtools.SAMFileReader
import
htsjdk.samtools.SAMFileReader.QueryInterval
import
htsjdk.samtools.SAMFileWriterFactory
import
htsjdk.samtools.SAMRecord
import
htsjdk.tribble.index.interval.
{
Interval
,
IntervalTree
}
import
orestes.bloomfilter.HashProvider.HashMethod
import
orestes.bloomfilter.
{
BloomFilter
,
FilterBuilder
}
import
org.apache.commons.io.FilenameUtils.getExtension
import
org.broadinstitute.gatk.utils.commandline.
{
Input
,
Output
}
import
nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import
nl.lumc.sasc.biopet.core.MainCommand
import
nl.lumc.sasc.biopet.core.config.Configurable
// TODO: finish implementation for usage in pipelines
/**
* WipeReads function class for usage in Biopet pipelines
*
* @param root Configuration object for the pipeline
*/
class
WipeReads
(
val
root
:
Configurable
)
extends
BiopetJavaCommandLineFunction
{
javaMainClass
=
getClass
.
getName
@Input
(
doc
=
"Input BAM file (must be indexed)"
,
shortName
=
"I"
,
required
=
true
)
var
inputBAM
:
File
=
_
@Output
(
doc
=
"Output BAM"
,
shortName
=
"o"
,
required
=
true
)
var
outputBAM
:
File
=
_
}
object
WipeReads
extends
MainCommand
{
/** Container type for command line flags */
type
OptionMap
=
Map
[
String
,
Any
]
/** Container class for interval parsing results */
case
class
RawInterval
(
chrom
:
String
,
start
:
Int
,
end
:
Int
)
{
require
(
start
<=
end
,
s
"Start coordinate $start is larger than end coordinate $end"
)
/** Function to check whether one interval contains the other */
def
contains
(
that
:
RawInterval
)
:
Boolean
=
if
(
this
.
chrom
!=
that
.
chrom
)
false
else
this
.
start
<=
that
.
start
&&
this
.
end
>=
that
.
end
/** Function to check whether two intervals overlap each other */
def
overlaps
(
that
:
RawInterval
)
:
Boolean
=
if
(
this
.
chrom
!=
that
.
chrom
)
false
else
this
.
start
<=
that
.
start
&&
this
.
end
>=
that
.
start
/** Function to merge two overlapping intervals */
def
merge
(
that
:
RawInterval
)
:
RawInterval
=
{
if
(
this
.
chrom
!=
that
.
chrom
)
throw
new
IllegalArgumentException
(
"Can not merge RawInterval objects from different chromosomes"
)
if
(
contains
(
that
))
this
else
if
(
overlaps
(
that
))
RawInterval
(
this
.
chrom
,
this
.
start
,
that
.
end
)
else
throw
new
IllegalArgumentException
(
"Can not merge non-overlapping RawInterval objects"
)
}
}
/**
* Function to create an iterator yielding non-overlapped RawInterval objects
*
* @param ri iterator yielding RawInterval objects
* @return iterator yielding RawInterval objects
*/
def
mergeOverlappingIntervals
(
ri
:
Iterator
[
RawInterval
])
:
Iterator
[
RawInterval
]
=
ri
.
toList
.
sortBy
(
x
=>
(
x
.
chrom
,
x
.
start
,
x
.
end
))
.
foldLeft
(
List
.
empty
[
RawInterval
])
{
(
acc
,
i
)
=>
acc
match
{
case
head
::
tail
if
head
.
overlaps
(
i
)
=>
head
.
merge
(
i
)
::
tail
case
_
=>
i
::
acc
}}
.
toIterator
/**
* Function to create iterator over intervals from input interval file
*
* @param inFile input interval file
*/
def
makeRawIntervalFromFile
(
inFile
:
File
)
:
Iterator
[
RawInterval
]
=
{
/** Function to create iterator from BED file */
def
makeRawIntervalFromBED
(
inFile
:
File
)
:
Iterator
[
RawInterval
]
=
// BED file coordinates are 0-based, half open so we need to do some conversion
Source
.
fromFile
(
inFile
)
.
getLines
()
.
filterNot
(
_
.
trim
.
isEmpty
)
.
dropWhile
(
_
.
matches
(
"^track | ^browser "
))
.
map
(
line
=>
line
.
trim
.
split
(
"\t"
)
match
{
case
Array
(
chrom
,
start
,
end
,
_
*)
=>
new
RawInterval
(
chrom
,
start
.
toInt
+
1
,
end
.
toInt
)
})
// TODO: implementation
/** Function to create iterator from refFlat file */
def
makeRawIntervalFromRefFlat
(
inFile
:
File
)
:
Iterator
[
RawInterval
]
=
???
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
// TODO: implementation
/** Function to create iterator from GTF file */
def
makeRawIntervalFromGTF
(
inFile
:
File
)
:
Iterator
[
RawInterval
]
=
???
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
// detect interval file format from extension
val
iterFunc
:
(
File
=>
Iterator
[
RawInterval
])
=
if
(
getExtension
(
inFile
.
toString
.
toLowerCase
)
==
"bed"
)
makeRawIntervalFromBED
else
throw
new
IllegalArgumentException
(
"Unexpected interval file type: "
+
inFile
.
getPath
)
mergeOverlappingIntervals
(
iterFunc
(
inFile
))
}
// TODO: set minimum fraction for overlap
/**
* Function to create function to check SAMRecord for exclusion in filtered BAM file.
*
* The returned function evaluates all filtered-in SAMRecord to false.
*
* @param iv iterator yielding RawInterval objects
* @param inBAM input BAM file
* @param inBAMIndex input BAM file index
* @param filterOutMulti whether to filter out reads with same name outside target region (default: true)
* @param minMapQ minimum MapQ of reads in target region to filter out (default: 0)
* @param readGroupIDs read group IDs of reads in target region to filter out (default: all IDs)
* @param bloomSize expected size of elements to contain in the Bloom filter
* @param bloomFp expected Bloom filter false positive rate
* @return function that checks whether a SAMRecord or String is to be excluded
*/
def
makeFilterOutFunction
(
iv
:
Iterator
[
RawInterval
],
inBAM
:
File
,
inBAMIndex
:
File
=
null
,
filterOutMulti
:
Boolean
=
true
,
minMapQ
:
Int
=
0
,
readGroupIDs
:
Set
[
String
]
=
Set
(),
bloomSize
:
Int
=
100000000
,
bloomFp
:
Double
=
1
e
-
10
)
:
(
SAMRecord
=>
Boolean
)
=
{
// TODO: implement optional index creation
/** Function to check for BAM file index and return a SAMFileReader given a File */
def
prepIndexedInputBAM
()
:
SAMFileReader
=
if
(
inBAMIndex
!=
null
)
new
SAMFileReader
(
inBAM
,
inBAMIndex
)
else
{
val
sfr
=
new
SAMFileReader
(
inBAM
)
sfr
.
setValidationStringency
(
SAMFileReader
.
ValidationStringency
.
LENIENT
)
if
(!
sfr
.
hasIndex
)
throw
new
IllegalStateException
(
"Input BAM file must be indexed"
)
else
sfr
}
/**
* Function to query intervals from a BAM file
*
* The function still works when only either one of the interval or BAM
* file contig is prepended with "chr"
*
* @param inBAM BAM file to query as SAMFileReader
* @param ri raw interval object containing query
* @return QueryInterval wrapped in Option
*/
def
monadicMakeQueryInterval
(
inBAM
:
SAMFileReader
,
ri
:
RawInterval
)
:
Option
[
QueryInterval
]
=
if
(
inBAM
.
getFileHeader
.
getSequenceIndex
(
ri
.
chrom
)
>
-
1
)
Some
(
inBAM
.
makeQueryInterval
(
ri
.
chrom
,
ri
.
start
,
ri
.
end
))
else
if
(
ri
.
chrom
.
startsWith
(
"chr"
)
&&
inBAM
.
getFileHeader
.
getSequenceIndex
(
ri
.
chrom
.
substring
(
3
))
>
-
1
)
Some
(
inBAM
.
makeQueryInterval
(
ri
.
chrom
.
substring
(
3
),
ri
.
start
,
ri
.
end
))
else
if
(!
ri
.
chrom
.
startsWith
(
"chr"
)
&&
inBAM
.
getFileHeader
.
getSequenceIndex
(
"chr"
+
ri
.
chrom
)
>
-
1
)
Some
(
inBAM
.
makeQueryInterval
(
"chr"
+
ri
.
chrom
,
ri
.
start
,
ri
.
end
))
else
None
/** function to make IntervalTree from our RawInterval objects
*
* @param ri iterable over RawInterval objects
* @return IntervalTree objects, filled with intervals from RawInterval
*/
def
makeIntervalTree
(
ri
:
Iterable
[
RawInterval
])
:
IntervalTree
=
{
val
ivt
=
new
IntervalTree
for
(
iv
:
RawInterval
<-
ri
)
ivt
.
insert
(
new
Interval
(
iv
.
start
,
iv
.
end
))
ivt
}
/**
* Function to ensure that a SAMRecord overlaps our target regions
*
* This is required because htsjdk's queryOverlap method does not take into
* account the SAMRecord splicing structure
*
* @param rec SAMRecord to check
* @param ivtm mutable mapping of a chromosome and its interval tree
* @return
*/
def
alignmentBlockOverlaps
(
rec
:
SAMRecord
,
ivtm
:
Map
[
String
,
IntervalTree
])
:
Boolean
=
// if SAMRecord is not spliced, assume queryOverlap has done its job
// otherwise check for alignment block overlaps in our interval list
// using raw SAMString to bypass cigar string decoding
if
(
rec
.
getSAMString
.
split
(
"\t"
)(
5
).
contains
(
"N"
))
{
for
(
ab
:
AlignmentBlock
<-
rec.getAlignmentBlocks.asScala
)
{
if
(!
ivtm
(
rec
.
getReferenceName
).
findOverlapping
(
new
Interval
(
ab
.
getReferenceStart
,
ab
.
getReferenceStart
+
ab
.
getLength
-
1
)).
isEmpty
)
return
true
}
false
}
else
true
/** filter function for read IDs */
val
rgFilter
=