Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mirrors
biopet.biopet
Commits
53d48f23
Commit
53d48f23
authored
Nov 04, 2014
by
Peter van 't Hof
Browse files
Merge 'develop' into 'feature-basty_generate_fasta'
parent
e29d7fcf
Changes
30
Hide whitespace changes
Inline
Side-by-side
biopet-framework/examples/shark_apps.json
0 → 100644
View file @
53d48f23
{
"bwa"
:
{
"exe"
:
"/usr/local/bwa/bwa-0.7.10/bwa"
},
"seqtk"
:
{
"exe"
:
"/data/DIV5/SASC/common/programs/seqtk/seqtk/seqtk"
},
"sickle"
:
{
"exe"
:
"/data/DIV5/SASC/common/programs/sickle/sickle-1.33/sickle"
},
"clever"
:
{
"exe"
:
"/data/DIV5/SASC/common/programs/clever/clever-toolkit-v2.0rc3/bin/clever"
,
"version_exe"
:
"/data/DIV5/SASC/common/programs/clever/clever-toolkit-v2.0rc3/bin/ctk-version"
},
"pindel"
:
{
"exe"
:
"/data/DIV5/SASC/common/programs/pindel/pindel-0.2.5/pindel"
},
"breakdancerconfig"
:
{
"exe"
:
"/data/DIV5/SASC/common/programs/breakdancer/breakdancer-v1.4.4/lib/breakdancer-max1.4.4/bam2cfg.pl"
},
"breakdancercaller"
:
{
"exe"
:
"/data/DIV5/SASC/common/programs/breakdancer/breakdancer-v1.4.4/bin/breakdancer-max"
},
"fastqc"
:
{
"exe"
:
"/usr/local/FastQC/FastQC_v0.10.1/fastqc"
},
"seqstat"
:
{
"exe"
:
"/data/DIV5/SASC/common/programs/dQual/fastq-seqstat"
},
"stampy"
:
{
"exe"
:
"/usr/local/stampy/stampy-1.0.23/stampy.py"
},
"sambamba"
:
{
"exe"
:
"/data/DIV5/SASC/common/programs/sambamba/sambamba-0.4.7/build/sambamba"
}
}
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunctionTrait.scala
View file @
53d48f23
...
...
@@ -92,7 +92,7 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab
protected
def
versionCommand
:
String
=
null
protected
val
versionRegex
:
Regex
=
null
protected
val
versionExitcode
=
List
(
0
)
// Can select multiple
def
getVersion
:
String
=
{
private
def
getVersion
Internal
:
String
=
{
if
(
versionCommand
==
null
||
versionRegex
==
null
)
return
"N/A"
val
stdout
=
new
StringBuffer
()
val
stderr
=
new
StringBuffer
()
...
...
@@ -114,6 +114,12 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab
return
"N/A"
}
def
getVersion
:
String
=
{
if
(!
BiopetCommandLineFunctionTrait
.
versionCache
.
contains
(
executable
))
BiopetCommandLineFunctionTrait
.
versionCache
+=
executable
->
getVersionInternal
return
BiopetCommandLineFunctionTrait
.
versionCache
(
executable
)
}
def
getThreads
(
default
:
Int
)
:
Int
=
{
val
maxThreads
:
Int
=
config
(
"maxthreads"
,
default
=
8
)
val
threads
:
Int
=
config
(
"threads"
,
default
=
default
)
...
...
@@ -128,3 +134,8 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab
else
return
maxThreads
}
}
object
BiopetCommandLineFunctionTrait
{
import
scala.collection.mutable.Map
private
val
versionCache
:
Map
[
String
,
String
]
=
Map
()
}
\ No newline at end of file
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutable.scala
View file @
53d48f23
package
nl.lumc.sasc.biopet.core
import
java.util.Properties
import
org.apache.log4j.Logger
object
BiopetExecutable
{
object
BiopetExecutable
extends
Logging
{
val
modules
:
Map
[
String
,
List
[
MainCommand
]]
=
Map
(
"pipeline"
->
List
(
...
...
@@ -109,4 +110,17 @@ object BiopetExecutable {
prop
.
load
(
getClass
.
getClassLoader
.
getResourceAsStream
(
"git.properties"
))
prop
.
getProperty
(
"git.commit.id.abbrev"
)
}
def
checkDirtyBuild
(
logger
:
Logger
)
{
val
prop
=
new
Properties
()
prop
.
load
(
getClass
.
getClassLoader
.
getResourceAsStream
(
"git.properties"
))
val
describeShort
=
prop
.
getProperty
(
"git.commit.id.describe-short"
)
if
(
describeShort
.
endsWith
(
"-dirty"
))
{
logger
.
warn
(
"**********************************************************"
)
logger
.
warn
(
"* This JAR was built while there are uncommited changes. *"
)
logger
.
warn
(
"* Reproducible results are *not* guaranteed. *"
)
logger
.
warn
(
"**********************************************************"
)
}
}
checkDirtyBuild
(
logger
)
}
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/Logging.scala
0 → 100644
View file @
53d48f23
package
nl.lumc.sasc.biopet.core
import
java.text.SimpleDateFormat
import
java.util.Calendar
import
org.apache.log4j.Logger
import
org.apache.log4j.WriterAppender
import
org.apache.log4j.helpers.DateLayout
trait
Logging
{
protected
val
logger
=
Logger
.
getLogger
(
getClass
.
getSimpleName
.
split
(
"\\$"
).
last
)
private
[
core
]
val
logLayout
=
new
DateLayout
()
{
val
ignoresThrowable
=
false
def
format
(
event
:
org.apache.log4j.spi.LoggingEvent
)
:
String
=
{
val
calendar
:
Calendar
=
Calendar
.
getInstance
calendar
.
setTimeInMillis
(
event
.
getTimeStamp
)
val
formatter
:
SimpleDateFormat
=
new
SimpleDateFormat
(
"yyyy-MM-dd HH:mm:ss"
);
val
formattedDate
:
String
=
formatter
.
format
(
calendar
.
getTime
)
var
logLevel
=
event
.
getLevel
.
toString
while
(
logLevel
.
size
<
6
)
logLevel
+=
" "
logLevel
+
" ["
+
formattedDate
+
"] ["
+
event
.
getLoggerName
+
"] "
+
event
.
getMessage
+
"\n"
}
}
private
[
core
]
val
stderrAppender
=
new
WriterAppender
(
logLayout
,
sys
.
process
.
stderr
)
logger
.
setLevel
(
org
.
apache
.
log4j
.
Level
.
INFO
)
logger
.
addAppender
(
stderrAppender
)
}
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/PipelineCommand.scala
View file @
53d48f23
package
nl.lumc.sasc.biopet.core
import
org.broadinstitute.gatk.queue.util.Logging
import
org.broadinstitute.gatk.queue.util.
{
Logging
=>
GatkLogging
}
trait
PipelineCommand
extends
MainCommand
with
Logging
{
trait
PipelineCommand
extends
MainCommand
with
Gatk
Logging
{
def
pipeline
=
"/"
+
getClass
.
getName
.
stripSuffix
(
"$"
).
replaceAll
(
"\\."
,
"/"
)
+
".class"
...
...
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/ToolCommand.scala
View file @
53d48f23
package
nl.lumc.sasc.biopet.core
import
java.text.SimpleDateFormat
import
java.util.Calendar
import
org.apache.log4j.Logger
import
org.apache.log4j.WriterAppender
import
org.apache.log4j.helpers.DateLayout
import
org.apache.log4j.spi.LoggingEvent
import
java.io.File
trait
ToolCommand
extends
MainCommand
{
trait
ToolCommand
extends
MainCommand
with
Logging
{
abstract
class
AbstractArgs
{
}
...
...
@@ -45,22 +40,4 @@ trait ToolCommand extends MainCommand {
type
Args
<:
AbstractArgs
type
OptParser
<:
AbstractOptParser
protected
val
logger
=
Logger
.
getLogger
(
commandName
)
private
val
logLayout
=
new
DateLayout
()
{
val
ignoresThrowable
=
false
def
format
(
event
:
org.apache.log4j.spi.LoggingEvent
)
:
String
=
{
val
calendar
:
Calendar
=
Calendar
.
getInstance
calendar
.
setTimeInMillis
(
event
.
getTimeStamp
)
val
formatter
:
SimpleDateFormat
=
new
SimpleDateFormat
(
"yyyy-MM-dd HH:mm:ss"
);
val
formattedDate
:
String
=
formatter
.
format
(
calendar
.
getTime
)
var
logLevel
=
event
.
getLevel
.
toString
while
(
logLevel
.
size
<
6
)
logLevel
+=
" "
logLevel
+
" ["
+
formattedDate
+
"] ["
+
event
.
getLoggerName
+
"] "
+
event
.
getMessage
+
"\n"
}
}
private
val
stderrAppender
=
new
WriterAppender
(
logLayout
,
sys
.
process
.
stderr
)
logger
.
setLevel
(
org
.
apache
.
log4j
.
Level
.
INFO
)
logger
.
addAppender
(
stderrAppender
)
}
\ No newline at end of file
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/config/Configurable.scala
View file @
53d48f23
...
...
@@ -2,6 +2,7 @@ package nl.lumc.sasc.biopet.core.config
import
java.io.File
import
org.broadinstitute.gatk.queue.util.Logging
import
scala.language.implicitConversions
trait
Configurable
extends
Logging
{
val
root
:
Configurable
...
...
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Raxml.scala
View file @
53d48f23
...
...
@@ -86,4 +86,4 @@ class Raxml(val root: Configurable) extends BiopetCommandLineFunction {
optional
(
"-t"
,
t
)
+
optional
(
"-z"
,
z
)
+
required
(
"-T"
,
threads
)
}
\ No newline at end of file
}
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MarkDuplicates.scala
View file @
53d48f23
...
...
@@ -52,6 +52,14 @@ class MarkDuplicates(val root: Configurable) extends Picard {
@Argument
(
doc
=
"OPTICAL_DUPLICATE_PIXEL_DISTANCE"
,
required
=
false
)
var
opticalDuplicatePixelDistance
:
Option
[
Int
]
=
config
(
"opticalDuplicatePixelDistance"
)
@Output
(
doc
=
"Bam Index"
,
required
=
true
)
private
var
outputIndex
:
File
=
_
override
def
afterGraph
{
super
.
afterGraph
if
(
createIndex
)
outputIndex
=
new
File
(
output
.
getAbsolutePath
.
stripSuffix
(
".bam"
)
+
".bai"
)
}
override
def
commandLine
=
super
.
commandLine
+
repeat
(
"INPUT="
,
input
,
spaceSeparated
=
false
)
+
required
(
"OUTPUT="
,
output
,
spaceSeparated
=
false
)
+
...
...
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/SortSam.scala
View file @
53d48f23
...
...
@@ -16,6 +16,14 @@ class SortSam(val root: Configurable) extends Picard {
@Argument
(
doc
=
"Sort order of output file Required. Possible values: {unsorted, queryname, coordinate} "
,
required
=
true
)
var
sortOrder
:
String
=
_
@Output
(
doc
=
"Bam Index"
,
required
=
true
)
private
var
outputIndex
:
File
=
_
override
def
afterGraph
{
super
.
afterGraph
if
(
createIndex
)
outputIndex
=
new
File
(
output
.
getAbsolutePath
.
stripSuffix
(
".bam"
)
+
".bai"
)
}
override
def
commandLine
=
super
.
commandLine
+
required
(
"INPUT="
,
input
,
spaceSeparated
=
false
)
+
required
(
"OUTPUT="
,
output
,
spaceSeparated
=
false
)
+
...
...
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala
View file @
53d48f23
...
...
@@ -3,7 +3,7 @@ package nl.lumc.sasc.biopet.pipelines.gatk
import
nl.lumc.sasc.biopet.core.MultiSampleQScript
import
nl.lumc.sasc.biopet.core.PipelineCommand
import
nl.lumc.sasc.biopet.core.config.Configurable
import
htsjdk.samtools.S
AMFileReader
import
htsjdk.samtools.S
amReaderFactory
import
scala.collection.JavaConversions._
import
java.io.File
import
nl.lumc.sasc.biopet.extensions.gatk.
{
CombineVariants
,
CombineGVCFs
}
...
...
@@ -186,7 +186,7 @@ class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScri
libraryOutput
.
mappedBamFile
=
mapping
.
outputFiles
(
"finalBamFile"
)
}
else
{
var
readGroupOke
=
true
val
inputSam
=
new
SAMFileReader
(
bamFile
)
val
inputSam
=
SamReaderFactory
.
makeDefault
.
open
(
bamFile
)
val
header
=
inputSam
.
getFileHeader
.
getReadGroups
for
(
readGroup
<-
inputSam
.
getFileHeader
.
getReadGroups
)
{
if
(
readGroup
.
getSample
!=
sampleID
)
logger
.
warn
(
"Sample ID readgroup in bam file is not the same"
)
...
...
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkVariantcalling.scala
View file @
53d48f23
...
...
@@ -10,6 +10,7 @@ import org.broadinstitute.gatk.queue.QScript
import
org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile
import
org.broadinstitute.gatk.utils.commandline.
{
Input
,
Output
,
Argument
}
import
scala.collection.SortedMap
import
scala.language.reflectiveCalls
class
GatkVariantcalling
(
val
root
:
Configurable
)
extends
QScript
with
BiopetQScript
{
def
this
()
=
this
(
null
)
...
...
@@ -83,6 +84,7 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr
if
(
variantcalling
)
{
var
mergBuffer
:
SortedMap
[
String
,
File
]
=
SortedMap
()
def
mergeList
=
mergBuffer
map
{
case
(
key
,
file
)
=>
TaggedFile
(
removeNoneVariants
(
file
),
"name="
+
key
)
}
if
(
sampleID
!=
null
&&
(
useHaplotypecaller
.
get
||
config
(
"joint_genotyping"
,
default
=
false
).
getBoolean
))
{
val
hcGvcf
=
new
HaplotypeCaller
(
this
)
...
...
@@ -140,10 +142,17 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr
mergBuffer
+=
(
"9.raw"
->
scriptOutput
.
rawFilterVcfFile
)
if
(
useAllelesOption
.
get
)
{
val
tempFile
=
if
(
mergeList
.
toList
.
size
>
1
)
{
val
allelesTemp
=
CombineVariants
(
this
,
mergeList
.
toList
,
outputDir
+
outputName
+
".alleles_temp.vcf.gz"
)
allelesTemp
.
genotypemergeoption
=
org
.
broadinstitute
.
gatk
.
utils
.
variant
.
GATKVariantContextUtils
.
GenotypeMergeType
.
UNSORTED
add
(
allelesTemp
,
isIntermediate
=
true
)
allelesTemp
.
out
}
else
mergeList
.
toList
.
head
val
alleleOnly
=
new
CommandLineFunction
{
@Input
val
input
:
File
=
scriptOutput
.
rawFilterVcf
File
@Input
val
input
:
File
=
temp
File
@Output
val
output
:
File
=
outputDir
+
"raw.allele_only.vcf.gz"
@Output
val
outputindex
:
File
=
outputDir
+
"raw.allele_only.vcf.gz.tbi"
@Output
val
outputindex
:
File
=
outputDir
+
"raw.allele_
_temp_
only.vcf.gz.tbi"
def
commandLine
=
"zcat "
+
input
+
" | cut -f1,2,3,4,5,6,7,8 | bgzip -c > "
+
output
+
" && tabix -pvcf "
+
output
}
add
(
alleleOnly
,
isIntermediate
=
true
)
...
...
@@ -180,8 +189,8 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr
sv
.
out
}
def
mergeList
=
mergBuffer
map
{
case
(
key
,
file
)
=>
TaggedFile
(
removeNoneVariants
(
file
),
"name="
+
key
)
}
val
cvFinal
=
CombineVariants
(
this
,
mergeList
.
toList
,
outputDir
+
outputName
+
".final.vcf.gz"
)
cvFinal
.
genotypemergeoption
=
org
.
broadinstitute
.
gatk
.
utils
.
variant
.
GATKVariantContextUtils
.
GenotypeMergeType
.
UNSORTED
add
(
cvFinal
)
scriptOutput
.
finalVcfFile
=
cvFinal
.
out
}
...
...
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/BedToInterval.scala
View file @
53d48f23
package
nl.lumc.sasc.biopet.tools
import
htsjdk.samtools.SAMFileReader
import
htsjdk.samtools.SAMSequenceRecord
import
htsjdk.samtools.
{
SAMSequenceRecord
,
SamReaderFactory
}
import
java.io.File
import
nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import
nl.lumc.sasc.biopet.core.ToolCommand
...
...
@@ -68,7 +67,7 @@ object BedToInterval extends ToolCommand {
val
writer
=
new
PrintWriter
(
commandArgs
.
outputFile
)
val
inputSam
=
new
SAMFileReader
(
commandArgs
.
bamFile
)
val
inputSam
=
SamReaderFactory
.
makeDefault
.
open
(
commandArgs
.
bamFile
)
val
refs
=
for
(
SQ
<-
inputSam
.
getFileHeader
.
getSequenceDictionary
.
getSequences
.
toArray
)
yield
{
val
record
=
SQ
.
asInstanceOf
[
SAMSequenceRecord
]
writer
.
write
(
"@SQ\tSN:"
+
record
.
getSequenceName
+
"\tLN:"
+
record
.
getSequenceLength
+
"\n"
)
...
...
@@ -80,10 +79,10 @@ object BedToInterval extends ToolCommand {
val
bedFile
=
Source
.
fromFile
(
commandArgs
.
inputFile
)
for
(
line
<-
bedFile
.
getLines
;
val
split
=
line
.
split
(
"\t"
)
if
split
.
size
>=
3
;
val
chr
=
split
(
0
);
val
start
=
split
(
1
);
val
stop
=
split
(
2
)
if
start
forall
Character
.
isDigit
if
stop
forall
Character
.
isDigit
split
=
line
.
split
(
"\t"
)
if
split
.
size
>=
3
;
chr
=
split
(
0
);
start
=
split
(
1
);
stop
=
split
(
2
)
if
start
forall
Character
.
isDigit
if
stop
forall
Character
.
isDigit
)
{
if
(!
refsMap
.
contains
(
chr
))
throw
new
IllegalStateException
(
"Chr '"
+
chr
+
"' in bed file not found in bam file"
)
writer
.
write
(
chr
+
"\t"
+
start
+
"\t"
+
stop
+
"\t"
)
...
...
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/BiopetFlagstat.scala
View file @
53d48f23
package
nl.lumc.sasc.biopet.tools
import
htsjdk.samtools.SAMFileReader
import
htsjdk.samtools.SAMRecord
import
htsjdk.samtools.
{
SAMRecord
,
SamReaderFactory
}
import
java.io.File
import
nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import
nl.lumc.sasc.biopet.core.ToolCommand
...
...
@@ -57,7 +56,7 @@ object BiopetFlagstat extends ToolCommand {
val
argsParser
=
new
OptParser
val
commandArgs
:
Args
=
argsParser
.
parse
(
args
,
Args
())
getOrElse
sys
.
exit
(
1
)
val
inputSam
=
new
SAMFileReader
(
commandArgs
.
inputFile
)
val
inputSam
=
SamReaderFactory
.
makeDefault
.
open
(
commandArgs
.
inputFile
)
val
iterSam
=
if
(
commandArgs
.
region
==
None
)
inputSam
.
iterator
else
{
val
regionRegex
=
"""(.*):(.*)-(.*)"""
.
r
commandArgs
.
region
.
get
match
{
...
...
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/CheckAllelesVcfInBam.scala
View file @
53d48f23
package
nl.lumc.sasc.biopet.tools
import
htsjdk.samtools.SAMFileReader
import
htsjdk.samtools.QueryInterval
import
htsjdk.samtools.SAMRecord
import
htsjdk.samtools.
{
QueryInterval
,
SamReaderFactory
,
SAMRecord
,
SamReader
}
import
htsjdk.variant.variantcontext.VariantContext
import
htsjdk.variant.variantcontext.VariantContextBuilder
import
htsjdk.variant.variantcontext.writer.AsyncVariantContextWriter
...
...
@@ -69,7 +67,8 @@ object CheckAllelesVcfInBam extends ToolCommand {
if
(
commandArgs
.
bamFiles
.
size
!=
commandArgs
.
samples
.
size
)
logger
.
warn
(
"Number of samples is diffrent then number of bam files, left over will be removed"
)
val
bamReaders
:
Map
[
String
,
SAMFileReader
]
=
Map
(
commandArgs
.
samples
zip
commandArgs
.
bamFiles
.
map
(
x
=>
new
SAMFileReader
(
x
))
:
_
*
)
val
samReaderFactory
=
SamReaderFactory
.
makeDefault
val
bamReaders
:
Map
[
String
,
SamReader
]
=
Map
(
commandArgs
.
samples
zip
commandArgs
.
bamFiles
.
map
(
x
=>
samReaderFactory
.
open
(
x
))
:
_
*
)
val
bamHeaders
=
bamReaders
.
map
(
x
=>
(
x
.
_1
,
x
.
_2
.
getFileHeader
))
val
reader
=
new
VCFFileReader
(
commandArgs
.
inputFile
,
false
)
...
...
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/ExtractAlignedFastq.scala
View file @
53d48f23
...
...
@@ -21,6 +21,9 @@ object ExtractAlignedFastq extends ToolCommand {
type
FastqInput
=
(
FastqRecord
,
Option
[
FastqRecord
])
/** function to get FastqRecord ID */
def
fastqId
(
rec
:
FastqRecord
)
=
rec
.
getReadHeader
.
split
(
" "
)(
0
)
/**
* Function to create iterator over Interval given input interval string
*
...
...
@@ -90,10 +93,10 @@ object ExtractAlignedFastq extends ToolCommand {
}
val
queries
:
Array
[
QueryInterval
]
=
iv
.
toList
// sort Interval
.
sortBy
(
x
=>
(
x
.
getSequence
,
x
.
getStart
,
x
.
getEnd
))
// transform to QueryInterval
.
map
(
x
=>
new
QueryInterval
(
getSequenceIndex
(
x
.
getSequence
),
x
.
getStart
,
x
.
getEnd
))
// sort Interval
.
sortBy
(
x
=>
(
x
.
referenceIndex
,
x
.
start
,
x
.
end
))
// cast to array
.
toArray
...
...
@@ -113,11 +116,12 @@ object ExtractAlignedFastq extends ToolCommand {
)
(
pair
:
FastqInput
)
=>
pair
.
_2
match
{
case
None
=>
selected
.
contains
(
pair
.
_1
.
getReadHeader
)
case
None
=>
selected
.
contains
(
fastqId
(
pair
.
_1
)
)
case
Some
(
x
)
=>
require
(
commonSuffixLength
<
pair
.
_1
.
getReadHeader
.
length
)
require
(
commonSuffixLength
<
x
.
getReadHeader
.
length
)
selected
.
contains
(
pair
.
_1
.
getReadHeader
.
dropRight
(
commonSuffixLength
))
val
rec1Id
=
fastqId
(
pair
.
_1
)
require
(
commonSuffixLength
<
rec1Id
.
length
)
require
(
commonSuffixLength
<
fastqId
(
x
).
length
)
selected
.
contains
(
rec1Id
.
dropRight
(
commonSuffixLength
))
}
}
...
...
@@ -224,6 +228,7 @@ object ExtractAlignedFastq extends ToolCommand {
minMapQ
=
commandArgs
.
minMapQ
,
commonSuffixLength
=
commandArgs
.
commonSuffixLength
)
logger
.
info
(
"Writing to output file(s) ..."
)
(
commandArgs
.
inputFastq2
,
commandArgs
.
outputFastq2
)
match
{
case
(
None
,
None
)
=>
extractReads
(
memFunc
,
...
...
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/FindRepeatsPacBio.scala
View file @
53d48f23
/*
* Copyright 2014 pjvan_thof.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package
nl.lumc.sasc.biopet.tools
import
htsjdk.samtools.QueryInterval
import
htsjdk.samtools.SAMFileReader
import
htsjdk.samtools.ValidationStringency
import
htsjdk.samtools.SAMRecord
import
htsjdk.samtools.
{
QueryInterval
,
SAMRecord
,
SamReaderFactory
,
ValidationStringency
}
import
java.io.File
import
nl.lumc.sasc.biopet.core.ToolCommand
import
scala.io.Source
...
...
@@ -44,8 +25,9 @@ object FindRepeatsPacBio extends ToolCommand {
val
argsParser
=
new
OptParser
val
commandArgs
:
Args
=
argsParser
.
parse
(
args
,
Args
())
getOrElse
sys
.
exit
(
1
)
val
bamReader
=
new
SAMFileReader
(
commandArgs
.
inputBam
)
bamReader
.
setValidationStringency
(
ValidationStringency
.
SILENT
)
val
bamReader
=
SamReaderFactory
.
makeDefault
.
validationStringency
(
ValidationStringency
.
SILENT
)
.
open
(
commandArgs
.
inputBam
)
val
bamHeader
=
bamReader
.
getFileHeader
val
header
=
List
(
"chr"
,
"startPos"
,
"stopPos"
,
"Repeat_seq"
,
"repeatLength"
,
...
...
@@ -55,7 +37,7 @@ object FindRepeatsPacBio extends ToolCommand {
for
(
bedLine
<-
Source
.
fromFile
(
commandArgs
.
inputBed
).
getLines
;
val
values
=
bedLine
.
split
(
"\t"
);
if
values
.
size
>=
3
values
=
bedLine
.
split
(
"\t"
);
if
values
.
size
>=
3
)
{
val
interval
=
new
QueryInterval
(
bamHeader
.
getSequenceIndex
(
values
(
0
)),
values
(
1
).
toInt
,
values
(
2
).
toInt
)
val
bamIter
=
bamReader
.
query
(
Array
(
interval
),
false
)
...
...
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/MpileupToVcf.scala
View file @
53d48f23
...
...
@@ -115,7 +115,7 @@ object MpileupToVcf extends ToolCommand {
class
Counts
(
var
forward
:
Int
,
var
reverse
:
Int
)
for
(
line
<-
inputStream
;
val
values
=
line
.
split
(
"\t"
);
values
=
line
.
split
(
"\t"
);
if
values
.
size
>
5
)
{
val
chr
=
values
(
0
)
...
...
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala
View file @
53d48f23
...
...
@@ -6,6 +6,7 @@ package nl.lumc.sasc.biopet.tools
import
java.io.File
import
scala.collection.JavaConverters._
import
scala.io.Source
import
scala.math.
{
max
,
min
}
import
com.google.common.hash.
{
Funnel
,
BloomFilter
,
PrimitiveSink
}
...
...
@@ -73,7 +74,7 @@ object WipeReads extends ToolCommand {
*
* @param inFile input interval file
*/
def
makeIntervalFromFile
(
inFile
:
File
)
:
List
[
Interval
]
=
{
def
makeIntervalFromFile
(
inFile
:
File
,
gtfFeatureType
:
String
=
"exon"
)
:
List
[
Interval
]
=
{
logger
.
info
(
"Parsing interval file ..."
)
...
...
@@ -83,20 +84,62 @@ object WipeReads extends ToolCommand {
.
asScala
.
map
(
x
=>
new
Interval
(
x
.
getChr
,
x
.
getStart
,
x
.
getEnd
))
/** Function to create iterator from refFlat file */
def
makeIntervalFromRefFlat
(
inFile
:
File
)
:
Iterator
[
Interval
]
=
???
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
/**
* Parses a refFlat file to yield Interval objects
*
* Format description:
* http://genome.csdb.cn/cgi-bin/hgTables?hgsid=6&hgta_doSchemaDb=hg18&hgta_doSchemaTable=refFlat
*
* @param inFile input refFlat file
*/
def
makeIntervalFromRefFlat
(
inFile
:
File
)
:
Iterator
[
Interval
]
=
Source
.
fromFile
(
inFile
)
// read each line
.
getLines
()
// skip all empty lines
.
filterNot
(
x
=>
x
.
trim
.
isEmpty
)
// split per column
.
map
(
line
=>
line
.
trim
.
split
(
"\t"
))
// take chromosome and exonEnds and exonStars
.
map
(
x
=>
(
x
(
2
),
x
.
reverse
.
take
(
2
)))
// split starts and ends based on comma
.
map
(
x
=>
(
x
.
_1
,
x
.
_2
.
map
(
y
=>
y
.
split
(
","
))))
// zip exonStarts and exonEnds, note the index was reversed because we did .reverse above
.
map
(
x
=>
(
x
.
_1
,
x
.
_2
(
1
).
zip
(
x
.
_2
(
0
))))
// make Intervals, accounting for the fact that refFlat coordinates are 0-based
.
map
(
x
=>
x
.
_2
.
map
(
y
=>
new
Interval
(
x
.
_1
,
y
.
_1
.
toInt
+
1
,
y
.
_2
.
toInt
)))
// flatten sublist
.
flatten
/** Function to create iterator from GTF file */
def
makeIntervalFromGtf
(
inFile
:
File
)
:
Iterator
[
Interval
]
=
???
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
/**
* Parses a GTF file to yield Interval objects
*
* @param inFile input GTF file
* @return
*/
def
makeIntervalFromGtf
(
inFile
:
File
)
:
Iterator
[
Interval
]
=
Source
.
fromFile
(
inFile
)
// read each line
.
getLines
()
// skip all empty lines
.
filterNot
(
x
=>
x
.
trim
.
isEmpty
)
// skip all UCSC track lines and/or ensembl comment lines
.
dropWhile
(
x
=>
x
.
matches
(
"^track | ^browser | ^#"
))
// split to columns
.
map
(
x
=>
x
.
split
(
"\t"
))
// exclude intervals whose type is different from the supplied one
.
filter
(
x
=>
x
(
2
)
==
gtfFeatureType
)
// and finally create the interval objects
.
map
(
x
=>
new
Interval
(
x
(
0
),
x
(
3
).
toInt
,
x
(
4
).
toInt
))
// detect interval file format from extension
val
iterFunc
:
(
File
=>
Iterator
[
Interval
])
=
if
(
getExtension
(
inFile
.
toString
.
toLowerCase
)
==
"bed"
)
makeIntervalFromBed
else
if
(
getExtension
(
inFile
.
toString
.
toLowerCase
)
==
"refflat"
)
makeIntervalFromRefFlat
else
if
(
getExtension
(
inFile
.
toString
.
toLowerCase
)
==
"gtf"
)
makeIntervalFromGtf
else
throw
new
IllegalArgumentException
(
"Unexpected interval file type: "
+
inFile
.
getPath
)
...
...
@@ -107,7 +150,7 @@ object WipeReads extends ToolCommand {
acc
match
{
case
head
::
tail
if
x
.
intersects
(
head
)
=>
new
Interval
(
x
.
getSequence
,
min
(
x
.
getStart
,
head
.
getStart
),
max
(
x
.
getEnd
,
head
.
getEnd
))
::
tail
case
_
=>
x
::
acc
case
_
=>
x
::
acc
}
}
)
...
...
@@ -150,12 +193,10 @@ object WipeReads extends ToolCommand {
else
if
(
iv
.
getSequence
.
startsWith
(
"chr"
)
&&
getIndex
(
iv
.
getSequence
.
substring
(
3
))
>
-
1
)
{
logger
.
warn
(
"Removing 'chr' prefix from interval "
+
iv
.
toString
)
Some
(
new
QueryInterval
(
getIndex
(
iv
.
getSequence
.
substring
(
3
)),
iv
.
getStart
,
iv
.
getEnd
))