Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mirrors
biopet.biopet
Commits
b7690fca
Commit
b7690fca
authored
Jun 27, 2014
by
Peter van 't Hof
Browse files
Added chunking for mapping
parent
6090fa33
Changes
4
Hide whitespace changes
Inline
Side-by-side
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/apps/FastqSplitter.scala
0 → 100644
View file @
b7690fca
package
nl.lumc.sasc.biopet.core.apps
import
java.io.BufferedInputStream
import
java.io.File
import
java.io.FileInputStream
import
java.io.PrintWriter
import
java.util.zip.GZIPInputStream
import
nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import
scala.io.Source
import
nl.lumc.sasc.biopet.core.config.Configurable
import
org.broadinstitute.sting.commandline._
class
FastqSplitter
(
val
root
:
Configurable
)
extends
BiopetJavaCommandLineFunction
{
javaMainClass
=
"nl.lumc.sasc.biopet.core.apps.FastqSplitter"
@Input
(
doc
=
"Input fastq"
,
shortName
=
"input"
,
required
=
true
)
var
input
:
File
=
_
@Output
(
doc
=
"Output fastq files"
,
shortName
=
"output"
,
required
=
true
)
var
output
:
List
[
File
]
=
Nil
override
def
commandLine
=
super
.
commandLine
+
required
(
input
)
+
repeat
(
output
)
}
object
FastqSplitter
{
/**
* @param args the command line arguments
*/
def
main
(
args
:
Array
[
String
])
:
Unit
=
{
val
groupsize
=
100
val
input
=
new
File
(
args
.
head
)
val
output
:
Array
[
PrintWriter
]
=
new
Array
[
PrintWriter
](
args
.
tail
.
size
)
for
(
t
<-
1
to
args
.
tail
.
size
)
output
(
t
-
1
)
=
new
PrintWriter
(
args
(
t
))
val
inputStream
=
{
if
(
input
.
getName
.
endsWith
(
".gz"
)
||
input
.
getName
.
endsWith
(
".gzip"
))
Source
.
fromInputStream
(
new
GZIPInputStream
(
new
BufferedInputStream
(
new
FileInputStream
(
input
)))).
bufferedReader
else
Source
.
fromFile
(
input
).
bufferedReader
}
while
(
inputStream
.
ready
)
{
for
(
writter
<-
output
)
{
for
(
t
<-
1
to
groupsize
)
{
for
(
t
<-
1
to
(
4
))
{
if
(
inputStream
.
ready
)
{
writter
.
write
(
inputStream
.
readLine
+
"\n"
)
}
}
}
writter
.
flush
}
}
for
(
writter
<-
output
)
writter
.
close
}
}
biopet-framework/src/main/scala/nl/lumc/sasc/biopet/function/Cat.scala
0 → 100644
View file @
b7690fca
package
nl.lumc.sasc.biopet.function
import
nl.lumc.sasc.biopet.core._
import
nl.lumc.sasc.biopet.core.config._
import
org.broadinstitute.sting.commandline._
import
java.io.File
class
Cat
(
val
root
:
Configurable
)
extends
BiopetCommandLineFunction
{
@Input
(
doc
=
"Input file"
,
required
=
true
)
var
input
:
List
[
File
]
=
Nil
@Output
(
doc
=
"Unzipped file"
,
required
=
true
)
var
output
:
File
=
_
executeble
=
config
(
"exe"
,
"cat"
)
def
cmdLine
=
required
(
executeble
)
+
repeat
(
input
)
+
" > "
+
required
(
output
)
}
object
Cat
{
def
apply
(
root
:
Configurable
,
input
:
List
[
File
],
output
:
File
)
:
Cat
=
{
val
cat
=
new
Cat
(
root
)
cat
.
input
=
input
cat
.
output
=
output
return
cat
}
}
\ No newline at end of file
flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala
View file @
b7690fca
...
...
@@ -58,8 +58,15 @@ class Flexiprep(val root:Configurable) extends QScript with BiopetQScript {
}
def
biopetScript
()
{
runInitial
Fastqc
()
runInitial
Jobs
()
if
(
paired
)
runTrimClip
(
outputFiles
(
"fastq_input_R1"
),
outputFiles
(
"fastq_input_R2"
),
outputDir
)
else
runTrimClip
(
outputFiles
(
"fastq_input_R1"
),
outputDir
)
runFinalize
(
List
(
outputFiles
(
"output_R1"
)),
if
(
outputFiles
.
contains
(
"output_R2"
))
List
(
outputFiles
(
"output_R2"
))
else
List
())
}
def
runInitialJobs
()
{
outputFiles
+=
(
"fastq_input_R1"
->
extractIfNeeded
(
input_R1
,
outputDir
))
if
(
paired
)
outputFiles
+=
(
"fastq_input_R2"
->
extractIfNeeded
(
input_R2
,
outputDir
))
...
...
@@ -69,13 +76,6 @@ class Flexiprep(val root:Configurable) extends QScript with BiopetQScript {
addSha1sum
(
outputFiles
(
"fastq_input_R1"
),
"sha1_R1"
)
if
(
paired
)
addSha1sum
(
outputFiles
(
"fastq_input_R2"
),
"sha1_R2"
)
if
(
paired
)
runTrimClip
(
outputFiles
(
"fastq_input_R1"
),
outputFiles
(
"fastq_input_R2"
),
outputDir
)
else
runTrimClip
(
outputFiles
(
"fastq_input_R1"
),
outputDir
)
runFinalize
(
List
(
outputFiles
(
"output_R1"
)),
if
(
outputFiles
.
contains
(
"output_R2"
))
List
(
outputFiles
(
"output_R2"
))
else
List
())
}
def
runInitialFastqc
()
{
var
fastqc_R1
=
runFastqc
(
input_R1
,
outputDir
+
"/"
+
R1_name
+
".fastqc/"
)
outputFiles
+=
(
"fastqc_R1"
->
fastqc_R1
.
output
)
outputFiles
+=
(
"qualtype_R1"
->
getQualtype
(
fastqc_R1
,
R1_name
))
...
...
@@ -115,12 +115,13 @@ class Flexiprep(val root:Configurable) extends QScript with BiopetQScript {
def
runTrimClip
(
R1_in
:
File
,
R2_in
:
File
,
outDir
:
String
)
{
runTrimClip
(
R1_in
,
R2_in
,
outDir
,
""
)
}
def
runTrimClip
(
R1_in
:
File
,
R2_in
:
File
,
outDir
:
String
,
chunk
:
String
)
{
def
runTrimClip
(
R1_in
:
File
,
R2_in
:
File
,
outDir
:
String
,
chunkarg
:
String
)
:
(
File
,
File
)
=
{
val
chunk
=
if
(
chunkarg
.
isEmpty
||
chunkarg
.
endsWith
(
"_"
))
chunkarg
else
chunkarg
+
"_"
var
results
:
Map
[
String
,
File
]
=
Map
()
var
R1
:
File
=
new
File
(
R1_in
)
var
R2
:
File
=
new
File
(
R2_in
)
if
(!
skipClip
)
{
// Adapter clipping
val
cutadapt_R1
=
new
Cutadapt
(
this
)
if
(!
skipTrim
||
paired
)
cutadapt_R1
.
isIntermediate
=
true
...
...
@@ -157,12 +158,12 @@ class Flexiprep(val root:Configurable) extends QScript with BiopetQScript {
sickle
.
input_R1
=
R1
sickle
.
output_R1
=
swapExt
(
outDir
,
R1
,
R1_ext
,
".trim"
+
R1_ext
)
if
(
outputFiles
.
contains
(
"qualtype_R1"
))
sickle
.
qualityTypeFile
=
outputFiles
(
"qualtype_R1"
)
if
(!
skipClip
)
sickle
.
deps
:+=
outputFiles
(
chunk
+
"fastq_input_R1"
)
if
(!
skipClip
)
sickle
.
deps
:+=
R1_in
if
(
paired
)
{
sickle
.
input_R2
=
R2
sickle
.
output_R2
=
swapExt
(
outDir
,
R2
,
R2_ext
,
".trim"
+
R2_ext
)
sickle
.
output_singles
=
swapExt
(
outDir
,
R2
,
R2_ext
,
".trim.singles"
+
R1_ext
)
if
(!
skipClip
)
sickle
.
deps
:+=
outputFiles
(
chunk
+
"fastq_input_R2"
)
if
(!
skipClip
)
sickle
.
deps
:+=
R2_in
}
sickle
.
output_stats
=
swapExt
(
outDir
,
R1
,
R1_ext
,
".trim.stats"
)
add
(
sickle
)
...
...
@@ -170,8 +171,9 @@ class Flexiprep(val root:Configurable) extends QScript with BiopetQScript {
if
(
paired
)
R2
=
sickle
.
output_R2
}
outputFiles
+=
(
"output_R1"
->
R1
)
if
(
paired
)
outputFiles
+=
(
"output_R2"
->
R2
)
outputFiles
+=
(
chunk
+
"output_R1"
->
R1
)
if
(
paired
)
outputFiles
+=
(
chunk
+
"output_R2"
->
R2
)
return
(
R1
,
R2
)
}
def
runFinalize
(
fastq_R1
:
List
[
File
],
fastq_R2
:
List
[
File
])
{
...
...
@@ -182,10 +184,17 @@ class Flexiprep(val root:Configurable) extends QScript with BiopetQScript {
for
(
file
<-
fastq_R1
)
R1
=
file
for
(
file
<-
fastq_R2
)
R2
=
file
}
else
{
throw
new
IllegalStateException
(
"Not yet inplemented"
)
// For chuncking
R1
=
new
File
(
outputDir
+
R1_name
+
".qc"
+
R1_ext
)
logger
.
debug
(
fastq_R1
)
add
(
Cat
(
this
,
fastq_R1
,
R1
),
true
)
if
(
paired
)
{
logger
.
debug
(
fastq_R2
)
R2
=
new
File
(
outputDir
+
R2_name
+
".qc"
+
R2_ext
)
add
(
Cat
(
this
,
fastq_R2
,
R2
),
true
)
}
}
if
(!
config
(
"skip_native_link"
,
false
))
{
if
(
fastq_R1
.
length
==
1
&&
!
config
(
"skip_native_link"
,
false
))
{
val
lnR1
=
new
Ln
(
this
)
lnR1
.
in
=
R1
R1
=
new
File
(
outputDir
+
R1_name
+
".qc"
+
R1_ext
)
...
...
@@ -200,6 +209,9 @@ class Flexiprep(val root:Configurable) extends QScript with BiopetQScript {
}
}
outputFiles
+=
(
"output_R1"
->
R1
)
if
(
paired
)
outputFiles
+=
(
"output_R2"
->
R2
)
if
(!
skipTrim
||
!
skipClip
)
{
addSeqstat
(
R1
,
"seqstat_qc_R1"
)
if
(
paired
)
addSeqstat
(
R2
,
"seqstat_qc_R2"
)
...
...
mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala
View file @
b7690fca
...
...
@@ -4,16 +4,19 @@ import nl.lumc.sasc.biopet.function._
import
nl.lumc.sasc.biopet.function.aligners._
import
java.util.Date
import
nl.lumc.sasc.biopet.core._
import
nl.lumc.sasc.biopet.core.apps.FastqSplitter
import
nl.lumc.sasc.biopet.core.config._
import
nl.lumc.sasc.biopet.pipelines.flexiprep._
import
org.broadinstitute.sting.queue.QScript
import
org.broadinstitute.sting.queue.extensions.gatk._
import
org.broadinstitute.sting.queue.extensions.picard.MarkDuplicates
import
org.broadinstitute.sting.queue.extensions.picard.MergeSamFiles
import
org.broadinstitute.sting.queue.extensions.picard.SortSam
import
org.broadinstitute.sting.queue.extensions.picard.AddOrReplaceReadGroups
import
org.broadinstitute.sting.queue.function._
import
scala.util.parsing.json._
import
org.broadinstitute.sting.utils.variant._
import
scala.math._
class
Mapping
(
val
root
:
Configurable
)
extends
QScript
with
BiopetQScript
{
qscript
=>
...
...
@@ -40,6 +43,9 @@ class Mapping(val root:Configurable) extends QScript with BiopetQScript {
@Argument
(
doc
=
"Reference"
,
shortName
=
"R"
,
required
=
false
)
var
referenceFile
:
File
=
_
@Argument
(
doc
=
"auto chuning"
,
shortName
=
"chunking"
,
required
=
false
)
var
chunking
:
Boolean
=
false
// Readgroup items
@Argument
(
doc
=
"Readgroup ID"
,
shortName
=
"RGID"
,
required
=
false
)
var
RGID
:
String
=
_
...
...
@@ -68,9 +74,8 @@ class Mapping(val root:Configurable) extends QScript with BiopetQScript {
@Argument
(
doc
=
"Readgroup predicted insert size"
,
shortName
=
"RGPI"
,
required
=
false
)
var
RGPI
:
Int
=
_
var
paired
:
Boolean
=
false
var
numberChunks
=
0
def
init
()
{
for
(
file
<-
configfiles
)
globalConfig
.
loadConfigFile
(
file
)
...
...
@@ -99,42 +104,101 @@ class Mapping(val root:Configurable) extends QScript with BiopetQScript {
if
(
RGDS
==
null
&&
configContains
(
"RGDS"
))
RGDS
=
config
(
"RGDS"
)
if
(
outputName
==
null
)
outputName
=
RGID
if
(!
chunking
&&
numberChunks
>
1
)
chunking
=
true
if
(!
chunking
)
chunking
=
config
(
"chunking"
,
false
)
if
(
chunking
)
{
val
chunkSize
:
Int
=
config
(
"chunksize"
,
(
1
<<
30
))
val
filesize
=
if
(
input_R1
.
getName
.
endsWith
(
".gz"
)
||
input_R1
.
getName
.
endsWith
(
".gzip"
))
input_R1
.
length
*
3
else
input_R1
.
length
if
(
numberChunks
==
0
&&
configContains
(
"numberchunks"
))
numberChunks
=
config
(
"numberchunks"
)
else
if
(
numberChunks
==
0
)
numberChunks
=
ceil
(
filesize
.
toDouble
/
chunkSize
).
toInt
logger
.
debug
(
"Chunks: "
+
numberChunks
)
}
}
def
biopetScript
()
{
var
fastq_R1
:
String
=
input_R1
var
fastq_R2
:
String
=
if
(
paired
)
input_R2
else
""
var
fastq_R1
:
File
=
input_R1
var
fastq_R2
:
File
=
if
(
paired
)
input_R2
else
""
val
flexiprep
=
new
Flexiprep
(
this
)
var
bamFiles
:
List
[
File
]
=
Nil
var
fastq_R1_output
:
List
[
File
]
=
Nil
var
fastq_R2_output
:
List
[
File
]
=
Nil
def
removeGz
(
file
:
String
)
:
String
=
{
if
(
file
.
endsWith
(
".gz"
))
return
file
.
substring
(
0
,
file
.
lastIndexOf
(
".gz"
))
else
if
(
file
.
endsWith
(
".gzip"
))
return
file
.
substring
(
0
,
file
.
lastIndexOf
(
".gzip"
))
else
return
file
}
var
chunks
:
Map
[
String
,
(
String
,
String
)]
=
Map
()
if
(
chunking
)
for
(
t
<-
1
to
numberChunks
)
{
chunks
+=
(
"chunk_"
+
t
->
(
removeGz
(
outputDir
+
"chunk_"
+
t
+
"/"
+
fastq_R1
.
getName
),
if
(
paired
)
removeGz
(
outputDir
+
"chunk_"
+
t
+
"/"
+
fastq_R2
.
getName
)
else
""
))
}
else
chunks
+=
(
""
->
(
fastq_R1
,
fastq_R2
))
if
(
chunking
)
{
val
fastSplitter_R1
=
new
FastqSplitter
(
this
)
fastSplitter_R1
.
input
=
fastq_R1
fastSplitter_R1
.
memoryLimit
=
4
fastSplitter_R1
.
jobResourceRequests
:+=
"h_vmem=8G"
for
((
chunk
,
fastqfile
)
<-
chunks
)
fastSplitter_R1
.
output
:+=
fastqfile
.
_1
add
(
fastSplitter_R1
)
if
(
paired
)
{
val
fastSplitter_R2
=
new
FastqSplitter
(
this
)
fastSplitter_R2
.
input
=
fastq_R2
fastSplitter_R2
.
memoryLimit
=
4
fastSplitter_R2
.
jobResourceRequests
:+=
"h_vmem=8G"
for
((
chunk
,
fastqfile
)
<-
chunks
)
fastSplitter_R2
.
output
:+=
fastqfile
.
_2
add
(
fastSplitter_R2
)
}
}
for
((
chunk
,
fastqfile
)
<-
chunks
)
{
var
R1
=
fastqfile
.
_1
var
R2
=
fastqfile
.
_2
if
(!
skipFlexiprep
)
{
flexiprep
.
input_R1
=
fastq_R1
if
(
paired
)
flexiprep
.
input_R2
=
fastq_R2
flexiprep
.
outputDir
=
outputDir
+
"flexiprep/"
flexiprep
.
init
flexiprep
.
runInitialJobs
//flexiprep.biopetScript
val
flexiout
=
flexiprep
.
runTrimClip
(
R1
,
R2
,
outputDir
+
chunk
,
chunk
)
logger
.
debug
(
chunk
+
" - "
+
flexiout
)
R1
=
flexiout
.
_1
if
(
paired
)
R2
=
flexiout
.
_2
fastq_R1_output
:+=
R1
fastq_R2_output
:+=
R2
}
val
chunkDir
=
if
(
chunk
.
isEmpty
)
outputDir
else
outputDir
+
chunk
+
"/"
if
(
aligner
==
"bwa"
)
{
val
bwaCommand
=
new
Bwa
(
this
)
bwaCommand
.
R1
=
R1
if
(
paired
)
bwaCommand
.
R2
=
R2
bwaCommand
.
RG
=
getReadGroup
bwaCommand
.
output
=
new
File
(
chunkDir
+
outputName
+
".sam"
)
add
(
bwaCommand
,
isIntermediate
=
true
)
bamFiles
:+=
addSortSam
(
List
(
bwaCommand
.
output
),
swapExt
(
chunkDir
,
bwaCommand
.
output
,
".sam"
,
".bam"
),
chunkDir
)
}
else
if
(
aligner
==
"star"
)
{
val
starCommand
=
Star
(
this
,
R1
,
if
(
paired
)
R2
else
null
,
outputDir
,
isIntermediate
=
true
)
add
(
starCommand
)
bamFiles
:+=
addAddOrReplaceReadGroups
(
List
(
starCommand
.
outputSam
),
new
File
(
chunkDir
+
outputName
+
".bam"
),
chunkDir
)
}
else
if
(
aligner
==
"star-2pass"
)
{
val
star2pass
=
Star
.
_2pass
(
this
,
R1
,
if
(
paired
)
R2
else
null
,
chunkDir
,
isIntermediate
=
true
)
addAll
(
star2pass
.
_2
)
bamFiles
:+=
addAddOrReplaceReadGroups
(
List
(
star2pass
.
_1
),
new
File
(
chunkDir
+
outputName
+
".bam"
),
chunkDir
)
}
else
throw
new
IllegalStateException
(
"Option Alginer: '"
+
aligner
+
"' is not valid"
)
}
if
(!
skipFlexiprep
)
{
val
flexiprep
=
new
Flexiprep
(
this
)
flexiprep
.
input_R1
=
fastq_R1
if
(
paired
)
flexiprep
.
input_R2
=
fastq_R2
flexiprep
.
outputDir
=
outputDir
+
"flexiprep/"
flexiprep
.
init
flexiprep
.
biopetScript
flexiprep
.
runFinalize
(
fastq_R1_output
,
fastq_R2_output
)
addAll
(
flexiprep
.
functions
)
// Add function of flexiprep to curent function pool
fastq_R1
=
flexiprep
.
outputFiles
(
"output_R1"
)
if
(
paired
)
fastq_R2
=
flexiprep
.
outputFiles
(
"output_R2"
)
}
var
bamFile
:
File
=
null
if
(
aligner
==
"bwa"
)
{
val
bwaCommand
=
new
Bwa
(
this
)
bwaCommand
.
R1
=
fastq_R1
if
(
paired
)
bwaCommand
.
R2
=
fastq_R2
bwaCommand
.
RG
=
getReadGroup
bwaCommand
.
output
=
new
File
(
outputDir
+
outputName
+
".sam"
)
add
(
bwaCommand
,
isIntermediate
=
true
)
bamFile
=
addSortSam
(
List
(
bwaCommand
.
output
),
swapExt
(
outputDir
,
bwaCommand
.
output
,
".sam"
,
".bam"
),
outputDir
)
}
else
if
(
aligner
==
"star"
)
{
val
starCommand
=
Star
(
this
,
fastq_R1
,
if
(
paired
)
fastq_R2
else
null
,
outputDir
,
isIntermediate
=
true
)
add
(
starCommand
)
bamFile
=
addAddOrReplaceReadGroups
(
List
(
starCommand
.
outputSam
),
new
File
(
outputDir
+
outputName
+
".bam"
),
outputDir
)
}
else
if
(
aligner
==
"star-2pass"
)
{
val
star2pass
=
Star
.
_2pass
(
this
,
fastq_R1
,
if
(
paired
)
fastq_R2
else
null
,
outputDir
,
isIntermediate
=
true
)
addAll
(
star2pass
.
_2
)
bamFile
=
addAddOrReplaceReadGroups
(
List
(
star2pass
.
_1
),
new
File
(
outputDir
+
outputName
+
".bam"
),
outputDir
)
}
else
throw
new
IllegalStateException
(
"Option Alginer: '"
+
aligner
+
"' is not valid"
)
if
(!
skipMarkduplicates
)
bamFile
=
addMarkDuplicates
(
List
(
bamFile
),
swapExt
(
outputDir
,
bamFile
,
".bam"
,
".dedup.bam"
),
outputDir
)
var
bamFile
=
""
if
(!
skipMarkduplicates
)
bamFile
=
addMarkDuplicates
(
bamFiles
,
new
File
(
outputDir
+
outputName
+
".dedup.bam"
),
outputDir
)
if
(
skipMarkduplicates
&&
chunking
)
bamFile
=
addMergeBam
(
bamFiles
,
new
File
(
outputDir
+
outputName
+
".bam"
),
outputDir
)
outputFiles
+=
(
"finalBamFile"
->
bamFile
)
}
...
...
@@ -152,6 +216,22 @@ class Mapping(val root:Configurable) extends QScript with BiopetQScript {
return
sortSam
.
output
}
def
addMergeBam
(
inputSam
:
List
[
File
],
outputFile
:
File
,
dir
:
String
)
:
File
=
{
val
mergeSam
=
new
MergeSamFiles
mergeSam
.
input
=
inputSam
mergeSam
.
createIndex
=
true
mergeSam
.
output
=
outputFile
mergeSam
.
memoryLimit
=
2
mergeSam
.
nCoresRequest
=
2
mergeSam
.
assumeSorted
=
true
mergeSam
.
USE_THREADING
=
true
mergeSam
.
jobResourceRequests
:+=
"h_vmem=4G"
if
(!
skipMarkduplicates
)
mergeSam
.
isIntermediate
=
true
add
(
mergeSam
)
return
mergeSam
.
output
}
def
addAddOrReplaceReadGroups
(
inputSam
:
List
[
File
],
outputFile
:
File
,
dir
:
String
)
:
File
=
{
val
addOrReplaceReadGroups
=
new
AddOrReplaceReadGroups
addOrReplaceReadGroups
.
input
=
inputSam
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment