Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mirrors
biopet.biopet
Commits
a748525c
Commit
a748525c
authored
Nov 29, 2016
by
Peter van 't Hof
Browse files
Merge remote-tracking branch 'remotes/origin/develop' into fix-BIOPET-374
parents
6e3f759c
39070750
Changes
35
Hide whitespace changes
Inline
Side-by-side
biopet-package/src/main/scala/nl/lumc/sasc/biopet/BiopetExecutableMain.scala
View file @
a748525c
...
...
@@ -14,7 +14,6 @@
*/
package
nl.lumc.sasc.biopet
import
nl.lumc.sasc.biopet.pipelines.generateindexes.GenerateIndexes
import
nl.lumc.sasc.biopet.utils.
{
BiopetExecutable
,
MainCommand
}
object
BiopetExecutableMain
extends
BiopetExecutable
{
...
...
@@ -38,7 +37,8 @@ object BiopetExecutableMain extends BiopetExecutable {
nl
.
lumc
.
sasc
.
biopet
.
pipelines
.
shiva
.
ShivaVariantcalling
,
nl
.
lumc
.
sasc
.
biopet
.
pipelines
.
basty
.
Basty
,
nl
.
lumc
.
sasc
.
biopet
.
pipelines
.
shiva
.
Shiva
,
GenerateIndexes
nl
.
lumc
.
sasc
.
biopet
.
pipelines
.
generateindexes
.
DownloadGenomes
,
nl
.
lumc
.
sasc
.
biopet
.
pipelines
.
generateindexes
.
GenerateIndexes
)
def
tools
:
List
[
MainCommand
]
=
BiopetToolsExecutable
.
tools
...
...
biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/DownloadNcbiAssembly.scala
0 → 100644
View file @
a748525c
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package
nl.lumc.sasc.biopet.extensions.tools
import
java.io.File
import
nl.lumc.sasc.biopet.core.ToolCommandFunction
import
nl.lumc.sasc.biopet.utils.config.Configurable
import
org.broadinstitute.gatk.utils.commandline.
{
Input
,
Output
}
/**
* @deprecated Use picard.util.BedToIntervalList instead
*/
class
DownloadNcbiAssembly
(
val
root
:
Configurable
)
extends
ToolCommandFunction
{
def
toolObject
=
nl
.
lumc
.
sasc
.
biopet
.
tools
.
DownloadNcbiAssembly
@Output
(
doc
=
"Output fasta file"
,
required
=
true
)
var
output
:
File
=
_
var
outputReport
:
File
=
_
var
assemblyId
:
String
=
null
var
nameHeader
:
Option
[
String
]
=
None
var
mustHaveOne
:
List
[
String
]
=
Nil
var
mustNotHave
:
List
[
String
]
=
Nil
override
def
defaultCoreMemory
=
4.0
override
def
cmdLine
=
super
.
cmdLine
+
required
(
"-a"
,
assemblyId
)
+
required
(
"--report"
,
outputReport
)
+
required
(
"-o"
,
output
)
+
optional
(
"--nameHeader"
,
nameHeader
)
+
repeat
(
"--mustHaveOne"
,
mustHaveOne
)
+
repeat
(
"--mustNotHave"
,
mustNotHave
)
}
biopet-tools-package/src/main/scala/nl/lumc/sasc/biopet/BiopetToolsExecutable.scala
View file @
a748525c
...
...
@@ -50,5 +50,6 @@ object BiopetToolsExecutable extends BiopetExecutable {
nl
.
lumc
.
sasc
.
biopet
.
tools
.
VcfToTsv
,
nl
.
lumc
.
sasc
.
biopet
.
tools
.
VcfWithVcf
,
nl
.
lumc
.
sasc
.
biopet
.
tools
.
VepNormalizer
,
nl
.
lumc
.
sasc
.
biopet
.
tools
.
WipeReads
)
nl
.
lumc
.
sasc
.
biopet
.
tools
.
WipeReads
,
nl
.
lumc
.
sasc
.
biopet
.
tools
.
DownloadNcbiAssembly
)
}
biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/DownloadNcbiAssembly.scala
0 → 100644
View file @
a748525c
package
nl.lumc.sasc.biopet.tools
import
java.io.
{
File
,
PrintWriter
}
import
nl.lumc.sasc.biopet.utils.ToolCommand
import
scala.io.Source
/**
* Created by pjvan_thof on 21-9-16.
*/
object
DownloadNcbiAssembly
extends
ToolCommand
{
case
class
Args
(
assemblyId
:
String
=
null
,
outputFile
:
File
=
null
,
reportFile
:
Option
[
File
]
=
None
,
contigNameHeader
:
Option
[
String
]
=
None
,
mustHaveOne
:
List
[(
String
,
String
)]
=
List
(),
mustNotHave
:
List
[(
String
,
String
)]
=
List
())
extends
AbstractArgs
class
OptParser
extends
AbstractOptParser
{
opt
[
String
](
'a'
,
"assembly id"
)
required
()
unbounded
()
valueName
"<file>"
action
{
(
x
,
c
)
=>
c
.
copy
(
assemblyId
=
x
)
}
text
"refseq ID from NCBI"
opt
[
File
](
'o'
,
"output"
)
required
()
unbounded
()
valueName
"<file>"
action
{
(
x
,
c
)
=>
c
.
copy
(
outputFile
=
x
)
}
text
"output Fasta file"
opt
[
File
](
"report"
)
unbounded
()
valueName
"<file>"
action
{
(
x
,
c
)
=>
c
.
copy
(
reportFile
=
Some
(
x
))
}
text
"where to write report from ncbi"
opt
[
String
](
"nameHeader"
)
unbounded
()
valueName
"<string>"
action
{
(
x
,
c
)
=>
c
.
copy
(
contigNameHeader
=
Some
(
x
))
}
text
"""
| What column to use from the NCBI report for the name of the contigs.
| All columns in the report can be used but this are the most common field to choose from:
| - 'Sequence-Name': Name of the contig within the assembly
| - 'UCSC-style-name': Name of the contig used by UCSC ( like hg19 )
| - 'RefSeq-Accn': Unique name of the contig at RefSeq (default for NCBI)"""
.
stripMargin
opt
[(
String
,
String
)](
"mustHaveOne"
)
unbounded
()
valueName
"<column_name=regex>"
action
{
(
x
,
c
)
=>
c
.
copy
(
mustHaveOne
=
(
x
.
_1
,
x
.
_2
)
::
c
.
mustHaveOne
)
}
text
"This can be used to filter based on the NCBI report, multiple conditions can be given, at least 1 should be true"
opt
[(
String
,
String
)](
"mustNotHave"
)
unbounded
()
valueName
"<column_name=regex>"
action
{
(
x
,
c
)
=>
c
.
copy
(
mustNotHave
=
(
x
.
_1
,
x
.
_2
)
::
c
.
mustNotHave
)
}
text
"This can be used to filter based on the NCBI report, multiple conditions can be given, all should be false"
}
/**
* @param args the command line arguments
*/
def
main
(
args
:
Array
[
String
])
:
Unit
=
{
val
argsParser
=
new
OptParser
val
cmdargs
:
Args
=
argsParser
.
parse
(
args
,
Args
())
getOrElse
(
throw
new
IllegalArgumentException
)
logger
.
info
(
s
"Reading ${cmdargs.assemblyId} from NCBI"
)
val
reader
=
Source
.
fromURL
(
s
"ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/${cmdargs.assemblyId}.assembly.txt"
)
val
assamblyReport
=
reader
.
getLines
().
toList
reader
.
close
()
cmdargs
.
reportFile
.
foreach
{
file
=>
val
writer
=
new
PrintWriter
(
file
)
assamblyReport
.
foreach
(
writer
.
println
)
writer
.
close
()
}
val
headers
=
assamblyReport
.
filter
(
_
.
startsWith
(
"#"
)).
last
.
stripPrefix
(
"# "
).
split
(
"\t"
).
zipWithIndex
.
toMap
val
nameId
=
cmdargs
.
contigNameHeader
.
map
(
x
=>
headers
(
x
))
val
lengthId
=
headers
.
get
(
"Sequence-Length"
)
val
baseUrlEutils
=
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
val
fastaWriter
=
new
PrintWriter
(
cmdargs
.
outputFile
)
val
allContigs
=
assamblyReport
.
filter
(!
_
.
startsWith
(
"#"
))
.
map
(
_
.
split
(
"\t"
))
val
totalLength
=
lengthId
.
map
(
id
=>
allContigs
.
map
(
_
.
apply
(
id
).
toLong
).
sum
)
logger
.
info
(
s
"${allContigs.size} contigs found"
)
totalLength
.
foreach
(
l
=>
logger
.
info
(
s
"Total length: ${l}"
))
val
filterContigs
=
allContigs
.
filter
(
values
=>
cmdargs
.
mustNotHave
.
forall
(
x
=>
values
(
headers
(
x
.
_1
))
!=
x
.
_2
))
.
filter
(
values
=>
cmdargs
.
mustHaveOne
.
exists
(
x
=>
values
(
headers
(
x
.
_1
))
==
x
.
_2
)
||
cmdargs
.
mustHaveOne
.
isEmpty
)
val
filterLength
=
lengthId
.
map
(
id
=>
filterContigs
.
map
(
_
.
apply
(
id
).
toLong
).
sum
)
logger
.
info
(
s
"${filterContigs.size} contigs left after filtering"
)
filterLength
.
foreach
(
l
=>
logger
.
info
(
s
"Filtered length: ${l}"
))
filterContigs
.
foreach
{
values
=>
val
id
=
if
(
values
(
6
)
==
"na"
)
values
(
4
)
else
values
(
6
)
logger
.
info
(
s
"Start download ${id}"
)
val
fastaReader
=
Source
.
fromURL
(
s
"${baseUrlEutils}/efetch.fcgi?db=nuccore&id=${id}&retmode=text&rettype=fasta"
)
fastaReader
.
getLines
()
.
map
(
x
=>
nameId
.
map
(
y
=>
x
.
replace
(
">"
,
s
">${values(y)} "
)).
getOrElse
(
x
))
.
foreach
(
fastaWriter
.
println
)
fastaReader
.
close
()
}
logger
.
info
(
"Downloading complete"
)
fastaWriter
.
close
()
}
}
biopet-tools/src/test/resources/GCF_000844745.1.report
0 → 100644
View file @
a748525c
# Assembly name: ViralProj14444
# Organism name: Ageratum yellow vein betasatellite (viruses)
# Taxid: 185750
# BioProject: PRJNA14444
# Submitter: Stanley J., Department of Virus Research, John Innes Centre, Norwich Research Park, Colney, Norwich, NR4 7UH, UNITED KINGDOM
# Date: 2000-6-14
# Assembly type: n/a
# Release type: major
# Assembly level: Complete Genome
# Genome representation: full
# RefSeq assembly accession: GCF_000844745.1
#
## Assembly-Units:
## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name
## GCF_000844735.1 Primary assembly
#
# Ordered by chromosome/plasmid; the chromosomes/plasmids are followed by
# unlocalized scaffolds.
# Unplaced scaffolds are listed at the end.
# RefSeq is equal or derived from GenBank object.
#
# Sequence-Name Sequence-Role Assigned-Molecule Assigned-Molecule-Location/Type GenBank-Accn Relationship RefSeq-Accn Assembly-Unit Sequence-Length UCSC-style-name
Unknown assembled-molecule Unknown Segment na <> NC_003403.1 Primary assembly 1347 na
\ No newline at end of file
biopet-tools/src/test/resources/NC_003403.1.fasta
0 → 100644
View file @
a748525c
>NC_003403.1 Ageratum yellow vein virus-associated DNA beta, complete genome
ACCGGTGGCGAGCGGGGAATTTGGGTTCCAGGTGGGACCCACACCTGGAACAGCCTGGAAAGAACGTGAG
TGGGCCAAATAAGTGGGCCGTTGAATTGGGCTTTTTATTCAAAACTATCTCAAATTTACGAAGCCCATTA
TTATTCATTTGAGAAAAATTACATTAAACATAGTCCGTATAAATATACATTTATACGGTTACATTCTTGT
ATACATGGTATTCGTCGTTTACTTGGATATCTATTACTGGAGCTTCCTGTTGCATCATGATATCTATTAT
TTCAACCATGTCTTCTGATTTGAACTCTTCTATTGTTGAATCTCTATACATGATCCTTAGTATGTTCTTG
ATACCTTCTTCTAAAGCATTAAAATTGAATGGTGGTATGATACCGTGATGTCCGTATGGAATGGAGAATT
TGTTCTTGACTAGTGCTGGTGACCTTGTTGAAAACACTTCTACGTCAACTCTGAATTTTATTTCTGATCT
GAACTTGACGTTGATGATGAACAATATTCCTTTATCGTTGGTATATGATATAGTCATGATTGTTTGATGG
AAATATATTCATTTGATGTGCATTTATAATTGGATGTGTCTGTTGTATGTGGTTGTGATTAATGATCGTG
TCAAATTTATGTTCTTGTATTAATTAGTGTATGTGTGACTGTATTGATGTTGATGTTTGTGATTATAATT
GAGAAAAGAAGATATTCATTCATTTGAATAAGAAAAGAAAGAAAAAAAAAAGAAAACTATATACATAAAA
AAAACGAATAATAAAAATAACTATCTATTCAGAAAAAATGGGAGCGCAGCGAAAACGAAAACAAAAATCG
ATATCAAAGAAAATCCTGATAAATTAAATAGACAGAATTGAAAAGGAAAAAAGTAAAAAAAGATAAAAGT
TATTACATTAAAAAAACTAAAGAAAAAAAAAATAAAAAAACTAAAAATGATAAATAAAAAAATGACGTCA
TCTGATGTCATCACAGTTAAAAACTATTTACTGCGGTTAAAATTTAATATAATTAACCACTTGGGTAAAT
ATATTGTCTCCAATAGGTAAATTGTCTCCAATATATCGGAGACATATTGGAGACTCTGGAGACACTTATA
TGCTAATTTCCCATTTTACCCTTACTAATGTGACTGTAAGGCGCGTGGGAGTGCTCTGAAAAAGTAGACC
TTCTCTCTCCAAAACTCCCCGGAGAAGACGATACAGGCTGATCCCGGCATCAATTCGCGACACGCGCGGC
GGTGTGTACCCCTGGGAGGGTAGAAACCACTACGCTACGCAGCAGCCTTAGCTACGCCGGAGCTTAGCTC
GCCACCGTAATAATATT
biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/DownloadNcbiAssemblyTest.scala
0 → 100644
View file @
a748525c
package
nl.lumc.sasc.biopet.tools
import
java.io.File
import
java.nio.file.Paths
import
org.scalatest.Matchers
import
org.scalatest.testng.TestNGSuite
import
org.testng.annotations.Test
import
scala.io.Source
/**
* Created by pjvanthof on 03/10/16.
*/
class
DownloadNcbiAssemblyTest
extends
TestNGSuite
with
Matchers
{
private
def
resourcePath
(
p
:
String
)
:
String
=
{
Paths
.
get
(
getClass
.
getResource
(
p
).
toURI
).
toString
}
@Test
def
testNC_003403_1
:
Unit
=
{
val
output
=
File
.
createTempFile
(
"test."
,
".fasta"
)
val
outputReport
=
File
.
createTempFile
(
"test."
,
".report"
)
output
.
deleteOnExit
()
outputReport
.
deleteOnExit
()
DownloadNcbiAssembly
.
main
(
Array
(
"-a"
,
"GCF_000844745.1"
,
"-o"
,
output
.
getAbsolutePath
,
"--report"
,
outputReport
.
getAbsolutePath
))
Source
.
fromFile
(
output
).
getLines
().
toList
shouldBe
Source
.
fromFile
(
new
File
(
resourcePath
(
"/NC_003403.1.fasta"
))).
getLines
().
toList
Source
.
fromFile
(
outputReport
).
getLines
().
toList
shouldBe
Source
.
fromFile
(
new
File
(
resourcePath
(
"/GCF_000844745.1.report"
))).
getLines
().
toList
}
}
biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/config/Configurable.scala
View file @
a748525c
...
...
@@ -103,11 +103,12 @@ trait Configurable extends ImplicitConversions {
freeVar
:
Boolean
=
true
,
sample
:
String
=
null
,
library
:
String
=
null
,
extraSubPath
:
List
[
String
]
=
Nil
,
path
:
List
[
String
]
=
null
)
:
ConfigValue
=
{
val
s
=
if
(
sample
!=
null
||
defaultSample
.
isEmpty
)
sample
else
defaultSample
.
get
val
l
=
if
(
library
!=
null
||
defaultLibrary
.
isEmpty
)
library
else
defaultLibrary
.
get
val
m
=
if
(
namespace
!=
null
)
namespace
else
configNamespace
val
p
=
if
(
path
==
null
)
getConfigPath
(
s
,
l
,
namespace
)
:::
subPath
else
p
ath
val
p
=
if
(
path
==
null
)
getConfigPath
(
s
,
l
,
namespace
)
:::
subPath
:::
extraSubPath
else
path
:::
extraSubP
ath
val
d
=
{
val
value
=
Config
.
getValueFromMap
(
internalDefaults
,
ConfigValueIndex
(
m
,
p
,
key
,
freeVar
))
if
(
value
.
isDefined
)
value
.
get
.
value
else
default
...
...
@@ -130,11 +131,12 @@ trait Configurable extends ImplicitConversions {
freeVar
:
Boolean
=
true
,
sample
:
String
=
null
,
library
:
String
=
null
,
extraSubPath
:
List
[
String
]
=
Nil
,
path
:
List
[
String
]
=
null
)
=
{
val
s
=
if
(
sample
!=
null
||
defaultSample
.
isEmpty
)
sample
else
defaultSample
.
get
val
l
=
if
(
library
!=
null
||
defaultLibrary
.
isEmpty
)
library
else
defaultLibrary
.
get
val
m
=
if
(
namespace
!=
null
)
namespace
else
configNamespace
val
p
=
if
(
path
==
null
)
getConfigPath
(
s
,
l
,
namespace
)
:::
subPath
else
p
ath
val
p
=
if
(
path
==
null
)
getConfigPath
(
s
,
l
,
namespace
)
:::
subPath
:::
extraSubPath
else
path
:::
extraSubP
ath
globalConfig
.
contains
(
m
,
p
,
key
,
freeVar
,
internalFixedValues
)
||
Config
.
getValueFromMap
(
internalDefaults
,
ConfigValueIndex
(
m
,
p
,
key
,
freeVar
)).
isDefined
}
...
...
gears/src/main/scala/nl/lumc/sasc/biopet/pipelines/gears/Gears.scala
View file @
a748525c
...
...
@@ -141,12 +141,20 @@ class Gears(val root: Configurable) extends QScript with MultiSampleQScript { qs
class
Library
(
libId
:
String
)
extends
AbstractLibrary
(
libId
)
{
lazy
val
flexiprep
=
new
Flexiprep
(
qscript
)
flexiprep
.
sampleId
=
Some
(
sampleId
)
flexiprep
.
libId
=
Some
(
libId
)
flexiprep
.
inputR1
=
config
(
"R1"
)
flexiprep
.
inputR2
=
config
(
"R2"
)
flexiprep
.
outputDir
=
new
File
(
libDir
,
"flexiprep"
)
lazy
val
inputR1
:
File
=
config
(
"R1"
)
lazy
val
inputR2
:
Option
[
File
]
=
config
(
"R2"
)
lazy
val
skipFlexiprep
:
Boolean
=
config
(
"skip_flexiprep"
,
default
=
false
)
lazy
val
flexiprep
=
if
(
skipFlexiprep
)
None
else
Some
(
new
Flexiprep
(
qscript
))
flexiprep
.
foreach
(
_
.
sampleId
=
Some
(
sampleId
))
flexiprep
.
foreach
(
_
.
libId
=
Some
(
libId
))
flexiprep
.
foreach
(
_
.
inputR1
=
inputR1
)
flexiprep
.
foreach
(
_
.
inputR2
=
inputR2
)
flexiprep
.
foreach
(
_
.
outputDir
=
new
File
(
libDir
,
"flexiprep"
))
lazy
val
qcR1
:
File
=
flexiprep
.
map
(
_
.
fastqR1Qc
).
getOrElse
(
inputR1
)
lazy
val
qcR2
:
Option
[
File
]
=
flexiprep
.
map
(
_
.
fastqR2Qc
).
getOrElse
(
inputR2
)
val
libraryGears
:
Boolean
=
config
(
"library_gears"
,
default
=
false
)
...
...
@@ -154,17 +162,17 @@ class Gears(val root: Configurable) extends QScript with MultiSampleQScript { qs
/** Function that add library jobs */
protected
def
addJobs
()
:
Unit
=
{
inputFiles
:+=
InputFile
(
flexiprep
.
inputR1
,
config
(
"R1_md5"
))
flexiprep
.
inputR2
.
foreach
(
inputFiles
:+=
InputFile
(
_
,
config
(
"R2_md5"
)))
add
(
flexiprep
)
inputFiles
:+=
InputFile
(
inputR1
,
config
(
"R1_md5"
))
inputR2
.
foreach
(
inputFiles
:+=
InputFile
(
_
,
config
(
"R2_md5"
)))
flexiprep
.
foreach
(
add
(
_
)
)
gearsSingle
.
foreach
{
gs
=>
gs
.
sampleId
=
Some
(
sampleId
)
gs
.
libId
=
Some
(
libId
)
gs
.
outputDir
=
libDir
gs
.
fastqR1
=
List
(
addDownsample
(
flexiprep
.
fastqR1Qc
,
gs
.
outputDir
))
gs
.
fastqR2
=
flexiprep
.
fastqR2Qc
.
map
(
addDownsample
(
_
,
gs
.
outputDir
)).
toList
gs
.
fastqR1
=
List
(
addDownsample
(
qcR1
,
gs
.
outputDir
))
gs
.
fastqR2
=
qcR2
.
map
(
addDownsample
(
_
,
gs
.
outputDir
)).
toList
add
(
gs
)
}
}
...
...
@@ -187,11 +195,11 @@ class Gears(val root: Configurable) extends QScript with MultiSampleQScript { qs
val
flexipreps
=
libraries
.
values
.
map
(
_
.
flexiprep
).
toList
val
mergeR1
:
File
=
new
File
(
sampleDir
,
s
"$sampleId.R1.fq.gz"
)
add
(
Zcat
(
qscript
,
flexipreps
.
map
(
_
.
fastqR1Qc
)
)
|
new
Gzip
(
qscript
)
>
mergeR1
)
add
(
Zcat
(
qscript
,
libraries
.
values
.
map
(
_
.
qcR1
).
toList
)
|
new
Gzip
(
qscript
)
>
mergeR1
)
val
mergeR2
=
if
(
flexipreps
.
exists
(
_
.
pair
ed
))
Some
(
new
File
(
sampleDir
,
s
"$sampleId.R2.fq.gz"
))
else
None
val
mergeR2
=
if
(
libraries
.
values
.
exists
(
_
.
inputR2
.
isDefin
ed
))
Some
(
new
File
(
sampleDir
,
s
"$sampleId.R2.fq.gz"
))
else
None
mergeR2
.
foreach
{
file
=>
add
(
Zcat
(
qscript
,
flexipreps
.
flatMap
(
_
.
fastqR2Qc
)
)
|
new
Gzip
(
qscript
)
>
file
)
add
(
Zcat
(
qscript
,
libraries
.
values
.
flatMap
(
_
.
qcR2
).
toList
)
|
new
Gzip
(
qscript
)
>
file
)
}
gearsSingle
.
fastqR1
=
List
(
addDownsample
(
mergeR1
,
gearsSingle
.
outputDir
))
...
...
generate-indexes/pom.xml
View file @
a748525c
...
...
@@ -41,7 +41,7 @@
</dependency>
<dependency>
<groupId>
nl.lumc.sasc
</groupId>
<artifactId>
BiopetExtensions
</artifactId>
<artifactId>
Biopet
Tools
Extensions
</artifactId>
<version>
${project.version}
</version>
</dependency>
<dependency>
...
...
generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/DownloadGenomes.scala
0 → 100644
View file @
a748525c
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package
nl.lumc.sasc.biopet.pipelines.generateindexes
import
java.io.File
import
java.util
import
nl.lumc.sasc.biopet.core.extensions.Md5sum
import
nl.lumc.sasc.biopet.core.
{
BiopetQScript
,
PipelineCommand
}
import
nl.lumc.sasc.biopet.extensions._
import
nl.lumc.sasc.biopet.extensions.gatk.CombineVariants
import
nl.lumc.sasc.biopet.extensions.picard.NormalizeFasta
import
nl.lumc.sasc.biopet.extensions.tools.DownloadNcbiAssembly
import
nl.lumc.sasc.biopet.utils.ConfigUtils
import
nl.lumc.sasc.biopet.utils.config.Configurable
import
org.broadinstitute.gatk.queue.QScript
import
scala.collection.JavaConversions._
import
scala.language.reflectiveCalls
class
DownloadGenomes
(
val
root
:
Configurable
)
extends
QScript
with
BiopetQScript
{
def
this
()
=
this
(
null
)
@Argument
(
required
=
true
)
var
referenceConfigFiles
:
List
[
File
]
=
Nil
var
referenceConfig
:
Map
[
String
,
Any
]
=
_
override
def
fixedValues
=
Map
(
"gffread"
->
Map
(
"T"
->
true
))
override
def
defaults
=
Map
(
"normalizefasta"
->
Map
(
"line_length"
->
60
))
val
downloadAnnotations
:
Boolean
=
config
(
"download_annotations"
,
default
=
false
)
/** This is executed before the script starts */
def
init
()
:
Unit
=
{
if
(
referenceConfig
==
null
)
referenceConfig
=
referenceConfigFiles
.
foldLeft
(
Map
[
String
,
Any
]())((
a
,
b
)
=>
ConfigUtils
.
mergeMaps
(
a
,
ConfigUtils
.
fileToConfigMap
(
b
)))
}
/** Method where jobs must be added */
def
biopetScript
()
:
Unit
=
{
for
((
speciesName
,
c
)
<-
referenceConfig
)
yield
speciesName
->
{
val
speciesConfig
=
ConfigUtils
.
any2map
(
c
)
val
speciesDir
=
new
File
(
outputDir
,
speciesName
)
for
((
genomeName
,
c
)
<-
speciesConfig
)
yield
genomeName
->
{
var
configDeps
:
List
[
File
]
=
Nil
val
genomeConfig
=
ConfigUtils
.
any2map
(
c
)
val
genomeDir
=
new
File
(
speciesDir
,
genomeName
)
val
fastaFile
=
new
File
(
genomeDir
,
"reference.fa"
)
val
downloadFastaFile
=
new
File
(
genomeDir
,
"download.reference.fa"
)
genomeConfig
.
get
(
"ncbi_assembly_id"
)
match
{
case
Some
(
assemblyID
:
String
)
=>
val
downloadAssembly
=
new
DownloadNcbiAssembly
(
this
)
downloadAssembly
.
assemblyId
=
assemblyID
downloadAssembly
.
output
=
downloadFastaFile
downloadAssembly
.
outputReport
=
new
File
(
genomeDir
,
s
"$speciesName-$genomeName.assembly.report"
)
downloadAssembly
.
nameHeader
=
genomeConfig
.
get
(
"ncbi_assembly_header_name"
).
map
(
_
.
toString
)
downloadAssembly
.
mustHaveOne
=
genomeConfig
.
get
(
"ncbi_assembly_must_have_one"
)
.
map
(
_
.
asInstanceOf
[
util.ArrayList
[
util.LinkedHashMap
[
String
,
String
]]])
.
getOrElse
(
new
util
.
ArrayList
()).
flatMap
(
x
=>
x
.
map
(
y
=>
y
.
_1
+
"="
+
y
.
_2
))
.
toList
downloadAssembly
.
mustNotHave
=
genomeConfig
.
get
(
"ncbi_assembly_must_not_have"
)
.
map
(
_
.
asInstanceOf
[
util.ArrayList
[
util.LinkedHashMap
[
String
,
String
]]])
.
getOrElse
(
new
util
.
ArrayList
()).
flatMap
(
x
=>
x
.
map
(
y
=>
y
.
_1
+
"="
+
y
.
_2
))
.
toList
downloadAssembly
.
isIntermediate
=
true
add
(
downloadAssembly
)
case
_
=>
val
fastaUris
=
genomeConfig
.
getOrElse
(
"fasta_uri"
,
throw
new
IllegalArgumentException
(
s
"No fasta_uri found for $speciesName - $genomeName"
))
match
{
case
a
:
Traversable
[
_
]
=>
a
.
map
(
_
.
toString
).
toArray
case
a
:
util.ArrayList
[
_
]
=>
a
.
map
(
_
.
toString
).
toArray
case
a
=>
Array
(
a
.
toString
)
}
val
fastaFiles
=
for
(
fastaUri
<-
fastaUris
)
yield
{
val
curl
=
new
Curl
(
this
)
curl
.
url
=
fastaUri
curl
.
output
=
if
(
fastaUris
.
length
>
1
||
fastaUri
.
endsWith
(
".gz"
))
{
curl
.
isIntermediate
=
true
new
File
(
genomeDir
,
new
File
(
fastaUri
).
getName
)
}
else
fastaFile
add
(
curl
)
add
(
Md5sum
(
this
,
curl
.
output
,
genomeDir
))
curl
.
output
}
val
fastaCat
=
new
FastaMerging
(
this
)
fastaCat
.
output
=
downloadFastaFile
fastaCat
.
isIntermediate
=
true
if
(
fastaUris
.
length
>
1
||
fastaFiles
.
exists
(
_
.
getName
.
endsWith
(
".gz"
)))
{
fastaFiles
.
foreach
{
file
=>
if
(
file
.
getName
.
endsWith
(
".gz"
))
{
val
zcat
=
new
Zcat
(
this
)
zcat
.
appending
=
true
zcat
.
input
:+=
file
zcat
.
output
=
downloadFastaFile
fastaCat
.
cmds
:+=
zcat
fastaCat
.
input
:+=
file
}
else
{
val
cat
=
new
Cat
(
this
)
cat
.
appending
=
true
cat
.
input
:+=
file
cat
.
output
=
downloadFastaFile
fastaCat
.
cmds
:+=
cat
fastaCat
.
input
:+=
file
}
}
add
(
fastaCat
)
configDeps
:+=
fastaCat
.
output
}
}
val
normalizeFasta
=
new
NormalizeFasta
(
this
)
normalizeFasta
.
input
=
downloadFastaFile
normalizeFasta
.
output
=
fastaFile
add
(
normalizeFasta
)
val
generateIndexes
=
new
GenerateIndexes
(
this
)
generateIndexes
.
fastaFile
=
fastaFile
generateIndexes
.
speciesName
=
speciesName
generateIndexes
.
genomeName
=
genomeName
generateIndexes
.
outputDir
=
genomeDir
//generateIndexes.fastaUris = fastaUris
//TODO: add gtf file
add
(
generateIndexes
)
if
(
downloadAnnotations
)
{
val
annotationDir
=
new
File
(
genomeDir
,
"annotation"
)
def
getAnnotation
(
tag
:
String
)
:
Map
[
String
,
Map
[
String
,
Any
]]
=
(
genomeConfig
.
get
(
tag
)
match
{
case
Some
(
s
:
Map
[
_
,
_
])
=>
s
.
map
(
x
=>
x
.
_2
match
{
case
o
:
Map
[
_
,
_
]
=>
x
.
_1
.
toString
->
o
.
map
(
x
=>
(
x
.
_1
.
toString
,
x
.
_2
))
case
_
=>
throw
new
IllegalStateException
(
s
"values in the tag $tag should be json objects"
)
})
case
None
=>
Map
()
case
x
=>
throw
new
IllegalStateException
(
s
"tag $tag should be an object with objects, now $x"
)
})
// Download vep caches
getAnnotation
(
"vep"
).
foreach
{
case
(
version
,
vep
)
=>
val
vepDir
=
new
File
(
annotationDir
,
"vep"
+
File
.
separator
+
version
)
val
curl
=
new
Curl
(
this
)
curl
.
url
=
vep
(
"cache_uri"
).
toString
curl
.
output
=
new
File
(
vepDir
,
new
File
(
curl
.
url
).
getName
)
curl
.
isIntermediate
=
true
add
(
curl
)
val
tar
=
new
TarExtract
(
this
)
tar
.
inputTar
=
curl
.
output
tar
.
outputDir
=
vepDir
add
(
tar
)
}
getAnnotation
(
"dbsnp"
).
foreach
{
case
(
version
,
dbsnp
)
=>
val
dbpsnpDir
=
new
File
(
annotationDir
,
"dbsnp"
)
val
contigMap
=
dbsnp
.
get
(
"dbsnp_contig_map"
).
map
(
_
.
asInstanceOf
[
Map
[
String
,
Any
]])
val
contigSed
=
contigMap
.
map
{
map
=>
val
sed
=
new
Sed
(
this
)
sed
.
expressions
=
map
.
map
(
x
=>
s
"""s/^${x._1}\t/${x._2}\t/"""
).
toList
sed
}