Skip to content
Snippets Groups Projects
Commit 618df913 authored by Ruben Vorderman's avatar Ruben Vorderman
Browse files

initial commit

parents
No related branches found
No related tags found
No related merge requests found
# Copyright Sequencing Analysis Support Core - Leiden University Medical Center 2017
# Bioconda installs
task installPrefix {
Array[String] requirements
String prefix
String? condaPath
command {
${default="conda" condaPath} create \
--json -q \
--yes \
--override-channels \
--channel bioconda \
--channel conda-forge \
--channel defaults \
--channel r \
--prefix ${prefix} \
${sep=' ' requirements}
}
output {
File condaEnvPath=prefix
}
}
# Copyright Sequencing Analysis Support Core - Leiden University Medical Center 2017
#
# Tasks from centrifuge
task download {
String libraryPath
Array[String]? domain
String? executable = "centrifuge-download"
File? condaEnvironment
String? seqTaxMapPath
String? database = "refseq"
String? assemblyLevel
String? refseqCategory
Array[String]? taxIds
Boolean? filterUnplaced = false
Boolean? maskLowComplexRegions = false
Boolean? downloadRnaSeqs = false
Boolean? modifyHeader = false
Boolean? downloadGiMap = false
# This will use centrifuge-download to download.
# The bash statement at the beginning is to make sure
# the directory for the SeqTaxMapPath exists.
command {
${'if [ ! -f ' + seqTaxMapPath +
' ]; then mkdir -p ' + seqTaxMapPath +
'; rm -d ' + seqTaxMapPath +
'; fi' }
${"source activate " + condaEnvironment}
${executable} \
-o ${libraryPath} \
${true='-d ' false='' defined(domain)}${sep=',' domain} \
${'-a "' + assemblyLevel + '"'} \
${"-c " + refseqCategory} \
${true='-t' false='' defined(taxIds)} '${sep=',' taxIds}' \
${true='-r' false='' downloadRnaSeqs} \
${true='-u' false='' filterUnplaced} \
${true='-m' false='' maskLowComplexRegions} \
${true='-l' false='' modifyHeader} \
${true='-g' false='' downloadGiMap} \
${database} ${">> " + seqTaxMapPath}
}
output {
File seqTaxMap = "${seqTaxMapPath}"
File library = libraryPath
Array[File] fastaFiles = glob(libraryPath + "/*/*.fna")
}
}
task downloadTaxonomy {
String centrifugeTaxonomyDir
String? executable = "centrifuge-download"
File? condaEnvironment
command {
${"source activate " + condaEnvironment}
${executable} \
-o ${centrifugeTaxonomyDir} \
taxonomy
}
output {
File taxonomyTree = centrifugeTaxonomyDir + "/nodes.dmp"
File nameTable = centrifugeTaxonomyDir + "/names.dmp"
}
}
task build {
File conversionTable
File taxonomyTree
File inputFasta
String centrifugeIndexBase
File? condaEnvironment
String? centrifugeBuildExecutable = "centrifuge-build"
#Boolean? c = false
Boolean? largeIndex = false
Boolean? noAuto = false
Int? bMax
Int? bMaxDivn
Boolean? noDiffCover = false
Boolean? noRef = false
Boolean? justRef = false
Int? offRate
Int? fTabChars
File? nameTable
File? sizeTable
Int? seed
Int? threads = 1
Int? kmerCount
command {
mkdir -p ${centrifugeIndexBase}
rm -d ${centrifugeIndexBase}
${"source activate " + condaEnvironment}
${centrifugeBuildExecutable} \
${true='--large-index' false='' largeIndex} \
${true='--noauto' false='' noAuto} \
${'--bmax ' + bMax} \
${'--bmaxdivn ' + bMaxDivn} \
${true='--nodc' false='' noDiffCover} \
${true='--noref' false='' noRef} \
${true='--justref' false='' justRef} \
${'--offrate ' + offRate} \
${'--ftabchars ' + fTabChars} \
${'--name-table ' + nameTable } \
${'--size-table ' + sizeTable} \
${'--seed ' + seed} \
${'--kmer-count' + kmerCount} \
${'--threads ' + threads} \
--conversion-table ${conversionTable} \
--taxonomy-tree ${taxonomyTree} \
${inputFasta} \
${centrifugeIndexBase}
}
runtime {
cpu: select_first([threads])
}
}
task objectMd5 {
Object the_object
command {
cat ${write_object(the_object)} | md5sum - | sed -e 's/ -//'
}
output {
String md5sum = read_string(stdout())
}
}
task mapMd5 {
Map[String,String] map
command {
cat ${write_map(map)} | md5sum - | sed -e 's/ -//'
}
output {
String md5sum = read_string(stdout())
}
}
task stringArrayMd5 {
Array[String] stringArray
command {
set -eu -o pipefail
echo ${sep=',' stringArray} | md5sum - | sed -e 's/ -//'
}
output {
String md5sum = read_string(stdout())
}
}
task concatenateTextFiles {
Array[File] fileList
String combinedFilePath
Boolean? unzip=false
command {
mkdir -p ${combinedFilePath}
rm -d ${combinedFilePath}
${true='zcat' false= 'cat' unzip} ${sep=' ' fileList} \
> ${combinedFilePath}
}
output {
File combinedFile = combinedFilePath
}
}
# inspired by https://gatkforums.broadinstitute.org/wdl/discussion/9616/is-there-a-way-to-flatten-arrays
task flattenStringArray {
Array[Array[String]] arrayList
command {
for line in $(echo ${sep=', ' arrayList}) ; \
do echo $line | tr -d '"[],' ; done
}
output {
Array[String] flattenedArray = read_lines(stdout())
}
}
task appendToStringArray {
Array[String] array
String string
command {
echo "${sep='\n' array}
${string}"
}
output {
Array[String] out_array = read_lines(stdout())
}
}
\ No newline at end of file
ncbi.wdl 0 → 100644
task genomeDownload {
String outputPath
String? section = "refseq"
String? format = "all"
String? assemblyLevel = "all"
String? taxId
String? refseqCategory
Boolean? humanReadable
String? ncbiBaseUri
Int? parallel
Int? retries
Boolean? verbose=true
Boolean? debug
String? domain = "all"
String? executable = "ncbi-genome-download"
File? condaEnvironment
command {
set -e -o pipefail
${"source activate " + condaEnvironment}
${executable} \
${"--section " + section} \
${"--format " + format} \
${"--assembly-level " + assemblyLevel } \
${"--taxid " + taxId } \
${"--refseq-category " + refseqCategory} \
${"--output-folder " + outputPath } \
${true="--human-readable" false="" humanReadable} \
${"--uri " + ncbiBaseUri } \
${"--parallel " + parallel } \
${"--retries " + retries } \
${true="--verbose" false="" verbose } \
${true="--debug" false ="" debug } \
${domain}
# Check md5sums for all downloaded files
for folder in $(realpath ${outputPath})/*/*/*
do
(
md5sums="$(
cd $folder
for file in *
do
if [[ ! $file == "MD5SUMS" ]]
then
grep $file MD5SUMS
fi
done
)"
cd $folder; echo $md5sums | md5sum -c)
done
}
output {
Array[File] fastaGzFiles = glob(outputPath + "/*/*/*/*_genomic.fna.gz")
Array[File] genbankGzFiles = glob(outputPath + "/*/*/*/*_genomic.gbff.gz")
Array[File] featuresGzFiles = glob(outputPath + "/*/*/*/*_feature_table.txt.gz")
Array[File] gffGzFiles = glob(outputPath + "/*/*/*/*_genomic.gff.gz")
Array[File] proteinFastaGzFiles = glob(outputPath + "/*/*/*/*_protein.faa.gz")
Array[File] genpeptGzFiles = glob(outputPath + "/*/*/*/*_protein.gpff.gz")
Array[File] wgsGzFiles = glob(outputPath + "/*/*/*/*_wgsmaster.gbff.gz")
Array[File] cdsFastaGzFiles = glob(outputPath + "/*/*/*/*_cds_from_genomic.fna.gz")
Array[File] rnaFastaGzFiles = glob(outputPath + "/*/*/*/*_rna_from_genomic.fna.gz")
Array[File] assemblyReportFiles = glob(outputPath + "/*/*/*/*_assembly_report.txt")
Array[File] assemblyStatsFiles = glob(outputPath + "/*/*/*/*_assembly_stats.txt")
}
}
task downloadNtFasta{
String libraryPath
String seqTaxMapPath
Boolean? unzip = true
String ntDir = libraryPath + "/nt"
String ntFilePath = ntDir + "/nt.fna"
command {
set -e -o pipefail
mkdir -p ${ntDir}
rsync -av --partial rsync://ftp.ncbi.nih.gov/blast/db/FASTA/nt.gz* ${ntDir}
(cd ${ntDir} && md5sum -c nt.gz.md5)
# Only unzip when necessary
if ${true='true' false='false' unzip}
then
zcat ${ntDir}/nt.gz > ${ntFilePath}
fi
}
output {
File ntFileGz = ntDir + "/nt.gz"
File library = libraryPath
# Added array file to allow for multiple downloads later.
# Also allows for easier pipeline logic.
Array[File] ntFastas = glob(ntDir + "/*.fna")
Array[File] ntFastasGz = glob(ntDir + "/nt*.gz")
}
}
task downloadAccessionToTaxId {
String downloadDir
Boolean gzip = false
command {
set -e -o pipefail
mkdir -p ${downloadDir}
rsync -av --partial rsync://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_*.accession2taxid.gz* ${downloadDir}
(cd ${downloadDir} && md5sum -c *.md5)
for file in ${downloadDir}/nucl_*.accession2taxid.gz
do
zcat $file | tail -n +2 | cut -f 2,3 ${true="| gzip " false='' gzip}> $file.seqtaxmap${true='.gz' false='' gzip}
done
}
output {
Array[File] seqTaxMaps = glob(downloadDir + "/*.seqtaxmap")
Array[File] seqTaxMapsGz = glob(downloadDir + "/*.seqtaxmap.gz")
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment