initial commit

618df913 · Ruben Vorderman · 618df913 · 618df913 · 618df913 · 618df913
Commit 618df913 authored 7 years ago by Ruben Vorderman
--- a/bioconda.wdl
+++ b/bioconda.wdl
+# Copyright Sequencing Analysis Support Core - Leiden University Medical Center 2017
+# Bioconda installs
+task installPrefix {
+    Array[String] requirements
+    String prefix
+    String? condaPath
+    command {
+        ${default="conda" condaPath} create \
+        --json -q \
+        --yes \
+        --override-channels \
+        --channel bioconda \
+        --channel conda-forge \
+        --channel defaults \
+        --channel r \
+        --prefix ${prefix} \
+        ${sep=' ' requirements}
+    }
+    output {
+        File condaEnvPath=prefix
+    }
+ }
--- a/centrifuge.wdl
+++ b/centrifuge.wdl
+# Copyright Sequencing Analysis Support Core - Leiden University Medical Center 2017
+#
+# Tasks from centrifuge
+task download {
+    String libraryPath
+    Array[String]? domain
+    String? executable = "centrifuge-download"
+    File? condaEnvironment
+    String? seqTaxMapPath
+    String? database = "refseq"
+    String? assemblyLevel
+    String? refseqCategory
+    Array[String]? taxIds
+    Boolean? filterUnplaced = false
+    Boolean? maskLowComplexRegions = false
+    Boolean? downloadRnaSeqs = false
+    Boolean? modifyHeader = false
+    Boolean? downloadGiMap = false
+    # This will use centrifuge-download to download.
+    # The bash statement at the beginning is to make sure
+    # the directory for the SeqTaxMapPath exists.
+    command {
+        ${'if [ ! -f ' + seqTaxMapPath +
+        ' ]; then mkdir -p ' + seqTaxMapPath +
+        '; rm -d ' + seqTaxMapPath +
+        '; fi' }
+        ${"source activate " + condaEnvironment}
+        ${executable} \
+        -o ${libraryPath} \
+        ${true='-d ' false='' defined(domain)}${sep=','  domain} \
+        ${'-a "' + assemblyLevel + '"'} \
+        ${"-c " + refseqCategory} \
+        ${true='-t' false='' defined(taxIds)} '${sep=',' taxIds}' \
+        ${true='-r' false='' downloadRnaSeqs} \
+        ${true='-u' false='' filterUnplaced} \
+        ${true='-m' false='' maskLowComplexRegions} \
+        ${true='-l' false='' modifyHeader} \
+        ${true='-g' false='' downloadGiMap} \
+        ${database} ${">> " + seqTaxMapPath}
+    }
+    output {
+        File seqTaxMap = "${seqTaxMapPath}"
+        File library = libraryPath
+        Array[File] fastaFiles = glob(libraryPath + "/*/*.fna")
+    }
+ }
+task downloadTaxonomy {
+    String centrifugeTaxonomyDir
+    String? executable = "centrifuge-download"
+    File? condaEnvironment
+    command {
+        ${"source activate " + condaEnvironment}
+        ${executable} \
+        -o ${centrifugeTaxonomyDir} \
+        taxonomy
+    }
+    output {
+        File taxonomyTree = centrifugeTaxonomyDir + "/nodes.dmp"
+        File nameTable = centrifugeTaxonomyDir + "/names.dmp"
+    }
+ }
+task build {
+    File conversionTable
+    File taxonomyTree
+    File inputFasta
+    String centrifugeIndexBase
+    File? condaEnvironment
+    String? centrifugeBuildExecutable = "centrifuge-build"
+    #Boolean? c = false
+    Boolean? largeIndex = false
+    Boolean? noAuto = false
+    Int? bMax
+    Int? bMaxDivn
+    Boolean? noDiffCover = false
+    Boolean? noRef = false
+    Boolean? justRef = false
+    Int? offRate
+    Int? fTabChars
+    File? nameTable
+    File? sizeTable
+    Int? seed
+    Int? threads = 1
+    Int? kmerCount
+    command {
+        mkdir -p  ${centrifugeIndexBase}
+        rm -d ${centrifugeIndexBase}
+        ${"source activate " + condaEnvironment}
+        ${centrifugeBuildExecutable} \
+        ${true='--large-index' false='' largeIndex} \
+        ${true='--noauto' false='' noAuto} \
+        ${'--bmax ' + bMax} \
+        ${'--bmaxdivn ' + bMaxDivn} \
+        ${true='--nodc' false='' noDiffCover} \
+        ${true='--noref' false='' noRef} \
+        ${true='--justref' false='' justRef} \
+        ${'--offrate ' + offRate} \
+        ${'--ftabchars ' + fTabChars} \
+        ${'--name-table ' + nameTable } \
+        ${'--size-table ' + sizeTable} \
+        ${'--seed ' + seed} \
+        ${'--kmer-count' + kmerCount} \
+        ${'--threads ' + threads} \
+        --conversion-table ${conversionTable} \
+        --taxonomy-tree ${taxonomyTree} \
+        ${inputFasta} \
+        ${centrifugeIndexBase}
+    }
+    runtime {
+        cpu: select_first([threads])
+    }
+}
--- a/common.wdl
+++ b/common.wdl
+task objectMd5 {
+    Object the_object
+    command {
+        cat ${write_object(the_object)} |  md5sum - | sed -e 's/  -//'
+    }
+    output {
+        String md5sum = read_string(stdout())
+    }
+}
+task mapMd5 {
+    Map[String,String] map
+    command {
+    cat ${write_map(map)} | md5sum - | sed -e 's/  -//'
+    }
+    output {
+        String md5sum = read_string(stdout())
+    }
+}
+task stringArrayMd5 {
+    Array[String] stringArray
+    command {
+    set -eu -o pipefail
+    echo ${sep=',' stringArray} | md5sum - | sed -e 's/  -//'
+    }
+    output {
+    String md5sum = read_string(stdout())
+    }
+}
+task concatenateTextFiles {
+    Array[File] fileList
+    String combinedFilePath
+    Boolean? unzip=false
+    command {
+        mkdir -p ${combinedFilePath}
+        rm -d ${combinedFilePath}
+        ${true='zcat' false= 'cat' unzip} ${sep=' ' fileList} \
+        > ${combinedFilePath}
+    }
+    output {
+        File combinedFile = combinedFilePath
+    }
+}
+# inspired by https://gatkforums.broadinstitute.org/wdl/discussion/9616/is-there-a-way-to-flatten-arrays
+task flattenStringArray {
+    Array[Array[String]] arrayList
+    command {
+    for line in $(echo ${sep=', ' arrayList}) ; \
+    do echo $line | tr -d '"[],' ; done
+    }
+    output {
+        Array[String] flattenedArray = read_lines(stdout())
+    }
+}
+task appendToStringArray {
+    Array[String] array
+    String string
+    command {
+        echo "${sep='\n' array}
+        ${string}"
+    }
+    output {
+        Array[String] out_array = read_lines(stdout())
+    }
+}
\ No newline at end of file
--- a/ncbi.wdl
+++ b/ncbi.wdl
+task genomeDownload {
+    String outputPath
+    String? section = "refseq"
+    String? format = "all"
+    String? assemblyLevel = "all"
+    String? taxId
+    String? refseqCategory
+    Boolean? humanReadable
+    String? ncbiBaseUri
+    Int? parallel
+    Int? retries
+    Boolean? verbose=true
+    Boolean? debug
+    String? domain = "all"
+    String? executable = "ncbi-genome-download"
+    File? condaEnvironment
+    command {
+        set -e -o pipefail
+        ${"source activate " + condaEnvironment}
+        ${executable} \
+        ${"--section " + section} \
+        ${"--format " + format} \
+        ${"--assembly-level " + assemblyLevel } \
+        ${"--taxid " + taxId } \
+        ${"--refseq-category " + refseqCategory} \
+        ${"--output-folder " + outputPath } \
+        ${true="--human-readable" false="" humanReadable} \
+        ${"--uri " + ncbiBaseUri } \
+        ${"--parallel " + parallel } \
+        ${"--retries " + retries } \
+        ${true="--verbose" false="" verbose } \
+        ${true="--debug" false ="" debug } \
+        ${domain}
+        # Check md5sums for all downloaded files
+        for folder in $(realpath ${outputPath})/*/*/*
+            do
+                (
+                md5sums="$(
+                    cd $folder
+                    for file in *
+                    do
+                        if [[ ! $file == "MD5SUMS" ]]
+                        then
+                            grep $file MD5SUMS
+                        fi
+                    done
+                    )"
+                cd $folder; echo $md5sums | md5sum -c)
+            done
+    }
+    output {
+        Array[File] fastaGzFiles = glob(outputPath + "/*/*/*/*_genomic.fna.gz")
+        Array[File] genbankGzFiles = glob(outputPath + "/*/*/*/*_genomic.gbff.gz")
+        Array[File] featuresGzFiles = glob(outputPath + "/*/*/*/*_feature_table.txt.gz")
+        Array[File] gffGzFiles = glob(outputPath + "/*/*/*/*_genomic.gff.gz")
+        Array[File] proteinFastaGzFiles = glob(outputPath + "/*/*/*/*_protein.faa.gz")
+        Array[File] genpeptGzFiles = glob(outputPath + "/*/*/*/*_protein.gpff.gz")
+        Array[File] wgsGzFiles = glob(outputPath + "/*/*/*/*_wgsmaster.gbff.gz")
+        Array[File] cdsFastaGzFiles = glob(outputPath + "/*/*/*/*_cds_from_genomic.fna.gz")
+        Array[File] rnaFastaGzFiles = glob(outputPath + "/*/*/*/*_rna_from_genomic.fna.gz")
+        Array[File] assemblyReportFiles = glob(outputPath + "/*/*/*/*_assembly_report.txt")
+        Array[File] assemblyStatsFiles = glob(outputPath + "/*/*/*/*_assembly_stats.txt")
+    }
+ }
+task downloadNtFasta{
+    String libraryPath
+    String seqTaxMapPath
+    Boolean? unzip = true
+    String ntDir = libraryPath + "/nt"
+    String ntFilePath = ntDir + "/nt.fna"
+    command {
+        set -e -o pipefail
+        mkdir -p ${ntDir}
+        rsync -av --partial rsync://ftp.ncbi.nih.gov/blast/db/FASTA/nt.gz* ${ntDir}
+        (cd ${ntDir} && md5sum -c nt.gz.md5)
+        # Only unzip when necessary
+        if ${true='true' false='false' unzip}
+        then
+            zcat ${ntDir}/nt.gz > ${ntFilePath}
+        fi
+        }
+    output {
+        File ntFileGz = ntDir + "/nt.gz"
+        File library = libraryPath
+        # Added array file to allow for multiple downloads later.
+        # Also allows for easier pipeline logic.
+        Array[File] ntFastas = glob(ntDir + "/*.fna")
+        Array[File] ntFastasGz = glob(ntDir + "/nt*.gz")
+    }
+}
+task downloadAccessionToTaxId {
+    String downloadDir
+    Boolean gzip = false
+    command {
+        set -e -o pipefail
+        mkdir -p ${downloadDir}
+        rsync -av --partial rsync://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_*.accession2taxid.gz* ${downloadDir}
+        (cd ${downloadDir} && md5sum -c *.md5)
+        for file in ${downloadDir}/nucl_*.accession2taxid.gz
+        do
+            zcat $file | tail -n +2 | cut -f 2,3 ${true="| gzip " false='' gzip}> $file.seqtaxmap${true='.gz' false='' gzip}
+        done
+        }
+    output {
+        Array[File] seqTaxMaps = glob(downloadDir + "/*.seqtaxmap")
+        Array[File] seqTaxMapsGz = glob(downloadDir + "/*.seqtaxmap.gz")
+    }
+}