Skip to content
Snippets Groups Projects
ncbi.wdl 4.17 KiB
Newer Older
Ruben Vorderman's avatar
Ruben Vorderman committed
version 1.0
Ruben Vorderman's avatar
Ruben Vorderman committed

Ruben Vorderman's avatar
Ruben Vorderman committed
task genomeDownload {
    input {
        String outputPath
        String? section = "refseq"
        String? format = "all"
        String? assemblyLevel = "all"
        String? taxId
        String? refseqCategory
        Boolean? humanReadable
        String? ncbiBaseUri
        Int? parallel
        Int? retries
        Boolean? verbose=true
        Boolean? debug
        String? domain = "all"
Ruben Vorderman's avatar
Ruben Vorderman committed

Ruben Vorderman's avatar
Ruben Vorderman committed
        String? executable = "ncbi-genome-download"
        String? preCommand
    }
Ruben Vorderman's avatar
Ruben Vorderman committed
    command {
        set -e -o pipefail
Ruben Vorderman's avatar
Ruben Vorderman committed
        ~{preCommand}
        ~{executable} \
        ~{"--section " + section} \
        ~{"--format " + format} \
        ~{"--assembly-level " + assemblyLevel } \
        ~{"--taxid " + taxId } \
        ~{"--refseq-category " + refseqCategory} \
        ~{"--output-folder " + outputPath } \
        ~{true="--human-readable" false="" humanReadable} \
        ~{"--uri " + ncbiBaseUri } \
        ~{"--parallel " + parallel } \
        ~{"--retries " + retries } \
        ~{true="--verbose" false="" verbose } \
        ~{true="--debug" false ="" debug } \
        ~{domain}
Ruben Vorderman's avatar
Ruben Vorderman committed

        # Check md5sums for all downloaded files
Ruben Vorderman's avatar
Ruben Vorderman committed
        for folder in $(realpath ~{outputPath})/*/*/*
Ruben Vorderman's avatar
Ruben Vorderman committed
            do
                (
                md5sums="$(
                    cd $folder
                    for file in *
                    do
                        if [[ ! $file == "MD5SUMS" ]]
                        then
                            grep $file MD5SUMS
                        fi
                    done
                    )"
                cd $folder; echo $md5sums | md5sum -c)
            done
    }

    output {
        Array[File] fastaGzFiles = glob(outputPath + "/*/*/*/*_genomic.fna.gz")
        Array[File] genbankGzFiles = glob(outputPath + "/*/*/*/*_genomic.gbff.gz")
        Array[File] featuresGzFiles = glob(outputPath + "/*/*/*/*_feature_table.txt.gz")
        Array[File] gffGzFiles = glob(outputPath + "/*/*/*/*_genomic.gff.gz")
        Array[File] proteinFastaGzFiles = glob(outputPath + "/*/*/*/*_protein.faa.gz")
        Array[File] genpeptGzFiles = glob(outputPath + "/*/*/*/*_protein.gpff.gz")
        Array[File] wgsGzFiles = glob(outputPath + "/*/*/*/*_wgsmaster.gbff.gz")
        Array[File] cdsFastaGzFiles = glob(outputPath + "/*/*/*/*_cds_from_genomic.fna.gz")
        Array[File] rnaFastaGzFiles = glob(outputPath + "/*/*/*/*_rna_from_genomic.fna.gz")
        Array[File] assemblyReportFiles = glob(outputPath + "/*/*/*/*_assembly_report.txt")
        Array[File] assemblyStatsFiles = glob(outputPath + "/*/*/*/*_assembly_stats.txt")
    }
 }


task downloadNtFasta{
Ruben Vorderman's avatar
Ruben Vorderman committed
    input {
        String libraryPath
        String seqTaxMapPath
        Boolean? unzip = true
        String ntDir = libraryPath + "/nt"
        String ntFilePath = ntDir + "/nt.fna"
    }
Ruben Vorderman's avatar
Ruben Vorderman committed
    command {
        set -e -o pipefail
Ruben Vorderman's avatar
Ruben Vorderman committed
        mkdir -p ~{ntDir}
        rsync -av --partial rsync://ftp.ncbi.nih.gov/blast/db/FASTA/nt.gz* ~{ntDir}
        (cd ~{ntDir} && md5sum -c nt.gz.md5)
Ruben Vorderman's avatar
Ruben Vorderman committed
        # Only unzip when necessary
Ruben Vorderman's avatar
Ruben Vorderman committed
        if ~{true='true' false='false' unzip}
Ruben Vorderman's avatar
Ruben Vorderman committed
        then
Ruben Vorderman's avatar
Ruben Vorderman committed
            zcat ~{ntDir}/nt.gz > ~{ntFilePath}
Ruben Vorderman's avatar
Ruben Vorderman committed
        fi
        }
    output {
        File ntFileGz = ntDir + "/nt.gz"
        File library = libraryPath
        # Added array file to allow for multiple downloads later.
        # Also allows for easier pipeline logic.
        Array[File] ntFastas = glob(ntDir + "/*.fna")
        Array[File] ntFastasGz = glob(ntDir + "/nt*.gz")
    }
}

task downloadAccessionToTaxId {
Ruben Vorderman's avatar
Ruben Vorderman committed
    input {
        String downloadDir
        Boolean gzip = false
    }
Ruben Vorderman's avatar
Ruben Vorderman committed
    command {
        set -e -o pipefail
Ruben Vorderman's avatar
Ruben Vorderman committed
        mkdir -p ~{downloadDir}
        rsync -av --partial rsync://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_*.accession2taxid.gz* ~{downloadDir}
        (cd ~{downloadDir} && md5sum -c *.md5)
        for file in ~{downloadDir}/nucl_*.accession2taxid.gz
Ruben Vorderman's avatar
Ruben Vorderman committed
        do
Ruben Vorderman's avatar
Ruben Vorderman committed
            zcat $file | tail -n +2 | cut -f 2,3 ~{true="| gzip " false='' gzip}> $file.seqtaxmap~{true='.gz' false='' gzip}
Ruben Vorderman's avatar
Ruben Vorderman committed
        done
        }
    output {
        Array[File] seqTaxMaps = glob(downloadDir + "/*.seqtaxmap")
        Array[File] seqTaxMapsGz = glob(downloadDir + "/*.seqtaxmap.gz")
    }
}