Skip to content
Snippets Groups Projects
ncbi.wdl 4.22 KiB
Newer Older
Ruben Vorderman's avatar
Ruben Vorderman committed
version 1.0
Ruben Vorderman's avatar
Ruben Vorderman committed

task GenomeDownload {
Ruben Vorderman's avatar
Ruben Vorderman committed
    input {
        String outputPath
        String? section = "refseq"
        String? format = "all"
        String? assemblyLevel = "all"
        String? taxId
        String? refseqCategory
        Boolean? humanReadable
        String? ncbiBaseUri
        Int? parallel
        Int? retries
        Boolean verbose = true
        Boolean debug = false
Ruben Vorderman's avatar
Ruben Vorderman committed
        String? domain = "all"
Ruben Vorderman's avatar
Ruben Vorderman committed

        String executable = "ncbi-genome-download"
Ruben Vorderman's avatar
Ruben Vorderman committed
        String? preCommand
    }
Ruben Vorderman's avatar
Ruben Vorderman committed
    command {
        set -e -o pipefail
Ruben Vorderman's avatar
Ruben Vorderman committed
        ~{preCommand}
        ~{executable} \
        ~{"--section " + section} \
        ~{"--format " + format} \
        ~{"--assembly-level " + assemblyLevel } \
        ~{"--taxid " + taxId } \
        ~{"--refseq-category " + refseqCategory} \
        ~{"--output-folder " + outputPath } \
        ~{true="--human-readable" false="" humanReadable} \
        ~{"--uri " + ncbiBaseUri } \
        ~{"--parallel " + parallel } \
        ~{"--retries " + retries } \
        ~{true="--verbose" false="" verbose } \
        ~{true="--debug" false ="" debug } \
        ~{domain}
Ruben Vorderman's avatar
Ruben Vorderman committed

        # Check md5sums for all downloaded files
Ruben Vorderman's avatar
Ruben Vorderman committed
        for folder in $(realpath ~{outputPath})/*/*/*
Ruben Vorderman's avatar
Ruben Vorderman committed
            do
                (
                md5sums="$(
                    cd $folder
                    for file in *
                    do
                        if [[ ! $file == "MD5SUMS" ]]
                        then
                            grep $file MD5SUMS
                        fi
                    done
                    )"
                cd $folder; echo $md5sums | md5sum -c)
            done
    }

    output {
        Array[File] fastaGzFiles = glob(outputPath + "/*/*/*/*_genomic.fna.gz")
        Array[File] genbankGzFiles = glob(outputPath + "/*/*/*/*_genomic.gbff.gz")
        Array[File] featuresGzFiles = glob(outputPath + "/*/*/*/*_feature_table.txt.gz")
        Array[File] gffGzFiles = glob(outputPath + "/*/*/*/*_genomic.gff.gz")
        Array[File] proteinFastaGzFiles = glob(outputPath + "/*/*/*/*_protein.faa.gz")
        Array[File] genpeptGzFiles = glob(outputPath + "/*/*/*/*_protein.gpff.gz")
        Array[File] wgsGzFiles = glob(outputPath + "/*/*/*/*_wgsmaster.gbff.gz")
        Array[File] cdsFastaGzFiles = glob(outputPath + "/*/*/*/*_cds_from_genomic.fna.gz")
        Array[File] rnaFastaGzFiles = glob(outputPath + "/*/*/*/*_rna_from_genomic.fna.gz")
        Array[File] assemblyReportFiles = glob(outputPath + "/*/*/*/*_assembly_report.txt")
        Array[File] assemblyStatsFiles = glob(outputPath + "/*/*/*/*_assembly_stats.txt")
    }
 }


task DownloadNtFasta{
Ruben Vorderman's avatar
Ruben Vorderman committed
    input {
        String libraryPath
        String seqTaxMapPath
        Boolean unzip = true
Ruben Vorderman's avatar
Ruben Vorderman committed
        String ntDir = libraryPath + "/nt"
        String ntFilePath = ntDir + "/nt.fna"
    }
Ruben Vorderman's avatar
Ruben Vorderman committed
    command {
        set -e -o pipefail
Ruben Vorderman's avatar
Ruben Vorderman committed
        mkdir -p ~{ntDir}
        rsync -av --partial rsync://ftp.ncbi.nih.gov/blast/db/FASTA/nt.gz* ~{ntDir}
        (cd ~{ntDir} && md5sum -c nt.gz.md5)
Ruben Vorderman's avatar
Ruben Vorderman committed
        # Only unzip when necessary
Ruben Vorderman's avatar
Ruben Vorderman committed
        if ~{true='true' false='false' unzip}
Ruben Vorderman's avatar
Ruben Vorderman committed
        then
Ruben Vorderman's avatar
Ruben Vorderman committed
            zcat ~{ntDir}/nt.gz > ~{ntFilePath}
Ruben Vorderman's avatar
Ruben Vorderman committed
        fi
Ruben Vorderman's avatar
Ruben Vorderman committed
    output {
        File ntFileGz = ntDir + "/nt.gz"
        File library = libraryPath
        # Added array file to allow for multiple downloads later.
        # Also allows for easier pipeline logic.
        Array[File] ntFastas = glob(ntDir + "/*.fna")
        Array[File] ntFastasGz = glob(ntDir + "/nt*.gz")
    }
}

task DownloadAccessionToTaxId {
Ruben Vorderman's avatar
Ruben Vorderman committed
    input {
        String downloadDir
        Boolean gzip = false
    }
Ruben Vorderman's avatar
Ruben Vorderman committed
    command {
        set -e -o pipefail
Ruben Vorderman's avatar
Ruben Vorderman committed
        mkdir -p ~{downloadDir}
        rsync -av \
          --partial \
          rsync://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_*.accession2taxid.gz* \
          ~{downloadDir}
Ruben Vorderman's avatar
Ruben Vorderman committed
        (cd ~{downloadDir} && md5sum -c *.md5)
        for file in ~{downloadDir}/nucl_*.accession2taxid.gz
Ruben Vorderman's avatar
Ruben Vorderman committed
        do
            zcat $file | tail -n +2 | cut -f 2,3 ~{true="| gzip" false='' gzip} > \
              $file.seqtaxmap~{true='.gz' false='' gzip}
Ruben Vorderman's avatar
Ruben Vorderman committed
        done
Ruben Vorderman's avatar
Ruben Vorderman committed
    output {
        Array[File] seqTaxMaps = glob(downloadDir + "/*.seqtaxmap")
        Array[File] seqTaxMapsGz = glob(downloadDir + "/*.seqtaxmap.gz")
    }
}