Merge remote-tracking branch 'origin/develop' into UMI-tools

38f56c02 · Cats · e55983c0 · 9d62a9d9 · 38f56c02 · 38f56c02
Commit 38f56c02 authored 5 years ago by Cats
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,14 @@ that users understand how the changes affect the new version.

 version 2.2.0-dev
 ---------------------------
+ Add `GenomicsDBImport` task for GATK.
+ Add `annotationGroups` input to `GenotypeGVCFs` to allow setting multiple 
+  annotation groups. The `StandardAnnotation` group is still used as default.
+ GenotypeGVCFs, only allow one input GVCF file, as the tool also only allows
+  one input file. 
+ Rename HaplotypeCallerGVCF to HaplotypeCaller. Add `gvcf` option to set 
+  whether output should be a GVCF.
+ Centrifuge: Add Krona task specific to Centrifuge.
 + Centrifuge: Fix Centrifuge tests, where sometimes the index files could still not be located.
 + Update parameter_meta for TALON, Centrifuge and Minimap2.
 + Centrifuge: Fix issue where Centrifuge Inspect did not get the correct index files location.

--- a/centrifuge.wdl
+++ b/centrifuge.wdl
@@ -37,7 +37,7 @@ task Build {

        Int threads = 5
        String memory = "20G"
-        String dockerImage = "quay.io/biocontainers/centrifuge:1.0.4_beta--he860b03_3"
+        String dockerImage = "quay.io/biocontainers/centrifuge:1.0.4_beta--he513fc3_5"
    }

    command {
@@ -107,7 +107,7 @@ task Classify {

        Int threads = 4
        String memory = "16G"
-        String dockerImage = "quay.io/biocontainers/centrifuge:1.0.4_beta--he860b03_3"
+        String dockerImage = "quay.io/biocontainers/centrifuge:1.0.4_beta--he513fc3_5"
    }

    Map[String, String] inputFormatOptions = {"fastq": "-q", "fasta": "-f", "qseq": "--qseq", "raw": "-r", "sequences": "-c"}
@@ -184,7 +184,7 @@ task Inspect {
        Int? across

        String memory = "4G"
-        String dockerImage = "quay.io/biocontainers/centrifuge:1.0.4_beta--he860b03_3"
+        String dockerImage = "quay.io/biocontainers/centrifuge:1.0.4_beta--he513fc3_5"
    }

    Map[String, String] outputOptions = {"fasta": "", "names": "--names", "summary": "--summary", "conversionTable": "--conversion-table", "taxonomyTree": "--taxonomy-tree", "nameTable": "--name-table", "sizeTable": "--size-table"}
@@ -296,45 +296,100 @@ task DownloadTaxonomy {

 task Kreport {
    input {
-        String? preCommand
-        File centrifugeOut
-        Boolean inputIsCompressed
-        String outputDir
-        String suffix = "kreport"
-        String prefix = "centrifuge"
-        String indexPrefix
-        Boolean? onlyUnique ## removed in 1.0.4
-        Boolean? showZeros
-        Boolean? isCountTable
-        Int? minScore
-        Int? minLength
-
-        Int cores = 1
+        File centrifugeClassification
+        String outputPrefix
+        Array[File]+ indexFiles
+        Boolean noLCA = false
+        Boolean showZeros = false
+        Boolean isCountTable = false
+
+        Int? minimumScore
+        Int? minimumLength
+
        String memory = "4G"
+        String dockerImage = "quay.io/biocontainers/centrifuge:1.0.4_beta--he513fc3_5"
    }

-    String kreportFilePath = outputDir + "/" + prefix + "." + suffix
-    command {
-        set -e -o pipefail
-        ~{preCommand}
+    command <<< 
+        set -e
+        mkdir -p "$(dirname ~{outputPrefix})"
+        indexBasename="$(basename ~{sub(indexFiles[0], "\.[0-9]\.cf", "")})"
+        for file in ~{sep=" " indexFiles}
+        do
+            ln ${file} $PWD/"$(basename ${file})"
+        done
        centrifuge-kreport \
-        -x ~{indexPrefix} \
-        ~{true="--only-unique" false="" onlyUnique} \
+        -x $PWD/${indexBasename} \
+        ~{true="--no-lca" false="" noLCA} \
        ~{true="--show-zeros" false="" showZeros} \
        ~{true="--is-count-table" false="" isCountTable} \
-        ~{"--min-score " + minScore} \
-        ~{"--min-length " + minLength} \
-        ~{true="<(zcat" false="" inputIsCompressed} ~{centrifugeOut}\
-        ~{true=")" false="" inputIsCompressed} \
-        > ~{kreportFilePath}
+        ~{"--min-score " + minimumScore} \
+        ~{"--min-length " + minimumLength} \
+        ~{centrifugeClassification} \
+        > ~{outputPrefix + "_kreport.tsv"}
+    >>>
+
+    output {
+        File outputKreport = outputPrefix + "_kreport.tsv"
+    }
+
+    runtime {
+        memory: memory
+        docker: dockerImage
+    }
+
+    parameter_meta {
+        # inputs
+        centrifugeClassification: {description: "File with Centrifuge classification results.", category: "required"}
+        outputPrefix: {description: "Output directory path + output file prefix.", category: "required"}
+        indexFiles: {description: "The files of the index for the reference genomes.", category: "required"}
+        noLCA: {description: "Do not report the LCA of multiple assignments, but report count fractions at the taxa.", category: "advanced"}
+        showZeros: {description: "Show clades that have zero reads.", category: "advanced"}
+        isCountTable: {description: "The format of the file is taxID<tab>COUNT.", category: "advanced"}
+        minimumScore: {description: "Require a minimum score for reads to be counted.", category: "advanced"}
+        minimumLength: {description: "Require a minimum alignment length to the read.", category: "advanced"}
+        memory: {description: "The amount of memory available to the job.", category: "advanced"}
+        dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"}
+
+        # outputs
+        outputKreport: {description: "File with kraken style report."}
+    }
+}
+
+task KTimportTaxonomy {
+    input {
+        File inputFile
+        String outputPrefix
+
+        String memory = "4G"
+        String dockerImage = "biocontainers/krona:v2.7.1_cv1"
+    }
+
+    command {
+        set -e
+        mkdir -p "$(dirname ~{outputPrefix})"
+        cat ~{inputFile} | cut -f 1,3 > kronaInput.krona
+        ktImportTaxonomy kronaInput.krona
+        cp taxonomy.krona.html ~{outputPrefix + "_krona.html"}
    }

    output {
-        File kreport = kreportFilePath
+        File outputKronaPlot = outputPrefix + "_krona.html"
    }

    runtime {
-        cpu: cores
        memory: memory
+        docker: dockerImage
+    }
+
+    parameter_meta {
+        # inputs
+        inputFile: {description: "File with Centrifuge classification results.", category: "required"}
+        outputPrefix: {description: "Output directory path + output file prefix.", category: "required"}
+        memory: {description: "The amount of memory available to the job.", category: "advanced"}
+        dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"}
+
+        # outputs
+        outputKronaPlot: {description: "Krona taxonomy plot html file."}
    }
 }
--- a/gatk.wdl
+++ b/gatk.wdl
@@ -723,15 +723,66 @@ task GatherBqsrReports {
    }
 }

+task GenomicsDBImport {
+    input {
+        Array[File] gvcfFiles
+        Array[File] gvcfFilesIndex
+        Array[File]+ intervals
+        String genomicsDBWorkspacePath = "genomics_db"
+        String genomicsDBTarFile = "genomics_db.tar.gz"
+        String? tmpDir
+        String memory = "12G"
+        String javaXmx = "4G"
+        String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0"
+    }
+
+    command {
+        set -e
+        mkdir -p "$(dirname ~{genomicsDBWorkspacePath})"
+        gatk --java-options -Xmx~{javaXmx} \
+        GenomicsDBImport \
+        -V ~{sep=" -V " gvcfFiles} \
+        --genomicsdb-workspace-path ~{genomicsDBWorkspacePath} \
+        ~{"--tmp-dir " + tmpDir} \
+        -L ~{sep=" -L " intervals}
+        bash -c 'tar -cvzf ~{genomicsDBTarFile} ~{genomicsDBWorkspacePath}/*'
+    }
+
+    output {
+        File genomicsDbTarArchive = genomicsDBTarFile
+    }
+
+    runtime {
+        docker: dockerImage
+        memory: memory
+    }
+
+    parameter_meta {
+        gvcfFiles: {description: "The gvcfFiles to be merged.", category: "required"}
+        gvcfFilesIndex: {description: "Indexes for the gvcfFiles.", category: "required"}
+        intervals: {description: "intervals over which to operate.", category: "required"}
+        genomicsDBWorkspacePath: {description: "Where the genomicsDB files should be stored", category: "advanced"}
+        genomicsDBTarFile: {description: "Where the .tar file containing the genomicsDB should be stored", category: "advanced"}
+        tmpDir: {description: "Alternate temporary directory in case there is not enough space. Must be mounted when using containers",
+        category: "advanced"}
+        memory: {description: "The amount of memory this job will use.", category: "advanced"}
+        javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.",
+                  category: "advanced"}
+        dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
+                      category: "advanced"}
+    }
+}
+
 task GenotypeGVCFs {
    input {
-        Array[File]+ gvcfFiles
-        Array[File]+ gvcfFilesIndex
+        File gvcfFile
+        File gvcfFileIndex
        Array[File]+ intervals
        String outputPath
        File referenceFasta
        File referenceFastaDict
        File referenceFastaFai
+        Array[String] annotationGroups = ["StandardAnnotation"]
        File? dbsnpVCF
        File? dbsnpVCFIndex

@@ -747,11 +798,10 @@ task GenotypeGVCFs {
        GenotypeGVCFs \
        -R ~{referenceFasta} \
        -O ~{outputPath} \
-        ~{true="-D" false="" defined(dbsnpVCF)} ~{dbsnpVCF} \
-        -G StandardAnnotation \
+        ~{"-D " + dbsnpVCF} \
+        ~{true="-G" false="" length(annotationGroups) > 0} ~{sep=" -G " annotationGroups} \
        --only-output-calls-starting-in-intervals \
-        -new-qual \
-        -V ~{sep=' -V ' gvcfFiles} \
+        -V ~{gvcfFile} \
        -L ~{sep=' -L ' intervals}
    }

@@ -767,8 +817,8 @@ task GenotypeGVCFs {
    }

    parameter_meta {
-        gvcfFiles: {description: "The GVCF files to be genotypes.", category: "required"}
-        gvcfFilesIndex: {description: "The index of the input GVCF files.", category: "required"}
+        gvcfFile: {description: "The GVCF file to be genotyped.", category: "required"}
+        gvcfFileIndex: {description: "The index of the input GVCF file.", category: "required"}
        intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "required"}
        outputPath: {description: "The location to write the output VCF file to.", category: "required"}
        referenceFasta: {description: "The reference fasta file which was also used for mapping.",
@@ -776,6 +826,7 @@ task GenotypeGVCFs {
        referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.",
                             category: "required"}
        referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"}
+        annotationGroups: {description: "Which annotation groups will be used for the annotation", category: "advanced"}
        dbsnpVCF: {description: "A dbSNP VCF.", category: "common"}
        dbsnpVCFIndex: {description: "The index for the dbSNP VCF.", category: "common"}

@@ -839,20 +890,21 @@ task GetPileupSummaries {
 }

 # Call variants on a single sample with HaplotypeCaller to produce a GVCF
-task HaplotypeCallerGvcf {
+task HaplotypeCaller {
    input {
        Array[File]+ inputBams
        Array[File]+ inputBamsIndex
        Array[File]+? intervalList
        Array[File]+? excludeIntervalList
-        String gvcfPath
+        String outputPath
        File referenceFasta
        File referenceFastaIndex
        File referenceFastaDict
-        Float contamination = 0.0
+        Float? contamination
        File? dbsnpVCF
        File? dbsnpVCFIndex
        Int? ploidy
+        Boolean gvcf = false

        String memory = "12G"
        String javaXmx = "4G"
@@ -861,23 +913,23 @@ task HaplotypeCallerGvcf {

    command {
        set -e
-        mkdir -p "$(dirname ~{gvcfPath})"
+        mkdir -p "$(dirname ~{outputPath})"
        gatk --java-options -Xmx~{javaXmx} \
        HaplotypeCaller \
        -R ~{referenceFasta} \
-        -O ~{gvcfPath} \
+        -O ~{outputPath} \
        -I ~{sep=" -I " inputBams} \
        ~{"--sample-ploidy " + ploidy} \
        ~{true="-L" false="" defined(intervalList)} ~{sep=' -L ' intervalList} \
        ~{true="-XL" false="" defined(excludeIntervalList)} ~{sep=' -XL ' excludeIntervalList} \
-        ~{true="-D" false="" defined(dbsnpVCF)} ~{dbsnpVCF} \
-        -contamination ~{contamination} \
-        -ERC GVCF
+        ~{"-D" + dbsnpVCF} \
+        ~{"--contamination-fraction-per-sample-file " + contamination} \
+        ~{true="-ERC GVCF" false="" gvcf}
    }

    output {
-        File outputGVCF = gvcfPath
-        File outputGVCFIndex = gvcfPath + ".tbi"
+        File outputVCF = outputPath
+        File outputVCFIndex = outputPath + ".tbi"
    }

    runtime {
@@ -890,8 +942,9 @@ task HaplotypeCallerGvcf {
        inputBamsIndex: {description: "The indexes for the input BAM files.", category: "required"}
        intervalList: {description: "Bed files or interval lists describing the regions to operate on.", category: "common"}
        excludeIntervalList: {description: "Bed files or interval lists describing the regions to NOT operate on.", category: "common"}
-        gvcfPath: {description: "The location to write the output GVCF to.", category: "required"}
+        outputPath: {description: "The location to write the output to.", category: "required"}
        ploidy: {description: "The ploidy with which the variants should be called.", category: "common"}
+        gvcf: {description: "Whether the output should be a gvcf", category: "common"}
        referenceFasta: {description: "The reference fasta file which was also used for mapping.",
                         category: "required"}
        referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.",