diff --git a/CHANGELOG.md b/CHANGELOG.md index 54c398e0e3348935a99e70380292a0844ada88a9..3fda7d6262af5addd3dd37ac48d909a1187e6e0d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,14 @@ that users understand how the changes affect the new version. version 2.2.0-dev --------------------------- ++ Add `GenomicsDBImport` task for GATK. ++ Add `annotationGroups` input to `GenotypeGVCFs` to allow setting multiple + annotation groups. The `StandardAnnotation` group is still used as default. ++ GenotypeGVCFs, only allow one input GVCF file, as the tool also only allows + one input file. ++ Rename HaplotypeCallerGVCF to HaplotypeCaller. Add `gvcf` option to set + whether output should be a GVCF. ++ Centrifuge: Add Krona task specific to Centrifuge. + Centrifuge: Fix Centrifuge tests, where sometimes the index files could still not be located. + Update parameter_meta for TALON, Centrifuge and Minimap2. + Centrifuge: Fix issue where Centrifuge Inspect did not get the correct index files location. diff --git a/centrifuge.wdl b/centrifuge.wdl index 909de67b9a564fe4a248f070b4873816cb1d465f..a3e7aeaf1f181017f802dc0edd6fc6c37eb60b08 100644 --- a/centrifuge.wdl +++ b/centrifuge.wdl @@ -37,7 +37,7 @@ task Build { Int threads = 5 String memory = "20G" - String dockerImage = "quay.io/biocontainers/centrifuge:1.0.4_beta--he860b03_3" + String dockerImage = "quay.io/biocontainers/centrifuge:1.0.4_beta--he513fc3_5" } command { @@ -107,7 +107,7 @@ task Classify { Int threads = 4 String memory = "16G" - String dockerImage = "quay.io/biocontainers/centrifuge:1.0.4_beta--he860b03_3" + String dockerImage = "quay.io/biocontainers/centrifuge:1.0.4_beta--he513fc3_5" } Map[String, String] inputFormatOptions = {"fastq": "-q", "fasta": "-f", "qseq": "--qseq", "raw": "-r", "sequences": "-c"} @@ -184,7 +184,7 @@ task Inspect { Int? across String memory = "4G" - String dockerImage = "quay.io/biocontainers/centrifuge:1.0.4_beta--he860b03_3" + String dockerImage = "quay.io/biocontainers/centrifuge:1.0.4_beta--he513fc3_5" } Map[String, String] outputOptions = {"fasta": "", "names": "--names", "summary": "--summary", "conversionTable": "--conversion-table", "taxonomyTree": "--taxonomy-tree", "nameTable": "--name-table", "sizeTable": "--size-table"} @@ -296,45 +296,100 @@ task DownloadTaxonomy { task Kreport { input { - String? preCommand - File centrifugeOut - Boolean inputIsCompressed - String outputDir - String suffix = "kreport" - String prefix = "centrifuge" - String indexPrefix - Boolean? onlyUnique ## removed in 1.0.4 - Boolean? showZeros - Boolean? isCountTable - Int? minScore - Int? minLength - - Int cores = 1 + File centrifugeClassification + String outputPrefix + Array[File]+ indexFiles + Boolean noLCA = false + Boolean showZeros = false + Boolean isCountTable = false + + Int? minimumScore + Int? minimumLength + String memory = "4G" + String dockerImage = "quay.io/biocontainers/centrifuge:1.0.4_beta--he513fc3_5" } - String kreportFilePath = outputDir + "/" + prefix + "." + suffix - command { - set -e -o pipefail - ~{preCommand} + command <<< + set -e + mkdir -p "$(dirname ~{outputPrefix})" + indexBasename="$(basename ~{sub(indexFiles[0], "\.[0-9]\.cf", "")})" + for file in ~{sep=" " indexFiles} + do + ln ${file} $PWD/"$(basename ${file})" + done centrifuge-kreport \ - -x ~{indexPrefix} \ - ~{true="--only-unique" false="" onlyUnique} \ + -x $PWD/${indexBasename} \ + ~{true="--no-lca" false="" noLCA} \ ~{true="--show-zeros" false="" showZeros} \ ~{true="--is-count-table" false="" isCountTable} \ - ~{"--min-score " + minScore} \ - ~{"--min-length " + minLength} \ - ~{true="<(zcat" false="" inputIsCompressed} ~{centrifugeOut}\ - ~{true=")" false="" inputIsCompressed} \ - > ~{kreportFilePath} + ~{"--min-score " + minimumScore} \ + ~{"--min-length " + minimumLength} \ + ~{centrifugeClassification} \ + > ~{outputPrefix + "_kreport.tsv"} + >>> + + output { + File outputKreport = outputPrefix + "_kreport.tsv" + } + + runtime { + memory: memory + docker: dockerImage + } + + parameter_meta { + # inputs + centrifugeClassification: {description: "File with Centrifuge classification results.", category: "required"} + outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} + indexFiles: {description: "The files of the index for the reference genomes.", category: "required"} + noLCA: {description: "Do not report the LCA of multiple assignments, but report count fractions at the taxa.", category: "advanced"} + showZeros: {description: "Show clades that have zero reads.", category: "advanced"} + isCountTable: {description: "The format of the file is taxID<tab>COUNT.", category: "advanced"} + minimumScore: {description: "Require a minimum score for reads to be counted.", category: "advanced"} + minimumLength: {description: "Require a minimum alignment length to the read.", category: "advanced"} + memory: {description: "The amount of memory available to the job.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputKreport: {description: "File with kraken style report."} + } +} + +task KTimportTaxonomy { + input { + File inputFile + String outputPrefix + + String memory = "4G" + String dockerImage = "biocontainers/krona:v2.7.1_cv1" + } + + command { + set -e + mkdir -p "$(dirname ~{outputPrefix})" + cat ~{inputFile} | cut -f 1,3 > kronaInput.krona + ktImportTaxonomy kronaInput.krona + cp taxonomy.krona.html ~{outputPrefix + "_krona.html"} } output { - File kreport = kreportFilePath + File outputKronaPlot = outputPrefix + "_krona.html" } runtime { - cpu: cores memory: memory + docker: dockerImage + } + + parameter_meta { + # inputs + inputFile: {description: "File with Centrifuge classification results.", category: "required"} + outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} + memory: {description: "The amount of memory available to the job.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputKronaPlot: {description: "Krona taxonomy plot html file."} } } diff --git a/gatk.wdl b/gatk.wdl index eb050f9a197ec5c88bcbab857a49b119a6884ae4..ff17dadcff8286e6e8a6702fa3117f8722a9596a 100644 --- a/gatk.wdl +++ b/gatk.wdl @@ -723,15 +723,66 @@ task GatherBqsrReports { } } +task GenomicsDBImport { + input { + Array[File] gvcfFiles + Array[File] gvcfFilesIndex + Array[File]+ intervals + String genomicsDBWorkspacePath = "genomics_db" + String genomicsDBTarFile = "genomics_db.tar.gz" + String? tmpDir + String memory = "12G" + String javaXmx = "4G" + String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + } + + command { + set -e + mkdir -p "$(dirname ~{genomicsDBWorkspacePath})" + gatk --java-options -Xmx~{javaXmx} \ + GenomicsDBImport \ + -V ~{sep=" -V " gvcfFiles} \ + --genomicsdb-workspace-path ~{genomicsDBWorkspacePath} \ + ~{"--tmp-dir " + tmpDir} \ + -L ~{sep=" -L " intervals} + bash -c 'tar -cvzf ~{genomicsDBTarFile} ~{genomicsDBWorkspacePath}/*' + } + + output { + File genomicsDbTarArchive = genomicsDBTarFile + } + + runtime { + docker: dockerImage + memory: memory + } + + parameter_meta { + gvcfFiles: {description: "The gvcfFiles to be merged.", category: "required"} + gvcfFilesIndex: {description: "Indexes for the gvcfFiles.", category: "required"} + intervals: {description: "intervals over which to operate.", category: "required"} + genomicsDBWorkspacePath: {description: "Where the genomicsDB files should be stored", category: "advanced"} + genomicsDBTarFile: {description: "Where the .tar file containing the genomicsDB should be stored", category: "advanced"} + tmpDir: {description: "Alternate temporary directory in case there is not enough space. Must be mounted when using containers", + category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + task GenotypeGVCFs { input { - Array[File]+ gvcfFiles - Array[File]+ gvcfFilesIndex + File gvcfFile + File gvcfFileIndex Array[File]+ intervals String outputPath File referenceFasta File referenceFastaDict File referenceFastaFai + Array[String] annotationGroups = ["StandardAnnotation"] File? dbsnpVCF File? dbsnpVCFIndex @@ -747,11 +798,10 @@ task GenotypeGVCFs { GenotypeGVCFs \ -R ~{referenceFasta} \ -O ~{outputPath} \ - ~{true="-D" false="" defined(dbsnpVCF)} ~{dbsnpVCF} \ - -G StandardAnnotation \ + ~{"-D " + dbsnpVCF} \ + ~{true="-G" false="" length(annotationGroups) > 0} ~{sep=" -G " annotationGroups} \ --only-output-calls-starting-in-intervals \ - -new-qual \ - -V ~{sep=' -V ' gvcfFiles} \ + -V ~{gvcfFile} \ -L ~{sep=' -L ' intervals} } @@ -767,8 +817,8 @@ task GenotypeGVCFs { } parameter_meta { - gvcfFiles: {description: "The GVCF files to be genotypes.", category: "required"} - gvcfFilesIndex: {description: "The index of the input GVCF files.", category: "required"} + gvcfFile: {description: "The GVCF file to be genotyped.", category: "required"} + gvcfFileIndex: {description: "The index of the input GVCF file.", category: "required"} intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "required"} outputPath: {description: "The location to write the output VCF file to.", category: "required"} referenceFasta: {description: "The reference fasta file which was also used for mapping.", @@ -776,6 +826,7 @@ task GenotypeGVCFs { referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + annotationGroups: {description: "Which annotation groups will be used for the annotation", category: "advanced"} dbsnpVCF: {description: "A dbSNP VCF.", category: "common"} dbsnpVCFIndex: {description: "The index for the dbSNP VCF.", category: "common"} @@ -839,20 +890,21 @@ task GetPileupSummaries { } # Call variants on a single sample with HaplotypeCaller to produce a GVCF -task HaplotypeCallerGvcf { +task HaplotypeCaller { input { Array[File]+ inputBams Array[File]+ inputBamsIndex Array[File]+? intervalList Array[File]+? excludeIntervalList - String gvcfPath + String outputPath File referenceFasta File referenceFastaIndex File referenceFastaDict - Float contamination = 0.0 + Float? contamination File? dbsnpVCF File? dbsnpVCFIndex Int? ploidy + Boolean gvcf = false String memory = "12G" String javaXmx = "4G" @@ -861,23 +913,23 @@ task HaplotypeCallerGvcf { command { set -e - mkdir -p "$(dirname ~{gvcfPath})" + mkdir -p "$(dirname ~{outputPath})" gatk --java-options -Xmx~{javaXmx} \ HaplotypeCaller \ -R ~{referenceFasta} \ - -O ~{gvcfPath} \ + -O ~{outputPath} \ -I ~{sep=" -I " inputBams} \ ~{"--sample-ploidy " + ploidy} \ ~{true="-L" false="" defined(intervalList)} ~{sep=' -L ' intervalList} \ ~{true="-XL" false="" defined(excludeIntervalList)} ~{sep=' -XL ' excludeIntervalList} \ - ~{true="-D" false="" defined(dbsnpVCF)} ~{dbsnpVCF} \ - -contamination ~{contamination} \ - -ERC GVCF + ~{"-D" + dbsnpVCF} \ + ~{"--contamination-fraction-per-sample-file " + contamination} \ + ~{true="-ERC GVCF" false="" gvcf} } output { - File outputGVCF = gvcfPath - File outputGVCFIndex = gvcfPath + ".tbi" + File outputVCF = outputPath + File outputVCFIndex = outputPath + ".tbi" } runtime { @@ -890,8 +942,9 @@ task HaplotypeCallerGvcf { inputBamsIndex: {description: "The indexes for the input BAM files.", category: "required"} intervalList: {description: "Bed files or interval lists describing the regions to operate on.", category: "common"} excludeIntervalList: {description: "Bed files or interval lists describing the regions to NOT operate on.", category: "common"} - gvcfPath: {description: "The location to write the output GVCF to.", category: "required"} + outputPath: {description: "The location to write the output to.", category: "required"} ploidy: {description: "The ploidy with which the variants should be called.", category: "common"} + gvcf: {description: "Whether the output should be a gvcf", category: "common"} referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.",