From 163290340ff4f5ed0488c69d2c194dbb3428a423 Mon Sep 17 00:00:00 2001 From: JasperBoom <jboom@infernum.nl> Date: Mon, 2 Nov 2020 11:57:06 +0100 Subject: [PATCH] Add another batch of updated tasks. --- CHANGELOG.md | 4 + centrifuge.wdl | 4 +- ncbi.wdl | 51 ++++---- pbbam.wdl | 10 +- pbmm2.wdl | 13 ++- picard.wdl | 312 +++++++++++++++++++++++-------------------------- rtg.wdl | 79 ++++++------- sambamba.wdl | 57 +++++---- 8 files changed, 257 insertions(+), 273 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c331112..f0dfaf1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,10 @@ that users understand how the changes affect the new version. version 5.0.0-dev --------------------------- ++ Picard: Add parameter_meta to `SortSam`. ++ pbmm2: Add parameter_meta for `sample`. ++ Centrifuge: Rename output in task `KReport` to `KrakenReport` to resolve + name collision with task name. + Bwa & bwa-mem2: Add parameter_meta for `outputHla`. + Multiqc: Removed WDL_AID excludes of "finished" & "dependencies" inputs. + Bam2fastx: Add localisation of input files to Bam2Fasta task. diff --git a/centrifuge.wdl b/centrifuge.wdl index 1637abd..07dc7f8 100644 --- a/centrifuge.wdl +++ b/centrifuge.wdl @@ -270,7 +270,7 @@ task KReport { >>> output { - File KReport = outputPrefix + "_kreport.tsv" + File KrakenReport = outputPrefix + "_kreport.tsv" } runtime { @@ -294,7 +294,7 @@ task KReport { dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - KReport: {description: "File with kraken style report."} + KrakenReport: {description: "File with kraken style report."} } } diff --git a/ncbi.wdl b/ncbi.wdl index d157d90..da753ba 100644 --- a/ncbi.wdl +++ b/ncbi.wdl @@ -23,6 +23,10 @@ version 1.0 task GenomeDownload { input { String outputPath + Boolean verbose = true + Boolean debug = false + String executable = "ncbi-genome-download" + String? section = "refseq" String? format = "all" String? assemblyLevel = "all" @@ -32,11 +36,7 @@ task GenomeDownload { String? ncbiBaseUri Int? parallel Int? retries - Boolean verbose = true - Boolean debug = false String? domain = "all" - - String executable = "ncbi-genome-download" String? preCommand } @@ -58,22 +58,22 @@ task GenomeDownload { ~{true="--debug" false ="" debug } \ ~{domain} - # Check md5sums for all downloaded files + # Check md5sums for all downloaded files. for folder in $(realpath ~{outputPath})/*/*/* - do - ( - md5sums="$( - cd $folder - for file in * - do - if [[ ! $file == "MD5SUMS" ]] - then - grep $file MD5SUMS - fi - done - )" - cd $folder; echo $md5sums | md5sum -c) - done + do + ( + md5sums="$( + cd $folder + for file in * + do + if [[ ! $file == "MD5SUMS" ]] + then + grep $file MD5SUMS + fi + done + )" + cd $folder; echo $md5sums | md5sum -c) + done } output { @@ -106,7 +106,7 @@ task DownloadNtFasta{ mkdir -p ~{ntDir} rsync -av --partial rsync://ftp.ncbi.nih.gov/blast/db/FASTA/nt.gz* ~{ntDir} (cd ~{ntDir} && md5sum -c nt.gz.md5) - # Only unzip when necessary + # Only unzip when necessary. if ~{true='true' false='false' unzip} then zcat ~{ntDir}/nt.gz > ~{ntFilePath} @@ -132,15 +132,16 @@ task DownloadAccessionToTaxId { command { set -e -o pipefail mkdir -p ~{downloadDir} - rsync -av \ - --partial \ - rsync://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_*.accession2taxid.gz* \ - ~{downloadDir} + rsync \ + -av \ + --partial \ + rsync://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_*.accession2taxid.gz* \ + ~{downloadDir} (cd ~{downloadDir} && md5sum -c *.md5) for file in ~{downloadDir}/nucl_*.accession2taxid.gz do zcat $file | tail -n +2 | cut -f 2,3 ~{true="| gzip" false='' gzip} > \ - $file.seqtaxmap~{true='.gz' false='' gzip} + $file.seqtaxmap~{true='.gz' false='' gzip} done } diff --git a/pbbam.wdl b/pbbam.wdl index 52737a0..d271a11 100644 --- a/pbbam.wdl +++ b/pbbam.wdl @@ -18,12 +18,14 @@ version 1.0 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. task Index { input { File bamFile + String? outputBamPath - + String memory = "2G" Int timeMinutes = 1 + ceil(size(bamFile, "G") * 4) String dockerImage = "quay.io/biocontainers/pbbam:1.6.0--h5b7e6e0_0" @@ -60,11 +62,9 @@ task Index { parameter_meta { # inputs bamFile: {description: "The BAM file for which an index should be made.", category: "required"} - outputBamPath: {description: "The location where the BAM file should be written to. The index will appear alongside this link to the BAM file.", - category: "common"} + outputBamPath: {description: "The location where the BAM file should be written to. The index will appear alongside this link to the BAM file.", category: "common"} memory: {description: "The amount of memory needed for the job.", category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} } } diff --git a/pbmm2.wdl b/pbmm2.wdl index 31d4c66..5fda1c8 100644 --- a/pbmm2.wdl +++ b/pbmm2.wdl @@ -8,10 +8,10 @@ version 1.0 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -58,9 +58,10 @@ task Mapping { } parameter_meta { + # inputs presetOption: {description: "This option applies multiple options at the same time.", category: "required"} sort: {description: "Sort the output bam file.", category: "advanced"} - sample: {description: "Name of the sample"} + sample: {description: "Name of the sample.", category: "required"} referenceMMI: {description: "MMI file for the reference.", category: "required"} queryFile: {description: "BAM file with reads to align against the reference.", category: "required"} cores: {description: "The number of cores to be used.", category: "advanced"} @@ -68,7 +69,7 @@ task Mapping { timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} - # output + # outputs outputAlignmentFile: {description: "Mapped bam file."} outputIndexFile: {description: "Bam index file."} } diff --git a/picard.wdl b/picard.wdl index 49db8b8..f1876f7 100644 --- a/picard.wdl +++ b/picard.wdl @@ -26,8 +26,8 @@ task BedToIntervalList { File dict String outputPath = "regions.interval_list" - String memory = "4G" String javaXmx = "3G" + String memory = "4G" Int timeMinutes = 5 String dockerImage = "quay.io/biocontainers/picard:2.20.5--0" } @@ -47,9 +47,9 @@ task BedToIntervalList { } runtime { - docker: dockerImage - time_minutes: timeMinutes memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { @@ -57,12 +57,10 @@ task BedToIntervalList { bedFile: {description: "A bed file.", category: "required"} dict: {description: "A sequence dict file.", category: "required"} outputPath: {description: "The location the output interval list should be written to.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} } } @@ -74,17 +72,19 @@ task CollectHsMetrics { File referenceFastaDict File referenceFastaFai File targets - File? baits String basename + File? baits + # Use the targets file as baits as a fallback, since often the baits # for a certain capture kit are not available. File baitsFile = select_first([baits, targets]) File targetsFile = targets - Int memoryMb = javaXmxMb + 512 Int javaXmxMb = 3072 - # Additional * 2 because picard multiple metrics reads the reference fasta twice. + Int memoryMb = javaXmxMb + 512 + # Additional * 2 because picard multiple metrics reads the + # reference fasta twice. Int timeMinutes = 1 + ceil(size(referenceFasta, "G") * 3 * 2) + ceil(size(inputBam, "G") * 6) String dockerImage = "quay.io/biocontainers/picard:2.23.2--0" } @@ -106,9 +106,9 @@ task CollectHsMetrics { } runtime { - docker: dockerImage - time_minutes: timeMinutes memory: "~{memoryMb}M" + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { @@ -116,18 +116,15 @@ task CollectHsMetrics { inputBam: {description: "The input BAM file for which metrics will be collected.", category: "required"} inputBamIndex: {description: "The index of the input BAM file.", category: "required"} referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} - referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", - category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} targets: {description: "Picard interval file of the capture targets.", category: "required"} - baits: {description: "Picard interval file of the capture bait set.", category: "advanced"} basename: {description: "The basename/prefix of the output files (may include directories).", category: "required"} + baits: {description: "Picard interval file of the capture bait set.", category: "advanced"} + javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.", category: "advanced"} memoryMb: {description: "The amount of memory this job will use in megabytes.", category: "advanced"} - javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.", - category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} } } @@ -139,7 +136,6 @@ task CollectMultipleMetrics { File referenceFastaDict File referenceFastaFai String basename - Boolean collectAlignmentSummaryMetrics = true Boolean collectInsertSizeMetrics = true Boolean qualityScoreDistribution = true @@ -150,14 +146,13 @@ task CollectMultipleMetrics { Boolean collectSequencingArtifactMetrics = true Boolean collectQualityYieldMetrics = true - Int memoryMb = javaXmxMb + 512 Int javaXmxMb = 3072 + Int memoryMb = javaXmxMb + 512 # Additional * 2 because picard multiple metrics reads the reference fasta twice. Int timeMinutes = 1 + ceil(size(referenceFasta, "G") * 3 * 2) + ceil(size(inputBam, "G") * 6) String dockerImage = "quay.io/biocontainers/picard:2.23.2--0" } - command { set -e mkdir -p "$(dirname ~{basename})" @@ -173,8 +168,7 @@ task CollectMultipleMetrics { ~{true="PROGRAM=MeanQualityByCycle" false="" meanQualityByCycle} \ ~{true="PROGRAM=CollectBaseDistributionByCycle" false="" collectBaseDistributionByCycle} \ ~{true="PROGRAM=CollectGcBiasMetrics" false="" collectGcBiasMetrics} \ - ~{true="PROGRAM=CollectSequencingArtifactMetrics" false="" - collectSequencingArtifactMetrics} \ + ~{true="PROGRAM=CollectSequencingArtifactMetrics" false="" collectSequencingArtifactMetrics} \ ~{true="PROGRAM=CollectQualityYieldMetrics" false="" collectQualityYieldMetrics} } @@ -221,9 +215,9 @@ task CollectMultipleMetrics { } runtime { - docker: dockerImage - time_minutes: timeMinutes memory: "~{memoryMb}M" + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { @@ -231,30 +225,21 @@ task CollectMultipleMetrics { inputBam: {description: "The input BAM file for which metrics will be collected.", category: "required"} inputBamIndex: {description: "The index of the input BAM file.", category: "required"} referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} - referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", - category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} basename: {description: "The basename/prefix of the output files (may include directories).", category: "required"} - collectAlignmentSummaryMetrics: {description: "Equivalent to the `PROGRAM=CollectAlignmentSummaryMetrics` argument.", - category: "advanced"} - collectInsertSizeMetrics: {description: "Equivalent to the `PROGRAM=CollectInsertSizeMetrics` argument.", - category: "advanced"} - qualityScoreDistribution: {description: "Equivalent to the `PROGRAM=QualityScoreDistribution` argument.", - category: "advanced"} + collectAlignmentSummaryMetrics: {description: "Equivalent to the `PROGRAM=CollectAlignmentSummaryMetrics` argument.", category: "advanced"} + collectInsertSizeMetrics: {description: "Equivalent to the `PROGRAM=CollectInsertSizeMetrics` argument.", category: "advanced"} + qualityScoreDistribution: {description: "Equivalent to the `PROGRAM=QualityScoreDistribution` argument.", category: "advanced"} meanQualityByCycle: {description: "Equivalent to the `PROGRAM=MeanQualityByCycle` argument.", category: "advanced"} - collectBaseDistributionByCycle: {description: "Equivalent to the `PROGRAM=CollectBaseDistributionByCycle` argument.", - category: "advanced"} + collectBaseDistributionByCycle: {description: "Equivalent to the `PROGRAM=CollectBaseDistributionByCycle` argument.", category: "advanced"} collectGcBiasMetrics: {description: "Equivalent to the `PROGRAM=CollectGcBiasMetrics` argument.", category: "advanced"} - collectSequencingArtifactMetrics: {description: "Equivalent to the `PROGRAM=CollectSequencingArtifactMetrics` argument.", - category: "advanced"} - collectQualityYieldMetrics: {description: "Equivalent to the `PROGRAM=CollectQualityYieldMetrics` argument.", - category: "advanced"} + collectSequencingArtifactMetrics: {description: "Equivalent to the `PROGRAM=CollectSequencingArtifactMetrics` argument.", category: "advanced"} + collectQualityYieldMetrics: {description: "Equivalent to the `PROGRAM=CollectQualityYieldMetrics` argument.", category: "advanced"} + javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.", category: "advanced"} memoryMb: {description: "The amount of memory this job will use in megabytes.", category: "advanced"} - javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.", - category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} } } @@ -266,9 +251,9 @@ task CollectRnaSeqMetrics { String basename String strandSpecificity = "NONE" - String memory = "9G" String javaXmx = "8G" - # With 6 minutes per G there were several timeouts. + String memory = "9G" + # With 6 minutes per G there were several timeouts. Int timeMinutes = 1 + ceil(size(inputBam, "G") * 12) String dockerImage = "quay.io/biocontainers/picard:2.23.2--0" } @@ -286,14 +271,14 @@ task CollectRnaSeqMetrics { } output { - File? chart = basename + ".RNA_Metrics.pdf" File metrics = basename + ".RNA_Metrics" + File? chart = basename + ".RNA_Metrics.pdf" } runtime { - docker: dockerImage - time_minutes: timeMinutes memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { @@ -302,15 +287,11 @@ task CollectRnaSeqMetrics { inputBamIndex: {description: "The index of the input BAM file.", category: "required"} refRefflat: {description: "A refflat file containing gene annotations.", catehory: "required"} basename: {description: "The basename/prefix of the output files (may include directories).", category: "required"} - strandSpecificity: {description: "Equivalent to the `STRAND_SPECIFICITY` option of picard's CollectRnaSeqMetrics.", - category: "common"} - + strandSpecificity: {description: "Equivalent to the `STRAND_SPECIFICITY` option of picard's CollectRnaSeqMetrics.", category: "common"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} } } @@ -325,8 +306,8 @@ task CollectTargetedPcrMetrics { Array[File]+ targetIntervals String basename - String memory = "4G" String javaXmx = "3G" + String memory = "4G" Int timeMinutes = 1 + ceil(size(inputBam, "G") * 6) String dockerImage = "quay.io/biocontainers/picard:2.23.2--0" } @@ -352,9 +333,9 @@ task CollectTargetedPcrMetrics { } runtime { - docker: dockerImage - time_minutes: timeMinutes memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { @@ -362,21 +343,15 @@ task CollectTargetedPcrMetrics { inputBam: {description: "The input BAM file for which metrics will be collected.", category: "required"} inputBamIndex: {description: "The index of the input BAM file.", category: "required"} referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} - referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", - category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} - ampliconIntervals: {description: "An interval list describinig the coordinates of the amplicons sequenced.", - category: "required"} - targetIntervals: {description: "An interval list describing the coordinates of the targets sequenced.", - category: "required"} + ampliconIntervals: {description: "An interval list describinig the coordinates of the amplicons sequenced.", category: "required"} + targetIntervals: {description: "An interval list describing the coordinates of the targets sequenced.", category: "required"} basename: {description: "The basename/prefix of the output files (may include directories).", category: "required"} - + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} } } @@ -388,8 +363,8 @@ task CollectVariantCallingMetrics { File inputVCFIndex String basename - String memory = "9G" String javaXmx = "8G" + String memory = "9G" Int timeMinutes = 1440 String dockerImage = "quay.io/biocontainers/picard:2.23.2--0" } @@ -410,24 +385,22 @@ task CollectVariantCallingMetrics { } runtime { - docker: dockerImage - time_minutes: timeMinutes memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { # inputs dbsnp: {description: "DBSNP vcf file to use with CollectVariantCallingMetrics.", category: "required"} dbsnpIndex: {description: "Index file for the DBSNP VCF.", category: "required"} - inputVCF: {description: "Input VCF file", category: "required"} + inputVCF: {description: "Input VCF file.", category: "required"} inputVCFIndex: {description: "Index file for the input VCF.", category: "required"} basename: {description: "The basename/prefix of the output files (may include directories).", category: "required"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} } } @@ -436,8 +409,8 @@ task CreateSequenceDictionary { File inputFile String outputDir - String memory = "3G" String javaXmx = "2G" + String memory = "3G" String dockerImage = "quay.io/biocontainers/picard:2.23.2--0" } @@ -464,8 +437,8 @@ task CreateSequenceDictionary { # inputs inputFile: {description: "The input fasta file.", category: "required"} outputDir: {description: "Output directory path.", category: "required"} - memory: {description: "The amount of memory available to the job.", category: "advanced"} javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} + memory: {description: "The amount of memory available to the job.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs @@ -473,17 +446,19 @@ task CreateSequenceDictionary { } } -# Combine multiple recalibrated BAM files from scattered ApplyRecalibration runs +# Combine multiple recalibrated BAM files from scattered +# ApplyRecalibration runs. task GatherBamFiles { input { Array[File]+ inputBams Array[File]+ inputBamsIndex String outputBamPath + Boolean createMd5File = false - Int memoryMb = javaXmxMb + 512 - Int javaXmxMb = 1024 Int? compressionLevel - Boolean createMd5File = false + + Int javaXmxMb = 1024 + Int memoryMb = javaXmxMb + 512 # One minute per input gigabyte. Int timeMinutes = 1 + ceil(size(inputBams, "G") * 1) String dockerImage = "quay.io/biocontainers/picard:2.23.2--0" @@ -508,9 +483,9 @@ task GatherBamFiles { } runtime { - docker: dockerImage - time_minutes: timeMinutes memory: "~{memoryMb}M" + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { @@ -518,14 +493,12 @@ task GatherBamFiles { inputBams: {description: "The BAM files to be merged together.", category: "required"} inputBamsIndex: {description: "The indexes of the input BAM files.", category: "required"} outputBamPath: {description: "The path where the merged BAM file will be written.", caregory: "required"} - compressionLevel: {description: "The compression level of the output BAM.", category: "advanced"} createMd5File: {decription: "Whether to create an md5 file of the output BAM.", category: "advanced"} + compressionLevel: {description: "The compression level of the output BAM.", category: "advanced"} + javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.", category: "advanced"} memoryMb: {description: "The amount of memory this job will use in megabytes.", category: "advanced"} - javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.", - category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} } } @@ -535,8 +508,8 @@ task GatherVcfs { Array[File]+ inputVcfIndexes String outputVcfPath = "out.vcf.gz" - String memory = "5G" String javaXmx = "4G" + String memory = "5G" Int timeMinutes = 1 + ceil(size(inputVcfs, "G") * 2) String dockerImage = "quay.io/biocontainers/picard:2.23.2--0" } @@ -555,9 +528,9 @@ task GatherVcfs { } runtime { - docker: dockerImage memory: memory time_minutes: timeMinutes + docker: dockerImage } parameter_meta { @@ -565,17 +538,14 @@ task GatherVcfs { inputVcfs: {description: "The VCF files to be merged together.", category: "required"} inputVcfIndexes: {description: "The indexes of the input VCF files.", category: "required"} outputVcfPath: {description: "The path where the merged VCF file will be written.", caregory: "required"} - + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} } } -# Mark duplicate reads to avoid counting non-independent observations +# Mark duplicate reads to avoid counting non-independent observations. task MarkDuplicates { input { Array[File]+ inputBams @@ -583,31 +553,32 @@ task MarkDuplicates { String metricsPath Int compressionLevel = 1 Boolean createMd5File = false - Boolean useJdkInflater = true # Slightly faster than the intel one. - # Better results for compression level 1 (much smaller). Higher compression levels similar to intel deflater. - # NOTE: this might change in the future when the intel deflater is updated! + Boolean useJdkInflater = true # Slightly faster than the intel one. + # Better results for compression level 1 (much smaller). + # Higher compression levels similar to intel deflater. + # NOTE: this might change in the future when the intel + # deflater is updated! Boolean useJdkDeflater = true - # In GATK Best practices pipeline MarkDuplicates is given a 7G VM. + # The program default for READ_NAME_REGEX is appropriate in nearly every case. + # Sometimes we wish to supply "null" in order to turn off optical duplicate detection. + # This can be desirable if you don't mind the estimated library size + # being wrong and optical duplicate detection is taking >7 days and failing. + String? read_name_regex + + # In GATK Best practices pipeline MarkDuplicates is given a 7G VM. # https://github.com/gatk-workflows/broad-prod-wgs-germline-snps-indels/blob/d2934ed656ade44801f9cfe1c0e78d4f80684b7b/PairedEndSingleSampleWf-fc-hg38.wdl#L1040 Int javaXmxMb = 6656 # 6.5G String memoryMb = javaXmxMb + 512 Int timeMinutes = 1 + ceil(size(inputBams, "G") * 8) String dockerImage = "quay.io/biocontainers/picard:2.23.2--0" - - # The program default for READ_NAME_REGEX is appropriate in nearly every case. - # Sometimes we wish to supply "null" in order to turn off optical duplicate detection - # This can be desirable if you don't mind the estimated library size being wrong and - # optical duplicate detection is taking >7 days and failing - String? read_name_regex } # Task is assuming query-sorted input so that the Secondary and Supplementary reads get # marked correctly. This works because the output of BWA is query-grouped and therefore, # so is the output of MergeBamAlignment. While query-grouped isn't actually query-sorted, - # it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname" - + # it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname". command { set -e mkdir -p "$(dirname ~{outputBamPath})" @@ -625,7 +596,7 @@ task MarkDuplicates { ADD_PG_TAG_TO_READS=false \ CREATE_MD5_FILE=~{true="true" false="false" createMd5File} \ USE_JDK_INFLATER=~{true="true" false="false" useJdkInflater} \ - USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater} + USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater} } output { @@ -636,9 +607,9 @@ task MarkDuplicates { } runtime { - docker: dockerImage - time_minutes: timeMinutes memory: "~{memoryMb}M" + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { @@ -646,42 +617,39 @@ task MarkDuplicates { inputBams: {description: "The BAM files for which the duplicate reads should be marked.", category: "required"} outputBamPath: {description: "The location where the ouptut BAM file should be written.", category: "required"} metricsPath: {description: "The location where the output metrics file should be written.", category: "required"} - read_name_regex: {description: "Equivalent to the `READ_NAME_REGEX` option of MarkDuplicates.", category: "advanced"} + compressionLevel: {description: "The compression level at which the BAM files are written.", category: "advanced"} createMd5File: {description: "Whether to create a md5 file for the created BAM file.", category: "advanced"} useJdkInflater: {description: "True, uses the java inflater. False, uses the optimized intel inflater.", category: "advanced"} useJdkDeflater: {description: "True, uses the java deflator to compress the BAM files. False uses the optimized intel deflater.", category: "advanced"} - compressionLevel: {description: "The compression level at which the BAM files are written", category: "advanced"} + read_name_regex: {description: "Equivalent to the `READ_NAME_REGEX` option of MarkDuplicates.", category: "advanced"} + javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.", category: "advanced"} memoryMb: {description: "The amount of memory this job will use in megabytes.", category: "advanced"} - javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.", - category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} } } -# Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs +# Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs. task MergeVCFs { input { Array[File]+ inputVCFs Array[File]+ inputVCFsIndexes String outputVcfPath - - String memory = "5G" - String javaXmx = "4G" - Int timeMinutes = 1 + ceil(size(inputVCFs, "G")) * 2 - String dockerImage = "quay.io/biocontainers/picard:2.23.2--0" Int compressionLevel = 1 - Boolean useJdkInflater = true # Slightly faster than the intel one. - # Better results for compression level 1 (much smaller). Higher compression levels similar to intel deflater. + Boolean useJdkInflater = true # Slightly faster than the intel one. + # Better results for compression level 1 (much smaller). + # Higher compression levels similar to intel deflater. # NOTE: this might change in the future when the intel deflater is updated! Boolean useJdkDeflater = true + String javaXmx = "4G" + String memory = "5G" + Int timeMinutes = 1 + ceil(size(inputVCFs, "G")) * 2 + String dockerImage = "quay.io/biocontainers/picard:2.23.2--0" } - # Using MergeVcfs instead of GatherVcfs so we can create indices - # See https://github.com/broadinstitute/picard/issues/789 for relevant GatherVcfs ticket - + # Using MergeVcfs instead of GatherVcfs so we can create indices. + # See https://github.com/broadinstitute/picard/issues/789 for relevant GatherVcfs ticket. command { set -e mkdir -p "$(dirname ~{outputVcfPath})" @@ -691,7 +659,7 @@ task MergeVCFs { OUTPUT=~{outputVcfPath} \ COMPRESSION_LEVEL=~{compressionLevel} \ USE_JDK_INFLATER=~{true="true" false="false" useJdkInflater} \ - USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater} + USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater} } output { @@ -700,9 +668,9 @@ task MergeVCFs { } runtime { - docker: dockerImage - time_minutes: timeMinutes memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { @@ -710,16 +678,13 @@ task MergeVCFs { inputVCFs: {description: "The VCF files to be merged.", category: "required"} inputVCFsIndexes: {description: "The indexes of the VCF files.", category: "required"} outputVcfPath: {description: "The location the output VCF file should be written to.", category: "required"} - - memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + compressionLevel: {description: "The compression level at which the BAM files are written.", category: "advanced"} useJdkInflater: {description: "True, uses the java inflater. False, uses the optimized intel inflater.", category: "advanced"} useJdkDeflater: {description: "True, uses the java deflator to compress the BAM files. False uses the optimized intel deflater.", category: "advanced"} - compressionLevel: {description: "The compression level at which the BAM files are written", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} } } @@ -729,10 +694,12 @@ task SamToFastq { File inputBamIndex Boolean paired = true - String memory = "17G" String javaXmx = "16G" # High memory default to avoid crashes. + String memory = "17G" + Int timeMinutes = 30 String dockerImage = "quay.io/biocontainers/picard:2.23.2--0" - File? NONE + + File? noneFile } String outputRead1 = basename(inputBam, "\.[bs]am") + "_R1.fastq.gz" @@ -751,13 +718,20 @@ task SamToFastq { output { File read1 = outputRead1 - File? read2 = if paired then outputRead2 else NONE - File? unpairedRead = if paired then outputUnpaired else NONE + File? read2 = if paired then outputRead2 else noneFile + File? unpairedRead = if paired then outputUnpaired else noneFile } runtime { - docker: dockerImage memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + meta { + WDL_AID: { + exclude: ["noneFile"] + } } } @@ -766,8 +740,8 @@ task ScatterIntervalList { File interval_list Int scatter_count - String memory = "4G" String javaXmx = "3G" + String memory = "4G" String dockerImage = "quay.io/biocontainers/picard:2.23.2--0" } @@ -790,8 +764,8 @@ task ScatterIntervalList { } runtime { - docker: dockerImage memory: memory + docker: dockerImage } } @@ -804,7 +778,7 @@ task SortSam { Int maxRecordsInRam = 500000 Int compressionLevel = 1 - # Default ram of 4 GB. Using 125001.0 to prevent an answer of + # Default ram of 4 GB. Using 125001.0 to prevent an answer of # 4.000000001 which gets rounded to 5. # GATK Best practices uses 75000 here: https://github.com/gatk-workflows/broad-prod-wgs-germline-snps-indels/blob/d2934ed656ade44801f9cfe1c0e78d4f80684b7b/PairedEndSingleSampleWf-fc-hg38.wdl#L778 Int XmxGb = ceil(maxRecordsInRam / 125001.0) @@ -840,13 +814,16 @@ task SortSam { } parameter_meta { - inputBam: {description: "The unsorted input BAM file", category: "required"} + # inputs + inputBam: {description: "The unsorted input BAM file.", category: "required"} outputPath: {description: "The location the output BAM file should be written to.", category: "required"} - XmxGb: {description: "The maximum memory available to picard SortSam. Should be lower than `memory` to accommodate JVM overhead and BWA mem's memory usage.", - category: "advanced"} + sortByName: {description: "Sort the output file by name, default is position.", category: "advanced"} + createMd5File: {description: "Whether to create an MD5 digest for any BAM or FASTQ files created.", category: "advanced"} + maxRecordsInRam: {description: "This will specify the number of records stored in RAM before spilling to disk.", category: "advanced"} + compressionLevel: {description: "Compression level for all compressed files created.", category: "advanced"} + XmxGb: {description: "The maximum memory available to picard SortSam. Should be lower than `memory` to accommodate JVM overhead and BWA mem's memory usage.", category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} } } @@ -854,10 +831,11 @@ task SortVcf { input { Array[File]+ vcfFiles String outputVcfPath + File? dict - String memory = "9G" String javaXmx = "8G" + String memory = "9G" Int timeMinutes = 1 + ceil(size(vcfFiles, "G") * 5) String dockerImage = "quay.io/biocontainers/picard:2.20.5--0" } @@ -879,9 +857,9 @@ task SortVcf { } runtime { - docker: dockerImage - time_minutes: timeMinutes memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { @@ -889,13 +867,10 @@ task SortVcf { vcfFiles: {description: "The VCF files to merge and sort.", category: "required"} outputVcfPath: {description: "The location the sorted VCF files should be written to.", category: "required"} dict: {description: "A sequence dictionary matching the VCF files.", category: "advanced"} - + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} } } @@ -904,8 +879,9 @@ task RenameSample { File inputVcf String outputPath = "./picard/renamed.vcf" String newSampleName - String memory = "9G" + String javaXmx = "8G" + String memory = "9G" Int timeMinutes = 1 + ceil(size(inputVcf, "G") * 2) String dockerImage = "quay.io/biocontainers/picard:2.19.0--0" } @@ -925,9 +901,9 @@ task RenameSample { } runtime { - docker: dockerImage - time_minutes: timeMinutes memory: memory + time_minutes: timeMinutes + docker: dockerImage } parameter_meta { @@ -935,8 +911,8 @@ task RenameSample { inputVcf: {description: "The VCF file to process.", category: "required"} outputPath: {description: "The location the output VCF file should be written.", category: "common"} newSampleName: {description: "A string to replace the old sample name.", category: "required"} - memory: {description: "The memory required to run the programs", category: "advanced"} - javaXmx: {description: "The max. memory allocated for JAVA", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} + memory: {description: "The memory required to run the programs.", category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} } diff --git a/rtg.wdl b/rtg.wdl index 104a5ef..bfd3295 100644 --- a/rtg.wdl +++ b/rtg.wdl @@ -22,13 +22,14 @@ version 1.0 task Format { input { + Array[File]+ inputFiles String format = "fasta" String outputPath = "seq_data.sdf" - Array[File]+ inputFiles - String dockerImage = "quay.io/biocontainers/rtg-tools:3.10.1--0" + String rtgMem = "8G" String memory = "9G" Int timeMinutes = 1 + ceil(size(inputFiles) * 2) + String dockerImage = "quay.io/biocontainers/rtg-tools:3.10.1--0" } command { @@ -44,21 +45,20 @@ task Format { } runtime { - docker: dockerImage memory: memory time_minutes: timeMinutes + docker: dockerImage } parameter_meta { - format: {description: "Format of input. Allowed values are [fasta, fastq, fastq-interleaved, sam-se, sam-pe].", - category: "advanced"} - outputPath: {description: "Where the output should be placed.", category: "advanced"} + # inputs inputFiles: {description: "input sequence files. May be specified 1 or more times.", category: "required"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} - timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + format: {description: "Format of input. Allowed values are [fasta, fastq, fastq-interleaved, sam-se, sam-pe].", category: "advanced"} + outputPath: {description: "Where the output should be placed.", category: "advanced"} + rtgMem: {description: "The amount of memory rtg will allocate to the JVM.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} - rtgMem: {description: "The amount of memory rtg will allocate to the JVM", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} } } @@ -68,18 +68,20 @@ task VcfEval { File baselineIndex File calls File callsIndex - File? evaluationRegions - File? bedRegions + Boolean squashPloidy = false + String outputMode = "split" String outputDir = "output/" File template Boolean allRecords = false Boolean decompose = false Boolean refOverlap = false + + File? evaluationRegions + File? bedRegions String? sample - Boolean squashPloidy = false - String outputMode = "split" - Int threads = 1 # tool default is number of cores in the system 😱 + String rtgMem = "8G" + Int threads = 1 # Tool default is number of cores in the system 😱. String memory = "9G" Int timeMinutes = 1 + ceil(size([baseline, calls], "G") * 5) String dockerImage = "quay.io/biocontainers/rtg-tools:3.10.1--0" @@ -132,39 +134,32 @@ task VcfEval { } runtime { - docker: dockerImage cpu: threads memory: memory time_minutes: timeMinutes + docker: dockerImage } parameter_meta { - baseline: {description: "VCF file containing baseline variants", category: "required"} - baselineIndex: {description: "The baseline's VCF index", category: "required"} - calls: {description: "VCF file containing called variants", category: "required"} - callsIndex: {description: "The call's VCF index", category: "required"} - outputDir: {description: "Directory for output", category: "advanced"} - bedRegions: {description: "if set, only read VCF records that overlap the ranges contained in the specified BED file", category: "advanced"} - evaluationRegions: {description: "if set, evaluate within regions contained in the supplied BED file, allowing transborder matches. To be used for truth-set high-confidence regions or other regions of interest where region boundary effects should be minimized", - category: "advanced"} - template: {description: "SDF of the reference genome the variants are called against", category: "required"} - allRecords: {description: "use all records regardless of FILTER status (Default is to only process records where FILTER is \".\" or \"PASS\")", - category: "common"} - decompose: {description: "decompose complex variants into smaller constituents to allow partial credit", category: "common"} - refOverlap: {description: "allow alleles to overlap where bases of either allele are same-as-ref (Default is to only allow VCF anchor base overlap)", - category: "common"} - sample: {description: "the name of the sample to select. Use <baseline_sample>,<calls_sample> to select different sample names for baseline and calls. (Required when using multi-sample VCF files)", - category: "common"} - squashPloidy: {description: "treat heterozygous genotypes as homozygous ALT in both baseline and calls, to allow matches that ignore zygosity differences", - category: "common"} - outputMode: {description: "output reporting mode. Allowed values are [split, annotate, combine, ga4gh, roc-only] (Default is split)", - category: "advanced"} - threads: {description: "Number of threads. Default is 1", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} - timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} - rtgMem: {description: "The amount of memory rtg will allocate to the JVM", category: "advanced"} + # inputs + baseline: {description: "VCF file containing baseline variants.", category: "required"} + baselineIndex: {description: "The baseline's VCF index.", category: "required"} + calls: {description: "VCF file containing called variants.", category: "required"} + callsIndex: {description: "The call's VCF index.", category: "required"} + squashPloidy: {description: "treat heterozygous genotypes as homozygous ALT in both baseline and calls, to allow matches that ignore zygosity differences.", category: "common"} + outputMode: {description: "output reporting mode. Allowed values are [split, annotate, combine, ga4gh, roc-only] (Default is split).", category: "advanced"} + outputDir: {description: "Directory for output.", category: "advanced"} + template: {description: "SDF of the reference genome the variants are called against.", category: "required"} + allRecords: {description: "use all records regardless of FILTER status (Default is to only process records where FILTER is \".\" or \"PASS\").", category: "common"} + decompose: {description: "decompose complex variants into smaller constituents to allow partial credit.", category: "common"} + refOverlap: {description: "allow alleles to overlap where bases of either allele are same-as-ref (Default is to only allow VCF anchor base overlap).", category: "common"} + sample: {description: "the name of the sample to select. Use <baseline_sample>,<calls_sample> to select different sample names for baseline and calls. (Required when using multi-sample VCF files).", category: "common"} + bedRegions: {description: "if set, only read VCF records that overlap the ranges contained in the specified BED file.", category: "advanced"} + evaluationRegions: {description: "if set, evaluate within regions contained in the supplied BED file, allowing transborder matches. To be used for truth-set high-confidence regions or other regions of interest where region boundary effects should be minimized.", category: "advanced"} + rtgMem: {description: "The amount of memory rtg will allocate to the JVM.", category: "advanced"} + threads: {description: "Number of threads. Default is 1.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} } } - diff --git a/sambamba.wdl b/sambamba.wdl index cd8da21..df5ab4d 100644 --- a/sambamba.wdl +++ b/sambamba.wdl @@ -20,29 +20,31 @@ version 1.0 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. - task Markdup { input { Array[File] inputBams String outputPath - # Sambamba scales like this: 1 thread is fully utilized (1). 2 threads 1.8 utilized. 3 -> 2.4, 4-> 2.7. - # 2 threads reduces wall clock time by more than 40%. - Int threads = 2 Int compressionLevel = 1 - Int? hashTableSize - Int? overFlowListSize - # sortBufferSize and ioBufferSize taken from markdup defaults as of sambamba 0.7.1 + # sortBufferSize and ioBufferSize taken from markdup defaults as of sambamba 0.7.1. Int sortBufferSize = 2048 Int ioBufferSize = 128 - Boolean removeDuplicates = false + Boolean removeDuplicates = false + Int? hashTableSize + Int? overFlowListSize + + # Sambamba scales like this: 1 thread is fully utilized (1). + # 2 threads 1.8 utilized. 3 -> 2.4, 4-> 2.7. + # 2 threads reduces wall clock time by more than 40%. + Int threads = 2 # According to the manual sambamba markdup uses the sortbufferSize + 2 times the ioBuffer size. # Added 1024 mb as a margin of safety. Real life use with this setting uses 2.7 GiB. Int memoryMb = 1024 + sortBufferSize + 2 * ioBufferSize - String dockerImage = "quay.io/biocontainers/sambamba:0.7.1--h148d290_2" # Time minute calculation does not work well for higher number of threads. Int timeMinutes = 1 + ceil(size(inputBams, "G") * 8) / threads + String dockerImage = "quay.io/biocontainers/sambamba:0.7.1--h148d290_2" } + String bamIndexPath = sub(outputPath, "\.bam$", ".bai") command { @@ -57,7 +59,7 @@ task Markdup { ~{"--sort-buffer-size " + sortBufferSize} \ ~{"--io-buffer-size " + ioBufferSize} \ ~{sep=' ' inputBams} ~{outputPath} - # sambamba creates an index for us + # sambamba creates an index for us. mv ~{outputPath}.bai ~{bamIndexPath} } @@ -67,8 +69,8 @@ task Markdup { } runtime { - memory: "~{memoryMb}M" cpu: threads + memory: "~{memoryMb}M" time_minutes: timeMinutes docker: dockerImage } @@ -78,17 +80,19 @@ task Markdup { inputBams: {description: "The input BAM files.", category: "required"} outputPath: {description: "Output directory path + output file.", category: "required"} compressionLevel: {description: "Compression level from 0 (uncompressed) to 9 (best).", category: "advanced"} - memoryMb: {description: "The amount of memory available to the job in megabytes.", category: "advanced"} - removeDuplicates: {description: "Whether to remove the duplicates (instead of only marking them).", category: "advanced"} - hashTableSize: {description: "Sets sambamba's hash table size", category: "advanced"} - overFlowListSize: {description: "Sets sambamba's overflow list size", category: "advanced"} - sortBufferSize: {description: "The amount of mb allocated to the sort buffer", category: "advanced"} + sortBufferSize: {description: "The amount of mb allocated to the sort buffer.", category: "advanced"} ioBufferSize: {description: "The amount of mb allocated to each IO buffer. Sambamba uses two IO buffers.", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + removeDuplicates: {description: "Whether to remove the duplicates (instead of only marking them).", category: "advanced"} + hashTableSize: {description: "Sets sambamba's hash table size.", category: "advanced"} + overFlowListSize: {description: "Sets sambamba's overflow list size.", category: "advanced"} threads: {description: "The number of threads that will be used for this task.", category: "advanced"} + memoryMb: {description: "The amount of memory available to the job in megabytes.", category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + # outputs outputBam: {description: "Sorted BAM file."} + outputBamIndex: {description: "Sorted BAM file index."} } } @@ -98,14 +102,15 @@ task Sort { String outputPath = basename(inputBam, "\.bam") + ".sorted.bam" Boolean sortByName = false Int compressionLevel = 1 - Int threads = 1 + Int memoryPerThreadGb = 4 + Int threads = 1 Int memoryGb = 1 + threads * memoryPerThreadGb - String dockerImage = "quay.io/biocontainers/sambamba:0.7.1--h148d290_2" Int timeMinutes = 1 + ceil(size(inputBam, "G") * 3) + String dockerImage = "quay.io/biocontainers/sambamba:0.7.1--h148d290_2" } - # Select first needed as outputPath is optional input. (bug in cromwell) + # Select first needed as outputPath is optional input (bug in cromwell). String bamIndexPath = sub(select_first([outputPath]), "\.bam$", ".bai") command { @@ -118,7 +123,7 @@ task Sort { -m ~{memoryPerThreadGb}G \ -o ~{outputPath} \ ~{inputBam} - # sambamba creates an index for us + # sambamba creates an index for us. mv ~{outputPath}.bai ~{bamIndexPath} } @@ -140,12 +145,14 @@ task Sort { outputPath: {description: "Output directory path + output file.", category: "required"} sortByName: {description: "Sort the inputBam by read name instead of position.", category: "advanced"} compressionLevel: {description: "Compression level from 0 (uncompressed) to 9 (best).", category: "advanced"} - memoryGb: {description: "The amount of memory available to the job in gigabytes.", category: "advanced"} - memoryPerThreadGb: {description: "The amount of memory used per sort thread in gigabytes", category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + memoryPerThreadGb: {description: "The amount of memory used per sort thread in gigabytes.", category: "advanced"} threads: {description: "The number of threads that will be used for this task.", category: "advanced"} + memoryGb: {description: "The amount of memory available to the job in gigabytes.", category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + # outputs outputBam: {description: "Sorted BAM file."} + outputBamIndex: {description: "Sorted BAM file index."} } -} \ No newline at end of file +} -- GitLab