diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 3b4ec9ac40da235c2b2e4e6a682442a6dd5d4832..372071eedba04dd24461a81b4806096474a348ad 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -2,3 +2,4 @@ - [ ] Pull request details were added to CHANGELOG.md. - [ ] Documentation was updated (if required). - [ ] `parameter_meta` was added/updated (if required). +- [ ] Submodule branches are on develop or a tagged commit. diff --git a/.github/lint-environment.yml b/.github/lint-environment.yml new file mode 100644 index 0000000000000000000000000000000000000000..63b538fce9790456e089e63ddda41859a8ba9f06 --- /dev/null +++ b/.github/lint-environment.yml @@ -0,0 +1,9 @@ +name: biowdl-lint +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - cromwell + - wdl-aid + - miniwdl diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 785661119a9f3d1ca173a9c92a64a52bbf930328..0000000000000000000000000000000000000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: Continuous integration - -on: - pull_request: - paths_ignore: - - "docs/**" - -defaults: - run: - # This is needed for miniconda, see: - # https://github.com/marketplace/actions/setup-miniconda#important - shell: bash -l {0} - -jobs: - lint: - runs-on: ubuntu-latest - name: Womtool validate and submodule up to date. - steps: - - uses: actions/checkout@v2.3.4 - with: - submodules: recursive - - name: install miniconda - uses: conda-incubator/setup-miniconda@v2.0.1 - with: - channels: conda-forge,bioconda,defaults - # Conda-incubator uses 'test' environment by default. - - name: install requirements - run: conda install -n test cromwell miniwdl wdl-aid - - name: run linting - run: bash scripts/biowdl_lint.sh diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000000000000000000000000000000000000..7ef19e58f4bc9c209c3cfffa3ea51359c9afe069 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,93 @@ +name: Linting + +on: + pull_request: + paths_ignore: + - "docs/**" + +defaults: + run: + # This is needed for miniconda, see: + # https://github.com/marketplace/actions/setup-miniconda#important + shell: bash -l {0} + +jobs: + lint: + runs-on: ubuntu-latest + name: Linting checks + steps: + - uses: actions/checkout@v2.3.4 + with: + submodules: recursive + + - name: Set cache date + run: echo "DATE=$(date +'%Y%m%d')" >> $GITHUB_ENV + + - name: Cache conda environment + # Use an always upload cache to prevent solving conda environment again and again on failing linting. + uses: pat-s/always-upload-cache@v2.1.5 + env: + # Increase this value to manually invalidate the cache + CACHE_NUMBER: 0 + with: + path: /usr/share/miniconda/envs/biowdl-lint + key: + ${{runner.os}}-biowdl-lint-${{ env.CACHE_NUMBER }}-${{env.DATE}}-${{ hashFiles('.github/lint-environment.yml') }} + id: env_cache + + # Use the builtin conda. This is the fastest installation. It may not be + # the fastest for resolving, but the package cache mitigates that problem. + # Since this installs fastest, it is fastest for all runs where a cache + # hit occurs. + - name: install miniconda + uses: conda-incubator/setup-miniconda@v2.1.1 + with: + channels: conda-forge,bioconda,defaults + channel-priority: strict + auto-activate-base: false + use-only-tar-bz2: true # Needed for proper caching according to the documentation. + # activate-environment is broken! This always seems to create a new environment. + # Activation is therefore done separately. + + - name: Create test environment if no cache is present + run: conda env create -n biowdl-lint -f .github/lint-environment.yml + if: steps.env_cache.outputs.cache-hit != 'true' + + - name: Activate test environment + # The new PATH should be passed to the environment, otherwise it won't register. + run: | + conda activate biowdl-lint + echo "PATH=$PATH" >> $GITHUB_ENV + + - name: Fetch develop branch for comparisons + run: git fetch --depth=1 origin develop + + - name: run womtool validate + # Only check files that have changed from the base reference. + # Womtool validate checks very slowly, so this saves a lot of time. + run: | + set -x + for WDL_FILE in $(git diff --name-only origin/${{github.base_ref}} | grep -E '*.wdl$'); do + womtool validate $WDL_FILE + done + - name: run miniwdl check + run: | + set -x + bash -c 'miniwdl check $(git ls-files *.wdl)' + + - name: Check copyright headers + run: | + set -x + for WDL_FILE in $(git diff --name-only origin/${{github.base_ref}} | grep -E '*.wdl$'); do + grep Copyright $WDL_FILE || bash -c "echo No copyright header in $WDL_FILE && exit 1" + done + - name: Check parameter_meta for inputs + run: | + set -x + for WDL_FILE in $(git diff --name-only origin/${{github.base_ref}} | grep -E '*.wdl$'); do + wdl-aid --strict $WDL_FILE > /dev/null 2> wdl-aid_stderr || + if grep -z 'ValueError: Missing parameter_meta for inputs:' wdl-aid_stderr + then + exit 1 + fi + done diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c0db94722449604660b6d39e98957fd6380da93..986582ddcd7fef050f497a352b5af3f581f8a13e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,14 @@ that users understand how the changes affect the new version. --> version 5.1.0-dev --------------------------- ++ Added a task to add SVTYPE annotations to GRIDSS results + (`AnnotateSvTypes`). ++ The GRIDSS task will now run tabix separately if GRIDSS doesn't + produce a vcf index. ++ Add a script to subtract UMI's from the read name and add them as + a BAM tag for each BAM record. The script is in umi.BamReadNameToUmiTag. ++ Add fgbio.AnnotateBamWithUmis. ++ Add picard.UmiAwareMarkDuplicatesWithMateCigar. + Added a task for SnpEff. + Adjusted runtime settings for sambamba Markdup. + Added a task for sambamba Flagstat. @@ -28,7 +36,7 @@ version 5.1.0-dev + Sage + VirusInterpreter + Added a task for VirusBreakend. -+ Added a task for GridssAnnotateVcfRepeatmasker. ++ Added a task for GridssAnnotateVcfRepeatmasker. + Bumped GRIDSS version to 2.12.2. + Adjusted GRIDSS runtime settings. + Added optional inputs to GRIDSS: @@ -147,7 +155,7 @@ version 4.0.0 + Picard MergeVcf now uses compression level 1 by default. + bwa mem, bwa mem+kit and hisat2 have their samtools sort threads tweaked. The number of threads is now related to the number of threads on the aligner. - Using more threads reduces the chance of the samtools sort pipe getting + Using more threads reduces the chance of the samtools sort pipe getting blocked if it's full. + Renamed a few inputs in centrifuge.wdl, isoseq3.wdl, talon.wdl, transcriptclean.wdl to be more descriptive. diff --git a/bcftools.wdl b/bcftools.wdl index 88d97cd09465fee1384edc6b498504bbd2adf917..2bf1c7323dad2b7c45ba1b91380e4f2396d80730 100644 --- a/bcftools.wdl +++ b/bcftools.wdl @@ -186,8 +186,8 @@ task Sort { String outputPath = "output.vcf.gz" String tmpDir = "./sorting-tmp" - String memory = "256M" - Int timeMinutes = 1 + ceil(size(inputFile, "G")) + String memory = "5G" + Int timeMinutes = 1 + ceil(size(inputFile, "G")) * 5 String dockerImage = "quay.io/biocontainers/bcftools:1.10.2--h4f4756c_2" } diff --git a/bwa.wdl b/bwa.wdl index 1cb170b70b5dc19321ea6094a31aa53418e6aab8..373de6280552d55006d1468af5c67915fd9ca787 100644 --- a/bwa.wdl +++ b/bwa.wdl @@ -94,6 +94,7 @@ task Mem { outputPrefix: {description: "The prefix of the output files, including any parent directories.", category: "required"} sixtyFour: {description: "Whether or not the index uses the '.64' suffixes.", category: "common"} usePostalt: {description: "Whether to use the postalt script from bwa kit."} + useSoftclippingForSupplementary: {description: "Use soft-clipping for supplementary alignments instead of hard-clipping", category: "common"} sortMemoryPerThreadGb: {description: "The amount of memory for each sorting thread in gigabytes.", category: "advanced"} compressionLevel: {description: "The compression level of the output BAM.", category: "advanced"} readgroup: {description: "A readgroup identifier.", category: "common"} diff --git a/fgbio.wdl b/fgbio.wdl new file mode 100644 index 0000000000000000000000000000000000000000..d50906d33ef5cfd62ad004561039379c074d15ab --- /dev/null +++ b/fgbio.wdl @@ -0,0 +1,68 @@ +version 1.0 + +# Copyright (c) 2017 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task AnnotateBamWithUmis { + input { + File inputBam + File inputUmi + String outputPath + + String memory = "120G" + Int timeMinutes = 360 + String javaXmx="100G" + String dockerImage = "quay.io/biocontainers/fgbio:1.4.0--hdfd78af_0" + } + + command { + set -e + mkdir -p "$(dirname ~{outputPath})" + fgbio -Xmx~{javaXmx} \ + AnnotateBamWithUmis \ + -i ~{inputBam} \ + -f ~{inputUmi} \ + -o ~{outputPath} + } + + output { + File outputBam = outputPath + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + inputBam: {description: "The input BAM file.", category: "required"} + inputUmi: {description: "The input fastq file with UMIs.", category: "required"} + outputPath: {description: "Output directory path + output file.", category: "required"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputBam: {description: "UMI-annotated output BAM file."} + } +} diff --git a/gridss.wdl b/gridss.wdl index d3d251a5b2a811d38db2776784e2a41a60f1578c..35e41d21d6cc067384c5f73e99544ffb982d1574 100644 --- a/gridss.wdl +++ b/gridss.wdl @@ -79,6 +79,74 @@ task AnnotateInsertedSequence { } } +task AnnotateSvTypes { + input { + File gridssVcf + File gridssVcfIndex + String outputPath = "./gridss.svtyped.vcf.bgz" + + String memory = "32G" + String dockerImage = "quay.io/biocontainers/bioconductor-structuralvariantannotation:1.10.0--r41hdfd78af_0" + Int timeMinutes = 240 + } + + String effectiveOutputPath = sub(outputPath, "\\.bgz", "") + String index = if effectiveOutputPath != outputPath then "T" else "F" + + + # Based on https://github.com/PapenfussLab/gridss/issues/74 + command <<< + set -e + mkdir -p "$(dirname ~{outputPath})" + R --vanilla << "EOF" + library(VariantAnnotation) + library(StructuralVariantAnnotation) + + vcf_path <- "~{gridssVcf}" + out_path <- "~{effectiveOutputPath}" + + # Simple SV type classifier + simpleEventType <- function(gr) { + return(ifelse(seqnames(gr) != seqnames(partner(gr)), "BND", # inter-chromosomosal + ifelse(gr$insLen >= abs(gr$svLen) * 0.7, "INS", + ifelse(strand(gr) == strand(partner(gr)), "INV", + ifelse(xor(start(gr) < start(partner(gr)), strand(gr) == "-"), "DEL", + "DUP"))))) + } + + header <- scanVcfHeader(vcf_path) + vcf <- readVcf(vcf_path, seqinfo(header)) + gr <- breakpointRanges(vcf) + svtype <- simpleEventType(gr) + info(vcf[gr$sourceId])$SVTYPE <- svtype + # GRIDSS doesn't supply a GT, so we estimate GT based on AF (assuming CN of 2, might be inaccurate) + geno(vcf)$GT <- ifelse(geno(vcf)$AF > 0.75, "1/1", ifelse(geno(vcf)$AF < 0.25, "0/0", "0/1")) + writeVcf(vcf, out_path, index=~{index}) + EOF + >>> + + output { + File vcf = outputPath + File? vcfIndex = outputPath + ".tbi" + } + + runtime { + memory: memory + time_minutes: timeMinutes # !UnknownRuntimeKey + docker: dockerImage + } + + parameter_meta { + gridssVcf: {description: "The VCF produced by GRIDSS.", category: "required"} + gridssVcfIndex: {description: "The index for the VCF produced by GRIDSS.", category: "required"} + outputPath: {description: "The path the output should be written to.", category: "common"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + task GRIDSS { input { File tumorBam @@ -116,6 +184,12 @@ task GRIDSS { ~{normalBam} \ ~{tumorBam} samtools index ~{outputPrefix}_assembly.bam ~{outputPrefix}_assembly.bai + + # For some reason the VCF index is sometimes missing + if [ ! -e ~{outputPrefix}.vcf.gz.tbi ] + then + tabix ~{outputPrefix}.vcf.gz + fi } output { diff --git a/macs2.wdl b/macs2.wdl index eb71ac1dd8ba5fe7e165068d97ecfb870d1befa6..2afe3bbee058473155d9b472145bd1fb7306f6a5 100644 --- a/macs2.wdl +++ b/macs2.wdl @@ -28,6 +28,7 @@ task PeakCalling { Array[File] controlBamsIndex String outDir = "macs2" String sampleName + String format = "AUTO" Boolean nomodel = false Int timeMinutes = 600 # Default to 10 hours String memory = "8G" @@ -41,6 +42,7 @@ task PeakCalling { ~{true="--control" false="" length(controlBams) > 0} ~{sep = ' ' controlBams} \ --outdir ~{outDir} \ --name ~{sampleName} \ + -f ~{format} \ ~{true='--nomodel' false='' nomodel} } @@ -65,6 +67,6 @@ task PeakCalling { memory: {description: "The amount of memory this job will use.", category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} - + format: {description: "Which format to use. Use BAMPE for paired-end reads.", category: "common"} } } diff --git a/picard.wdl b/picard.wdl index 9a935045bd9804ce37f7773b5f8db480fa3563b2..3d835829f4c54c3a20ee162cf8d115cea3d0b3ac 100644 --- a/picard.wdl +++ b/picard.wdl @@ -29,7 +29,7 @@ task BedToIntervalList { String javaXmx = "3G" String memory = "4G" Int timeMinutes = 5 - String dockerImage = "quay.io/biocontainers/picard:2.23.8--0" + String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0" } command { @@ -89,7 +89,7 @@ task CollectHsMetrics { # Additional * 2 because picard multiple metrics reads the # reference fasta twice. Int timeMinutes = 1 + ceil(size(referenceFasta, "G") * 3 * 2) + ceil(size(inputBam, "G") * 6) - String dockerImage = "quay.io/biocontainers/picard:2.23.8--0" + String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0" } command { @@ -158,7 +158,7 @@ task CollectMultipleMetrics { Int memoryMb = javaXmxMb + 512 # Additional * 2 because picard multiple metrics reads the reference fasta twice. Int timeMinutes = 1 + ceil(size(referenceFasta, "G") * 3 * 2) + ceil(size(inputBam, "G") * 6) - String dockerImage = "quay.io/biocontainers/picard:2.23.8--0" + String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0" } command { @@ -284,7 +284,7 @@ task CollectRnaSeqMetrics { String memory = "9G" # With 6 minutes per G there were several timeouts. Int timeMinutes = 1 + ceil(size(inputBam, "G") * 12) - String dockerImage = "quay.io/biocontainers/picard:2.23.8--0" + String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0" } command { @@ -342,7 +342,7 @@ task CollectTargetedPcrMetrics { String javaXmx = "3G" String memory = "4G" Int timeMinutes = 1 + ceil(size(inputBam, "G") * 6) - String dockerImage = "quay.io/biocontainers/picard:2.23.8--0" + String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0" } command { @@ -404,7 +404,7 @@ task CollectVariantCallingMetrics { String javaXmx = "8G" String memory = "9G" Int timeMinutes = 1440 - String dockerImage = "quay.io/biocontainers/picard:2.23.8--0" + String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0" } command { @@ -517,7 +517,7 @@ task CreateSequenceDictionary { String javaXmx = "2G" String memory = "3G" - String dockerImage = "quay.io/biocontainers/picard:2.23.8--0" + String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0" } command { @@ -561,13 +561,15 @@ task GatherBamFiles { String outputBamPath Boolean createMd5File = false - Int? compressionLevel + Int compressionLevel = 1 + Boolean useJdkInflater = false + Boolean useJdkDeflater = true # Achieves much better compression rates than the intel deflater Int javaXmxMb = 1024 Int memoryMb = javaXmxMb + 512 # One minute per input gigabyte. Int timeMinutes = 1 + ceil(size(inputBams, "G") * 1) - String dockerImage = "quay.io/biocontainers/picard:2.23.8--0" + String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0" } command { @@ -577,7 +579,9 @@ task GatherBamFiles { GatherBamFiles \ INPUT=~{sep=' INPUT=' inputBams} \ OUTPUT=~{outputBamPath} \ - ~{"COMPRESSION_LEVEL=" + compressionLevel} \ + COMPRESSION_LEVEL=~{compressionLevel} \ + USE_JDK_INFLATER=~{true="true" false="false" useJdkInflater} \ + USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater} \ CREATE_INDEX=true \ CREATE_MD5_FILE=~{true="true" false="false" createMd5File} } @@ -600,7 +604,9 @@ task GatherBamFiles { inputBamsIndex: {description: "The indexes of the input BAM files.", category: "required"} outputBamPath: {description: "The path where the merged BAM file will be written.", caregory: "required"} createMd5File: {decription: "Whether to create an md5 file of the output BAM.", category: "advanced"} - compressionLevel: {description: "The compression level of the output BAM.", category: "advanced"} + compressionLevel: {description: "The compression level at which the BAM files are written.", category: "advanced"} + useJdkInflater: {description: "True, uses the java inflater. False, uses the optimized intel inflater.", category: "advanced"} + useJdkDeflater: {description: "True, uses the java deflator to compress the BAM files. False uses the optimized intel deflater.", category: "advanced"} javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.", category: "advanced"} memoryMb: {description: "The amount of memory this job will use in megabytes.", category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} @@ -619,10 +625,14 @@ task GatherVcfs { Array[File]+ inputVcfIndexes String outputVcfPath = "out.vcf.gz" + Int compressionLevel = 1 + Boolean useJdkInflater = false + Boolean useJdkDeflater = true # Achieves much better compression rates than the intel deflater + String javaXmx = "4G" String memory = "5G" Int timeMinutes = 1 + ceil(size(inputVcfs, "G") * 2) - String dockerImage = "quay.io/biocontainers/picard:2.23.8--0" + String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0" } command { @@ -630,6 +640,10 @@ task GatherVcfs { mkdir -p "$(dirname ~{outputVcfPath})" picard -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ GatherVcfs \ + COMPRESSION_LEVEL=~{compressionLevel} \ + USE_JDK_INFLATER=~{true="true" false="false" useJdkInflater} \ + USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater} \ + CREATE_INDEX=true \ INPUT=~{sep=' INPUT=' inputVcfs} \ OUTPUT=~{outputVcfPath} } @@ -654,6 +668,10 @@ task GatherVcfs { timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + compressionLevel: {description: "The compression level at which the BAM files are written.", category: "advanced"} + useJdkInflater: {description: "True, uses the java inflater. False, uses the optimized intel inflater.", category: "advanced"} + useJdkDeflater: {description: "True, uses the java deflator to compress the BAM files. False uses the optimized intel deflater.", category: "advanced"} + # outputs outputVcf: {description: "Multiple VCF files gathered into one file."} } @@ -665,14 +683,11 @@ task MarkDuplicates { Array[File]+ inputBams String outputBamPath String metricsPath - Int compressionLevel = 1 Boolean createMd5File = false - Boolean useJdkInflater = true # Slightly faster than the intel one. - # Better results for compression level 1 (much smaller). - # Higher compression levels similar to intel deflater. - # NOTE: this might change in the future when the intel - # deflater is updated! - Boolean useJdkDeflater = true + + Int compressionLevel = 1 + Boolean useJdkInflater = false + Boolean useJdkDeflater = true # Achieves much better compression rates than the intel deflater # The program default for READ_NAME_REGEX is appropriate in nearly every case. # Sometimes we wish to supply "null" in order to turn off optical duplicate detection. @@ -686,7 +701,7 @@ task MarkDuplicates { String memoryMb = javaXmxMb + 512 Int timeMinutes = 1 + ceil(size(inputBams, "G") * 8) - String dockerImage = "quay.io/biocontainers/picard:2.23.8--0" + String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0" } # Task is assuming query-sorted input so that the Secondary and Supplementary reads get @@ -702,6 +717,8 @@ task MarkDuplicates { OUTPUT=~{outputBamPath} \ METRICS_FILE=~{metricsPath} \ COMPRESSION_LEVEL=~{compressionLevel} \ + USE_JDK_INFLATER=~{true="true" false="false" useJdkInflater} \ + USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater} \ VALIDATION_STRINGENCY=SILENT \ ~{"READ_NAME_REGEX=" + read_name_regex} \ OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \ @@ -709,8 +726,6 @@ task MarkDuplicates { CREATE_INDEX=true \ ADD_PG_TAG_TO_READS=false \ CREATE_MD5_FILE=~{true="true" false="false" createMd5File} \ - USE_JDK_INFLATER=~{true="true" false="false" useJdkInflater} \ - USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater} } output { @@ -732,9 +747,9 @@ task MarkDuplicates { outputBamPath: {description: "The location where the ouptut BAM file should be written.", category: "required"} metricsPath: {description: "The location where the output metrics file should be written.", category: "required"} compressionLevel: {description: "The compression level at which the BAM files are written.", category: "advanced"} - createMd5File: {description: "Whether to create a md5 file for the created BAM file.", category: "advanced"} useJdkInflater: {description: "True, uses the java inflater. False, uses the optimized intel inflater.", category: "advanced"} useJdkDeflater: {description: "True, uses the java deflator to compress the BAM files. False uses the optimized intel deflater.", category: "advanced"} + createMd5File: {description: "Whether to create a md5 file for the created BAM file.", category: "advanced"} read_name_regex: {description: "Equivalent to the `READ_NAME_REGEX` option of MarkDuplicates.", category: "advanced"} javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.", category: "advanced"} memoryMb: {description: "The amount of memory this job will use in megabytes.", category: "advanced"} @@ -756,16 +771,20 @@ task MergeVCFs { Array[File]+ inputVCFsIndexes String outputVcfPath Int compressionLevel = 1 - Boolean useJdkInflater = true # Slightly faster than the intel one. + Boolean useJdkInflater = false # Better results for compression level 1 (much smaller). # Higher compression levels similar to intel deflater. # NOTE: this might change in the future when the intel deflater is updated! - Boolean useJdkDeflater = true + # Second NOTE: No it did not change. Only the fastest algorithm with + # worse compression is wrapped in the intel GKL. Instead of using + # one of the slightly slower but better compressing alternatives from ISA-L. + # (Which are also faster than zlib.) + Boolean useJdkDeflater = true # Achieves much better compression rates than the intel deflater String javaXmx = "4G" String memory = "5G" Int timeMinutes = 1 + ceil(size(inputVCFs, "G")) * 2 - String dockerImage = "quay.io/biocontainers/picard:2.23.8--0" + String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0" } # Using MergeVcfs instead of GatherVcfs so we can create indices. @@ -821,7 +840,7 @@ task SamToFastq { String javaXmx = "16G" # High memory default to avoid crashes. String memory = "17G" Int timeMinutes = 30 - String dockerImage = "quay.io/biocontainers/picard:2.23.8--0" + String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0" File? noneFile } @@ -882,7 +901,7 @@ task ScatterIntervalList { String javaXmx = "3G" String memory = "4G" - String dockerImage = "quay.io/biocontainers/picard:2.23.8--0" + String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0" } command { @@ -917,13 +936,15 @@ task SortSam { Boolean createMd5File = false Int maxRecordsInRam = 500000 Int compressionLevel = 1 + Boolean useJdkInflater = false + Boolean useJdkDeflater = true # Achieves much better compression rates than the intel deflater # Default ram of 4 GB. Using 125001.0 to prevent an answer of # 4.000000001 which gets rounded to 5. # GATK Best practices uses 75000 here: https://github.com/gatk-workflows/broad-prod-wgs-germline-snps-indels/blob/d2934ed656ade44801f9cfe1c0e78d4f80684b7b/PairedEndSingleSampleWf-fc-hg38.wdl#L778 Int XmxGb = ceil(maxRecordsInRam / 125001.0) Int timeMinutes = 1 + ceil(size(inputBam, "G") * 3) - String dockerImage = "quay.io/biocontainers/picard:2.23.8--0" + String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0" } command { @@ -936,6 +957,8 @@ task SortSam { SORT_ORDER=~{true="queryname" false="coordinate" sortByName} \ CREATE_INDEX=true \ COMPRESSION_LEVEL=~{compressionLevel} \ + USE_JDK_INFLATER=~{true="true" false="false" useJdkInflater} \ + USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater} \ VALIDATION_STRINGENCY=SILENT \ CREATE_MD5_FILE=~{true="true" false="false" createMd5File} @@ -960,7 +983,9 @@ task SortSam { sortByName: {description: "Sort the output file by name, default is position.", category: "advanced"} createMd5File: {description: "Whether to create an MD5 digest for any BAM or FASTQ files created.", category: "advanced"} maxRecordsInRam: {description: "This will specify the number of records stored in RAM before spilling to disk.", category: "advanced"} - compressionLevel: {description: "Compression level for all compressed files created.", category: "advanced"} + compressionLevel: {description: "The compression level at which the BAM files are written.", category: "advanced"} + useJdkInflater: {description: "True, uses the java inflater. False, uses the optimized intel inflater.", category: "advanced"} + useJdkDeflater: {description: "True, uses the java deflator to compress the BAM files. False uses the optimized intel deflater.", category: "advanced"} XmxGb: {description: "The maximum memory available to picard SortSam. Should be lower than `memory` to accommodate JVM overhead and BWA mem's memory usage.", category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} @@ -981,7 +1006,7 @@ task SortVcf { String javaXmx = "8G" String memory = "9G" Int timeMinutes = 1 + ceil(size(vcfFiles, "G") * 5) - String dockerImage = "quay.io/biocontainers/picard:2.23.8--0" + String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0" } @@ -1031,7 +1056,7 @@ task RenameSample { String javaXmx = "8G" String memory = "9G" Int timeMinutes = 1 + ceil(size(inputVcf, "G") * 2) - String dockerImage = "quay.io/biocontainers/picard:2.23.8--0" + String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0" } command { @@ -1068,3 +1093,78 @@ task RenameSample { renamedVcf: {description: "New VCF with renamed sample."} } } + +task UmiAwareMarkDuplicatesWithMateCigar { + input { + Array[File] inputBams + String outputPath + String outputPathMetrics = outputPath + ".metrics" + String outputPathUmiMetrics = outputPath + ".umi-metrics" + Int maxRecordsInRam = 1500000 # Default is 500_000 but that will lead to very small files on disk. + String? assumeSortOrder + String tempdir = "temp" + Boolean removeDuplicates = true + String umiTagName = "RX" + Int compressionLevel = 1 + Boolean useJdkInflater = false + Boolean useJdkDeflater = true # Achieves much better compression rates than the intel deflater + String javaXmx = "8G" + String memory = "9G" + Int timeMinutes = 360 + String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0" + } + + command { + set -e + mkdir -p "$(dirname ~{outputPath})" ~{tempdir} + picard -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ + UmiAwareMarkDuplicatesWithMateCigar \ + INPUT=~{sep=' INPUT=' inputBams} \ + O=~{outputPath} \ + M=~{outputPathMetrics} \ + UMI_TAG_NAME=~{umiTagName} \ + UMI_METRICS_FILE=~{outputPathUmiMetrics} \ + TMP_DIR=~{tempdir} \ + REMOVE_DUPLICATES=~{removeDuplicates} \ + MAX_RECORDS_IN_RAM=~{maxRecordsInRam} \ + CREATE_INDEX=true \ + COMPRESSION_LEVEL=~{compressionLevel} \ + USE_JDK_INFLATER=~{true="true" false="false" useJdkInflater} \ + USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater} \ + ~{"ASSUME_SORT_ORDER=" + assumeSortOrder} + } + + output { + File outputBam = outputPath + File outputBamIndex = sub(outputPath, "\.bam$", ".bai") + File outputMetrics = outputPathMetrics + File outputUmiMetrics = outputPathUmiMetrics + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + inputBams: {description: "The BAM files for which the duplicate reads should be marked.", category: "required"} + outputPath: {description: "The location the output BAM file should be written to.", category: "required"} + outputPathMetrics: {description: "The location the output metrics file should be written to.", category: "required"} + outputPathUmiMetrics: {description: "The location the output UMI metrics file should be written to.", category: "required"} + removeDuplicates: {description: "Whether the duplicate reads should be removed instead of marked.", category: "common"} + umiTagName: {description: "Which tag in the BAM file holds the UMI.", category: "common"} + assumeSortOrder: {description: "Assume a certain sort order even though the header might say otherwise.", category: "common"} + tempdir: {description: "Temporary directory.", category: "advanced"} + compressionLevel: {description: "The compression level at which the BAM files are written.", category: "advanced"} + maxRecordsInRam: {description: "This will specify the number of records stored in RAM before spilling to disk.", category: "advanced"} + useJdkInflater: {description: "True, uses the java inflater. False, uses the optimized intel inflater.", category: "advanced"} + useJdkDeflater: {description: "True, uses the java deflator to compress the BAM files. False uses the optimized intel deflater.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + } +} \ No newline at end of file diff --git a/star.wdl b/star.wdl index aa1fd6084096a631758561df393498fa87a485a1..6a123c86dcad02c36a4891884b5674598e006db6 100644 --- a/star.wdl +++ b/star.wdl @@ -78,7 +78,7 @@ task GenomeGenerate { parameter_meta { # inputs - genomeDir: {description:"The directory the STAR index should be written to.", categroy: "common"} + genomeDir: {description:"The directory the STAR index should be written to.", category: "common"} referenceFasta: {description: "The reference Fasta file.", category: "required"} referenceGtf: {description: "The reference GTF file.", category: "common"} sjdbOverhang: {description: "Equivalent to STAR's `--sjdbOverhang` option.", category: "advanced"} diff --git a/umi.wdl b/umi.wdl new file mode 100644 index 0000000000000000000000000000000000000000..0dc5c55e786014e7a59af4d1bfc91fdd225ef807 --- /dev/null +++ b/umi.wdl @@ -0,0 +1,105 @@ +version 1.0 + +# Copyright (c) 2022 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task BamReadNameToUmiTag { + + # This task processes a bam file with reads that have been extracted with + # umi-tools extract. The UMI is extracted from the read name again and put + # in the bam file again with umiTag (default RX) + input { + File inputBam + String outputPath = "output.bam" + String umiTag = "RX" + + String memory = "2G" + Int timeMinutes = 1 + ceil(size([inputBam], "G") * 10) + String dockerImage = "quay.io/biocontainers/pysam:0.17.0--py39h051187c_0" + } + + String bamIndexPath = sub(select_first([outputPath]), "\.bam$", ".bai") + + command <<< + python <<CODE + import pysam + import sys + import os + + from typing import Tuple + + def split_umi_from_name(name) -> Tuple[str, str]: + id_and_rest = name.split(maxsplit=1) + id = id_and_rest[0] + # If there was no whitespace id_and_rest will have length 1 + other_parts = id_and_rest[1] if len(id_and_rest) == 2 else "" + underscore_index = id.rfind("_") + umi = id[underscore_index + 1:] + new_id = id[:underscore_index] + if other_parts: + return " ".join([new_id, other_parts]), umi + return new_id, umi + + def annotate_umis(in_file, out_file, bam_tag="RX"): + in_bam = pysam.AlignmentFile(in_file, "rb") + os.makedirs(os.path.dirname(out_file), exist_ok=True) + out_bam = pysam.AlignmentFile(out_file, "wb", template=in_bam) + # Encode bam_tag as bytes. Otherwise pysam converts it to bytes anyway. + encoded_bam_tag = bam_tag.encode('ascii') + for segment in in_bam: # type: pysam.AlignedSegment + new_name, umi = split_umi_from_name(segment.query_name) + segment.query_name = new_name + # Encode umi as ascii. Otherwise pysam encodes it to bytes anyway. + # Value type has to be a string though, otherwise pysam crashes. + segment.set_tag(encoded_bam_tag, umi.encode('ascii'), value_type="Z") + out_bam.write(segment) + + if __name__ == "__main__": + annotate_umis("~{inputBam}", "~{outputPath}", "~{umiTag}") + pysam.index("~{outputPath}", "~{bamIndexPath}", b=True) + CODE + >>> + + output { + File outputBam = outputPath + File outputBamIndex = bamIndexPath + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + inputBam: {description: "The input SAM file.", category: "required"} + outputPath: {description: "Output directory path + output file.", category: "common"} + umiTag: {description: "The tag used for UMIs in the output BAM file.", category: "common"} + + memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputBam: {description: "Sorted BAM file."} + outputBamIndex: {description: "Sorted BAM file index."} + } +}