Merge pull request #234 from biowdl/BIOWDL-488

Add bwa-mem2 task.

Merge pull request #234 from biowdl/BIOWDL-488
cce6b32f · Ruben Vorderman · GitHub · 2fbdf6aa · 48351b26 · cce6b32f
Unverified Commit cce6b32f authored 4 years ago by Ruben Vorderman Committed by GitHub 4 years ago
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,8 +9,12 @@ This document is user facing. Please word the changes in such a way
 that users understand how the changes affect the new version.
 -->

-version 4.1.0-dev
+version 5.0.0-dev
 ---------------------------
+ A bwa-mem2 task was created with the same interface (including usePostalt) 
+  as the bwa mem task.
+ bwa mem and bwa kit are now one task. The usePostalt boolean can be used to
+  switch the postalt script on and off.
 + Added a task for GRIDSS.
 + Add wdl file for pacbio's bam2fastx tool.


--- a/bwa-mem2.wdl
+++ b/bwa-mem2.wdl
+version 1.0
+
+# Copyright (c) 2017 Leiden University Medical Center
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+task Mem {
+    input {
+        File read1
+        File? read2
+        BwaIndex bwaIndex
+        String outputPrefix
+        String? readgroup
+        Boolean sixtyFour = false
+        Boolean usePostalt = false
+        Int threads = 4
+        Int? sortThreads
+        Int sortMemoryPerThreadGb = 2
+        Int compressionLevel = 1
+        Int? memoryGb 
+        Int timeMinutes = 1 + ceil(size([read1, read2], "G") * 220 / threads)
+        # Contains bwa-mem2 2.0 bwakit 0.7.17.dev1 and samtools 1.10
+        String dockerImage = "quay.io/biocontainers/mulled-v2-6a15c99309c82b345497d24489bee67bbb76c2f6:1c9c3227b9bf825a8dc9726a25701aa23c0b1f12-0"
+    }
+
+    # Samtools sort may block the pipe while it is writing data to disk. 
+    # This can lead to cpu underutilization.
+    # 1 thread if threads is 1. For 2-4 threads 2 sort threads. 3 sort threads for 5-8 threads. 
+    Int estimatedSortThreads = if threads == 1 then 1 else 1 + ceil(threads / 4.0)
+    Int totalSortThreads = select_first([sortThreads, estimatedSortThreads])
+    # BWA-mem2's index files contain 2 BWT indexes of which only one is used. .2bit64 is used by default and 
+    # .8bit32 is used for avx2.
+    # The larger one of these is the 8bit32 index. Since we do not know beforehand which one is used we need to accomodate for that.
+    # Using only the 8bit32 index uses 57,5% of the index files. Since bwa-mem2 uses slightly more memory than the index
+    # We put it at 62% as a safety factor. That means the memory usage for bwa-mem will be 53G for a human genome. Resulting in 60G total
+    # on 8 cores with samtools with 3 sort threads.
+    Int estimatedMemoryGb = 1 + ceil(size(bwaIndex.indexFiles, "G") * 0.62) + sortMemoryPerThreadGb * totalSortThreads
+    
+    # The bwa postalt script is out commented as soon as usePostalt = false. 
+    # This hack was tested with bash, dash and ash. It seems that comments in between pipes work for all of them.
+    command {
+        set -e
+        mkdir -p "$(dirname ~{outputPrefix})"
+        bwa-mem2 mem \
+          -t ~{threads} \
+          ~{"-R '" + readgroup}~{true="'" false="" defined(readgroup)} \
+          ~{bwaIndex.fastaFile} \
+          ~{read1} \
+          ~{read2} \
+          2> ~{outputPrefix}.log.bwamem | \
+          ~{true="" false="#" usePostalt} bwa-postalt.js -p ~{outputPrefix}.hla ~{bwaIndex.fastaFile}~{true=".64.alt" false=".alt" sixtyFour} | \
+          samtools sort \
+          ~{"-@ " + totalSortThreads} \
+          -m ~{sortMemoryPerThreadGb}G \
+          -l ~{compressionLevel} \
+          - \
+          -o ~{outputPrefix}.aln.bam
+    }
+
+    output {
+        File outputBam = outputPrefix + ".aln.bam"
+        File? outputHla = outputPrefix + ".hla"
+    }
+
+    runtime {
+        # One extra thread for bwa-postalt + samtools is not needed.
+        # These only use 5-10% of compute power and not always simultaneously.
+        cpu: threads  
+        memory: "~{select_first([memoryGb, estimatedMemoryGb])}G"
+        time_minutes: timeMinutes
+        docker: dockerImage
+    }
+
+    parameter_meta {
+        # inputs
+        read1: {description: "The first-end fastq file.", category: "required"}
+        read2: {description: "The second-end fastq file.", category: "common"}
+        bwaIndex: {description: "The BWA index, including (optionally) a .alt file.", category: "required"}
+        usePostalt: {description: "Whether to use the postalt script from bwa kit."}
+        outputPrefix: {description: "The prefix of the output files, including any parent directories.", category: "required"}
+        readgroup: {description: "A readgroup identifier.", category: "common"}
+        sixtyFour: {description: "Whether or not the index uses the '.64' suffixes.", category: "common"}
+        threads: {description: "The number of threads to use for alignment.", category: "advanced"}
+        memoryGb: {description: "The amount of memory this job will use in gigabytes.", category: "advanced"}
+        sortThreads: {description: "The number of threads to use for sorting.", category: "advanced"}
+        sortMemoryPerThreadGb: {description: "The amount of memory for each sorting thread in gigabytes.", category: "advanced"}
+        compressionLevel: {description: "The compression level of the output BAM.", category: "advanced"}
+        timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
+        dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
+                      category: "advanced"}
+
+        # outputs
+        outputBam: "The produced BAM file."
+    }
+}
+
+struct BwaIndex {
+    File fastaFile
+    Array[File] indexFiles
+}
--- a/bwa.wdl
+++ b/bwa.wdl
@@ -21,78 +21,6 @@ version 1.0
 # SOFTWARE.

 task Mem {
-    input {
-        File read1
-        File? read2
-        BwaIndex bwaIndex
-        String outputPath
-        String? readgroup
-
-        Int threads = 4
-        Int? sortThreads
-        Int sortMemoryPerThreadGb = 2
-        Int compressionLevel = 1
-        Int? memoryGb 
-        Int timeMinutes = 1 + ceil(size([read1, read2], "G") * 200 / threads)
-        # This container contains: samtools (1.10), bwa (0.7.17-r1188)
-        String dockerImage = "quay.io/biocontainers/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:eabfac3657eda5818bae4090db989e3d41b01542-0"
-    }
-
-    # Samtools sort may block the pipe while it is writing data to disk. 
-    # This can lead to cpu underutilization.
-    # 1 thread if threads is 1. For 2-4 threads 2 sort threads. 3 sort threads for 5-8 threads. 
-    Int estimatedSortThreads = if threads == 1 then 1 else 1 + ceil(threads / 4.0)
-    Int totalSortThreads = select_first([sortThreads, estimatedSortThreads])
-    # BWA needs slightly more memory than the size of the index files (~10%). Add a margin for safety here.  
-    Int estimatedMemoryGb = 1 + ceil(size(bwaIndex.indexFiles, "G") * 1.2) + sortMemoryPerThreadGb * totalSortThreads
-
-    command {
-        set -e -o pipefail
-        mkdir -p "$(dirname ~{outputPath})"
-        bwa mem \
-        ~{"-t " + threads} \
-        ~{"-R '" + readgroup}~{true="'" false="" defined(readgroup)} \
-        ~{bwaIndex.fastaFile} \
-        ~{read1} \
-        ~{read2} \
-        | samtools sort \
-        ~{"-@ " + totalSortThreads} \
-        -m ~{sortMemoryPerThreadGb}G \
-        -l ~{compressionLevel} \
-        - \
-        -o ~{outputPath}
-    }
-
-    output {
-        File outputBam = outputPath
-    }
-
-    runtime {
-        cpu: threads
-        memory: "~{select_first([memoryGb, estimatedMemoryGb])}G"
-        time_minutes: timeMinutes
-        docker: dockerImage
-    }
-
-    parameter_meta {
-        read1: {description: "The first or single end fastq file.", category: "required"}
-        read2: {description: "The second end fastq file.", category: "common"}
-        bwaIndex: {description: "The BWA index files.", category: "required"}
-        outputPath: {description: "The location the output BAM file should be written to.", category: "required"}
-        readgroup: {description: "The readgroup to be assigned to the reads. See BWA mem's `-R` option.", category: "common"}
-
-        threads: {description: "The number of threads to use.", category: "advanced"}
-        memoryGb: {description: "The amount of memory this job will use in gigabytes.", category: "advanced"}
-        sortThreads: {description: "The number of threads to use for sorting.", category: "advanced"}
-        sortMemoryPerThreadGb: {description: "The amount of memory for each sorting thread in gigabytes.", category: "advanced"}
-        compressionLevel: {description: "The compression level of the output BAM.", category: "advanced"}
-        timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
-        dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
-                      category: "advanced"}
-    }
-}
-
-task Kit {
    input {
        File read1
        File? read2
@@ -100,7 +28,7 @@ task Kit {
        String outputPrefix
        String? readgroup
        Boolean sixtyFour = false
-
+        Boolean usePostalt = false
        Int threads = 4
        Int? sortThreads
        Int sortMemoryPerThreadGb = 2
@@ -119,6 +47,8 @@ task Kit {
    # BWA needs slightly more memory than the size of the index files (~10%). Add a margin for safety here.  
    Int estimatedMemoryGb = 1 + ceil(size(bwaIndex.indexFiles, "G") * 1.2) + sortMemoryPerThreadGb * totalSortThreads
    
+    # The bwa postalt script is out commented as soon as usePostalt = false. 
+    # This hack was tested with bash, dash and ash. It seems that comments in between pipes work for all of them.
    command {
        set -e
        mkdir -p "$(dirname ~{outputPrefix})"
@@ -129,10 +59,8 @@ task Kit {
          ~{read1} \
          ~{read2} \
          2> ~{outputPrefix}.log.bwamem | \
-        bwa-postalt.js \
-          -p ~{outputPrefix}.hla \
-          ~{bwaIndex.fastaFile}~{true=".64.alt" false=".alt" sixtyFour} | \
-        samtools sort \
+          ~{true="" false="#" usePostalt} bwa-postalt.js -p ~{outputPrefix}.hla ~{bwaIndex.fastaFile}~{true=".64.alt" false=".alt" sixtyFour} | \
+          samtools sort \
          ~{"-@ " + totalSortThreads} \
          -m ~{sortMemoryPerThreadGb}G \
          -l ~{compressionLevel} \
@@ -142,6 +70,7 @@ task Kit {

    output {
        File outputBam = outputPrefix + ".aln.bam"
+        File? outputHla = outputPrefix + ".hla"
    }

    runtime {
@@ -157,7 +86,8 @@ task Kit {
        # inputs
        read1: {description: "The first-end fastq file.", category: "required"}
        read2: {description: "The second-end fastq file.", category: "common"}
-        bwaIndex: {description: "The BWA index, including a .alt file.", category: "required"}
+        bwaIndex: {description: "The BWA index, including (optionally) a .alt file.", category: "required"}
+        usePostalt: {description: "Whether to use the postalt script from bwa kit."}
        outputPrefix: {description: "The prefix of the output files, including any parent directories.", category: "required"}
        readgroup: {description: "A readgroup identifier.", category: "common"}
        sixtyFour: {description: "Whether or not the index uses the '.64' suffixes.", category: "common"}