diff --git a/CHANGELOG.md b/CHANGELOG.md index 95241551a95ad5400cd7b642bfafc1e72074b92f..55fb1e8ab8385737b340dd91c7cb925a6abb6089 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,10 @@ that users understand how the changes affect the new version. version 4.0.0-develop --------------------------- ++ bwa mem, bwa mem+kit and hisat2 have their samtools sort threads tweaked. The + number of threads is now related to the number of threads on the aligner. + Using more threads reduces the chance of the samtools sort pipe getting + blocked if it's full. + Renamed a few inputs in centrifuge.wdl, isoseq3.wdl, talon.wdl, transcriptclean.wdl to be more descriptive. + Renamed outputs of tasks used in the TALON-WDL, PacBio-subreads-processing & diff --git a/bwa.wdl b/bwa.wdl index 78881ad295f94d53eb84760b366534b40f7868e4..58e1dc8031dfb04221cb1711b63e6fb4cabd6a87 100644 --- a/bwa.wdl +++ b/bwa.wdl @@ -29,16 +29,23 @@ task Mem { String? readgroup Int threads = 4 - Int sortThreads = 1 + Int? sortThreads Int sortMemoryPerThreadGb = 2 Int compressionLevel = 1 - # BWA needs slightly more memory than the size of the index files (~10%). Add a margin for safety here. - Int memoryGb = 1 + ceil(size(bwaIndex.indexFiles, "G") * 1.2) + sortMemoryPerThreadGb * sortThreads + Int? memoryGb Int timeMinutes = 1 + ceil(size([read1, read2], "G") * 200 / threads) # This container contains: samtools (1.10), bwa (0.7.17-r1188) String dockerImage = "quay.io/biocontainers/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:eabfac3657eda5818bae4090db989e3d41b01542-0" } + # Samtools sort may block the pipe while it is writing data to disk. + # This can lead to cpu underutilization. + # 1 thread if threads is 1. For 2-4 threads 2 sort threads. 3 sort threads for 5-8 threads. + Int estimatedSortThreads = if threads == 1 then 1 else 1 + ceil(threads / 4.0) + Int totalSortThreads = select_first([sortThreads, estimatedSortThreads]) + # BWA needs slightly more memory than the size of the index files (~10%). Add a margin for safety here. + Int estimatedMemoryGb = 1 + ceil(size(bwaIndex.indexFiles, "G") * 1.2) + sortMemoryPerThreadGb * totalSortThreads + command { set -e -o pipefail mkdir -p "$(dirname ~{outputPath})" @@ -49,7 +56,7 @@ task Mem { ~{read1} \ ~{read2} \ | samtools sort \ - ~{"-@ " + sortThreads} \ + ~{"-@ " + totalSortThreads} \ -m ~{sortMemoryPerThreadGb}G \ -l ~{compressionLevel} \ - \ @@ -62,7 +69,7 @@ task Mem { runtime { cpu: threads - memory: "~{memoryGb}G" + memory: "~{select_first([memoryGb, estimatedMemoryGb])}G" time_minutes: timeMinutes docker: dockerImage } @@ -95,16 +102,23 @@ task Kit { Boolean sixtyFour = false Int threads = 4 - Int sortThreads = 1 + Int? sortThreads Int sortMemoryPerThreadGb = 2 Int compressionLevel = 1 - # BWA needs slightly more memory than the size of the index files (~10%). Add a margin for safety here. - Int memoryGb = 1 + ceil(size(bwaIndex.indexFiles, "G") * 1.2) + sortMemoryPerThreadGb * sortThreads + Int? memoryGb Int timeMinutes = 1 + ceil(size([read1, read2], "G") * 220 / threads) # Contains bwa 0.7.17 bwakit 0.7.17.dev1 and samtools 1.10 String dockerImage = "quay.io/biocontainers/mulled-v2-ad317f19f5881324e963f6a6d464d696a2825ab6:c59b7a73c87a9fe81737d5d628e10a3b5807f453-0" } + # Samtools sort may block the pipe while it is writing data to disk. + # This can lead to cpu underutilization. + # 1 thread if threads is 1. For 2-4 threads 2 sort threads. 3 sort threads for 5-8 threads. + Int estimatedSortThreads = if threads == 1 then 1 else 1 + ceil(threads / 4.0) + Int totalSortThreads = select_first([sortThreads, estimatedSortThreads]) + # BWA needs slightly more memory than the size of the index files (~10%). Add a margin for safety here. + Int estimatedMemoryGb = 1 + ceil(size(bwaIndex.indexFiles, "G") * 1.2) + sortMemoryPerThreadGb * totalSortThreads + command { set -e mkdir -p "$(dirname ~{outputPrefix})" @@ -119,7 +133,7 @@ task Kit { -p ~{outputPrefix}.hla \ ~{bwaIndex.fastaFile}~{true=".64.alt" false=".alt" sixtyFour} | \ samtools sort \ - ~{"-@ " + sortThreads} \ + ~{"-@ " + totalSortThreads} \ -m ~{sortMemoryPerThreadGb}G \ -l ~{compressionLevel} \ - \ @@ -134,7 +148,7 @@ task Kit { # One extra thread for bwa-postalt + samtools is not needed. # These only use 5-10% of compute power and not always simultaneously. cpu: threads - memory: "~{memoryGb}G" + memory: "~{select_first([memoryGb, estimatedMemoryGb])}G" time_minutes: timeMinutes docker: dockerImage } diff --git a/hisat2.wdl b/hisat2.wdl index c24610edc6fdc732a70040756b62483d7dc509d4..f9a4bc599f0fc43ce759807e4c5f138f02e3aa20 100644 --- a/hisat2.wdl +++ b/hisat2.wdl @@ -34,10 +34,10 @@ task Hisat2 { String summaryFilePath = basename(outputBam, ".bam") + ".summary.txt" Int threads = 4 - Int sortThreads = 1 + Int? sortThreads Int sortMemoryPerThreadGb = 2 Int compressionLevel = 1 - Int memoryGb = 1 + threads + ceil(size(indexFiles, "G") * 1.2) + sortMemoryPerThreadGb * sortThreads + Int? memoryGb Int timeMinutes = 1 + ceil(size([inputR1, inputR2], "G") * 180 / threads) # quay.io/biocontainers/mulled-v2-a97e90b3b802d1da3d6958e0867610c718cb5eb1 # is a combination of hisat2 and samtools @@ -45,7 +45,12 @@ task Hisat2 { String dockerImage = "quay.io/biocontainers/mulled-v2-a97e90b3b802d1da3d6958e0867610c718cb5eb1:2880dd9d8ad0a7b221d4eacda9a818e92983128d-0" } - String bamIndexPath = sub(outputBam, "\.bam$", ".bai") + # Samtools sort may block the pipe while it is writing data to disk. + # This can lead to cpu underutilization. + # 1 thread if threads is 1. For 2-4 threads 2 sort threads. 3 sort threads for 5-8 threads. + Int estimatedSortThreads = if threads == 1 then 1 else 1 + ceil(threads / 4.0) + Int totalSortThreads = select_first([sortThreads, estimatedSortThreads]) + Int estimatedMemoryGb = 1 + ceil(size(indexFiles, "G") * 1.2) + sortMemoryPerThreadGb * totalSortThreads command { set -e -o pipefail @@ -63,7 +68,7 @@ task Hisat2 { --new-summary \ --summary-file ~{summaryFilePath} \ | samtools sort \ - ~{"-@ " + sortThreads} \ + ~{"-@ " + totalSortThreads} \ -m ~{sortMemoryPerThreadGb}G \ -l ~{compressionLevel} \ - \ @@ -76,8 +81,8 @@ task Hisat2 { } runtime { - memory: "~{memoryGb}G" - cpu: threads + 1 + memory: "~{select_first([memoryGb, estimatedMemoryGb])}G" + cpu: threads time_minutes: timeMinutes docker: dockerImage }