diff --git a/CHANGELOG.md b/CHANGELOG.md index 77189f3d67489b674068599128fd327f0d194f9e..7889194141e364c34d635ee653c96fee958c8f3c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,11 +11,18 @@ that users understand how the changes affect the new version. version 4.0.0-develop --------------------------- ++ Tuned resource requirements for GATK VariantEval, MultiQC, Picard metrics and + STAR. ++ Added a new task for [scatter-regions](https://github.com/biowdl/chunked-scatter) + that replaces biopet-scatterregions. ++ The FastQC task now talks to the Java directly instead of using the included + Perl wrapper for FastQC. This has the advantage that memory and threads can + be set independently. A rather high maximum heap size of 1750MB (Xmx1750M) + was set, as OOM errors occurred frequently on some fastqs. + STAR: Add options regarding alignment score (regarding read length as well) for tweaking when processing rRNA depleted samples. + TALON: Update `minimumIdentity` to correct type (float, was integer) & set new default according to developers (0.8, was 0). -+ Added bcftools stats task. + Added GATK VariantEval task. + Added a log output for STAR. + Added report output to Hisat2. diff --git a/biopet/biopet.wdl b/biopet/biopet.wdl index cc8e1bc6382bb452c947403c0815804a1fcb3ede..8931940946ac56d8ae64a8be011c1dec8318e320 100644 --- a/biopet/biopet.wdl +++ b/biopet/biopet.wdl @@ -214,6 +214,7 @@ task ScatterRegions { input { File referenceFasta File referenceFastaDict + Int scatterSizeMillions = 1000 Int? scatterSize File? regions Boolean notSplitContigs = false @@ -230,6 +231,7 @@ task ScatterRegions { # linking. This path must be in the containers filesystem, otherwise the # linking does not work. String outputDirPath = "scatters" + String finalSize = if defined(scatterSize) then "~{scatterSize}" else "~{scatterSizeMillions}000000" command <<< set -e -o pipefail @@ -237,7 +239,7 @@ task ScatterRegions { biopet-scatterregions -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \ -R ~{referenceFasta} \ -o ~{outputDirPath} \ - ~{"-s " + scatterSize} \ + ~{"-s " + finalSize} \ ~{"-L " + regions} \ ~{"--bamFile " + bamFile} \ ~{true="--notSplitContigs" false="" notSplitContigs} @@ -271,7 +273,8 @@ task ScatterRegions { referenceFasta: {description: "The reference fasta file.", category: "required"} referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} - scatterSize: {description: "Equivalent to biopet scatterregions' `-s` option.", category: "common"} + scatterSizeMillions: {description: "Over how many million base pairs should be scattered.", category: "common"} + scatterSize: {description: "Overrides scatterSizeMillions with a smaller value if set.", category: "advanced"} regions: {description: "The regions to be scattered.", category: "advanced"} notSplitContigs: {description: "Equivalent to biopet scatterregions' `--notSplitContigs` flag.", category: "advanced"} diff --git a/chunked-scatter.wdl b/chunked-scatter.wdl index 111d8fa44f338bd56d13c5752ea0d44571f4b526..96dbf1eb6c9703ad79a3ca6b10d7bfdbf2d4e6f9 100644 --- a/chunked-scatter.wdl +++ b/chunked-scatter.wdl @@ -64,3 +64,52 @@ task ChunkedScatter { category: "advanced"} } } + + +task ScatterRegions { + input { + File inputFile + String prefix = "scatters/scatter-" + Boolean splitContigs = false + Int scatterSizeMillions = 1000 + Int? scatterSize + Int timeMinutes = 2 + String memory = "256M" + String dockerImage = "biowdl/chunked-scatter:latest" + } + + String finalSize = if defined(scatterSize) then "~{scatterSize}" else "~{scatterSizeMillions}000000" + + command { + scatter-regions \ + --print-paths \ + --scatter-size ~{finalSize} \ + ~{true="--split-contigs" false="" splitContigs} \ + ~{"--prefix " + prefix} \ + ~{inputFile} + } + + output { + Array[File] scatters = read_lines(stdout()) + } + + runtime { + cpu: 1 + memory: memory + docker: dockerImage + time_minutes: timeMinutes + } + + parameter_meta { + inputFile: {description: "The input file, either a bed file or a sequence dict. Which format is used is detected by the extension: '.bed', '.fai' or '.dict'.", category: "required"} + prefix: {description: "The prefix of the ouput files. Output will be named like: <PREFIX><N>.bed, in which N is an incrementing number. Default 'scatter-'.", category: "advanced"} + splitContigs: {description: "If set, contigs are allowed to be split up over multiple files.", category: "advanced"} + scatterSizeMillions: {description: "Over how many million base pairs should be scattered.", category: "common"} + scatterSize: {description: "Overrides scatterSizeMillions with a smaller value if set.", category: "advanced"} + + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} diff --git a/fastqc.wdl b/fastqc.wdl index e24b6ce44afcc62d95e752bddf88401c48e4bb6b..04b6813f3369c02debab54464e2627fea3a3ba42 100644 --- a/fastqc.wdl +++ b/fastqc.wdl @@ -38,8 +38,13 @@ task Fastqc { String? dir Int threads = 1 - # Fastqc uses 250MB per thread in its wrapper. - String memory = "~{250 + 250 * threads}M" + # Set javaXmx a little high. Equal to fastqc default with 7 threads. + # This is because some fastq files need more memory. 2G per core + # is a nice cluster default, so we use all the rest of the memory for + # fastqc so we should have as little OOM crashes as possible even with + # weird edge case fastq's. + String javaXmx="1750M" + String memory = "2G" Int timeMinutes = 1 + ceil(size(seqFile, "G")) * 4 String dockerImage = "quay.io/biocontainers/fastqc:0.11.9--0" Array[File]? NoneArray @@ -53,26 +58,32 @@ task Fastqc { # Just as fastqc does it. String reportDir = outdirPath + "/" + sub(name, "\.[^\.]*$", "_fastqc") - command { + # We reimplement the perl wrapper here. This has the advantage that it gives + # us more control over the amount of memory used. + command <<< set -e mkdir -p ~{outdirPath} - fastqc \ - ~{"--outdir " + outdirPath} \ - ~{true="--casava" false="" casava} \ - ~{true="--nano" false="" nano} \ - ~{true="--nofilter" false="" noFilter} \ - ~{true="--extract" false="" extract} \ - ~{true="--nogroup" false="" nogroup} \ - ~{"--min_length " + minLength } \ - ~{"--format " + format} \ - ~{"--threads " + threads} \ - ~{"--contaminants " + contaminants} \ - ~{"--adapters " + adapters} \ - ~{"--limits " + limits} \ - ~{"--kmers " + kmers} \ - ~{"--dir " + dir} \ + FASTQC_DIR="/usr/local/opt/fastqc-0.11.9" + export CLASSPATH="$FASTQC_DIR:$FASTQC_DIR/sam-1.103.jar:$FASTQC_DIR/jbzip2-0.9.jar:$FASTQC_DIR/cisd-jhdf5.jar" + java -Djava.awt.headless=true -XX:ParallelGCThreads=1 \ + -Xms200M -Xmx~{javaXmx} \ + ~{"-Dfastqc.output_dir=" + outdirPath} \ + ~{true="-Dfastqc.casava=true" false="" casava} \ + ~{true="-Dfastqc.nano=true" false="" nano} \ + ~{true="-Dfastqc.nofilter=true" false="" noFilter} \ + ~{true="-Dfastqc.unzip=true" false="" extract} \ + ~{true="-Dfastqc.nogroup=true" false="" nogroup} \ + ~{"-Dfastqc.min_length=" + minLength} \ + ~{"-Dfastqc.sequence_format=" + format} \ + ~{"-Dfastqc.threads=" + threads} \ + ~{"-Dfastqc.contaminant_file=" + contaminants} \ + ~{"-Dfastqc.adapter_file=" + adapters} \ + ~{"-Dfastqc.limits_file=" + limits} \ + ~{"-Dfastqc.kmer_size=" + kmers} \ + ~{"-Djava.io.tmpdir=" + dir} \ + uk.ac.babraham.FastQC.FastQCApplication \ ~{seqFile} - } + >>> output { File? rawReport = if extract then reportDir + "/fastqc_data.txt" else NoneFile @@ -105,6 +116,8 @@ task Fastqc { kmers: {description: "Equivalent to fastqc's --kmers option.", category: "advanced"} dir: {description: "Equivalent to fastqc's --dir option.", category: "advanced"} threads: {description: "The number of cores to use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", diff --git a/gatk.wdl b/gatk.wdl index 09de0488fd44542fc91a3546387d0a207cf90858..2089eabb5302dff5f69c0f2ac49281b27ee93f0a 100644 --- a/gatk.wdl +++ b/gatk.wdl @@ -1574,7 +1574,7 @@ task VariantEval { String memory = "5G" String javaXmx = "4G" # TODO: Refine estimate. For now 4 minutes per GB of input. - Int timeMinutes = ceil(size(flatten([evalVcfs, comparisonVcfs]), "G") * 4) + Int timeMinutes = ceil(size(flatten([evalVcfs, comparisonVcfs, select_all([referenceFasta, dbsnpVCF])]), "G") * 20) String dockerImage = "quay.io/biocontainers/gatk4:4.1.7.0--py38_0" } diff --git a/multiqc.wdl b/multiqc.wdl index 6a967b3f18c0fcc49a81168b97c047cc1395de89..7dcf333eb0ba40a52d69d33e62352e22ee5ee4ec 100644 --- a/multiqc.wdl +++ b/multiqc.wdl @@ -51,11 +51,11 @@ task MultiQC { Boolean megaQCUpload = false # This must be actively enabled in my opinion. The tools default is to upload. File? config # A directory String? clConfig - - String memory = "4G" - Int timeMinutes = 120 + String? memory + Int timeMinutes = 2 + ceil(size(reports, "G") * 8) String dockerImage = "quay.io/biocontainers/multiqc:1.7--py_1" } + Int memoryGb = 2 + ceil(size(reports, "G")) # This is where the reports end up. It does not need to be changed by the # user. It is full of symbolic links, so it is not of any use to the user @@ -132,7 +132,7 @@ task MultiQC { } runtime { - memory: memory + memory: select_first([memory, "~{memoryGb}G"]) time_minutes: timeMinutes docker: dockerImage } diff --git a/picard.wdl b/picard.wdl index 3103ad9baa7287a211a652e892dfa45a1071bebc..0ee5da3612b46af0a097d803958418ccb5b1bc77 100644 --- a/picard.wdl +++ b/picard.wdl @@ -87,7 +87,8 @@ task CollectMultipleMetrics { String memory = "9G" String javaXmx = "8G" - Int timeMinutes = 1 + ceil(size(inputBam, "G") * 6) + # Additional * 2 because picard multiple metrics reads the reference fasta twice. + Int timeMinutes = 1 + ceil(size(referenceFasta, "G") * 3 * 2) + ceil(size(inputBam, "G") * 6) String dockerImage = "quay.io/biocontainers/picard:2.20.5--0" } @@ -203,7 +204,8 @@ task CollectRnaSeqMetrics { String memory = "9G" String javaXmx = "8G" - Int timeMinutes = 1 + ceil(size(inputBam, "G") * 6) + # With 6 minutes per G there were several timeouts. + Int timeMinutes = 1 + ceil(size(inputBam, "G") * 12) String dockerImage = "quay.io/biocontainers/picard:2.20.5--0" } diff --git a/star.wdl b/star.wdl index 6f95a637ad7e286ad7c2398d888ea0183e7104fa..4da67f72609c3bd27d03301c3cd0ed39c9da4a99 100644 --- a/star.wdl +++ b/star.wdl @@ -35,7 +35,7 @@ task GenomeGenerate { command { set -e - mkdir -p "$(dirname ~{genomeDir})" + mkdir -p ~{genomeDir} STAR \ --runMode genomeGenerate \ --runThreadN ~{threads} \ @@ -50,7 +50,7 @@ task GenomeGenerate { File chrNameLength = "~{genomeDir}/chrNameLength.txt" File chrName = "~{genomeDir}/chrName.txt" File chrStart = "~{genomeDir}/chrStart.txt" - File genome = "~{genomeDir}/genome.txt" + File genome = "~{genomeDir}/Genome" File genomeParameters = "~{genomeDir}/genomeParameters.txt" File sa = "~{genomeDir}/SA" File saIndex = "~{genomeDir}/SAindex" @@ -106,11 +106,18 @@ task Star { Int? limitBAMsortRAM Int runThreadN = 4 - String memory = "~{5 + ceil(size(indexFiles, "G"))}G" - Int timeMinutes = 1 + ceil(size(flatten([inputR1, inputR2]), "G") * 180 / runThreadN) + String? memory + # 1 minute initialization + time reading in index (1 minute per G) + time aligning data. + Int timeMinutes = 1 + ceil(size(indexFiles, "G")) + ceil(size(flatten([inputR1, inputR2]), "G") * 300 / runThreadN) String dockerImage = "quay.io/biocontainers/star:2.7.3a--0" } + # Use a margin of 30% index size. Real memory usage is ~30 GiB for a 27 GiB index. + Int memoryGb = 1 + ceil(size(indexFiles, "G") * 1.3) + # For some reason doing above calculation inside a string does not work. + # So we solve it with an optional memory string and using select_first + # in the runtime section. + #TODO Could be extended for all possible output extensions Map[String, String] samOutputNames = {"BAM SortedByCoordinate": "sortedByCoord.out.bam"} @@ -142,7 +149,7 @@ task Star { runtime { cpu: runThreadN - memory: memory + memory: select_first([memory, "~{memoryGb}G"]) time_minutes: timeMinutes docker: dockerImage }