Merge pull request #223 from biowdl/BIOWDL-471

Updated resource requirements for the RNA-seq pipeline.

Merge pull request #223 from biowdl/BIOWDL-471
3b583e41 · Ruben Vorderman · GitHub · 4048bb82 · 5c9b7c83 · 3b583e41
Unverified Commit 3b583e41 authored 4 years ago by Ruben Vorderman Committed by GitHub 4 years ago
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,11 +11,18 @@ that users understand how the changes affect the new version.

 version 4.0.0-develop
 ---------------------------
+ Tuned resource requirements for GATK VariantEval, MultiQC, Picard metrics and 
+  STAR.
+ Added a new task for [scatter-regions](https://github.com/biowdl/chunked-scatter) 
+  that replaces biopet-scatterregions. 
+ The FastQC task now talks to the Java directly instead of using the included
+  Perl wrapper for FastQC. This has the advantage that memory and threads can
+  be set independently. A rather high maximum heap size of 1750MB (Xmx1750M) 
+  was set, as OOM errors occurred frequently on some fastqs.
 + STAR: Add options regarding alignment score (regarding read length as well)
  for tweaking when processing rRNA depleted samples.
 + TALON: Update `minimumIdentity` to correct type (float, was integer)
  & set new default according to developers (0.8, was 0).
-+ Added bcftools stats task.
 + Added GATK VariantEval task.
 + Added a log output for STAR.
 + Added report output to Hisat2.

--- a/biopet/biopet.wdl
+++ b/biopet/biopet.wdl
@@ -214,6 +214,7 @@ task ScatterRegions {
    input {
        File referenceFasta
        File referenceFastaDict
+        Int scatterSizeMillions = 1000
        Int? scatterSize
        File? regions
        Boolean notSplitContigs = false
@@ -230,6 +231,7 @@ task ScatterRegions {
    # linking. This path must be in the containers filesystem, otherwise the
    # linking does not work.
    String outputDirPath = "scatters"
+    String finalSize = if defined(scatterSize) then "~{scatterSize}" else "~{scatterSizeMillions}000000"

    command <<<
        set -e -o pipefail
@@ -237,7 +239,7 @@ task ScatterRegions {
        biopet-scatterregions -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \
          -R ~{referenceFasta} \
          -o ~{outputDirPath} \
-          ~{"-s " + scatterSize} \
+          ~{"-s " + finalSize} \
          ~{"-L " + regions} \
          ~{"--bamFile " + bamFile} \
          ~{true="--notSplitContigs" false="" notSplitContigs}
@@ -271,7 +273,8 @@ task ScatterRegions {
        referenceFasta: {description: "The reference fasta file.", category: "required"}
        referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.",
                             category: "required"}
-        scatterSize: {description: "Equivalent to biopet scatterregions' `-s` option.", category: "common"}
+        scatterSizeMillions: {description: "Over how many million base pairs should be scattered.", category: "common"}
+        scatterSize: {description: "Overrides scatterSizeMillions with a smaller value if set.", category: "advanced"}
        regions: {description: "The regions to be scattered.", category: "advanced"}
        notSplitContigs: {description: "Equivalent to biopet scatterregions' `--notSplitContigs` flag.",
                          category: "advanced"}

--- a/chunked-scatter.wdl
+++ b/chunked-scatter.wdl
@@ -64,3 +64,52 @@ task ChunkedScatter {
                      category: "advanced"}
    }
 }
+
+
+task ScatterRegions {
+    input {
+        File inputFile
+        String prefix = "scatters/scatter-" 
+        Boolean splitContigs = false
+        Int scatterSizeMillions = 1000
+        Int? scatterSize
+        Int timeMinutes = 2
+        String memory = "256M"
+        String dockerImage = "biowdl/chunked-scatter:latest"
+    }
+
+    String finalSize = if defined(scatterSize) then "~{scatterSize}" else "~{scatterSizeMillions}000000"
+    
+    command {
+        scatter-regions \
+        --print-paths \
+        --scatter-size ~{finalSize} \
+        ~{true="--split-contigs" false="" splitContigs} \
+        ~{"--prefix " + prefix} \
+        ~{inputFile} 
+    }
+
+    output {
+        Array[File] scatters = read_lines(stdout())
+    }
+    
+    runtime {
+        cpu: 1
+        memory: memory
+        docker: dockerImage
+        time_minutes: timeMinutes
+    }
+
+    parameter_meta {
+        inputFile: {description: "The input file, either a bed file or a sequence dict. Which format is used is detected by the extension: '.bed', '.fai' or '.dict'.", category: "required"}
+        prefix: {description: "The prefix of the ouput files. Output will be named like: <PREFIX><N>.bed, in which N is an incrementing number. Default 'scatter-'.", category: "advanced"}
+        splitContigs: {description: "If set, contigs are allowed to be split up over multiple files.", category: "advanced"}
+        scatterSizeMillions: {description: "Over how many million base pairs should be scattered.", category: "common"}
+        scatterSize: {description: "Overrides scatterSizeMillions with a smaller value if set.", category: "advanced"}
+
+        timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
+        memory: {description: "The amount of memory this job will use.", category: "advanced"}
+        dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
+                category: "advanced"}
+    }
+}
--- a/fastqc.wdl
+++ b/fastqc.wdl
@@ -38,8 +38,13 @@ task Fastqc {
        String? dir

        Int threads = 1
-        # Fastqc uses 250MB per thread in its wrapper.
-        String memory = "~{250 + 250 * threads}M"
+        # Set javaXmx a little high. Equal to fastqc default with 7 threads.
+        # This is because some fastq files need more memory. 2G per core
+        # is a nice cluster default, so we use all the rest of the memory for
+        # fastqc so we should have as little OOM crashes as possible even with
+        # weird edge case fastq's.
+        String javaXmx="1750M"  
+        String memory = "2G"
        Int timeMinutes = 1 + ceil(size(seqFile, "G")) * 4
        String dockerImage = "quay.io/biocontainers/fastqc:0.11.9--0"
        Array[File]? NoneArray
@@ -53,26 +58,32 @@ task Fastqc {
    # Just as fastqc does it.
    String reportDir = outdirPath + "/" + sub(name, "\.[^\.]*$", "_fastqc")

-    command {
+    # We reimplement the perl wrapper here. This has the advantage that it gives
+    # us more control over the amount of memory used.
+    command <<<
        set -e
        mkdir -p ~{outdirPath}
-        fastqc \
-        ~{"--outdir " + outdirPath} \
-        ~{true="--casava" false="" casava} \
-        ~{true="--nano" false="" nano} \
-        ~{true="--nofilter" false="" noFilter} \
-        ~{true="--extract" false="" extract} \
-        ~{true="--nogroup" false="" nogroup} \
-        ~{"--min_length " + minLength } \
-        ~{"--format " + format} \
-        ~{"--threads " + threads} \
-        ~{"--contaminants " + contaminants} \
-        ~{"--adapters " + adapters} \
-        ~{"--limits " + limits} \
-        ~{"--kmers " + kmers} \
-        ~{"--dir " + dir} \
+        FASTQC_DIR="/usr/local/opt/fastqc-0.11.9"
+        export CLASSPATH="$FASTQC_DIR:$FASTQC_DIR/sam-1.103.jar:$FASTQC_DIR/jbzip2-0.9.jar:$FASTQC_DIR/cisd-jhdf5.jar"
+        java -Djava.awt.headless=true -XX:ParallelGCThreads=1 \
+        -Xms200M -Xmx~{javaXmx} \
+        ~{"-Dfastqc.output_dir=" + outdirPath} \
+        ~{true="-Dfastqc.casava=true" false="" casava} \
+        ~{true="-Dfastqc.nano=true" false="" nano} \
+        ~{true="-Dfastqc.nofilter=true" false="" noFilter} \
+        ~{true="-Dfastqc.unzip=true" false="" extract} \
+        ~{true="-Dfastqc.nogroup=true" false="" nogroup} \
+        ~{"-Dfastqc.min_length=" + minLength} \
+        ~{"-Dfastqc.sequence_format=" + format} \
+        ~{"-Dfastqc.threads=" + threads} \
+        ~{"-Dfastqc.contaminant_file=" + contaminants} \
+        ~{"-Dfastqc.adapter_file=" + adapters} \
+        ~{"-Dfastqc.limits_file=" + limits} \
+        ~{"-Dfastqc.kmer_size=" + kmers} \
+        ~{"-Djava.io.tmpdir=" + dir} \
+        uk.ac.babraham.FastQC.FastQCApplication \
        ~{seqFile}
-    }
+    >>>

    output {
        File? rawReport = if extract then reportDir + "/fastqc_data.txt" else NoneFile
@@ -105,6 +116,8 @@ task Fastqc {
        kmers: {description: "Equivalent to fastqc's --kmers option.", category: "advanced"}
        dir: {description: "Equivalent to fastqc's --dir option.", category: "advanced"}
        threads: {description: "The number of cores to use.", category: "advanced"}
+        javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.",
+            category: "advanced"}
        memory: {description: "The amount of memory this job will use.", category: "advanced"}
        timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
        dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",

--- a/gatk.wdl
+++ b/gatk.wdl
@@ -1574,7 +1574,7 @@ task VariantEval {
        String memory = "5G"
        String javaXmx = "4G"
        # TODO: Refine estimate. For now 4 minutes per GB of input.
-        Int timeMinutes = ceil(size(flatten([evalVcfs, comparisonVcfs]), "G") * 4)
+        Int timeMinutes = ceil(size(flatten([evalVcfs, comparisonVcfs, select_all([referenceFasta, dbsnpVCF])]), "G") * 20)
        String dockerImage = "quay.io/biocontainers/gatk4:4.1.7.0--py38_0"
    }


--- a/multiqc.wdl
+++ b/multiqc.wdl
@@ -51,11 +51,11 @@ task MultiQC {
        Boolean megaQCUpload = false # This must be actively enabled in my opinion. The tools default is to upload.
        File? config  # A directory
        String? clConfig
-    
-        String memory = "4G"
-        Int timeMinutes = 120
+        String? memory
+        Int timeMinutes = 2 + ceil(size(reports, "G") * 8)
        String dockerImage = "quay.io/biocontainers/multiqc:1.7--py_1"
    }
+    Int memoryGb = 2 + ceil(size(reports, "G"))

    # This is where the reports end up. It does not need to be changed by the
    # user. It is full of symbolic links, so it is not of any use to the user
@@ -132,7 +132,7 @@ task MultiQC {
    }

    runtime {
-        memory: memory
+        memory: select_first([memory, "~{memoryGb}G"])
        time_minutes: timeMinutes
        docker: dockerImage
    }

--- a/picard.wdl
+++ b/picard.wdl
@@ -87,7 +87,8 @@ task CollectMultipleMetrics {

        String memory = "9G"
        String javaXmx = "8G"
-        Int timeMinutes = 1 + ceil(size(inputBam, "G") * 6)
+        # Additional * 2 because picard multiple metrics reads the reference fasta twice.
+        Int timeMinutes = 1 + ceil(size(referenceFasta, "G") * 3 * 2) + ceil(size(inputBam, "G") * 6)
        String dockerImage = "quay.io/biocontainers/picard:2.20.5--0"
    }

@@ -203,7 +204,8 @@ task CollectRnaSeqMetrics {

        String memory = "9G"
        String javaXmx =  "8G"
-        Int timeMinutes = 1 + ceil(size(inputBam, "G") * 6)
+        # With 6 minutes per G there were several timeouts. 
+        Int timeMinutes = 1 + ceil(size(inputBam, "G") * 12)
        String dockerImage = "quay.io/biocontainers/picard:2.20.5--0"
    }


--- a/star.wdl
+++ b/star.wdl
@@ -35,7 +35,7 @@ task GenomeGenerate {

    command {
        set -e
-        mkdir -p "$(dirname ~{genomeDir})"
+        mkdir -p ~{genomeDir}
        STAR \
        --runMode genomeGenerate \
        --runThreadN ~{threads} \
@@ -50,7 +50,7 @@ task GenomeGenerate {
        File chrNameLength = "~{genomeDir}/chrNameLength.txt"
        File chrName = "~{genomeDir}/chrName.txt"
        File chrStart = "~{genomeDir}/chrStart.txt"
-        File genome = "~{genomeDir}/genome.txt"
+        File genome = "~{genomeDir}/Genome"
        File genomeParameters = "~{genomeDir}/genomeParameters.txt"
        File sa = "~{genomeDir}/SA"
        File saIndex = "~{genomeDir}/SAindex"
@@ -106,11 +106,18 @@ task Star {
        Int? limitBAMsortRAM

        Int runThreadN = 4
-        String memory = "~{5 + ceil(size(indexFiles, "G"))}G"
-        Int timeMinutes = 1 + ceil(size(flatten([inputR1, inputR2]), "G") * 180 / runThreadN)
+        String? memory
+        # 1 minute initialization + time reading in index (1 minute per G) + time aligning data.
+        Int timeMinutes = 1 + ceil(size(indexFiles, "G")) + ceil(size(flatten([inputR1, inputR2]), "G") * 300 / runThreadN)
        String dockerImage = "quay.io/biocontainers/star:2.7.3a--0"
    }

+    # Use a margin of 30% index size. Real memory usage is ~30 GiB for a 27 GiB index. 
+    Int memoryGb = 1 + ceil(size(indexFiles, "G") * 1.3)
+    # For some reason doing above calculation inside a string does not work.
+    # So we solve it with an optional memory string and using select_first
+    # in the runtime section.
+
    #TODO Could be extended for all possible output extensions
    Map[String, String] samOutputNames = {"BAM SortedByCoordinate": "sortedByCoord.out.bam"}

@@ -142,7 +149,7 @@ task Star {

    runtime {
        cpu: runThreadN
-        memory: memory
+        memory: select_first([memory, "~{memoryGb}G"])
        time_minutes: timeMinutes
        docker: dockerImage
    }