Skip to content
Snippets Groups Projects
Unverified Commit 3b583e41 authored by Ruben Vorderman's avatar Ruben Vorderman Committed by GitHub
Browse files

Merge pull request #223 from biowdl/BIOWDL-471

Updated resource requirements for the RNA-seq pipeline.
parents 4048bb82 5c9b7c83
No related branches found
No related tags found
No related merge requests found
......@@ -11,11 +11,18 @@ that users understand how the changes affect the new version.
version 4.0.0-develop
---------------------------
+ Tuned resource requirements for GATK VariantEval, MultiQC, Picard metrics and
STAR.
+ Added a new task for [scatter-regions](https://github.com/biowdl/chunked-scatter)
that replaces biopet-scatterregions.
+ The FastQC task now talks to the Java directly instead of using the included
Perl wrapper for FastQC. This has the advantage that memory and threads can
be set independently. A rather high maximum heap size of 1750MB (Xmx1750M)
was set, as OOM errors occurred frequently on some fastqs.
+ STAR: Add options regarding alignment score (regarding read length as well)
for tweaking when processing rRNA depleted samples.
+ TALON: Update `minimumIdentity` to correct type (float, was integer)
& set new default according to developers (0.8, was 0).
+ Added bcftools stats task.
+ Added GATK VariantEval task.
+ Added a log output for STAR.
+ Added report output to Hisat2.
......
......@@ -214,6 +214,7 @@ task ScatterRegions {
input {
File referenceFasta
File referenceFastaDict
Int scatterSizeMillions = 1000
Int? scatterSize
File? regions
Boolean notSplitContigs = false
......@@ -230,6 +231,7 @@ task ScatterRegions {
# linking. This path must be in the containers filesystem, otherwise the
# linking does not work.
String outputDirPath = "scatters"
String finalSize = if defined(scatterSize) then "~{scatterSize}" else "~{scatterSizeMillions}000000"
command <<<
set -e -o pipefail
......@@ -237,7 +239,7 @@ task ScatterRegions {
biopet-scatterregions -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \
-R ~{referenceFasta} \
-o ~{outputDirPath} \
~{"-s " + scatterSize} \
~{"-s " + finalSize} \
~{"-L " + regions} \
~{"--bamFile " + bamFile} \
~{true="--notSplitContigs" false="" notSplitContigs}
......@@ -271,7 +273,8 @@ task ScatterRegions {
referenceFasta: {description: "The reference fasta file.", category: "required"}
referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.",
category: "required"}
scatterSize: {description: "Equivalent to biopet scatterregions' `-s` option.", category: "common"}
scatterSizeMillions: {description: "Over how many million base pairs should be scattered.", category: "common"}
scatterSize: {description: "Overrides scatterSizeMillions with a smaller value if set.", category: "advanced"}
regions: {description: "The regions to be scattered.", category: "advanced"}
notSplitContigs: {description: "Equivalent to biopet scatterregions' `--notSplitContigs` flag.",
category: "advanced"}
......
......@@ -64,3 +64,52 @@ task ChunkedScatter {
category: "advanced"}
}
}
task ScatterRegions {
input {
File inputFile
String prefix = "scatters/scatter-"
Boolean splitContigs = false
Int scatterSizeMillions = 1000
Int? scatterSize
Int timeMinutes = 2
String memory = "256M"
String dockerImage = "biowdl/chunked-scatter:latest"
}
String finalSize = if defined(scatterSize) then "~{scatterSize}" else "~{scatterSizeMillions}000000"
command {
scatter-regions \
--print-paths \
--scatter-size ~{finalSize} \
~{true="--split-contigs" false="" splitContigs} \
~{"--prefix " + prefix} \
~{inputFile}
}
output {
Array[File] scatters = read_lines(stdout())
}
runtime {
cpu: 1
memory: memory
docker: dockerImage
time_minutes: timeMinutes
}
parameter_meta {
inputFile: {description: "The input file, either a bed file or a sequence dict. Which format is used is detected by the extension: '.bed', '.fai' or '.dict'.", category: "required"}
prefix: {description: "The prefix of the ouput files. Output will be named like: <PREFIX><N>.bed, in which N is an incrementing number. Default 'scatter-'.", category: "advanced"}
splitContigs: {description: "If set, contigs are allowed to be split up over multiple files.", category: "advanced"}
scatterSizeMillions: {description: "Over how many million base pairs should be scattered.", category: "common"}
scatterSize: {description: "Overrides scatterSizeMillions with a smaller value if set.", category: "advanced"}
timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
memory: {description: "The amount of memory this job will use.", category: "advanced"}
dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
category: "advanced"}
}
}
......@@ -38,8 +38,13 @@ task Fastqc {
String? dir
Int threads = 1
# Fastqc uses 250MB per thread in its wrapper.
String memory = "~{250 + 250 * threads}M"
# Set javaXmx a little high. Equal to fastqc default with 7 threads.
# This is because some fastq files need more memory. 2G per core
# is a nice cluster default, so we use all the rest of the memory for
# fastqc so we should have as little OOM crashes as possible even with
# weird edge case fastq's.
String javaXmx="1750M"
String memory = "2G"
Int timeMinutes = 1 + ceil(size(seqFile, "G")) * 4
String dockerImage = "quay.io/biocontainers/fastqc:0.11.9--0"
Array[File]? NoneArray
......@@ -53,26 +58,32 @@ task Fastqc {
# Just as fastqc does it.
String reportDir = outdirPath + "/" + sub(name, "\.[^\.]*$", "_fastqc")
command {
# We reimplement the perl wrapper here. This has the advantage that it gives
# us more control over the amount of memory used.
command <<<
set -e
mkdir -p ~{outdirPath}
fastqc \
~{"--outdir " + outdirPath} \
~{true="--casava" false="" casava} \
~{true="--nano" false="" nano} \
~{true="--nofilter" false="" noFilter} \
~{true="--extract" false="" extract} \
~{true="--nogroup" false="" nogroup} \
~{"--min_length " + minLength } \
~{"--format " + format} \
~{"--threads " + threads} \
~{"--contaminants " + contaminants} \
~{"--adapters " + adapters} \
~{"--limits " + limits} \
~{"--kmers " + kmers} \
~{"--dir " + dir} \
FASTQC_DIR="/usr/local/opt/fastqc-0.11.9"
export CLASSPATH="$FASTQC_DIR:$FASTQC_DIR/sam-1.103.jar:$FASTQC_DIR/jbzip2-0.9.jar:$FASTQC_DIR/cisd-jhdf5.jar"
java -Djava.awt.headless=true -XX:ParallelGCThreads=1 \
-Xms200M -Xmx~{javaXmx} \
~{"-Dfastqc.output_dir=" + outdirPath} \
~{true="-Dfastqc.casava=true" false="" casava} \
~{true="-Dfastqc.nano=true" false="" nano} \
~{true="-Dfastqc.nofilter=true" false="" noFilter} \
~{true="-Dfastqc.unzip=true" false="" extract} \
~{true="-Dfastqc.nogroup=true" false="" nogroup} \
~{"-Dfastqc.min_length=" + minLength} \
~{"-Dfastqc.sequence_format=" + format} \
~{"-Dfastqc.threads=" + threads} \
~{"-Dfastqc.contaminant_file=" + contaminants} \
~{"-Dfastqc.adapter_file=" + adapters} \
~{"-Dfastqc.limits_file=" + limits} \
~{"-Dfastqc.kmer_size=" + kmers} \
~{"-Djava.io.tmpdir=" + dir} \
uk.ac.babraham.FastQC.FastQCApplication \
~{seqFile}
}
>>>
output {
File? rawReport = if extract then reportDir + "/fastqc_data.txt" else NoneFile
......@@ -105,6 +116,8 @@ task Fastqc {
kmers: {description: "Equivalent to fastqc's --kmers option.", category: "advanced"}
dir: {description: "Equivalent to fastqc's --dir option.", category: "advanced"}
threads: {description: "The number of cores to use.", category: "advanced"}
javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.",
category: "advanced"}
memory: {description: "The amount of memory this job will use.", category: "advanced"}
timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
......
......@@ -1574,7 +1574,7 @@ task VariantEval {
String memory = "5G"
String javaXmx = "4G"
# TODO: Refine estimate. For now 4 minutes per GB of input.
Int timeMinutes = ceil(size(flatten([evalVcfs, comparisonVcfs]), "G") * 4)
Int timeMinutes = ceil(size(flatten([evalVcfs, comparisonVcfs, select_all([referenceFasta, dbsnpVCF])]), "G") * 20)
String dockerImage = "quay.io/biocontainers/gatk4:4.1.7.0--py38_0"
}
......
......@@ -51,11 +51,11 @@ task MultiQC {
Boolean megaQCUpload = false # This must be actively enabled in my opinion. The tools default is to upload.
File? config # A directory
String? clConfig
String memory = "4G"
Int timeMinutes = 120
String? memory
Int timeMinutes = 2 + ceil(size(reports, "G") * 8)
String dockerImage = "quay.io/biocontainers/multiqc:1.7--py_1"
}
Int memoryGb = 2 + ceil(size(reports, "G"))
# This is where the reports end up. It does not need to be changed by the
# user. It is full of symbolic links, so it is not of any use to the user
......@@ -132,7 +132,7 @@ task MultiQC {
}
runtime {
memory: memory
memory: select_first([memory, "~{memoryGb}G"])
time_minutes: timeMinutes
docker: dockerImage
}
......
......@@ -87,7 +87,8 @@ task CollectMultipleMetrics {
String memory = "9G"
String javaXmx = "8G"
Int timeMinutes = 1 + ceil(size(inputBam, "G") * 6)
# Additional * 2 because picard multiple metrics reads the reference fasta twice.
Int timeMinutes = 1 + ceil(size(referenceFasta, "G") * 3 * 2) + ceil(size(inputBam, "G") * 6)
String dockerImage = "quay.io/biocontainers/picard:2.20.5--0"
}
......@@ -203,7 +204,8 @@ task CollectRnaSeqMetrics {
String memory = "9G"
String javaXmx = "8G"
Int timeMinutes = 1 + ceil(size(inputBam, "G") * 6)
# With 6 minutes per G there were several timeouts.
Int timeMinutes = 1 + ceil(size(inputBam, "G") * 12)
String dockerImage = "quay.io/biocontainers/picard:2.20.5--0"
}
......
......@@ -35,7 +35,7 @@ task GenomeGenerate {
command {
set -e
mkdir -p "$(dirname ~{genomeDir})"
mkdir -p ~{genomeDir}
STAR \
--runMode genomeGenerate \
--runThreadN ~{threads} \
......@@ -50,7 +50,7 @@ task GenomeGenerate {
File chrNameLength = "~{genomeDir}/chrNameLength.txt"
File chrName = "~{genomeDir}/chrName.txt"
File chrStart = "~{genomeDir}/chrStart.txt"
File genome = "~{genomeDir}/genome.txt"
File genome = "~{genomeDir}/Genome"
File genomeParameters = "~{genomeDir}/genomeParameters.txt"
File sa = "~{genomeDir}/SA"
File saIndex = "~{genomeDir}/SAindex"
......@@ -106,11 +106,18 @@ task Star {
Int? limitBAMsortRAM
Int runThreadN = 4
String memory = "~{5 + ceil(size(indexFiles, "G"))}G"
Int timeMinutes = 1 + ceil(size(flatten([inputR1, inputR2]), "G") * 180 / runThreadN)
String? memory
# 1 minute initialization + time reading in index (1 minute per G) + time aligning data.
Int timeMinutes = 1 + ceil(size(indexFiles, "G")) + ceil(size(flatten([inputR1, inputR2]), "G") * 300 / runThreadN)
String dockerImage = "quay.io/biocontainers/star:2.7.3a--0"
}
# Use a margin of 30% index size. Real memory usage is ~30 GiB for a 27 GiB index.
Int memoryGb = 1 + ceil(size(indexFiles, "G") * 1.3)
# For some reason doing above calculation inside a string does not work.
# So we solve it with an optional memory string and using select_first
# in the runtime section.
#TODO Could be extended for all possible output extensions
Map[String, String] samOutputNames = {"BAM SortedByCoordinate": "sortedByCoord.out.bam"}
......@@ -142,7 +149,7 @@ task Star {
runtime {
cpu: runThreadN
memory: memory
memory: select_first([memory, "~{memoryGb}G"])
time_minutes: timeMinutes
docker: dockerImage
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment