Skip to content
Snippets Groups Projects
Unverified Commit 0ee95d1b authored by Peter van 't Hof's avatar Peter van 't Hof Committed by GitHub
Browse files

Merge pull request #8 from biowdl/run_time

Run time settings and additional adjustments
parents 592164c0 689a2f2c
No related branches found
No related tags found
No related merge requests found
......@@ -32,11 +32,15 @@ task ScatterRegions {
Int? scatterSize
File? regions
Float? memory
Float? memoryMultiplier
Int mem = ceil(select_first([memory, 4.0]))
command {
set -e -o pipefail
${preCommand}
mkdir -p ${outputDirPath}
java -Xmx2G -jar ${tool_jar} \
java -Xmx${mem}G -jar ${tool_jar} \
-R ${ref_fasta} \
-o ${outputDirPath} \
${"-s " + scatterSize} \
......@@ -46,6 +50,10 @@ task ScatterRegions {
output {
Array[File] scatters = glob(outputDirPath + "/scatter-*.bed")
}
runtime {
memory: ceil(mem * select_first([memoryMultiplier, 2.0]))
}
}
task SampleConfig {
......@@ -58,11 +66,15 @@ task SampleConfig {
String? jsonOutputPath
String? tsvOutputPath
Float? memory
Float? memoryMultiplier
Int mem = ceil(select_first([memory, 4.0]))
command {
set -e -o pipefail
${preCommand}
mkdir -p . $(dirname ${jsonOutputPath}) $(dirname ${tsvOutputPath})
java -jar ${tool_jar} \
mkdir -p . ${"$(dirname " + jsonOutputPath + ")"} ${"$(dirname " + tsvOutputPath + ")"}
java -Xmx${mem}G -jar ${tool_jar} \
-i ${sep="-i " inputFiles} \
${"--sample " + sample} \
${"--library " + library} \
......@@ -77,6 +89,10 @@ task SampleConfig {
File? tsvOutput = tsvOutputPath
Object values = if (defined(tsvOutput) && size(tsvOutput) > 0) then read_map(tsvOutput) else { "": "" }
}
runtime {
memory: ceil(mem * select_first([memoryMultiplier, 2.0]))
}
}
task BaseCounter {
......@@ -87,11 +103,15 @@ task BaseCounter {
String outputDir
String prefix
Float? memory
Float? memoryMultiplier
Int mem = ceil(select_first([memory, 12.0]))
command {
set -e -o pipefail
${preCommand}
mkdir -p ${outputDir}
java -jar ${tool_jar} \
java -Xmx${mem}G -jar ${tool_jar} \
-b ${bam} \
-r ${refFlat} \
-o ${outputDir} \
......@@ -134,4 +154,8 @@ task BaseCounter {
File transcriptIntronicSense = outputDir + "/" + prefix + ".base.transcript.intronic.sense.counts"
File transcriptSense = outputDir + "/" + prefix + ".base.transcript.sense.counts"
}
runtime {
memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
}
}
......@@ -6,15 +6,23 @@ task BwaMem {
String outputPath
String? readgroup
Int? threads
Int? memory
command {
set -e -o pipefail
mkdir -p $(dirname ${outputPath})
${preCommand}
bwa mem ${"-R '" + readgroup + "'"} \
bwa mem ${"-t " + threads} \
${"-R '" + readgroup + "'"} \
${referenceFasta} ${inputR1} ${inputR2} | samtools sort --output-fmt BAM - > ${outputPath}
}
output {
File bamFile = outputPath
}
runtime{
cpu: if defined(threads) then threads else 1
memory: if defined(memory) then memory else 8
}
}
task objectMd5 {
Object the_object
command {
cat ${write_object(the_object)} | md5sum - | sed -e 's/ -//'
}
output {
String md5sum = read_string(stdout())
}
runtime {
memory: 1
}
}
task mapMd5 {
Map[String,String] map
command {
cat ${write_map(map)} | md5sum - | sed -e 's/ -//'
cat ${write_map(map)} | md5sum - | sed -e 's/ -//'
}
output {
String md5sum = read_string(stdout())
}
runtime {
memory: 1
}
}
task stringArrayMd5 {
Array[String] stringArray
command {
set -eu -o pipefail
echo ${sep=',' stringArray} | md5sum - | sed -e 's/ -//'
set -eu -o pipefail
echo ${sep=',' stringArray} | md5sum - | sed -e 's/ -//'
}
output {
String md5sum = read_string(stdout())
String md5sum = read_string(stdout())
}
runtime {
memory: 1
}
}
......@@ -33,37 +51,68 @@ task concatenateTextFiles {
Array[File] fileList
String combinedFilePath
Boolean? unzip=false
command {
mkdir -p ${combinedFilePath}
rm -d ${combinedFilePath}
${true='zcat' false= 'cat' unzip} ${sep=' ' fileList} \
> ${combinedFilePath}
}
output {
File combinedFile = combinedFilePath
}
runtime {
memory: 1
}
}
# inspired by https://gatkforums.broadinstitute.org/wdl/discussion/9616/is-there-a-way-to-flatten-arrays
task flattenStringArray {
Array[Array[String]] arrayList
command {
for line in $(echo ${sep=', ' arrayList}) ; \
do echo $line | tr -d '"[],' ; done
for line in $(echo ${sep=', ' arrayList}) ; \
do echo $line | tr -d '"[],' ; done
}
output {
Array[String] flattenedArray = read_lines(stdout())
}
runtime {
memory: 1
}
}
task appendToStringArray {
Array[String] array
String string
command {
echo "${sep='\n' array}
${string}"
}
output {
Array[String] out_array = read_lines(stdout())
}
runtime {
memory: 1
}
}
task createLink {
File inputFile
String outputPath
command {
ln -sf ${inputFile} ${outputPath}
}
output {
File link = outputPath
}
}
\ No newline at end of file
......@@ -62,10 +62,15 @@ task extractAdapters {
File? knownAdapterFile
Float? adapterCutoff
Boolean? outputAsFasta
Float? memory
Float? memoryMultiplier
Int mem = ceil(select_first([memory, 4.0]))
command {
set -e
mkdir -p ${outputDir}
java -jar ${extractAdaptersFastqcJar} \
java -Xmx${mem}G -jar ${extractAdaptersFastqcJar} \
--inputFile ${inputFile} \
${"--adapterOutputFile " + adapterOutputFilePath } \
${"--contamsOutputFile " + contamsOutputFilePath } \
......@@ -82,20 +87,30 @@ task extractAdapters {
Array[String] adapterList = read_lines(select_first([adapterOutputFilePath]))
Array[String] contamsList = read_lines(select_first([contamsOutputFilePath]))
}
runtime {
memory: ceil(mem * select_first([memoryMultiplier, 2.5]))
}
}
task getConfiguration {
String? preCommand
String? fastqcDirFile = "fastqcDir.txt"
command {
set -e -o pipefail
${preCommand}
echo $(dirname $(readlink -f $(which fastqc))) > ${fastqcDirFile}
}
output {
String fastqcDir = read_string(fastqcDirFile)
File adapterList = fastqcDir + "/Configuration/adapter_list.txt"
File contaminantList = fastqcDir + "/Configuration/contaminant_list.txt"
File limits = fastqcDir + "/Configuration/limits.txt"
}
runtime {
memory: 1
}
}
\ No newline at end of file
......@@ -12,10 +12,14 @@ task BaseRecalibrator {
File ref_fasta
File ref_fasta_index
Float? memory
Float? memoryMultiplier
Int mem = ceil(select_first([memory, 4.0]))
command {
set -e -o pipefail
${preCommand}
java -Xms4G -jar ${gatk_jar} \
java -Xms${mem}G -jar ${gatk_jar} \
BaseRecalibrator \
-R ${ref_fasta} \
-I ${input_bam} \
......@@ -24,9 +28,14 @@ task BaseRecalibrator {
--known-sites ${sep=" --known-sites " known_indels_sites_VCFs} \
-L ${sep=" -L " sequence_group_interval}
}
output {
File recalibration_report = "${recalibration_report_filename}"
}
runtime {
memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
}
}
# Apply Base Quality Score Recalibration (BQSR) model
......@@ -42,10 +51,15 @@ task ApplyBQSR {
File ref_fasta_index
Int? compression_level
Float? memory
Float? memoryMultiplier
Int mem = ceil(select_first([memory, 4.0]))
command {
set -e -o pipefail
${preCommand}
java ${"-Dsamjdk.compression_level=" + compression_level} -Xms4G -jar ${gatk_jar} \
java ${"-Dsamjdk.compression_level=" + compression_level} \
-Xms${mem}G -jar ${gatk_jar} \
ApplyBQSR \
--create-output-bam-md5 \
--add-output-sam-program-record \
......@@ -57,10 +71,15 @@ task ApplyBQSR {
--static-quantized-quals 10 --static-quantized-quals 20 --static-quantized-quals 30 \
-L ${sep=" -L " sequence_group_interval}
}
output {
File recalibrated_bam = "${output_bam_path}"
File recalibrated_bam_checksum = "${output_bam_path}.md5"
}
runtime {
memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
}
}
# Combine multiple recalibration tables from scattered BaseRecalibrator runs
......@@ -70,17 +89,26 @@ task GatherBqsrReports {
Array[File] input_bqsr_reports
String output_report_filepath
Float? memory
Float? memoryMultiplier
Int mem = ceil(select_first([memory, 4.0]))
command {
set -e -o pipefail
${preCommand}
java -Xms3G -jar ${gatk_jar} \
java -Xms${mem}G -jar ${gatk_jar} \
GatherBQSRReports \
-I ${sep=' -I ' input_bqsr_reports} \
-O ${output_report_filepath}
}
output {
File output_bqsr_report = "${output_report_filepath}"
}
runtime {
memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
}
}
# Call variants on a single sample with HaplotypeCaller to produce a GVCF
......@@ -97,10 +125,15 @@ task HaplotypeCallerGvcf {
Int? compression_level
String gatk_jar
Float? memory
Float? memoryMultiplier
Int mem = ceil(select_first([memory, 4.0]))
command {
set -e -o pipefail
${preCommand}
java ${"-Dsamjdk.compression_level=" + compression_level} -Xmx4G -jar ${gatk_jar} \
java ${"-Dsamjdk.compression_level=" + compression_level} \
-Xmx${mem}G -jar ${gatk_jar} \
HaplotypeCaller \
-R ${ref_fasta} \
-O ${gvcf_basename}.vcf.gz \
......@@ -109,10 +142,15 @@ task HaplotypeCallerGvcf {
-contamination ${default=0 contamination} \
-ERC GVCF
}
output {
File output_gvcf = "${gvcf_basename}.vcf.gz"
File output_gvcf_index = "${gvcf_basename}.vcf.gz.tbi"
}
runtime {
memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
}
}
task GenotypeGVCFs {
......@@ -133,12 +171,16 @@ task GenotypeGVCFs {
File dbsnp_vcf_index
Int? compression_level
Float? memory
Float? memoryMultiplier
Int mem = ceil(select_first([memory, 4.0]))
command {
set -e -o pipefail
${preCommand}
java ${"-Dsamjdk.compression_level=" + compression_level} -Xmx4G -jar ${gatk_jar} \
java ${"-Dsamjdk.compression_level=" + compression_level} \
-Xmx${mem}G -jar ${gatk_jar} \
GenotypeGVCFs \
-R ${ref_fasta} \
-O ${output_basename + ".vcf.gz"} \
......@@ -154,6 +196,10 @@ task GenotypeGVCFs {
File output_vcf = output_basename + ".vcf.gz"
File output_vcf_index = output_basename + ".vcf.gz.tbi"
}
runtime{
memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
}
}
task CombineGVCFs {
......@@ -171,13 +217,17 @@ task CombineGVCFs {
File ref_dict
Int? compression_level
Float? memory
Float? memoryMultiplier
Int mem = ceil(select_first([memory, 4.0]))
command {
set -e -o pipefail
${preCommand}
if [ ${length(gvcf_files)} -gt 1 ]; then
java ${"-Dsamjdk.compression_level=" + compression_level} -Xmx4G -jar ${gatk_jar} \
java ${"-Dsamjdk.compression_level=" + compression_level} \
-Xmx${mem}G -jar ${gatk_jar} \
CombineGVCFs \
-R ${ref_fasta} \
-O ${output_basename + ".vcf.gz"} \
......@@ -193,6 +243,10 @@ task CombineGVCFs {
File output_gvcf = output_basename + ".vcf.gz"
File output_gvcf_index = output_basename + ".vcf.gz.tbi"
}
runtime {
memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
}
}
task SplitNCigarReads {
......@@ -206,11 +260,15 @@ task SplitNCigarReads {
String gatk_jar
Array[File]+ intervals
Float? memory
Float? memoryMultiplier
Int mem = ceil(select_first([memory, 4.0]))
command {
set -e -o pipefail
${preCommand}
java -Xms4G -jar ${gatk_jar} \
java -Xms${mem}G -jar ${gatk_jar} \
SplitNCigarReads \
-I ${input_bam} \
-R ${ref_fasta} \
-O ${output_bam} # might have to be -o depending on GATK version \
......@@ -221,4 +279,8 @@ task SplitNCigarReads {
File bam = output_bam
File bam_index = output_bam + ".bai"
}
runtime {
memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
}
}
task HTSeqCount {
String? preCommand
Array[File] alignmentFiles
File gffFile
File gtfFile
String outputTable
String? format
String? order
String? stranded
Int? memory
command {
set -e -o pipefail
${preCommand}
......@@ -15,11 +17,15 @@ task HTSeqCount {
-r ${default="pos" order} \
-s ${default="no" stranded} \
${sep=" " alignmentFiles} \
${gffFile} \
${gtfFile} \
> ${outputTable}
}
output {
File counts = outputTable
}
runtime {
memory: select_first([memory, 3])
}
}
\ No newline at end of file
......@@ -32,4 +32,8 @@ task MergeCounts {
output {
File mergedCounts = outputFile
}
runtime {
memory: 4 + (2*length(inputFiles))
}
}
\ No newline at end of file
......@@ -4,11 +4,15 @@ task ScatterIntervalList {
Int scatter_count
String picard_jar
Float? memory
Float? memoryMultiplier
Int mem = ceil(select_first([memory, 4.0]))
command {
set -e -o pipefail
${preCommand}
mkdir scatter_list
java -Xmx4G -jar ${picard_jar} \
java -Xmx${mem}G -jar ${picard_jar} \
IntervalListTools \
SCATTER_COUNT=${scatter_count} \
SUBDIVISION_MODE=BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \
......@@ -17,10 +21,15 @@ task ScatterIntervalList {
INPUT=${interval_list} \
OUTPUT=scatter_list
}
output {
Array[File] out = glob("scatter_list/*/*.interval_list")
Int interval_count = read_int(stdout())
}
runtime {
memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
}
}
# Combine multiple recalibrated BAM files from scattered ApplyRecalibration runs
......@@ -31,21 +40,31 @@ task GatherBamFiles {
Int? compression_level
String picard_jar
Float? memory
Float? memoryMultiplier
Int mem = ceil(select_first([memory, 4.0]))
command {
set -e -o pipefail
${preCommand}
java ${"-Dsamjdk.compression_level=" + compression_level} -Xmx4G -jar ${picard_jar} \
java ${"-Dsamjdk.compression_level=" + compression_level} \
-Xmx${mem}G -jar ${picard_jar} \
GatherBamFiles \
INPUT=${sep=' INPUT=' input_bams} \
OUTPUT=${output_bam_path} \
CREATE_INDEX=true \
CREATE_MD5_FILE=true
}
output {
File output_bam = "${output_bam_path}"
File output_bam_index = sub(output_bam_path, ".bam$", ".bai")
File output_bam_md5 = "${output_bam_path}.md5"
}
runtime {
memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
}
}
# Mark duplicate reads to avoid counting non-independent observations
......@@ -57,6 +76,9 @@ task MarkDuplicates {
Int? compression_level
String picard_jar
Float? memory
Float? memoryMultiplier
# The program default for READ_NAME_REGEX is appropriate in nearly every case.
# Sometimes we wish to supply "null" in order to turn off optical duplicate detection
# This can be desirable if you don't mind the estimated library size being wrong and optical duplicate detection is taking >7 days and failing
......@@ -65,11 +87,13 @@ task MarkDuplicates {
# Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly
# This works because the output of BWA is query-grouped and therefore, so is the output of MergeBamAlignment.
# While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname"
Int mem = ceil(select_first([memory, 4.0]))
command {
set -e -o pipefail
${preCommand}
mkdir -p $(dirname ${output_bam_path})
java ${"-Dsamjdk.compression_level=" + compression_level} -Xmx4G -jar ${picard_jar} \
java ${"-Dsamjdk.compression_level=" + compression_level} \
-Xmx${mem}G -jar ${picard_jar} \
MarkDuplicates \
INPUT=${sep=' INPUT=' input_bams} \
OUTPUT=${output_bam_path} \
......@@ -81,11 +105,16 @@ task MarkDuplicates {
CREATE_INDEX=true \
ADD_PG_TAG_TO_READS=false
}
output {
File output_bam = output_bam_path
File output_bam_index = sub(output_bam_path, ".bam$", ".bai")
File duplicate_metrics = metrics_path
}
runtime {
memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
}
}
# Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs
......@@ -97,18 +126,28 @@ task MergeVCFs {
Int? compression_level
String picard_jar
Float? memory
Float? memoryMultiplier
# Using MergeVcfs instead of GatherVcfs so we can create indices
# See https://github.com/broadinstitute/picard/issues/789 for relevant GatherVcfs ticket
Int mem = ceil(select_first([memory, 4.0]))
command {
set -e -o pipefail
${preCommand}
java ${"-Dsamjdk.compression_level=" + compression_level} -Xmx4G -jar ${picard_jar} \
java ${"-Dsamjdk.compression_level=" + compression_level} \
-Xmx${mem}G -jar ${picard_jar} \
MergeVcfs \
INPUT=${sep=' INPUT=' input_vcfs} \
OUTPUT=${output_vcf_path}
}
output {
File output_vcf = output_vcf_path
File output_vcf_index = output_vcf_path + ".tbi"
}
runtime {
memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
}
}
\ No newline at end of file
......@@ -21,7 +21,12 @@ task Merge {
command {
set -e -o pipefail
${preCommand}
samtools merge ${outputBamPath} ${sep=' ' bamFiles}
if [ ${length(bamFiles)} -gt 1 ]
then
samtools merge ${outputBamPath} ${sep=' ' bamFiles}
else
ln -sf ${bamFiles} ${outputBamPath}
fi
}
output {
......
......@@ -2,7 +2,7 @@ task Star {
String? preCommand
Array[File] inputR1
Array[File]? inputR2
Array[File?] inputR2
String genomeDir
String outFileNamePrefix
......@@ -13,9 +13,14 @@ task Star {
String? twopassMode
Array[String]? outSAMattrRGline
Int? memory
#TODO needs to be extended for all possible output extensions
Map[String, String] samOutputNames = {"BAM SortedByCoordinate": "sortedByCoord.out.bam"}
# converts String? to String for use as key (for the Map above) in output
String key = select_first([outSAMtype, "BAM SortedByCoordinate"])
command {
set -e -o pipefail
mkdir -p ${sub(outFileNamePrefix, basename(outFileNamePrefix) + "$", "")}
......@@ -33,10 +38,11 @@ task Star {
}
output {
File bamFile = outFileNamePrefix + "Aligned." + samOutputNames["${outSAMtype}"]
File bamFile = outFileNamePrefix + "Aligned." + samOutputNames[key]
}
runtime {
threads: runThreadN
cpu: select_first([runThreadN, 1])
memory: select_first([memory, 10])
}
}
\ No newline at end of file
task Stringtie {
String? preCommand
File alignedReads
File? referenceGFF
File? referenceGtf
Int? threads
String assembledTranscriptsFile
Boolean? firstStranded
......@@ -13,7 +13,7 @@ task Stringtie {
${preCommand}
stringtie \
${"-p " + threads} \
${"-G " + referenceGFF} \
${"-G " + referenceGtf} \
${true="--rf" false="" firstStranded} \
${true="fr" false="" secondStranded} \
-o ${assembledTranscriptsFile} \
......@@ -28,6 +28,6 @@ task Stringtie {
}
runtime {
threads: threads
cpu: select_first([threads, 1])
}
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment