Merge pull request #8 from biowdl/run_time

Run time settings and additional adjustments

Merge pull request #8 from biowdl/run_time
0ee95d1b · Peter van 't Hof · GitHub · 592164c0 · 689a2f2c · 0ee95d1b
Unverified Commit 0ee95d1b authored 6 years ago by Peter van 't Hof Committed by GitHub 6 years ago
--- a/biopet.wdl
+++ b/biopet.wdl
@@ -32,11 +32,15 @@ task ScatterRegions {
    Int? scatterSize
    File? regions

+    Float? memory
+    Float? memoryMultiplier
+
+    Int mem = ceil(select_first([memory, 4.0]))
    command {
        set -e -o pipefail
        ${preCommand}
        mkdir -p ${outputDirPath}
-        java -Xmx2G -jar ${tool_jar} \
+        java -Xmx${mem}G -jar ${tool_jar} \
          -R ${ref_fasta} \
          -o ${outputDirPath} \
          ${"-s " + scatterSize} \
@@ -46,6 +50,10 @@ task ScatterRegions {
    output {
        Array[File] scatters = glob(outputDirPath + "/scatter-*.bed")
    }
+
+    runtime {
+        memory: ceil(mem * select_first([memoryMultiplier, 2.0]))
+    }
 }

 task SampleConfig {
@@ -58,11 +66,15 @@ task SampleConfig {
    String? jsonOutputPath
    String? tsvOutputPath

+    Float? memory
+    Float? memoryMultiplier
+
+    Int mem = ceil(select_first([memory, 4.0]))
    command {
        set -e -o pipefail
        ${preCommand}
-        mkdir -p . $(dirname ${jsonOutputPath}) $(dirname ${tsvOutputPath})
-        java -jar ${tool_jar} \
+        mkdir -p . ${"$(dirname " + jsonOutputPath + ")"} ${"$(dirname " + tsvOutputPath + ")"}
+        java -Xmx${mem}G -jar ${tool_jar} \
        -i ${sep="-i " inputFiles} \
        ${"--sample " + sample} \
        ${"--library " + library} \
@@ -77,6 +89,10 @@ task SampleConfig {
        File? tsvOutput = tsvOutputPath
        Object values = if (defined(tsvOutput) && size(tsvOutput) > 0) then read_map(tsvOutput) else { "": "" }
    }
+
+    runtime {
+        memory: ceil(mem * select_first([memoryMultiplier, 2.0]))
+    }
 }

 task BaseCounter {
@@ -87,11 +103,15 @@ task BaseCounter {
    String outputDir
    String prefix

+    Float? memory
+    Float? memoryMultiplier
+
+    Int mem = ceil(select_first([memory, 12.0]))
    command {
        set -e -o pipefail
        ${preCommand}
        mkdir -p ${outputDir}
-        java -jar ${tool_jar} \
+        java -Xmx${mem}G -jar ${tool_jar} \
        -b ${bam} \
        -r ${refFlat} \
        -o ${outputDir} \
@@ -134,4 +154,8 @@ task BaseCounter {
        File transcriptIntronicSense = outputDir + "/" + prefix + ".base.transcript.intronic.sense.counts"
        File transcriptSense = outputDir + "/" + prefix + ".base.transcript.sense.counts"
    }
+
+    runtime {
+        memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
+    }
 }
--- a/bwa.wdl
+++ b/bwa.wdl
@@ -6,15 +6,23 @@ task BwaMem {
    String outputPath
    String? readgroup

+    Int? threads
+    Int? memory
+
    command {
        set -e -o pipefail
        mkdir -p $(dirname ${outputPath})
        ${preCommand}
-        bwa mem ${"-R '" + readgroup + "'"} \
+        bwa mem ${"-t " + threads} \
+        ${"-R '" + readgroup + "'"} \
        ${referenceFasta} ${inputR1} ${inputR2} | samtools sort --output-fmt BAM - > ${outputPath}
    }

    output {
        File bamFile = outputPath
    }
+    runtime{
+        cpu: if defined(threads) then threads else 1
+        memory: if defined(memory) then memory else 8
+    }
 }
--- a/common.wdl
+++ b/common.wdl
 task objectMd5 {
    Object the_object
+
    command {
        cat ${write_object(the_object)} |  md5sum - | sed -e 's/  -//'
    }
+
    output {
        String md5sum = read_string(stdout())
    }
+
+    runtime {
+        memory: 1
+    }
 }

 task mapMd5 {
    Map[String,String] map
+
    command {
-    cat ${write_map(map)} | md5sum - | sed -e 's/  -//'
+        cat ${write_map(map)} | md5sum - | sed -e 's/  -//'
    }
+
    output {
        String md5sum = read_string(stdout())
    }
+
+    runtime {
+        memory: 1
+    }
 }

 task stringArrayMd5 {
    Array[String] stringArray
+
    command {
-    set -eu -o pipefail
-    echo ${sep=',' stringArray} | md5sum - | sed -e 's/  -//'
+        set -eu -o pipefail
+        echo ${sep=',' stringArray} | md5sum - | sed -e 's/  -//'
    }
+
    output {
-    String md5sum = read_string(stdout())
+        String md5sum = read_string(stdout())
+    }
+
+    runtime {
+        memory: 1
    }
 }

@@ -33,37 +51,68 @@ task concatenateTextFiles {
    Array[File] fileList
    String combinedFilePath
    Boolean? unzip=false
+
    command {
        mkdir -p ${combinedFilePath}
        rm -d ${combinedFilePath}
        ${true='zcat' false= 'cat' unzip} ${sep=' ' fileList} \
        > ${combinedFilePath}
    }
+
    output {
        File combinedFile = combinedFilePath
    }
+
+    runtime {
+        memory: 1
+    }
 }

 # inspired by https://gatkforums.broadinstitute.org/wdl/discussion/9616/is-there-a-way-to-flatten-arrays
 task flattenStringArray {
    Array[Array[String]] arrayList
+
    command {
-    for line in $(echo ${sep=', ' arrayList}) ; \
-    do echo $line | tr -d '"[],' ; done
+        for line in $(echo ${sep=', ' arrayList}) ; \
+        do echo $line | tr -d '"[],' ; done
    }
+
    output {
        Array[String] flattenedArray = read_lines(stdout())
    }
+
+    runtime {
+        memory: 1
+    }
 }

 task appendToStringArray {
    Array[String] array
    String string
+
    command {
        echo "${sep='\n' array}
        ${string}"
    }
+
    output {
        Array[String] out_array = read_lines(stdout())
    }
+
+    runtime {
+        memory: 1
+    }
+}
+
+task createLink {
+    File inputFile
+    String outputPath
+
+    command {
+        ln -sf ${inputFile} ${outputPath}
+    }
+
+    output {
+        File link = outputPath
+    }
 }
\ No newline at end of file
--- a/fastqc.wdl
+++ b/fastqc.wdl
@@ -62,10 +62,15 @@ task extractAdapters {
    File? knownAdapterFile
    Float? adapterCutoff
    Boolean? outputAsFasta
+
+    Float? memory
+    Float? memoryMultiplier
+
+    Int mem = ceil(select_first([memory, 4.0]))
    command {
    set -e
    mkdir -p ${outputDir}
-    java -jar ${extractAdaptersFastqcJar} \
+    java -Xmx${mem}G -jar ${extractAdaptersFastqcJar} \
    --inputFile ${inputFile} \
    ${"--adapterOutputFile " + adapterOutputFilePath } \
    ${"--contamsOutputFile " + contamsOutputFilePath } \
@@ -82,20 +87,30 @@ task extractAdapters {
        Array[String] adapterList = read_lines(select_first([adapterOutputFilePath]))
        Array[String] contamsList = read_lines(select_first([contamsOutputFilePath]))
    }
+
+    runtime {
+        memory: ceil(mem * select_first([memoryMultiplier, 2.5]))
+    }
 }

 task getConfiguration {
    String? preCommand
    String? fastqcDirFile = "fastqcDir.txt"
+
    command {
        set -e -o pipefail
        ${preCommand}
        echo $(dirname $(readlink -f $(which fastqc))) > ${fastqcDirFile}
    }
+
    output {
        String fastqcDir = read_string(fastqcDirFile)
        File adapterList = fastqcDir + "/Configuration/adapter_list.txt"
        File contaminantList = fastqcDir + "/Configuration/contaminant_list.txt"
        File limits = fastqcDir + "/Configuration/limits.txt"
    }
+
+    runtime {
+        memory: 1
+    }
 }
\ No newline at end of file
--- a/gatk.wdl
+++ b/gatk.wdl
@@ -12,10 +12,14 @@ task BaseRecalibrator {
    File ref_fasta
    File ref_fasta_index

+    Float? memory
+    Float? memoryMultiplier
+
+    Int mem = ceil(select_first([memory, 4.0]))
    command {
        set -e -o pipefail
        ${preCommand}
-        java -Xms4G -jar ${gatk_jar} \
+        java -Xms${mem}G -jar ${gatk_jar} \
          BaseRecalibrator \
          -R ${ref_fasta} \
          -I ${input_bam} \
@@ -24,9 +28,14 @@ task BaseRecalibrator {
          --known-sites ${sep=" --known-sites " known_indels_sites_VCFs} \
          -L ${sep=" -L " sequence_group_interval}
    }
+
    output {
        File recalibration_report = "${recalibration_report_filename}"
    }
+
+    runtime {
+        memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
+    }
 }

 # Apply Base Quality Score Recalibration (BQSR) model
@@ -42,10 +51,15 @@ task ApplyBQSR {
    File ref_fasta_index
    Int? compression_level

+    Float? memory
+    Float? memoryMultiplier
+
+    Int mem = ceil(select_first([memory, 4.0]))
    command {
        set -e -o pipefail
        ${preCommand}
-        java ${"-Dsamjdk.compression_level=" + compression_level} -Xms4G -jar ${gatk_jar} \
+        java ${"-Dsamjdk.compression_level=" + compression_level} \
+        -Xms${mem}G -jar ${gatk_jar} \
          ApplyBQSR \
          --create-output-bam-md5 \
          --add-output-sam-program-record \
@@ -57,10 +71,15 @@ task ApplyBQSR {
          --static-quantized-quals 10 --static-quantized-quals 20 --static-quantized-quals 30 \
          -L ${sep=" -L " sequence_group_interval}
    }
+
    output {
        File recalibrated_bam = "${output_bam_path}"
        File recalibrated_bam_checksum = "${output_bam_path}.md5"
    }
+
+    runtime {
+        memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
+    }
 }

 # Combine multiple recalibration tables from scattered BaseRecalibrator runs
@@ -70,17 +89,26 @@ task GatherBqsrReports {
    Array[File] input_bqsr_reports
    String output_report_filepath

+    Float? memory
+    Float? memoryMultiplier
+
+    Int mem = ceil(select_first([memory, 4.0]))
    command {
        set -e -o pipefail
        ${preCommand}
-        java -Xms3G -jar ${gatk_jar} \
+        java -Xms${mem}G -jar ${gatk_jar} \
        GatherBQSRReports \
        -I ${sep=' -I ' input_bqsr_reports} \
        -O ${output_report_filepath}
    }
+
    output {
        File output_bqsr_report = "${output_report_filepath}"
    }
+
+    runtime {
+        memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
+    }
 }

 # Call variants on a single sample with HaplotypeCaller to produce a GVCF
@@ -97,10 +125,15 @@ task HaplotypeCallerGvcf {
    Int? compression_level
    String gatk_jar

+    Float? memory
+    Float? memoryMultiplier
+
+    Int mem = ceil(select_first([memory, 4.0]))
    command {
        set -e -o pipefail
        ${preCommand}
-        java ${"-Dsamjdk.compression_level=" + compression_level} -Xmx4G -jar ${gatk_jar} \
+        java ${"-Dsamjdk.compression_level=" + compression_level} \
+        -Xmx${mem}G -jar ${gatk_jar} \
          HaplotypeCaller \
          -R ${ref_fasta} \
          -O ${gvcf_basename}.vcf.gz \
@@ -109,10 +142,15 @@ task HaplotypeCallerGvcf {
          -contamination ${default=0 contamination} \
          -ERC GVCF
    }
+
    output {
        File output_gvcf = "${gvcf_basename}.vcf.gz"
        File output_gvcf_index = "${gvcf_basename}.vcf.gz.tbi"
    }
+
+    runtime {
+        memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
+    }
 }

 task GenotypeGVCFs {
@@ -133,12 +171,16 @@ task GenotypeGVCFs {
    File dbsnp_vcf_index

    Int? compression_level
+    Float? memory
+    Float? memoryMultiplier

+    Int mem = ceil(select_first([memory, 4.0]))
    command {
        set -e -o pipefail
        ${preCommand}

-        java ${"-Dsamjdk.compression_level=" + compression_level} -Xmx4G -jar ${gatk_jar} \
+        java ${"-Dsamjdk.compression_level=" + compression_level} \
+        -Xmx${mem}G -jar ${gatk_jar} \
         GenotypeGVCFs \
         -R ${ref_fasta} \
         -O ${output_basename + ".vcf.gz"} \
@@ -154,6 +196,10 @@ task GenotypeGVCFs {
        File output_vcf = output_basename + ".vcf.gz"
        File output_vcf_index = output_basename + ".vcf.gz.tbi"
    }
+
+    runtime{
+        memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
+    }
 }

 task CombineGVCFs {
@@ -171,13 +217,17 @@ task CombineGVCFs {
    File ref_dict

    Int? compression_level
+    Float? memory
+    Float? memoryMultiplier

+    Int mem = ceil(select_first([memory, 4.0]))
    command {
        set -e -o pipefail
        ${preCommand}

        if [ ${length(gvcf_files)} -gt 1 ]; then
-            java ${"-Dsamjdk.compression_level=" + compression_level} -Xmx4G -jar ${gatk_jar} \
+            java ${"-Dsamjdk.compression_level=" + compression_level} \
+            -Xmx${mem}G -jar ${gatk_jar} \
             CombineGVCFs \
             -R ${ref_fasta} \
             -O ${output_basename + ".vcf.gz"} \
@@ -193,6 +243,10 @@ task CombineGVCFs {
        File output_gvcf = output_basename + ".vcf.gz"
        File output_gvcf_index = output_basename + ".vcf.gz.tbi"
    }
+
+    runtime {
+        memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
+    }
 }

 task SplitNCigarReads {
@@ -206,11 +260,15 @@ task SplitNCigarReads {
    String gatk_jar
    Array[File]+ intervals

+    Float? memory
+    Float? memoryMultiplier

+    Int mem = ceil(select_first([memory, 4.0]))
    command {
        set -e -o pipefail
        ${preCommand}
-        java -Xms4G -jar ${gatk_jar} \
+        java -Xms${mem}G -jar ${gatk_jar} \
+        SplitNCigarReads \
        -I ${input_bam} \
        -R ${ref_fasta} \
        -O ${output_bam} # might have to be -o depending on GATK version \
@@ -221,4 +279,8 @@ task SplitNCigarReads {
        File bam = output_bam
        File bam_index = output_bam + ".bai"
    }
+
+    runtime {
+        memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
+    }
 }
--- a/htseq.wdl
+++ b/htseq.wdl
 task HTSeqCount {
    String? preCommand
    Array[File] alignmentFiles
-    File gffFile
+    File gtfFile
    String outputTable
    String? format
    String? order
    String? stranded

+    Int? memory
+
    command {
        set -e -o pipefail
        ${preCommand}
@@ -15,11 +17,15 @@ task HTSeqCount {
        -r ${default="pos" order} \
        -s ${default="no" stranded} \
        ${sep=" " alignmentFiles} \
-        ${gffFile} \
+        ${gtfFile} \
        > ${outputTable}
    }

    output {
        File counts = outputTable
    }
+
+    runtime {
+        memory: select_first([memory, 3])
+    }
 }
\ No newline at end of file
--- a/mergecounts.wdl
+++ b/mergecounts.wdl
@@ -32,4 +32,8 @@ task MergeCounts {
    output {
        File mergedCounts = outputFile
    }
+
+    runtime {
+        memory: 4 + (2*length(inputFiles))
+    }
 }
\ No newline at end of file
--- a/picard.wdl
+++ b/picard.wdl
@@ -4,11 +4,15 @@ task ScatterIntervalList {
    Int scatter_count
    String picard_jar

+    Float? memory
+    Float? memoryMultiplier
+
+    Int mem = ceil(select_first([memory, 4.0]))
    command {
        set -e -o pipefail
        ${preCommand}
        mkdir scatter_list
-        java -Xmx4G -jar ${picard_jar} \
+        java -Xmx${mem}G -jar ${picard_jar} \
          IntervalListTools \
          SCATTER_COUNT=${scatter_count} \
          SUBDIVISION_MODE=BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \
@@ -17,10 +21,15 @@ task ScatterIntervalList {
          INPUT=${interval_list} \
          OUTPUT=scatter_list
    }
+
    output {
        Array[File] out = glob("scatter_list/*/*.interval_list")
        Int interval_count = read_int(stdout())
    }
+
+    runtime {
+        memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
+    }
 }

 # Combine multiple recalibrated BAM files from scattered ApplyRecalibration runs
@@ -31,21 +40,31 @@ task GatherBamFiles {
    Int? compression_level
    String picard_jar

+    Float? memory
+    Float? memoryMultiplier
+
+    Int mem = ceil(select_first([memory, 4.0]))
    command {
        set -e -o pipefail
        ${preCommand}
-        java ${"-Dsamjdk.compression_level=" + compression_level} -Xmx4G -jar ${picard_jar} \
+        java ${"-Dsamjdk.compression_level=" + compression_level} \
+        -Xmx${mem}G -jar ${picard_jar} \
          GatherBamFiles \
          INPUT=${sep=' INPUT=' input_bams} \
          OUTPUT=${output_bam_path} \
          CREATE_INDEX=true \
          CREATE_MD5_FILE=true
    }
+
    output {
        File output_bam = "${output_bam_path}"
        File output_bam_index = sub(output_bam_path, ".bam$", ".bai")
        File output_bam_md5 = "${output_bam_path}.md5"
    }
+
+    runtime {
+        memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
+    }
 }

 # Mark duplicate reads to avoid counting non-independent observations
@@ -57,6 +76,9 @@ task MarkDuplicates {
    Int? compression_level
    String picard_jar

+    Float? memory
+    Float? memoryMultiplier
+
    # The program default for READ_NAME_REGEX is appropriate in nearly every case.
    # Sometimes we wish to supply "null" in order to turn off optical duplicate detection
    # This can be desirable if you don't mind the estimated library size being wrong and optical duplicate detection is taking >7 days and failing
@@ -65,11 +87,13 @@ task MarkDuplicates {
    # Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly
    # This works because the output of BWA is query-grouped and therefore, so is the output of MergeBamAlignment.
    # While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname"
+    Int mem = ceil(select_first([memory, 4.0]))
    command {
        set -e -o pipefail
        ${preCommand}
        mkdir -p $(dirname ${output_bam_path})
-        java ${"-Dsamjdk.compression_level=" + compression_level} -Xmx4G -jar ${picard_jar} \
+        java ${"-Dsamjdk.compression_level=" + compression_level} \
+        -Xmx${mem}G -jar ${picard_jar} \
          MarkDuplicates \
          INPUT=${sep=' INPUT=' input_bams} \
          OUTPUT=${output_bam_path} \
@@ -81,11 +105,16 @@ task MarkDuplicates {
          CREATE_INDEX=true \
          ADD_PG_TAG_TO_READS=false
    }
+
    output {
        File output_bam = output_bam_path
        File output_bam_index = sub(output_bam_path, ".bam$", ".bai")
        File duplicate_metrics = metrics_path
    }
+
+    runtime {
+        memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
+    }
 }

 # Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs
@@ -97,18 +126,28 @@ task MergeVCFs {
    Int? compression_level
    String picard_jar

+    Float? memory
+    Float? memoryMultiplier
+
    # Using MergeVcfs instead of GatherVcfs so we can create indices
    # See https://github.com/broadinstitute/picard/issues/789 for relevant GatherVcfs ticket
+    Int mem = ceil(select_first([memory, 4.0]))
    command {
        set -e -o pipefail
        ${preCommand}
-        java ${"-Dsamjdk.compression_level=" + compression_level} -Xmx4G -jar ${picard_jar} \
+        java ${"-Dsamjdk.compression_level=" + compression_level} \
+        -Xmx${mem}G -jar ${picard_jar} \
          MergeVcfs \
          INPUT=${sep=' INPUT=' input_vcfs} \
          OUTPUT=${output_vcf_path}
    }
+
    output {
        File output_vcf = output_vcf_path
        File output_vcf_index = output_vcf_path + ".tbi"
    }
+
+    runtime {
+        memory: ceil(mem * select_first([memoryMultiplier, 1.5]))
+    }
 }
\ No newline at end of file
--- a/samtools.wdl
+++ b/samtools.wdl
@@ -21,7 +21,12 @@ task Merge {
    command {
        set -e -o pipefail
        ${preCommand}
-        samtools merge ${outputBamPath} ${sep=' ' bamFiles}
+        if [ ${length(bamFiles)} -gt 1 ]
+          then
+            samtools merge ${outputBamPath} ${sep=' ' bamFiles}
+          else
+            ln -sf ${bamFiles} ${outputBamPath}
+        fi
    }

    output {

--- a/star.wdl
+++ b/star.wdl
@@ -2,7 +2,7 @@ task Star {
    String? preCommand

    Array[File] inputR1
-    Array[File]? inputR2
+    Array[File?] inputR2
    String genomeDir
    String outFileNamePrefix

@@ -13,9 +13,14 @@ task Star {
    String? twopassMode
    Array[String]? outSAMattrRGline

+    Int? memory
+
    #TODO needs to be extended for all possible output extensions
    Map[String, String] samOutputNames = {"BAM SortedByCoordinate": "sortedByCoord.out.bam"}

+    # converts String? to String for use as key (for the Map above) in output
+    String key = select_first([outSAMtype, "BAM SortedByCoordinate"])
+
    command {
        set -e -o pipefail
        mkdir -p ${sub(outFileNamePrefix, basename(outFileNamePrefix) + "$", "")}
@@ -33,10 +38,11 @@ task Star {
    }

    output {
-        File bamFile = outFileNamePrefix + "Aligned." +  samOutputNames["${outSAMtype}"]
+        File bamFile = outFileNamePrefix + "Aligned." +  samOutputNames[key]
    }

    runtime {
-        threads: runThreadN
+        cpu: select_first([runThreadN, 1])
+        memory: select_first([memory, 10])
    }
 }
\ No newline at end of file
--- a/stringtie.wdl
+++ b/stringtie.wdl
 task Stringtie {
    String? preCommand
    File alignedReads
-    File? referenceGFF
+    File? referenceGtf
    Int? threads
    String assembledTranscriptsFile
    Boolean? firstStranded
@@ -13,7 +13,7 @@ task Stringtie {
        ${preCommand}
        stringtie \
        ${"-p " + threads} \
-        ${"-G " + referenceGFF} \
+        ${"-G " + referenceGtf} \
        ${true="--rf" false="" firstStranded} \
        ${true="fr" false="" secondStranded} \
        -o ${assembledTranscriptsFile} \
@@ -28,6 +28,6 @@ task Stringtie {
    }

    runtime {
-        threads: threads
+        cpu: select_first([threads, 1])
    }
 }
\ No newline at end of file