Merge pull request #17 from biowdl/BIOWDL-25

Caching related changes

Merge pull request #17 from biowdl/BIOWDL-25
c5eb0a76 · Peter van 't Hof · GitHub · e75a3008 · 4177a251 · c5eb0a76
Unverified Commit c5eb0a76 authored 6 years ago by Peter van 't Hof Committed by GitHub 6 years ago
--- a/biopet.wdl
+++ b/biopet.wdl
@@ -117,25 +117,24 @@ task extractAdaptersFastqc {
 task FastqSplitter {
    String? preCommand
    File inputFastq
-    String outputPath
-    Int numberChunks
-    File toolJar
-    Array[Int] chunks = range(numberChunks)
+    Array[String] outputPaths
+    String toolJar

    command {
        set -e -o pipefail
        ${preCommand}
-        mkdir -p ${sep=' ' prefix(outputPath + "/chunk_", chunks)}
-        if [ ${numberChunks} -gt 1 ]; then
-            SEP="/${basename(inputFastq)} -o "
-            java -jar ${toolJar} -I ${inputFastq} -o ${sep='$SEP' prefix(outputPath + "/chunk_", chunks)}/${basename(inputFastq)}
-        else
-            ln -sf ${inputFastq} ${outputPath}/chunk_0/${basename(inputFastq)}
-        fi
+        mkdir -p $(dirname ${sep=') $(dirname ' outputPaths})
+        if [ ${length(outputPaths)} -gt 1 ]; then
+            java -jar ${toolJar} \
+            -I ${inputFastq} \
+            -o ${sep=' -o ' outputPaths}
+          else
+            ln -sf ${inputFastq} ${outputPaths[0]}
+          fi
    }

    output {
-        Array[File] outputFastqFiles = glob(outputPath + "/chunk_*/" + basename(inputFastq))
+        Array[File] chunks = outputPaths
    }
 }


--- a/common.wdl
+++ b/common.wdl
@@ -106,7 +106,9 @@ task appendToStringArray {
 }

 task createLink {
-    File inputFile
+    # Making this of type File will create a link to the copy of the file in the execution
+    # folder, instead of the actual file.
+    String inputFile
    String outputPath

    command {

--- a/gatk.wdl
+++ b/gatk.wdl
-# Generate Base Quality Score Recalibration (BQSR) model
-task BaseRecalibrator {
+# Apply Base Quality Score Recalibration (BQSR) model
+task ApplyBQSR {
    String? preCommand
-    String gatk_jar
-    String input_bam
-    String input_bam_index
-    String recalibration_report_filename
-    Array[File]+ sequence_group_interval
-    Array[File]+ known_indels_sites_VCFs
-    Array[File]+ known_indels_sites_indices
-    File ref_dict
-    File ref_fasta
-    File ref_fasta_index
+    File gatkJar
+    File inputBam
+    File inputBamIndex
+    String outputBamPath
+    File recalibrationReport
+    Array[File]+ sequenceGroupInterval
+    File refDict
+    File refFasta
+    File refFastaIndex
+    Int? compressionLevel

    Float? memory
    Float? memoryMultiplier
@@ -19,18 +19,23 @@ task BaseRecalibrator {
    command {
        set -e -o pipefail
        ${preCommand}
-        java -Xms${mem}G -jar ${gatk_jar} \
-          BaseRecalibrator \
-          -R ${ref_fasta} \
-          -I ${input_bam} \
+        java ${"-Dsamjdk.compression_level=" + compressionLevel} \
+        -Xms${mem}G -jar ${gatkJar} \
+          ApplyBQSR \
+          --create-output-bam-md5 \
+          --add-output-sam-program-record \
+          -R ${refFasta} \
+          -I ${inputBam} \
          --use-original-qualities \
-          -O ${recalibration_report_filename} \
-          --known-sites ${sep=" --known-sites " known_indels_sites_VCFs} \
-          -L ${sep=" -L " sequence_group_interval}
+          -O ${outputBamPath} \
+          -bqsr ${recalibrationReport} \
+          --static-quantized-quals 10 --static-quantized-quals 20 --static-quantized-quals 30 \
+          -L ${sep=" -L " sequenceGroupInterval}
    }

    output {
-        File recalibration_report = "${recalibration_report_filename}"
+        File recalibrated_bam = outputBamPath
+        File recalibrated_bam_checksum = outputBamPath + ".md5"
    }

    runtime {
@@ -38,18 +43,19 @@ task BaseRecalibrator {
    }
 }

-# Apply Base Quality Score Recalibration (BQSR) model
-task ApplyBQSR {
+# Generate Base Quality Score Recalibration (BQSR) model
+task BaseRecalibrator {
    String? preCommand
-    String gatk_jar
-    String input_bam
-    String output_bam_path
-    File recalibration_report
-    Array[String] sequence_group_interval
-    File ref_dict
-    File ref_fasta
-    File ref_fasta_index
-    Int? compression_level
+    File gatkJar
+    File inputBam
+    File inputBamIndex
+    String recalibrationReportPath
+    Array[File]+ sequenceGroupInterval
+    Array[File]+ knownIndelsSitesVCFs
+    Array[File]+ knownIndelsSitesIndices
+    File refDict
+    File refFasta
+    File refFastaIndex

    Float? memory
    Float? memoryMultiplier
@@ -58,23 +64,18 @@ task ApplyBQSR {
    command {
        set -e -o pipefail
        ${preCommand}
-        java ${"-Dsamjdk.compression_level=" + compression_level} \
-        -Xms${mem}G -jar ${gatk_jar} \
-          ApplyBQSR \
-          --create-output-bam-md5 \
-          --add-output-sam-program-record \
-          -R ${ref_fasta} \
-          -I ${input_bam} \
+        java -Xms${mem}G -jar ${gatkJar} \
+          BaseRecalibrator \
+          -R ${refFasta} \
+          -I ${inputBam} \
          --use-original-qualities \
-          -O ${output_bam_path} \
-          -bqsr ${recalibration_report} \
-          --static-quantized-quals 10 --static-quantized-quals 20 --static-quantized-quals 30 \
-          -L ${sep=" -L " sequence_group_interval}
+          -O ${recalibrationReportPath} \
+          --known-sites ${sep=" --known-sites " knownIndelsSitesVCFs} \
+          -L ${sep=" -L " sequenceGroupInterval}
    }

    output {
-        File recalibrated_bam = "${output_bam_path}"
-        File recalibrated_bam_checksum = "${output_bam_path}.md5"
+        File recalibrationReport = recalibrationReportPath
    }

    runtime {
@@ -82,13 +83,21 @@ task ApplyBQSR {
    }
 }

-# Combine multiple recalibration tables from scattered BaseRecalibrator runs
-task GatherBqsrReports {
+task CombineGVCFs {
    String? preCommand
-    String gatk_jar
-    Array[File] input_bqsr_reports
-    String output_report_filepath
+    Array[File]+ gvcfFiles
+    Array[File]+ gvcfFileIndexes
+    Array[File]+ intervals
+
+    String outputPath
+
+    String gatkJar

+    File refFasta
+    File refFastaIndex
+    File refDict
+
+    Int? compressionLevel
    Float? memory
    Float? memoryMultiplier

@@ -96,14 +105,24 @@ task GatherBqsrReports {
    command {
        set -e -o pipefail
        ${preCommand}
-        java -Xms${mem}G -jar ${gatk_jar} \
-        GatherBQSRReports \
-        -I ${sep=' -I ' input_bqsr_reports} \
-        -O ${output_report_filepath}
+
+        if [ ${length(gvcfFiles)} -gt 1 ]; then
+            java ${"-Dsamjdk.compression_level=" + compressionLevel} \
+            -Xmx${mem}G -jar ${gatkJar} \
+             CombineGVCFs \
+             -R ${refFasta} \
+             -O ${outputPath} \
+             -V ${sep=' -V ' gvcfFiles} \
+             -L ${sep=' -L ' intervals}
+        else # TODO this should be handeled in wdl
+            ln -sf ${select_first(gvcfFiles)} ${outputPath}
+            ln -sf ${select_first(gvcfFileIndexes)} ${outputPath}.tbi
+        fi
    }

    output {
-        File output_bqsr_report = "${output_report_filepath}"
+        File outputGVCF = outputPath
+        File outputGVCFindex = outputPath + ".tbi"
    }

    runtime {
@@ -111,19 +130,12 @@ task GatherBqsrReports {
    }
 }

-# Call variants on a single sample with HaplotypeCaller to produce a GVCF
-task HaplotypeCallerGvcf {
+# Combine multiple recalibration tables from scattered BaseRecalibrator runs
+task GatherBqsrReports {
    String? preCommand
-    Array[File]+ input_bams
-    Array[File]+ input_bams_index
-    Array[File]+ interval_list
-    String gvcf_basename
-    File ref_dict
-    File ref_fasta
-    File ref_fasta_index
-    Float? contamination
-    Int? compression_level
-    String gatk_jar
+    String gatkJar
+    Array[File] inputBQSRreports
+    String outputReportPath

    Float? memory
    Float? memoryMultiplier
@@ -132,20 +144,14 @@ task HaplotypeCallerGvcf {
    command {
        set -e -o pipefail
        ${preCommand}
-        java ${"-Dsamjdk.compression_level=" + compression_level} \
-        -Xmx${mem}G -jar ${gatk_jar} \
-          HaplotypeCaller \
-          -R ${ref_fasta} \
-          -O ${gvcf_basename}.vcf.gz \
-          -I ${sep=" -I " input_bams} \
-          -L ${sep=' -L ' interval_list} \
-          -contamination ${default=0 contamination} \
-          -ERC GVCF
+        java -Xms${mem}G -jar ${gatkJar} \
+        GatherBQSRReports \
+        -I ${sep=' -I ' inputBQSRreports} \
+        -O ${outputReportPath}
    }

    output {
-        File output_gvcf = "${gvcf_basename}.vcf.gz"
-        File output_gvcf_index = "${gvcf_basename}.vcf.gz.tbi"
+        File outputBQSRreport = outputReportPath
    }

    runtime {
@@ -155,22 +161,22 @@ task HaplotypeCallerGvcf {

 task GenotypeGVCFs {
    String? preCommand
-    File gvcf_files
-    File gvcf_file_indexes
+    File gvcfFiles
+    File gvcfFileIndexes
    Array[File]+ intervals

-    String output_basename
+    String outputPath

-    String gatk_jar
+    String gatkJar

-    File ref_fasta
-    File ref_fasta_index
-    File ref_dict
+    File refFasta
+    File refFastaIndex
+    File refDict

-    File dbsnp_vcf
-    File dbsnp_vcf_index
+    File dbsnpVCF
+    File dbsnpVCFindex

-    Int? compression_level
+    Int? compressionLevel
    Float? memory
    Float? memoryMultiplier

@@ -179,22 +185,22 @@ task GenotypeGVCFs {
        set -e -o pipefail
        ${preCommand}

-        java ${"-Dsamjdk.compression_level=" + compression_level} \
-        -Xmx${mem}G -jar ${gatk_jar} \
+        java ${"-Dsamjdk.compression_level=" + compressionLevel} \
+        -Xmx${mem}G -jar ${gatkJar} \
         GenotypeGVCFs \
-         -R ${ref_fasta} \
-         -O ${output_basename + ".vcf.gz"} \
-         -D ${dbsnp_vcf} \
+         -R ${refFasta} \
+         -O ${outputPath} \
+         -D ${dbsnpVCF} \
         -G StandardAnnotation \
         --only-output-calls-starting-in-intervals \
         -new-qual \
-         -V ${gvcf_files} \
+         -V ${gvcfFiles} \
         -L ${sep=' -L ' intervals}
    }

    output {
-        File output_vcf = output_basename + ".vcf.gz"
-        File output_vcf_index = output_basename + ".vcf.gz.tbi"
+        File outputVCF = outputPath
+        File outputVCFindex = outputPath + ".tbi"
    }

    runtime{
@@ -202,21 +208,20 @@ task GenotypeGVCFs {
    }
 }

-task CombineGVCFs {
+# Call variants on a single sample with HaplotypeCaller to produce a GVCF
+task HaplotypeCallerGvcf {
    String? preCommand
-    Array[File]+ gvcf_files
-    Array[File]+ gvcf_file_indexes
-    Array[File]+ intervals
-
-    String output_basename
-
-    String gatk_jar
-
-    File ref_fasta
-    File ref_fasta_index
-    File ref_dict
+    Array[File]+ inputBams
+    Array[File]+ inputBamsIndex
+    Array[File]+ intervalList
+    String gvcfPath
+    File refDict
+    File refFasta
+    File refFastaIndex
+    Float? contamination
+    Int? compressionLevel
+    String gatkJar

-    Int? compression_level
    Float? memory
    Float? memoryMultiplier

@@ -224,24 +229,20 @@ task CombineGVCFs {
    command {
        set -e -o pipefail
        ${preCommand}
-
-        if [ ${length(gvcf_files)} -gt 1 ]; then
-            java ${"-Dsamjdk.compression_level=" + compression_level} \
-            -Xmx${mem}G -jar ${gatk_jar} \
-             CombineGVCFs \
-             -R ${ref_fasta} \
-             -O ${output_basename + ".vcf.gz"} \
-             -V ${sep=' -V ' gvcf_files} \
-             -L ${sep=' -L ' intervals}
-        else
-            ln -sf ${select_first(gvcf_files)} ${output_basename + ".vcf.gz"}
-            ln -sf ${select_first(gvcf_files)}.tbi ${output_basename + ".vcf.gz.tbi"}
-        fi
+        java ${"-Dsamjdk.compression_level=" + compressionLevel} \
+        -Xmx${mem}G -jar ${gatkJar} \
+          HaplotypeCaller \
+          -R ${refFasta} \
+          -O ${gvcfPath} \
+          -I ${sep=" -I " inputBams} \
+          -L ${sep=' -L ' intervalList} \
+          -contamination ${default=0 contamination} \
+          -ERC GVCF
    }

    output {
-        File output_gvcf = output_basename + ".vcf.gz"
-        File output_gvcf_index = output_basename + ".vcf.gz.tbi"
+        File outputGVCF = gvcfPath
+        File outputGVCFindex = gvcfPath + ".tbi"
    }

    runtime {
@@ -252,13 +253,13 @@ task CombineGVCFs {
 task SplitNCigarReads {
    String? preCommand

-    File input_bam
-    File input_bam_index
-    File ref_fasta
-    File ref_fasta_index
-    File ref_dict
-    String output_bam
-    String gatk_jar
+    File inputBam
+    File inputBamIndex
+    File refFasta
+    File refFastaIndex
+    File refDict
+    String outputBam
+    String gatkJar
    Array[File]+ intervals

    Float? memory
@@ -268,17 +269,17 @@ task SplitNCigarReads {
    command {
        set -e -o pipefail
        ${preCommand}
-        java -Xms${mem}G -jar ${gatk_jar} \
+        java -Xms${mem}G -jar ${gatkJar} \
        SplitNCigarReads \
-        -I ${input_bam} \
-        -R ${ref_fasta} \
-        -O ${output_bam} \
+        -I ${inputBam} \
+        -R ${refFasta} \
+        -O ${outputBam} \
        -L ${sep=' -L ' intervals}
    }

    output {
-        File bam = output_bam
-        File bam_index = sub(output_bam, "\\.bam$", ".bai")
+        File bam = outputBam
+        File bamIndex = sub(outputBam, "\\.bam$", ".bai")
    }

    runtime {

--- a/picard.wdl
+++ b/picard.wdl
@@ -120,11 +120,11 @@ task MarkDuplicates {
 # Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs
 task MergeVCFs {
    String? preCommand
-    Array[File] input_vcfs
-    Array[File] input_vcfs_indexes
-    String output_vcf_path
-    Int? compression_level
-    String picard_jar
+    Array[File] inputVCFs
+    Array[File] inputVCFsIndexes
+    String outputVCFpath
+    Int? compressionLevel
+    String picardJar

    Float? memory
    Float? memoryMultiplier
@@ -135,16 +135,16 @@ task MergeVCFs {
    command {
        set -e -o pipefail
        ${preCommand}
-        java ${"-Dsamjdk.compression_level=" + compression_level} \
-        -Xmx${mem}G -jar ${picard_jar} \
+        java ${"-Dsamjdk.compression_level=" + compressionLevel} \
+        -Xmx${mem}G -jar ${picardJar} \
          MergeVcfs \
-          INPUT=${sep=' INPUT=' input_vcfs} \
-          OUTPUT=${output_vcf_path}
+          INPUT=${sep=' INPUT=' inputVCFs} \
+          OUTPUT=${outputVCFpath}
    }

    output {
-        File output_vcf = output_vcf_path
-        File output_vcf_index = output_vcf_path + ".tbi"
+        File outputVCF = outputVCFpath
+        File outputVCFindex = outputVCFpath + ".tbi"
    }

    runtime {