From 58c0da169b4b4f208e8c1f9a4298d60a4b6100bc Mon Sep 17 00:00:00 2001 From: Ruben Vorderman <r.h.p.vorderman@lumc.nl> Date: Mon, 23 Jul 2018 13:38:56 +0200 Subject: [PATCH] picard to 1.0 --- picard.wdl | 330 ++++++++++++++++++++++++++++------------------------- 1 file changed, 172 insertions(+), 158 deletions(-) diff --git a/picard.wdl b/picard.wdl index 444af3b..1095f4c 100644 --- a/picard.wdl +++ b/picard.wdl @@ -1,29 +1,32 @@ -task CollectMultipleMetrics { - String? preCommand - File bamFile - File bamIndex - File refFasta - File refDict - File refFastaIndex - String basename - - # These should proably be optional, but I'm not sure how to handle the ouput in that - # case (without a null literal). - Boolean collectAlignmentSummaryMetrics = true - Boolean collectInsertSizeMetrics = true - Boolean qualityScoreDistribution = true - Boolean meanQualityByCycle = true - Boolean collectBaseDistributionByCycle = true - Boolean collectGcBiasMetrics = true - #Boolean? rnaSeqMetrics = false # There is a bug in picard https://github.com/broadinstitute/picard/issues/999 - Boolean collectSequencingArtifactMetrics = true - Boolean collectQualityYieldMetrics = true - - String? picardJar - - Float? memory - Float? memoryMultiplier +version 1.0 +task CollectMultipleMetrics { + input { + String? preCommand + File bamFile + File bamIndex + File refFasta + File refDict + File refFastaIndex + String basename + + # These should proably be optional, but I'm not sure how to handle the ouput in that + # case (without a null literal). + Boolean collectAlignmentSummaryMetrics = true + Boolean collectInsertSizeMetrics = true + Boolean qualityScoreDistribution = true + Boolean meanQualityByCycle = true + Boolean collectBaseDistributionByCycle = true + Boolean collectGcBiasMetrics = true + #Boolean? rnaSeqMetrics = false # There is a bug in picard https://github.com/broadinstitute/picard/issues/999 + Boolean collectSequencingArtifactMetrics = true + Boolean collectQualityYieldMetrics = true + + String? picardJar + + Float? memory + Float? memoryMultiplier + } Int mem = ceil(select_first([memory, 4.0])) String toolCommand = if defined(picardJar) @@ -32,23 +35,23 @@ task CollectMultipleMetrics { command { set -e -o pipefail - mkdir -p $(dirname "${basename}") - ${preCommand} - ${toolCommand} \ + mkdir -p $(dirname "~{basename}") + ~{preCommand} + ~{toolCommand} \ CollectMultipleMetrics \ - I=${bamFile} \ - R=${refFasta} \ - O=${basename} \ + I=~{bamFile} \ + R=~{refFasta} \ + O=~{basename} \ PROGRAM=null \ - ${true="PROGRAM=CollectAlignmentSummaryMetrics" false="" collectAlignmentSummaryMetrics} \ - ${true="PROGRAM=CollectInsertSizeMetrics" false="" collectInsertSizeMetrics} \ - ${true="PROGRAM=QualityScoreDistribution" false="" qualityScoreDistribution} \ - ${true="PROGRAM=MeanQualityByCycle" false="" meanQualityByCycle} \ - ${true="PROGRAM=CollectBaseDistributionByCycle" false="" collectBaseDistributionByCycle} \ - ${true="PROGRAM=CollectGcBiasMetrics" false="" collectGcBiasMetrics} \ - ${true="PROGRAM=CollectSequencingArtifactMetrics" false="" + ~{true="PROGRAM=CollectAlignmentSummaryMetrics" false="" collectAlignmentSummaryMetrics} \ + ~{true="PROGRAM=CollectInsertSizeMetrics" false="" collectInsertSizeMetrics} \ + ~{true="PROGRAM=QualityScoreDistribution" false="" qualityScoreDistribution} \ + ~{true="PROGRAM=MeanQualityByCycle" false="" meanQualityByCycle} \ + ~{true="PROGRAM=CollectBaseDistributionByCycle" false="" collectBaseDistributionByCycle} \ + ~{true="PROGRAM=CollectGcBiasMetrics" false="" collectGcBiasMetrics} \ + ~{true="PROGRAM=CollectSequencingArtifactMetrics" false="" collectSequencingArtifactMetrics} \ - ${true="PROGRAM=CollectQualityYieldMetrics" false="" collectQualityYieldMetrics} + ~{true="PROGRAM=CollectQualityYieldMetrics" false="" collectQualityYieldMetrics} } output { @@ -78,18 +81,19 @@ task CollectMultipleMetrics { } task CollectRnaSeqMetrics { - String? preCommand - File bamFile - File bamIndex - File refRefflat - String basename - String? strandSpecificity = "NONE" - - String? picardJar - - Float? memory - Float? memoryMultiplier - + input { + String? preCommand + File bamFile + File bamIndex + File refRefflat + String basename + String? strandSpecificity = "NONE" + + String? picardJar + + Float? memory + Float? memoryMultiplier + } Int mem = ceil(select_first([memory, 4.0])) String toolCommand = if defined(picardJar) @@ -98,15 +102,15 @@ task CollectRnaSeqMetrics { command { set -e -o pipefail - mkdir -p $(dirname "${basename}") - ${preCommand} - ${toolCommand} \ + mkdir -p $(dirname "~{basename}") + ~{preCommand} + ~{toolCommand} \ CollectRnaSeqMetrics \ - I=${bamFile} \ - O=${basename}.RNA_Metrics \ - CHART_OUTPUT=${basename}.RNA_Metrics.pdf \ - ${"STRAND_SPECIFICITY=" + strandSpecificity} \ - REF_FLAT=${refRefflat} + I=~{bamFile} \ + O=~{basename}.RNA_Metrics \ + CHART_OUTPUT=~{basename}.RNA_Metrics.pdf \ + ~{"STRAND_SPECIFICITY=" + strandSpecificity} \ + REF_FLAT=~{refRefflat} } output { @@ -120,20 +124,22 @@ task CollectRnaSeqMetrics { } task CollectTargetedPcrMetrics { - String? preCommand - File bamFile - File bamIndex - File refFasta - File refDict - File refFastaIndex - File ampliconIntervals - Array[File]+ targetIntervals - String basename - - String? picardJar - - Float? memory - Float? memoryMultiplier + input { + String? preCommand + File bamFile + File bamIndex + File refFasta + File refDict + File refFastaIndex + File ampliconIntervals + Array[File]+ targetIntervals + String basename + + String? picardJar + + Float? memory + Float? memoryMultiplier + } Int mem = ceil(select_first([memory, 4.0])) @@ -143,17 +149,17 @@ task CollectTargetedPcrMetrics { command { set -e -o pipefail - mkdir -p $(dirname "${basename}") - ${preCommand} - ${toolCommand} \ + mkdir -p $(dirname "~{basename}") + ~{preCommand} + ~{toolCommand} \ CollectTargetedPcrMetrics \ - I=${bamFile} \ - R=${refFasta} \ - AMPLICON_INTERVALS=${ampliconIntervals} \ - TARGET_INTERVALS=${sep=" TARGET_INTERVALS=" targetIntervals} \ - O=${basename}.targetPcrMetrics \ - PER_BASE_COVERAGE=${basename}.targetPcrPerBaseCoverage \ - PER_TARGET_COVERAGE=${basename}.targetPcrPerTargetCoverage + I=~{bamFile} \ + R=~{refFasta} \ + AMPLICON_INTERVALS=~{ampliconIntervals} \ + TARGET_INTERVALS=~{sep=" TARGET_INTERVALS=" targetIntervals} \ + O=~{basename}.targetPcrMetrics \ + PER_BASE_COVERAGE=~{basename}.targetPcrPerBaseCoverage \ + PER_TARGET_COVERAGE=~{basename}.targetPcrPerTargetCoverage } output { @@ -169,14 +175,16 @@ task CollectTargetedPcrMetrics { # Combine multiple recalibrated BAM files from scattered ApplyRecalibration runs task GatherBamFiles { - String? preCommand - Array[File]+ input_bams - String output_bam_path - Int? compression_level - String? picardJar - - Float? memory - Float? memoryMultiplier + input { + String? preCommand + Array[File]+ input_bams + String output_bam_path + Int? compression_level + String? picardJar + + Float? memory + Float? memoryMultiplier + } Int mem = ceil(select_first([memory, 4.0])) @@ -186,19 +194,19 @@ task GatherBamFiles { command { set -e -o pipefail - ${preCommand} - ${toolCommand} \ + ~{preCommand} + ~{toolCommand} \ GatherBamFiles \ - INPUT=${sep=' INPUT=' input_bams} \ - OUTPUT=${output_bam_path} \ + INPUT=~{sep=' INPUT=' input_bams} \ + OUTPUT=~{output_bam_path} \ CREATE_INDEX=true \ CREATE_MD5_FILE=true } output { - File output_bam = "${output_bam_path}" + File output_bam = "~{output_bam_path}" File output_bam_index = sub(output_bam_path, ".bam$", ".bai") - File output_bam_md5 = "${output_bam_path}.md5" + File output_bam_md5 = "~{output_bam_path}.md5" } runtime { @@ -208,21 +216,23 @@ task GatherBamFiles { # Mark duplicate reads to avoid counting non-independent observations task MarkDuplicates { - String? preCommand - Array[File] input_bams - String output_bam_path - String metrics_path - Int? compression_level - String? picardJar - - Float? memory - Float? memoryMultiplier - - # The program default for READ_NAME_REGEX is appropriate in nearly every case. - # Sometimes we wish to supply "null" in order to turn off optical duplicate detection - # This can be desirable if you don't mind the estimated library size being wrong and optical duplicate detection is taking >7 days and failing - String? read_name_regex + input { + String? preCommand + Array[File] input_bams + String output_bam_path + String metrics_path + Int? compression_level + String? picardJar + + Float? memory + Float? memoryMultiplier + + # The program default for READ_NAME_REGEX is appropriate in nearly every case. + # Sometimes we wish to supply "null" in order to turn off optical duplicate detection + # This can be desirable if you don't mind the estimated library size being wrong and optical duplicate detection is taking >7 days and failing + String? read_name_regex + } # Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly # This works because the output of BWA is query-grouped and therefore, so is the output of MergeBamAlignment. # While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname" @@ -234,15 +244,15 @@ task MarkDuplicates { command { set -e -o pipefail - ${preCommand} - mkdir -p $(dirname ${output_bam_path}) - ${toolCommand} \ + ~{preCommand} + mkdir -p $(dirname ~{output_bam_path}) + ~{toolCommand} \ MarkDuplicates \ - INPUT=${sep=' INPUT=' input_bams} \ - OUTPUT=${output_bam_path} \ - METRICS_FILE=${metrics_path} \ + INPUT=~{sep=' INPUT=' input_bams} \ + OUTPUT=~{output_bam_path} \ + METRICS_FILE=~{metrics_path} \ VALIDATION_STRINGENCY=SILENT \ - ${"READ_NAME_REGEX=" + read_name_regex} \ + ~{"READ_NAME_REGEX=" + read_name_regex} \ OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \ CLEAR_DT="false" \ CREATE_INDEX=true \ @@ -262,16 +272,17 @@ task MarkDuplicates { # Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs task MergeVCFs { - String? preCommand - Array[File] inputVCFs - Array[File] inputVCFsIndexes - String outputVCFpath - Int? compressionLevel - String? picardJar - - Float? memory - Float? memoryMultiplier - + input { + String? preCommand + Array[File] inputVCFs + Array[File] inputVCFsIndexes + String outputVCFpath + Int? compressionLevel + String? picardJar + + Float? memory + Float? memoryMultiplier + } # Using MergeVcfs instead of GatherVcfs so we can create indices # See https://github.com/broadinstitute/picard/issues/789 for relevant GatherVcfs ticket Int mem = ceil(select_first([memory, 4.0])) @@ -282,11 +293,11 @@ task MergeVCFs { command { set -e -o pipefail - ${preCommand} - ${toolCommand} \ + ~{preCommand} + ~{toolCommand} \ MergeVcfs \ - INPUT=${sep=' INPUT=' inputVCFs} \ - OUTPUT=${outputVCFpath} + INPUT=~{sep=' INPUT=' inputVCFs} \ + OUTPUT=~{outputVCFpath} } output { @@ -300,14 +311,16 @@ task MergeVCFs { } task SamToFastq { - String? preCommand - File inputBam - String outputRead1 - String? outputRead2 - String? outputUnpaired - String? picardJar - Float? memory - Float? memoryMultiplier + input { + String? preCommand + File inputBam + String outputRead1 + String? outputRead2 + String? outputUnpaired + String? picardJar + Float? memory + Float? memoryMultiplier + } Int mem = ceil(select_first([memory, 16.0])) # High memory default to avoid crashes. String toolCommand = if defined(picardJar) @@ -316,13 +329,13 @@ task SamToFastq { command { set -e -o pipefail - ${preCommand} - ${toolCommand} \ + ~{preCommand} + ~{toolCommand} \ SamToFastq \ - I=${inputBam} \ - ${"FASTQ=" + outputRead1} \ - ${"SECOND_END_FASTQ=" + outputRead2} \ - ${"UNPAIRED_FASTQ=" + outputUnpaired} + I=~{inputBam} \ + ~{"FASTQ=" + outputRead1} \ + ~{"SECOND_END_FASTQ=" + outputRead2} \ + ~{"UNPAIRED_FASTQ=" + outputUnpaired} } output { @@ -337,14 +350,15 @@ task SamToFastq { } task ScatterIntervalList { - String? preCommand - File interval_list - Int scatter_count - String? picardJar - - Float? memory - Float? memoryMultiplier - + input { + String? preCommand + File interval_list + Int scatter_count + String? picardJar + + Float? memory + Float? memoryMultiplier + } Int mem = ceil(select_first([memory, 4.0])) String toolCommand = if defined(picardJar) @@ -353,15 +367,15 @@ task ScatterIntervalList { command { set -e -o pipefail - ${preCommand} + ~{preCommand} mkdir scatter_list - ${toolCommand} \ + ~{toolCommand} \ IntervalListTools \ - SCATTER_COUNT=${scatter_count} \ + SCATTER_COUNT=~{scatter_count} \ SUBDIVISION_MODE=BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \ UNIQUE=true \ SORT=true \ - INPUT=${interval_list} \ + INPUT=~{interval_list} \ OUTPUT=scatter_list } -- GitLab