diff --git a/CHANGELOG.md b/CHANGELOG.md index 18a90306edaa4ff27d7da72480e6271c3dbf4eab..05e657222fe9588d0d55231e71f075e4cc0a8437 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,17 @@ that users understand how the changes affect the new version. version 2.2.0-dev --------------------------- + Add `memory` input to fastqc task. ++ Added GATK CNV calling tasks: + + AnnotateIntervals + + CallCopyRatioSegments + + CollectAllelicCounts + + CollectReadCounts + + CreateReadCountPanelOfNormals + + DenoiseReadCounts + + ModelSegments + + PlotDenoisedCopyRatios + + PlotModeledSegments + + PreprocessIntervals + Add common.TextToFile task. + Add bedtools.Intersect. + Add `-o pipefail` to bedtools.MergeBedFiles to prevent errors in BED files diff --git a/gatk.wdl b/gatk.wdl index 0b4c71c701b6d7a2ef1013b6ddeef8f77ff6c612..eff98bf8b1260f14d9691c2ebb92235916ad4c21 100644 --- a/gatk.wdl +++ b/gatk.wdl @@ -1,5 +1,83 @@ version 1.0 +# Copyright (c) 2018 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task AnnotateIntervals { + input { + File referenceFasta + File referenceFastaDict + File referenceFastaFai + String annotatedIntervalsPath = "intervals.annotated.tsv" + File intervals + String intervalMergingRule = "OVERLAPPING_ONLY" + File? mappabilityTrack + File? segmentalDuplicationTrack + Int featureQueryLookahead = 1000000 + + String memory = "10G" + String javaXmx = "2G" + String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + } + + command { + set -e + mkdir -p "$(dirname ~{annotatedIntervalsPath})" + gatk --java-options -Xmx~{javaXmx} \ + AnnotateIntervals \ + -R ~{referenceFasta} \ + -L ~{intervals} \ + ~{"--mappability-track " + mappabilityTrack} \ + ~{"--segmental-duplication-track " + segmentalDuplicationTrack} \ + --feature-query-lookahead ~{featureQueryLookahead} \ + --interval-merging-rule ~{intervalMergingRule} \ + -O ~{annotatedIntervalsPath} + } + + output { + File annotatedIntervals = annotatedIntervalsPath + } + + runtime { + docker: dockerImage + memory: memory + } + + parameter_meta { + referenceFasta: {description: "The reference fasta file.", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} + referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + annotatedIntervalsPath: {description: "The location the output should be written to.", category: "advanced"} + intervals: {description: "An interval list describinig the intervals to annotate.", category: "required"} + intervalMergingRule: {description: "Equivalent to gatk AnnotateIntervals' `--interval-merging-rule` option.", category: "advanced"} + mappabilityTrack: {description: "Equivalent to gatk AnnotateIntervals' `--mappability-track` option.", category: "common"} + segmentalDuplicationTrack: {description: "Equivalent to gatk AnnotateIntervals' `--segmenta-duplicarion-track` option.", category: "common"} + featureQueryLookahead: {description: "Equivalent to gatk AnnotateIntervals' `--feature-query-lookahead` option", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + # Apply Base Quality Score Recalibration (BQSR) model task ApplyBQSR { input { @@ -132,35 +210,29 @@ task BaseRecalibrator { } } -task CombineGVCFs { +task CalculateContamination { input { - Array[File]+ gvcfFiles - Array[File]+ gvcfFilesIndex - Array[File] intervals = [] - String outputPath - File referenceFasta - File referenceFastaDict - File referenceFastaFai + File tumorPileups + File? normalPileups String memory = "24G" String javaXmx = "12G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + String dockerImage = "quay.io/biocontainers/gatk4:4.1.2.0--1" } command { set -e - mkdir -p "$(dirname ~{outputPath})" gatk --java-options -Xmx~{javaXmx} \ - CombineGVCFs \ - -R ~{referenceFasta} \ - -O ~{outputPath} \ - -V ~{sep=' -V ' gvcfFiles} \ - ~{true='-L' false='' length(intervals) > 0} ~{sep=' -L ' intervals} + CalculateContamination \ + -I ~{tumorPileups} \ + ~{"-matched " + normalPileups} \ + -O "contamination.table" \ + --tumor-segmentation "segments.table" } output { - File outputVcf = outputPath - File outputVcfIndex = outputPath + ".tbi" + File contaminationTable = "contamination.table" + File mafTumorSegments = "segments.table" } runtime { @@ -169,16 +241,8 @@ task CombineGVCFs { } parameter_meta { - gvcfFiles: {description: "The GVCF files to be combined.", category: "required"} - gvcfFilesIndex: {description: "The indexes for the GVCF files.", caregory: "required"} - intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "advanced"} - outputPath: {description: "The location the combined GVCF should be written to.", category: "required"} - referenceFasta: {description: "The reference fasta file which was also used for mapping.", - category: "required"} - referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", - category: "required"} - referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} - + tumorPileups: {description: "The pileup summary of a tumor/case sample.", category: "required"} + normalPileups: {description: "The pileup summary of the normal/control sample.", category: "common"} memory: {description: "The amount of memory this job will use.", category: "advanced"} javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} @@ -187,28 +251,28 @@ task CombineGVCFs { } } -# Combine multiple recalibration tables from scattered BaseRecalibrator runs -task GatherBqsrReports { +task CallCopyRatioSegments { input { - Array[File] inputBQSRreports - String outputReportPath + String outputPrefix + File copyRatioSegments - String memory = "12G" - String javaXmx = "4G" + String memory = "21G" + String javaXmx = "6G" String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" } command { set -e - mkdir -p "$(dirname ~{outputReportPath})" + mkdir -p "$(dirname ~{outputPrefix})" gatk --java-options -Xmx~{javaXmx} \ - GatherBQSRReports \ - -I ~{sep=' -I ' inputBQSRreports} \ - -O ~{outputReportPath} + CallCopyRatioSegments \ + -I ~{copyRatioSegments} \ + -O ~{outputPrefix}.called.seg } output { - File outputBQSRreport = outputReportPath + File calledSegments = outputPrefix + ".called.seg" + File calledSegmentsIgv = outputPrefix + ".called.igv.seg" } runtime { @@ -217,9 +281,8 @@ task GatherBqsrReports { } parameter_meta { - inputBQSRreports: {description: "The BQSR reports to be merged.", category: "required"} - outputReportPath: {description: "The location of the combined BQSR report.", category: "required"} - + outputPrefix: {description: "The prefix for the output files.", category: "required"} + copyRatioSegments: {description: "The copy ratios file generated by gatk ModelSegments.", category: "required"} memory: {description: "The amount of memory this job will use.", category: "advanced"} javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} @@ -228,42 +291,33 @@ task GatherBqsrReports { } } -task GenotypeGVCFs { +task CollectAllelicCounts { input { - Array[File]+ gvcfFiles - Array[File]+ gvcfFilesIndex - Array[File]+ intervals - String outputPath + String allelicCountsPath = "allelic_counts.tsv" + File commonVariantSites + File inputBam + File inputBamIndex File referenceFasta File referenceFastaDict File referenceFastaFai - File? dbsnpVCF - File? dbsnpVCFIndex - - String memory = "18G" - String javaXmx = "6G" + String memory = "90G" + String javaXmx = "30G" String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" } command { set -e - mkdir -p "$(dirname ~{outputPath})" + mkdir -p "$(dirname ~{allelicCountsPath})" gatk --java-options -Xmx~{javaXmx} \ - GenotypeGVCFs \ + CollectAllelicCounts \ + -L ~{commonVariantSites} \ + -I ~{inputBam} \ -R ~{referenceFasta} \ - -O ~{outputPath} \ - ~{true="-D" false="" defined(dbsnpVCF)} ~{dbsnpVCF} \ - -G StandardAnnotation \ - --only-output-calls-starting-in-intervals \ - -new-qual \ - -V ~{sep=' -V ' gvcfFiles} \ - -L ~{sep=' -L ' intervals} + -O ~{allelicCountsPath} } output { - File outputVCF = outputPath - File outputVCFIndex = outputPath + ".tbi" - + File allelicCounts = allelicCountsPath } runtime { @@ -272,18 +326,13 @@ task GenotypeGVCFs { } parameter_meta { - gvcfFiles: {description: "The GVCF files to be genotypes.", category: "required"} - gvcfFilesIndex: {description: "The index of the input GVCF files.", category: "required"} - intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "required"} - outputPath: {description: "The location to write the output VCF file to.", category: "required"} - referenceFasta: {description: "The reference fasta file which was also used for mapping.", - category: "required"} - referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", - category: "required"} + allelicCountsPath: {description: "The path the output should be written to.", category: "advanced"} + commonVariantSites: {description: "Interval list of common vairat sies (to retrieve the allelic counts for).", category: "required"} + inputBam: {description: "The BAM file to generate counts for.", category: "required"} + inputBamIndex: {description: "The index of the input BAM file.", category: "required"} + referenceFasta: {description: "The reference fasta file.", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} - dbsnpVCF: {description: "A dbSNP VCF.", category: "common"} - dbsnpVCFIndex: {description: "The index for the dbSNP VCF.", category: "common"} - memory: {description: "The amount of memory this job will use.", category: "advanced"} javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} @@ -292,46 +341,37 @@ task GenotypeGVCFs { } } -# Call variants on a single sample with HaplotypeCaller to produce a GVCF -task HaplotypeCallerGvcf { +task CollectReadCounts { input { - Array[File]+ inputBams - Array[File]+ inputBamsIndex - Array[File]+? intervalList - Array[File]+? excludeIntervalList - String gvcfPath + String countsPath = "readcounts.hdf5" + File intervals + File inputBam + File inputBamIndex File referenceFasta - File referenceFastaIndex File referenceFastaDict - Float contamination = 0.0 - File? dbsnpVCF - File? dbsnpVCFIndex - Int? ploidy + File referenceFastaFai + String intervalMergingRule = "OVERLAPPING_ONLY" - String memory = "12G" - String javaXmx = "4G" + String memory = "35G" + String javaXmx = "7G" String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" } command { set -e - mkdir -p "$(dirname ~{gvcfPath})" + mkdir -p "$(dirname ~{countsPath})" gatk --java-options -Xmx~{javaXmx} \ - HaplotypeCaller \ + CollectReadCounts \ + -L ~{intervals} \ + -I ~{inputBam} \ -R ~{referenceFasta} \ - -O ~{gvcfPath} \ - -I ~{sep=" -I " inputBams} \ - ~{"--sample-ploidy " + ploidy} \ - ~{true="-L" false="" defined(intervalList)} ~{sep=' -L ' intervalList} \ - ~{true="-XL" false="" defined(excludeIntervalList)} ~{sep=' -XL ' excludeIntervalList} \ - ~{true="-D" false="" defined(dbsnpVCF)} ~{dbsnpVCF} \ - -contamination ~{contamination} \ - -ERC GVCF + --format HDF5 \ + --interval-merging-rule ~{intervalMergingRule} \ + -O ~{countsPath} } output { - File outputGVCF = gvcfPath - File outputGVCFIndex = gvcfPath + ".tbi" + File counts = countsPath } runtime { @@ -340,21 +380,14 @@ task HaplotypeCallerGvcf { } parameter_meta { - inputBams: {description: "The BAM files on which to perform variant calling.", category: "required"} - inputBamsIndex: {description: "The indexes for the input BAM files.", category: "required"} - intervalList: {description: "Bed files or interval lists describing the regions to operate on.", category: "common"} - excludeIntervalList: {description: "Bed files or interval lists describing the regions to NOT operate on.", category: "common"} - gvcfPath: {description: "The location to write the output GVCF to.", category: "required"} - ploidy: {description: "The ploidy with which the variants should be called.", category: "common"} - referenceFasta: {description: "The reference fasta file which was also used for mapping.", - category: "required"} - referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", - category: "required"} - referenceFastaIndex: {description: "The index for the reference fasta file.", category: "required"} - contamination: {description: "Equivalent to HaplotypeCaller's `-contamination` option.", category: "advanced"} - dbsnpVCF: {description: "A dbSNP VCF.", category: "common"} - dbsnpVCFIndex: {description: "The index for the dbSNP VCF.", category: "common"} - + countsPath: {description: "The location the output should be written to.", category: "advanced"} + intervals: {description: "The intervals to collect counts for.", category: "required"} + inputBam: {description: "The BAM file to determine the coverage for.", category: "required"} + inputBamIndex: {description: "The input BAM file's index.", category: "required"} + referenceFasta: {description: "The reference fasta file.", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} + referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + intervalMergingRule: {description: "Equivalent to gatk CollectReadCounts' `--interval-merging-rule` option.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} @@ -363,50 +396,35 @@ task HaplotypeCallerGvcf { } } -task MuTect2 { +task CombineGVCFs { input { - Array[File]+ inputBams - Array[File]+ inputBamsIndex + Array[File]+ gvcfFiles + Array[File]+ gvcfFilesIndex + Array[File] intervals = [] + String outputPath File referenceFasta File referenceFastaDict File referenceFastaFai - String outputVcf - String tumorSample - String? normalSample - File? germlineResource - File? germlineResourceIndex - File? panelOfNormals - File? panelOfNormalsIndex - String f1r2TarGz = "f1r2.tar.gz" - Array[File]+ intervals - String outputStats = outputVcf + ".stats" - String memory = "16G" - String javaXmx = "4G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.2.0--1" + String memory = "24G" + String javaXmx = "12G" + String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" } command { set -e - mkdir -p "$(dirname ~{outputVcf})" + mkdir -p "$(dirname ~{outputPath})" gatk --java-options -Xmx~{javaXmx} \ - Mutect2 \ + CombineGVCFs \ -R ~{referenceFasta} \ - -I ~{sep=" -I " inputBams} \ - -tumor ~{tumorSample} \ - ~{"-normal " + normalSample} \ - ~{"--germline-resource " + germlineResource} \ - ~{"--panel-of-normals " + panelOfNormals} \ - ~{"--f1r2-tar-gz " + f1r2TarGz} \ - -O ~{outputVcf} \ - -L ~{sep=" -L " intervals} + -O ~{outputPath} \ + -V ~{sep=' -V ' gvcfFiles} \ + ~{true='-L' false='' length(intervals) > 0} ~{sep=' -L ' intervals} } output { - File vcfFile = outputVcf - File vcfFileIndex = outputVcf + ".tbi" - File f1r2File = f1r2TarGz - File stats = outputStats + File outputVcf = outputPath + File outputVcfIndex = outputPath + ".tbi" } runtime { @@ -415,21 +433,16 @@ task MuTect2 { } parameter_meta { - inputBams: {description: "The BAM files on which to perform variant calling.", category: "required"} - inputBamsIndex: {description: "The indexes for the input BAM files.", category: "required"} - referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} - referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} + gvcfFiles: {description: "The GVCF files to be combined.", category: "required"} + gvcfFilesIndex: {description: "The indexes for the GVCF files.", caregory: "required"} + intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "advanced"} + outputPath: {description: "The location the combined GVCF should be written to.", category: "required"} + referenceFasta: {description: "The reference fasta file which was also used for mapping.", + category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", + category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} - outputVcf: {description: "The location to write the output VCF file to.", category: "required"} - tumorSample: {description: "The name of the tumor/case sample.", category: "required"} - normalSample: {description: "The name of the normal/control sample.", category: "common"} - germlineResource: {description: "Equivalent to Mutect2's `--germline-resource` option.", category: "advanced"} - germlineResourceIndex: {description: "The index for the germline resource.", category: "advanced"} - panelOfNormals: {description: "Equivalent to Mutect2's `--panel-of-normals` option.", category: "advanced"} - panelOfNormalsIndex: {description: "The index for the panel of normals.", category: "advanced"} - f1r2TarGz: {description: "Equivalent to Mutect2's `--f1r2-tar-gz` option.", category: "advanced"} - intervals: {description: "Bed files describing the regiosn to operate on.", category: "required"} - outputStats: {description: "The location the output statistics should be written to.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} @@ -438,61 +451,50 @@ task MuTect2 { } } -task LearnReadOrientationModel { +task CombineVariants { input { - Array[File]+ f1r2TarGz + File referenceFasta + File referenceFastaFai + File referenceFastaDict + String genotypeMergeOption = "UNIQUIFY" + String filteredRecordsMergeType = "KEEP_IF_ANY_UNFILTERED" + Array[String]+ identifiers + Array[File]+ variantVcfs # follow "identifiers" array order + Array[File]+ variantIndexes + String outputPath String memory = "24G" String javaXmx = "12G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.2.0--1" + String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" } - command { + command <<< set -e - gatk --java-options -Xmx~{javaXmx} \ - LearnReadOrientationModel \ - -I ~{sep=" -I " f1r2TarGz} \ - -O "artifact-priors.tar.gz" - } - - output { - File artifactPriorsTable = "artifact-priors.tar.gz" - } - - runtime { - docker: dockerImage - memory: memory - } - - parameter_meta { - f1r2TarGz: {description: "A f1r2TarGz file outputed by mutect2.", category: "required"} - memory: {description: "The amount of memory this job will use.", category: "advanced"} - javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", - category: "advanced"} - dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", - category: "advanced"} - } -} - -task MergeStats { - input { - Array[File]+ stats - - String memory = "28G" - String javaXmx = "14G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.2.0--1" - } + mkdir -p "$(dirname ~{outputPath})" - command { - set -e - gatk --java-options -Xmx~{javaXmx} \ - MergeMutectStats \ - -stats ~{sep=" -stats " stats} \ - -O "merged.stats" - } + # build "-V:<ID> <file.vcf>" arguments according to IDs and VCFs to merge + # Make sure commands are run in bash + V_args=$(bash -c ' + set -eu + ids=(~{sep=" " identifiers}) + vars=(~{sep=" " variantVcfs}) + for (( i = 0; i < ${#ids[@]}; ++i )) + do + printf -- "-V:%s %s " "${ids[i]}" "${vars[i]}" + done + ') + java -Xmx~{javaXmx} -jar /usr/GenomeAnalysisTK.jar \ + -T CombineVariants \ + -R ~{referenceFasta} \ + --genotypemergeoption ~{genotypeMergeOption} \ + --filteredrecordsmergetype ~{filteredRecordsMergeType} \ + --out ~{outputPath} \ + $V_args + >>> output { - File mergedStats = "merged.stats" + File combinedVcf = outputPath + File combinedVcfIndex = outputPath + ".tbi" } runtime { @@ -501,7 +503,16 @@ task MergeStats { } parameter_meta { - stats: {description: "Statistics files to be merged.", category: "required"} + referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} + referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + genotypeMergeOption: {description: "Equivalent to CombineVariants' `--genotypemergeoption` option.", category: "advanced"} + filteredRecordsMergeType: {description: "Equivalent to CombineVariants' `--filteredrecordsmergetype` option.", category: "advanced"} + identifiers: {description: "The sample identifiers in the same order as variantVcfs.", category: "required"} + variantVcfs: {description: "The input VCF files in the same order as identifiers.", category: "required"} + variantIndexes: {description: "The indexes of the input VCF files.", category: "required"} + outputPath: {description: "The location the output should be written to", category: "required"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} @@ -510,33 +521,29 @@ task MergeStats { } } -task GetPileupSummaries { +task CreateReadCountPanelOfNormals { input { - File sampleBam - File sampleBamIndex - File variantsForContamination - File variantsForContaminationIndex - File sitesForContamination - File sitesForContaminationIndex - String outputPrefix + String PONpath = "PON.hdf5" + Array[File]+ readCountsFiles + File? annotatedIntervals - String memory = "24G" - String javaXmx = "12G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.2.0--1" + String memory = "21G" + String javaXmx = "7G" + String dockerImage = "broadinstitute/gatk:4.1.4.0" # The biocontainer causes a spark related error for some reason... } command { set -e + mkdir -p "$(dirname ~{PONpath})" gatk --java-options -Xmx~{javaXmx} \ - GetPileupSummaries \ - -I ~{sampleBam} \ - -V ~{variantsForContamination} \ - -L ~{sitesForContamination} \ - -O ~{outputPrefix + "-pileups.table"} + CreateReadCountPanelOfNormals \ + -I ~{sep=" -I " readCountsFiles} \ + ~{"--annotated-intervals " + annotatedIntervals} \ + -O ~{PONpath} } output { - File pileups = outputPrefix + "-pileups.table" + File PON = PONpath } runtime { @@ -545,14 +552,10 @@ task GetPileupSummaries { } parameter_meta { - sampleBam: {description: "A BAM file for which a pileup should be created.", category: "required"} - sampleBamIndex: {description: "The index of the input BAM file.", category: "required"} - variantsForContamination: {description: "A VCF file with common variants.", category: "required"} - variantsForContaminationIndex: {description: "The index for the common variants VCF file.", category: "required"} - sitesForContamination: {description: "A bed file describing regions to operate on.", category: "required"} - sitesForContaminationIndex: {description: "The index for the bed file.", category: "required"} - outputPrefix: {description: "The prefix for the ouput.", category: "required"} - + PONpath: {description: "The location the PON should be written to.", category: "common"} + readCountsFiles: {description: "The read counts files as generated by CollectReadCounts.", category: "required"} + annotatedIntervals: {description: "An annotation set of intervals as generated by AnnotateIntervals. If provided, explicit GC correction will be performed.", + category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} @@ -561,29 +564,33 @@ task GetPileupSummaries { } } -task CalculateContamination { +task DenoiseReadCounts { input { - File tumorPileups - File? normalPileups + File? PON + File? annotatedIntervals + File readCounts + String outputPrefix - String memory = "24G" - String javaXmx = "12G" - String dockerImage = "quay.io/biocontainers/gatk4:4.1.2.0--1" + String memory = "39G" + String javaXmx = "13G" + String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" } command { set -e + mkdir -p "$(dirname ~{outputPrefix})" gatk --java-options -Xmx~{javaXmx} \ - CalculateContamination \ - -I ~{tumorPileups} \ - ~{"-matched " + normalPileups} \ - -O "contamination.table" \ - --tumor-segmentation "segments.table" + DenoiseReadCounts \ + -I ~{readCounts} \ + ~{"--count-panel-of-normals " + PON} \ + ~{"--annotated-intervals " + annotatedIntervals} \ + --standardized-copy-ratios ~{outputPrefix}.standardizedCR.tsv \ + --denoised-copy-ratios ~{outputPrefix}.denoisedCR.tsv } output { - File contaminationTable = "contamination.table" - File mafTumorSegments = "segments.table" + File standardizedCopyRatios = outputPrefix + ".standardizedCR.tsv" + File denoisedCopyRatios = outputPrefix + ".denoisedCR.tsv" } runtime { @@ -592,8 +599,11 @@ task CalculateContamination { } parameter_meta { - tumorPileups: {description: "The pileup summary of a tumor/case sample.", category: "required"} - normalPileups: {description: "The pileup summary of the normal/control sample.", category: "common"} + PON: {description: "A panel of normals as generated by CreateReadCountPanelOfNormals.", category: "advanced"} + annotatedIntervals: {description: "An annotated set of intervals as generated by AnnotateIntervals. Will be ignored if PON is provided.", + category: "advanced"} + readCounts: {description: "The read counts file as generated by CollectReadCounts.", category: "required"} + outputPrefix: {description: "The prefix for the output files.", category: "required"} memory: {description: "The amount of memory this job will use.", category: "advanced"} javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} @@ -670,35 +680,83 @@ task FilterMutectCalls { } } -task SplitNCigarReads { +# Combine multiple recalibration tables from scattered BaseRecalibrator runs +task GatherBqsrReports { input { - File inputBam - File inputBamIndex + Array[File] inputBQSRreports + String outputReportPath + + String memory = "12G" + String javaXmx = "4G" + String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + } + + command { + set -e + mkdir -p "$(dirname ~{outputReportPath})" + gatk --java-options -Xmx~{javaXmx} \ + GatherBQSRReports \ + -I ~{sep=' -I ' inputBQSRreports} \ + -O ~{outputReportPath} + } + + output { + File outputBQSRreport = outputReportPath + } + + runtime { + docker: dockerImage + memory: memory + } + + parameter_meta { + inputBQSRreports: {description: "The BQSR reports to be merged.", category: "required"} + outputReportPath: {description: "The location of the combined BQSR report.", category: "required"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task GenotypeGVCFs { + input { + Array[File]+ gvcfFiles + Array[File]+ gvcfFilesIndex + Array[File]+ intervals + String outputPath File referenceFasta File referenceFastaDict File referenceFastaFai - String outputBam - Array[File] intervals = [] + File? dbsnpVCF + File? dbsnpVCFIndex - String memory = "16G" - String javaXmx = "4G" + String memory = "18G" + String javaXmx = "6G" String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" } command { set -e - mkdir -p "$(dirname ~{outputBam})" + mkdir -p "$(dirname ~{outputPath})" gatk --java-options -Xmx~{javaXmx} \ - SplitNCigarReads \ - -I ~{inputBam} \ + GenotypeGVCFs \ -R ~{referenceFasta} \ - -O ~{outputBam} \ - ~{true="-L" false="" length(intervals) > 0} ~{sep=' -L ' intervals} + -O ~{outputPath} \ + ~{true="-D" false="" defined(dbsnpVCF)} ~{dbsnpVCF} \ + -G StandardAnnotation \ + --only-output-calls-starting-in-intervals \ + -new-qual \ + -V ~{sep=' -V ' gvcfFiles} \ + -L ~{sep=' -L ' intervals} } output { - File bam = outputBam - File bamIndex = sub(outputBam, "\.bam$", ".bai") + File outputVCF = outputPath + File outputVCFIndex = outputPath + ".tbi" + } runtime { @@ -707,15 +765,17 @@ task SplitNCigarReads { } parameter_meta { - inputBam: {description: "The BAM file for which spliced reads should be split.", category: "required"} - inputBamIndex: {description: "The input BAM file's index.", category: "required"} + gvcfFiles: {description: "The GVCF files to be genotypes.", category: "required"} + gvcfFilesIndex: {description: "The index of the input GVCF files.", category: "required"} + intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "required"} + outputPath: {description: "The location to write the output VCF file to.", category: "required"} referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} - outputBam: {description: "The location the output BAM file should be written.", category: "required"} - intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "advanced"} + dbsnpVCF: {description: "A dbSNP VCF.", category: "common"} + dbsnpVCFIndex: {description: "The index for the dbSNP VCF.", category: "common"} memory: {description: "The amount of memory this job will use.", category: "advanced"} javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", @@ -725,50 +785,33 @@ task SplitNCigarReads { } } -task CombineVariants { +task GetPileupSummaries { input { - File referenceFasta - File referenceFastaFai - File referenceFastaDict - String genotypeMergeOption = "UNIQUIFY" - String filteredRecordsMergeType = "KEEP_IF_ANY_UNFILTERED" - Array[String]+ identifiers - Array[File]+ variantVcfs # follow "identifiers" array order - Array[File]+ variantIndexes - String outputPath + File sampleBam + File sampleBamIndex + File variantsForContamination + File variantsForContaminationIndex + File sitesForContamination + File sitesForContaminationIndex + String outputPrefix String memory = "24G" String javaXmx = "12G" - String dockerImage = "broadinstitute/gatk3:3.8-1" + String dockerImage = "quay.io/biocontainers/gatk4:4.1.2.0--1" } - command <<< + command { set -e - mkdir -p "$(dirname ~{outputPath})" - - # build "-V:<ID> <file.vcf>" arguments according to IDs and VCFs to merge - # Make sure commands are run in bash - V_args=$(bash -c ' - set -eu - ids=(~{sep=" " identifiers}) - vars=(~{sep=" " variantVcfs}) - for (( i = 0; i < ${#ids[@]}; ++i )) - do - printf -- "-V:%s %s " "${ids[i]}" "${vars[i]}" - done - ') - java -Xmx~{javaXmx} -jar /usr/GenomeAnalysisTK.jar \ - -T CombineVariants \ - -R ~{referenceFasta} \ - --genotypemergeoption ~{genotypeMergeOption} \ - --filteredrecordsmergetype ~{filteredRecordsMergeType} \ - --out ~{outputPath} \ - $V_args - >>> + gatk --java-options -Xmx~{javaXmx} \ + GetPileupSummaries \ + -I ~{sampleBam} \ + -V ~{variantsForContamination} \ + -L ~{sitesForContamination} \ + -O ~{outputPrefix + "-pileups.table"} + } output { - File combinedVcf = outputPath - File combinedVcfIndex = outputPath + ".tbi" + File pileups = outputPrefix + "-pileups.table" } runtime { @@ -777,15 +820,514 @@ task CombineVariants { } parameter_meta { + sampleBam: {description: "A BAM file for which a pileup should be created.", category: "required"} + sampleBamIndex: {description: "The index of the input BAM file.", category: "required"} + variantsForContamination: {description: "A VCF file with common variants.", category: "required"} + variantsForContaminationIndex: {description: "The index for the common variants VCF file.", category: "required"} + sitesForContamination: {description: "A bed file describing regions to operate on.", category: "required"} + sitesForContaminationIndex: {description: "The index for the bed file.", category: "required"} + outputPrefix: {description: "The prefix for the ouput.", category: "required"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +# Call variants on a single sample with HaplotypeCaller to produce a GVCF +task HaplotypeCallerGvcf { + input { + Array[File]+ inputBams + Array[File]+ inputBamsIndex + Array[File]+? intervalList + Array[File]+? excludeIntervalList + String gvcfPath + File referenceFasta + File referenceFastaIndex + File referenceFastaDict + Float contamination = 0.0 + File? dbsnpVCF + File? dbsnpVCFIndex + Int? ploidy + + String memory = "12G" + String javaXmx = "4G" + String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + } + + command { + set -e + mkdir -p "$(dirname ~{gvcfPath})" + gatk --java-options -Xmx~{javaXmx} \ + HaplotypeCaller \ + -R ~{referenceFasta} \ + -O ~{gvcfPath} \ + -I ~{sep=" -I " inputBams} \ + ~{"--sample-ploidy " + ploidy} \ + ~{true="-L" false="" defined(intervalList)} ~{sep=' -L ' intervalList} \ + ~{true="-XL" false="" defined(excludeIntervalList)} ~{sep=' -XL ' excludeIntervalList} \ + ~{true="-D" false="" defined(dbsnpVCF)} ~{dbsnpVCF} \ + -contamination ~{contamination} \ + -ERC GVCF + } + + output { + File outputGVCF = gvcfPath + File outputGVCFIndex = gvcfPath + ".tbi" + } + + runtime { + docker: dockerImage + memory: memory + } + + parameter_meta { + inputBams: {description: "The BAM files on which to perform variant calling.", category: "required"} + inputBamsIndex: {description: "The indexes for the input BAM files.", category: "required"} + intervalList: {description: "Bed files or interval lists describing the regions to operate on.", category: "common"} + excludeIntervalList: {description: "Bed files or interval lists describing the regions to NOT operate on.", category: "common"} + gvcfPath: {description: "The location to write the output GVCF to.", category: "required"} + ploidy: {description: "The ploidy with which the variants should be called.", category: "common"} + referenceFasta: {description: "The reference fasta file which was also used for mapping.", + category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", + category: "required"} + referenceFastaIndex: {description: "The index for the reference fasta file.", category: "required"} + contamination: {description: "Equivalent to HaplotypeCaller's `-contamination` option.", category: "advanced"} + dbsnpVCF: {description: "A dbSNP VCF.", category: "common"} + dbsnpVCFIndex: {description: "The index for the dbSNP VCF.", category: "common"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + + +task LearnReadOrientationModel { + input { + Array[File]+ f1r2TarGz + + String memory = "24G" + String javaXmx = "12G" + String dockerImage = "quay.io/biocontainers/gatk4:4.1.2.0--1" + } + + command { + set -e + gatk --java-options -Xmx~{javaXmx} \ + LearnReadOrientationModel \ + -I ~{sep=" -I " f1r2TarGz} \ + -O "artifact-priors.tar.gz" + } + + output { + File artifactPriorsTable = "artifact-priors.tar.gz" + } + + runtime { + docker: dockerImage + memory: memory + } + + parameter_meta { + f1r2TarGz: {description: "A f1r2TarGz file outputed by mutect2.", category: "required"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task MergeStats { + input { + Array[File]+ stats + + String memory = "28G" + String javaXmx = "14G" + String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + } + + command { + set -e + gatk --java-options -Xmx~{javaXmx} \ + MergeMutectStats \ + -stats ~{sep=" -stats " stats} \ + -O "merged.stats" + } + + output { + File mergedStats = "merged.stats" + } + + runtime { + docker: dockerImage + memory: memory + } + + parameter_meta { + stats: {description: "Statistics files to be merged.", category: "required"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task ModelSegments { + input { + String outputDir = "." + String outputPrefix + File denoisedCopyRatios + File allelicCounts + File? normalAllelicCounts + Int minimumTotalAlleleCountCase = if defined(normalAllelicCounts) + then 0 + else 30 + Int maximumNumberOfSmoothingIterations = 10 + + String memory = "64G" + String javaXmx = "10G" + String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + } + + command { + set -e + mkdir -p ~{outputDir} + gatk --java-options -Xmx~{javaXmx} \ + ModelSegments \ + --denoised-copy-ratios ~{denoisedCopyRatios} \ + --allelic-counts ~{allelicCounts} \ + ~{"--normal-allelic-counts " + normalAllelicCounts} \ + --minimum-total-allele-count-case ~{minimumTotalAlleleCountCase} \ + --maximum-number-of-smoothing-iterations ~{maximumNumberOfSmoothingIterations} \ + --output ~{outputDir} \ + --output-prefix ~{outputPrefix} + } + + output { + File hetrozygousAllelicCounts = outputDir + "/" + outputPrefix + ".hets.tsv" + File? normalHetrozygousAllelicCounts = outputDir + "/" + outputPrefix + ".hets.normal.tsv" + File copyRatioSegments = outputDir + "/" + outputPrefix + ".cr.seg" + File copyRatioCBS = outputDir + "/" + outputPrefix + ".cr.igv.seg" + File alleleFractionCBS = outputDir + "/" + outputPrefix + ".af.igv.seg" + File unsmoothedModeledSegments = outputDir + "/" + outputPrefix + ".modelBegin.seg" + File unsmoothedCopyRatioParameters = outputDir + "/" + outputPrefix + ".modelBegin.cr.param" + File unsmoothedAlleleFractionParameters = outputDir + "/" + outputPrefix + ".modelBegin.af.param" + File modeledSegments = outputDir + "/" + outputPrefix + ".modelFinal.seg" + File copyRatioParameters = outputDir + "/" + outputPrefix + ".modelFinal.cr.param" + File alleleFractionParameters = outputDir + "/" + outputPrefix + ".modelFinal.af.param" + } + + runtime { + docker: dockerImage + memory: memory + } + + parameter_meta { + outputDir: {description: "The directory to write the ouput to.", category: "common"} + outputPrefix: {description: "The prefix of the output files. Should not include directories.", category: "required"} + denoisedCopyRatios: {description: "The denoised copy ratios as generated by DenoiseReadCounts.", category: "required"} + allelicCounts: {description: "The allelicCounts as generate by CollectAllelicCounts.", category: "required" } + normalAllelicCounts: {description: "The allelicCounts as generate by CollectAllelicCounts for a matched normal.", category: "common"} + minimumTotalAlleleCountCase: {description: "Equivalent to gatk ModelSeqments' `--minimum-total-allele-count-case` option.", category: "advanced"} + maximumNumberOfSmoothingIterations: {description: "Equivalent to gatk ModelSeqments' `--maximum-number-of-smoothing-iterations` option.", category: "advanced"} + + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task MuTect2 { + input { + Array[File]+ inputBams + Array[File]+ inputBamsIndex + File referenceFasta + File referenceFastaDict + File referenceFastaFai + String outputVcf + String tumorSample + String? normalSample + File? germlineResource + File? germlineResourceIndex + File? panelOfNormals + File? panelOfNormalsIndex + String f1r2TarGz = "f1r2.tar.gz" + Array[File]+ intervals + String outputStats = outputVcf + ".stats" + + String memory = "16G" + String javaXmx = "4G" + String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + } + + command { + set -e + mkdir -p "$(dirname ~{outputVcf})" + gatk --java-options -Xmx~{javaXmx} \ + Mutect2 \ + -R ~{referenceFasta} \ + -I ~{sep=" -I " inputBams} \ + -tumor ~{tumorSample} \ + ~{"-normal " + normalSample} \ + ~{"--germline-resource " + germlineResource} \ + ~{"--panel-of-normals " + panelOfNormals} \ + ~{"--f1r2-tar-gz " + f1r2TarGz} \ + -O ~{outputVcf} \ + -L ~{sep=" -L " intervals} + } + + output { + File vcfFile = outputVcf + File vcfFileIndex = outputVcf + ".tbi" + File f1r2File = f1r2TarGz + File stats = outputStats + } + + runtime { + docker: dockerImage + memory: memory + } + + parameter_meta { + inputBams: {description: "The BAM files on which to perform variant calling.", category: "required"} + inputBamsIndex: {description: "The indexes for the input BAM files.", category: "required"} referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} - genotypeMergeOption: {description: "Equivalent to CombineVariants' `--genotypemergeoption` option.", category: "advanced"} - filteredRecordsMergeType: {description: "Equivalent to CombineVariants' `--filteredrecordsmergetype` option.", category: "advanced"} - identifiers: {description: "The sample identifiers in the same order as variantVcfs.", category: "required"} - variantVcfs: {description: "The input VCF files in the same order as identifiers.", category: "required"} - variantIndexes: {description: "The indexes of the input VCF files.", category: "required"} - outputPath: {description: "The location the output should be written to", category: "required"} + outputVcf: {description: "The location to write the output VCF file to.", category: "required"} + tumorSample: {description: "The name of the tumor/case sample.", category: "required"} + normalSample: {description: "The name of the normal/control sample.", category: "common"} + germlineResource: {description: "Equivalent to Mutect2's `--germline-resource` option.", category: "advanced"} + germlineResourceIndex: {description: "The index for the germline resource.", category: "advanced"} + panelOfNormals: {description: "Equivalent to Mutect2's `--panel-of-normals` option.", category: "advanced"} + panelOfNormalsIndex: {description: "The index for the panel of normals.", category: "advanced"} + f1r2TarGz: {description: "Equivalent to Mutect2's `--f1r2-tar-gz` option.", category: "advanced"} + intervals: {description: "Bed files describing the regiosn to operate on.", category: "required"} + outputStats: {description: "The location the output statistics should be written to.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task PlotDenoisedCopyRatios { + input { + File referenceFastaDict + String outputDir = "." + String outputPrefix + File standardizedCopyRatios + File denoisedCopyRatios + + String memory = "32G" + String javaXmx = "7G" + String dockerImage = "broadinstitute/gatk:4.1.4.0" # The biocontainer doesn't seem to contain R. + } + + command { + set -e + mkdir -p ~{outputDir} + gatk --java-options -Xmx~{javaXmx} \ + PlotDenoisedCopyRatios \ + --standardized-copy-ratios ~{standardizedCopyRatios} \ + --denoised-copy-ratios ~{denoisedCopyRatios} \ + --sequence-dictionary ~{referenceFastaDict} \ + --output ~{outputDir} \ + --output-prefix ~{outputPrefix} + } + + output { + File denoisedCopyRatiosPlot = outputDir + "/" + outputPrefix + ".denoised.png" + File denoisedCopyRatiosLimitedPlot = outputDir + "/" + outputPrefix + ".denoisedLimit4.png" + File standardizedMedianAbsoluteDeviation = outputDir + "/" + outputPrefix + ".standardizedMAD.txt" + File denoisedMedianAbsoluteDeviation = outputDir + "/" + outputPrefix + ".denoisedMAD.txt" + File deltaMedianAbsoluteDeviation = outputDir + "/" + outputPrefix + ".deltaMAD.txt" + File deltaScaledMedianAbsoluteDeviation = outputDir + "/" + outputPrefix + ".scaledDeltaMAD.txt" + } + + runtime { + docker: dockerImage + memory: memory + } + + parameter_meta { + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file used for the analyses.", category: "required"} + outputDir: {description: "The directory to write the ouput to.", category: "common"} + outputPrefix: {description: "The prefix of the output files. Should not include directories.", category: "required"} + denoisedCopyRatios: {description: "The denoised copy ratios as generated by DenoiseReadCounts.", category: "required"} + standardizedCopyRatios: {description: "The standardized copy ratios as generated by DenoiseReadCounts.", category: "required"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task PlotModeledSegments { + input { + File referenceFastaDict + String outputDir = "." + String outputPrefix + File denoisedCopyRatios + File segments + File allelicCounts + + String memory = "21G" + String javaXmx = "7G" + String dockerImage = "broadinstitute/gatk:4.1.4.0" # The biocontainer doesn't seem to contain R. + } + + command { + set -e + mkdir -p ~{outputDir} + gatk --java-options -Xmx~{javaXmx} \ + PlotModeledSegments \ + --denoised-copy-ratios ~{denoisedCopyRatios} \ + --allelic-counts ~{allelicCounts} \ + --segments ~{segments} \ + --sequence-dictionary ~{referenceFastaDict} \ + --output ~{outputDir} \ + --output-prefix ~{outputPrefix} + } + + output { + File modeledSegmentsPlot = outputDir + "/" + outputPrefix + ".modeled.png" + } + + runtime { + docker: dockerImage + memory: memory + } + + parameter_meta { + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file used for the analyses.", category: "required"} + outputDir: {description: "The directory to write the ouput to.", category: "common"} + outputPrefix: {description: "The prefix of the output files. Should not include directories.", category: "required"} + denoisedCopyRatios: {description: "The denoised copy ratios as generated by DenoiseReadCounts.", category: "required"} + segments: {description: "The modeled segments as generated by ModelSegments.", category: "required"} + allelicCounts: {description: "The hetrozygous allelic counts as generated by ModelSegments.", category: "required"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task PreprocessIntervals { + input { + File referenceFasta + File referenceFastaDict + File referenceFastaFai + File? intervals + String outputIntervalList = "bins.interval_list" + Int binLength = if defined(intervals) then 0 else 1000 + Int padding = if defined(intervals) then 250 else 0 + String intervalMergingRule = "OVERLAPPING_ONLY" + + String memory = "10G" + String javaXmx = "2G" + String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + } + + command { + set -e + mkdir -p "$(dirname ~{outputIntervalList})" + gatk --java-options -Xmx~{javaXmx} \ + PreprocessIntervals \ + -R ~{referenceFasta} \ + --sequence-dictionary ~{referenceFastaDict} \ + --bin-length ~{binLength} \ + --padding ~{padding} \ + ~{"-L " + intervals} \ + --interval-merging-rule ~{intervalMergingRule} \ + -O ~{outputIntervalList} + } + + output { + File intervalList = outputIntervalList + } + + runtime { + docker: dockerImage + memory: memory + } + + parameter_meta { + referenceFasta: {description: "The reference fasta file..", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", category: "required"} + referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + intervals: {description: "Bed files describing the regiosn to operate on.", category: "common"} + outputIntervalList: {description: "The location the output should be written to.", category: "advanced"} + binLength: {description: "The size of the bins to be created. Should be 0 for targeted/exome sequencing.", category: "advanced"} + padding: {description: "The padding to be added to the bins. Should be 0 if contiguos binning is used, eg with WGS.", category: "advanced"} + intervalMergingRule: {description: "Equivalent to gatk PreprocessIntervals' `--interval-merging-rule` option.", category: "advanced"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + +task SplitNCigarReads { + input { + File inputBam + File inputBamIndex + File referenceFasta + File referenceFastaDict + File referenceFastaFai + String outputBam + Array[File] intervals = [] + + String memory = "16G" + String javaXmx = "4G" + String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0" + } + + command { + set -e + mkdir -p "$(dirname ~{outputBam})" + gatk --java-options -Xmx~{javaXmx} \ + SplitNCigarReads \ + -I ~{inputBam} \ + -R ~{referenceFasta} \ + -O ~{outputBam} \ + ~{true="-L" false="" length(intervals) > 0} ~{sep=' -L ' intervals} + } + + output { + File bam = outputBam + File bamIndex = sub(outputBam, "\.bam$", ".bai") + } + + runtime { + docker: dockerImage + memory: memory + } + + parameter_meta { + inputBam: {description: "The BAM file for which spliced reads should be split.", category: "required"} + inputBamIndex: {description: "The input BAM file's index.", category: "required"} + referenceFasta: {description: "The reference fasta file which was also used for mapping.", + category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", + category: "required"} + referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + outputBam: {description: "The location the output BAM file should be written.", category: "required"} + intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.",