diff --git a/CHANGELOG.md b/CHANGELOG.md index 9646b5f037e85740143c84c0c6d577497cba4a11..b27addabdd83bfa695a7ebb0cd86da70dc175519 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,11 @@ that users understand how the changes affect the new version. version 5.0.0-dev --------------------------- ++ deepvariant: Add task for DeepVariant. ++ gatk: Make intervals optional for GenotypeGVCFs. ++ isoseq3: Add required bam index input to isoseq3. ++ pbbam: Add task for indexing PacBio bam files. ++ picard: Add CollectHsMetrics and CollectVariantCallingMetrics. + Samtools: Add `threads` to parameter meta for Merge task. + bcftools: add tmpDir input to specify temporary directory when sorting. + bcftools: remove outputType and implement indexing based on output file extension. diff --git a/bam2fastx.wdl b/bam2fastx.wdl index 42240cd4da789985ca1d84ec86973f833f15672b..18434755f251ee91df8a10e36eac3990fbd51217 100644 --- a/bam2fastx.wdl +++ b/bam2fastx.wdl @@ -91,12 +91,25 @@ task Bam2Fastq { command { set -e mkdir -p "$(dirname ~{outputPrefix})" + + # Localise the bam and pbi files so they are next to each other in the + # current folder + bamfiles="" + for bamfile in ~{sep=" " bam};do + ln $bamfile . + bamfiles=$bamfiles" $(basename $bamfile)" + done + + for bamindex in ~{sep=" " bamIndex}; do + ln $bamindex . + done + bam2fastq \ --output ~{outputPrefix} \ -c ~{compressionLevel} \ ~{true="--split-barcodes" false="" splitByBarcode} \ ~{"--seqid-prefix " + seqIdPrefix} \ - ~{sep=" " bam} + $bamfiles } output { diff --git a/chunked-scatter.wdl b/chunked-scatter.wdl index b54a7d2e909a20d43bbe194d9f28496b7753ce5d..115c5ca4eb8fb8407ef0f2d7c36c61c39c669d11 100644 --- a/chunked-scatter.wdl +++ b/chunked-scatter.wdl @@ -24,6 +24,7 @@ task ChunkedScatter { input { File inputFile String prefix = "./scatter" + Boolean splitContigs = false Int? chunkSize Int? overlap Int? minimumBasesPerFile @@ -40,6 +41,7 @@ task ChunkedScatter { ~{"-c " + chunkSize} \ ~{"-o " + overlap} \ ~{"-m " + minimumBasesPerFile} \ + ~{true="--split-contigs " false="" splitContigs} \ ~{inputFile} } diff --git a/deepvariant.wdl b/deepvariant.wdl new file mode 100644 index 0000000000000000000000000000000000000000..88bdb35267732aaf096eb883539394bcf6fbb345 --- /dev/null +++ b/deepvariant.wdl @@ -0,0 +1,91 @@ +version 1.0 + +# Copyright (c) 2018 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task RunDeepVariant { + input { + File referenceFasta + File referenceFastaIndex + File inputBam + File inputBamIndex + String modelType + String outputVcf + File? customizedModel + Int? numShards + String? outputGVcf + File? regions + String? sampleName + Boolean? VCFStatsReport = true + + String memory = "3G" + Int timeMinutes = 5000 + String dockerImage = "google/deepvariant:1.0.0" + } + + command { + set -e + + /opt/deepvariant/bin/run_deepvariant \ + --ref ~{referenceFasta} \ + --reads ~{inputBam} \ + --model_type ~{modelType} \ + --output_vcf ~{outputVcf} \ + ~{"--output_gvcf " + outputGVcf} \ + ~{"--customized_model " + customizedModel} \ + ~{"--num_shards " + numShards} \ + ~{"--regions} " + regions} \ + ~{"--sample_name " + sampleName} \ + ~{true="--vcf_stats_report" false="--novcf_stats_report" VCFStatsReport} + } + + runtime { + docker: dockerImage + time_minutes: timeMinutes + memory: memory + } + + output { + File outputVCF = outputVcf + File outputVCFIndex = outputVCF + ".tbi" + File? outputGVCF = outputGVcf + File? outputGVCFIndex = outputGVcf + ".tbi" + Array[File] outputVCFStatsReport = glob("*.visual_report.html") + } + + parameter_meta { + referenceFasta: {description: "Genome reference to use", category: "required"} + referenceFastaIndex: {description: "Index for the genome reference file.", category: "required"} + inputBam: {description: "Aligned, sorted, indexed BAM file containing the reads we want to call.", category: "required"} + inputBamIndex: {description: "Index for the input bam file.", category: "required"} + modelType: {description: "<WGS|WES|PACBIO>. Type of model to use for variant calling. Each model_type has an associated default model, which can be overridden by the --customized_model flag", category: "required"} + outputVcf: {description: "Path where we should write VCF file.", category: "required"} + customizedModel: {description: "A path to a model checkpoint to load for the `call_variants` step. If not set, the default for each --model_type will be used", category: "advanced"} + numShards: {description: "Number of shards for make_examples step.", category: "common"} + outputGVcf: {description: "Path where we should write gVCF file.", category: "common"} + regions: {description: "List of regions we want to process, in BED/BEDPE format.", category: "advanced"} + sampleName: {description: "Sample name to use instead of the sample name from the input reads BAM (SM tag in the header).", category: "common"} + VCFStatsReport: {description: "Output a visual report (HTML) of statistics about the output VCF.", category: "common"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} diff --git a/gatk.wdl b/gatk.wdl index e0209a0c4ef0f4b19fde26f3b47f0d0bd2b33fe0..12416dda147265aff574110d075f34f2897fdae7 100644 --- a/gatk.wdl +++ b/gatk.wdl @@ -820,7 +820,7 @@ task GenotypeGVCFs { input { File gvcfFile File gvcfFileIndex - Array[File]+ intervals + Array[File]? intervals String outputPath File referenceFasta File referenceFastaDict @@ -846,9 +846,9 @@ task GenotypeGVCFs { ~{"-D " + dbsnpVCF} \ ~{"--pedigree " + pedigree} \ ~{true="-G" false="" length(annotationGroups) > 0} ~{sep=" -G " annotationGroups} \ - --only-output-calls-starting-in-intervals \ -V ~{gvcfFile} \ - -L ~{sep=' -L ' intervals} + ~{true="--only-output-calls-starting-in-intervals" false="" defined(intervals)} \ + ~{true="-L" false="" defined(intervals)} ~{sep=' -L ' intervals} } output { @@ -866,7 +866,7 @@ task GenotypeGVCFs { parameter_meta { gvcfFile: {description: "The GVCF file to be genotyped.", category: "required"} gvcfFileIndex: {description: "The index of the input GVCF file.", category: "required"} - intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "required"} + intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "optional"} outputPath: {description: "The location to write the output VCF file to.", category: "required"} referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} diff --git a/isoseq3.wdl b/isoseq3.wdl index 604a71d50fa673b5d248b7edfc943ccc66150c82..5060f0e7ca5a216bd409f5d19605ac0f65d855dc 100644 --- a/isoseq3.wdl +++ b/isoseq3.wdl @@ -26,6 +26,7 @@ task Refine { Boolean requirePolyA = false String logLevel = "WARN" File inputBamFile + File inputBamIndex File primerFile String outputDir String outputNamePrefix @@ -72,6 +73,7 @@ task Refine { requirePolyA: {description: "Require fl reads to have a poly(A) tail and remove it.", category: "common"} logLevel: {description: "Set log level. Valid choices: (TRACE, DEBUG, INFO, WARN, FATAL).", category: "advanced"} inputBamFile: {description: "Bam input file.", category: "required"} + inputBamIndex: {description: "Index for the Bam input file.", category: "required"} primerFile: {description: "Barcode/primer fasta file.", category: "required"} outputDir: {description: "Output directory path.", category: "required"} outputNamePrefix: {description: "Basename of the output files.", category: "required"} diff --git a/lima.wdl b/lima.wdl index 1a40b1c8174c8df6aefa360fe199e2a1d8ccea8d..7ef9d4abfe8e88a26bad13d407895813f1da2227 100644 --- a/lima.wdl +++ b/lima.wdl @@ -83,32 +83,30 @@ task Lima { ~{true="--peek-guess" false="" peekGuess} \ --log-level ~{logLevel} \ --num-threads ~{cores} \ - ~{"--log-file " + outputPrefix + ".fl.stderr.log"} \ + ~{"--log-file " + outputPrefix + ".stderr.log"} \ ~{inputBamFile} \ ~{barcodeFile} \ - ~{basename(outputPrefix) + ".fl.bam"} + ~{outputPrefix + ".bam"} - # copy commands below are needed because glob command does not find - # multiple bam/bam.pbi/subreadset.xml files when not located in working - # directory. - cp "~{basename(outputPrefix)}.fl.json" "~{outputPrefix}.fl.json" - cp "~{basename(outputPrefix)}.fl.lima.counts" "~{outputPrefix}.fl.lima.counts" - cp "~{basename(outputPrefix)}.fl.lima.report" "~{outputPrefix}.fl.lima.report" - cp "~{basename(outputPrefix)}.fl.lima.summary" "~{outputPrefix}.fl.lima.summary" - find . -path "*.bam" > bamFiles.txt - find . -path "*.bam.pbi" > bamIndexes.txt - find . -path "*.subreadset.xml" > subreadsets.txt + # copy the files with the default filename to the folder specified in + # outputPrefix. + if [ "~{basename(outputPrefix)}.json" != "~{outputPrefix}.json" ]; then + cp "~{basename(outputPrefix)}.json" "~{outputPrefix}.json" + cp "~{basename(outputPrefix)}.lima.counts" "~{outputPrefix}.lima.counts" + cp "~{basename(outputPrefix)}.lima.report" "~{outputPrefix}.lima.report" + cp "~{basename(outputPrefix)}.lima.summary" "~{outputPrefix}.lima.summary" + fi } output { - Array[File] limaBam = read_lines("bamFiles.txt") - Array[File] limaBamIndex = read_lines("bamIndexes.txt") - Array[File] limaXml = read_lines("subreadsets.txt") - File limaStderr = outputPrefix + ".fl.stderr.log" - File limaJson = outputPrefix + ".fl.json" - File limaCounts = outputPrefix + ".fl.lima.counts" - File limaReport = outputPrefix + ".fl.lima.report" - File limaSummary = outputPrefix + ".fl.lima.summary" + Array[File] limaBam = glob("*.bam") + Array[File] limaBamIndex = glob("*.bam.pbi") + Array[File] limaXml = glob("*.subreadset.xml") + File limaStderr = outputPrefix + ".stderr.log" + File limaJson = outputPrefix + ".json" + File limaCounts = outputPrefix + ".lima.counts" + File limaReport = outputPrefix + ".lima.report" + File limaSummary = outputPrefix + ".lima.summary" } runtime { diff --git a/pbbam.wdl b/pbbam.wdl new file mode 100644 index 0000000000000000000000000000000000000000..52737a008625e559b76905b673f0c18260b8c582 --- /dev/null +++ b/pbbam.wdl @@ -0,0 +1,70 @@ +version 1.0 + +# Copyright (c) 2017 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + +task Index { + input { + File bamFile + String? outputBamPath + + String memory = "2G" + Int timeMinutes = 1 + ceil(size(bamFile, "G") * 4) + String dockerImage = "quay.io/biocontainers/pbbam:1.6.0--h5b7e6e0_0" + } + + # Select_first is needed, otherwise womtool validate fails. + String outputPath = select_first([outputBamPath, basename(bamFile)]) + String bamIndexPath = outputPath + ".pbi" + + command { + bash -c ' + set -e + # Make sure outputBamPath does not exist. + if [ ! -f ~{outputPath} ] + then + mkdir -p "$(dirname ~{outputPath})" + ln ~{bamFile} ~{outputPath} + fi + pbindex ~{outputPath} ~{bamIndexPath} + ' + } + + output { + File indexedBam = outputPath + File index = bamIndexPath + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + bamFile: {description: "The BAM file for which an index should be made.", category: "required"} + outputBamPath: {description: "The location where the BAM file should be written to. The index will appear alongside this link to the BAM file.", + category: "common"} + memory: {description: "The amount of memory needed for the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} diff --git a/pbmm2.wdl b/pbmm2.wdl index 84fbd2d08e32c9b9676e5b24f6d6c0843bb95dce..31d4c667e37f2444df413085d78870208504761a 100644 --- a/pbmm2.wdl +++ b/pbmm2.wdl @@ -30,7 +30,7 @@ task Mapping { Int cores = 4 String memory = "30G" - Int timeMinutes = 1 + ceil(size(queryFile, "G") * 200 / cores) + Int timeMinutes = 1 + ceil(size(queryFile, "G") * 2000 / cores) String dockerImage = "quay.io/biocontainers/pbmm2:1.3.0--h56fc30b_1" } @@ -41,6 +41,7 @@ task Mapping { -j ~{cores} \ ~{referenceMMI} \ ~{queryFile} \ + --sample ~{sample} \ ~{sample}.align.bam } diff --git a/picard.wdl b/picard.wdl index 1afa5ea7272155e09064c07725e729121061bf83..49db8b8b44eccb793f8eba46bc5b8aae7b297582 100644 --- a/picard.wdl +++ b/picard.wdl @@ -66,6 +66,71 @@ task BedToIntervalList { } } +task CollectHsMetrics { + input { + File inputBam + File inputBamIndex + File referenceFasta + File referenceFastaDict + File referenceFastaFai + File targets + File? baits + String basename + + # Use the targets file as baits as a fallback, since often the baits + # for a certain capture kit are not available. + File baitsFile = select_first([baits, targets]) + File targetsFile = targets + + Int memoryMb = javaXmxMb + 512 + Int javaXmxMb = 3072 + # Additional * 2 because picard multiple metrics reads the reference fasta twice. + Int timeMinutes = 1 + ceil(size(referenceFasta, "G") * 3 * 2) + ceil(size(inputBam, "G") * 6) + String dockerImage = "quay.io/biocontainers/picard:2.23.2--0" + } + + command { + set -e + mkdir -p "$(dirname ~{basename})" + picard -Xmx~{javaXmxMb}M -XX:ParallelGCThreads=1 \ + CollectHsMetrics \ + I=~{inputBam} \ + R=~{referenceFasta} \ + BAIT_INTERVALS=~{baitsFile} \ + TARGET_INTERVALS=~{targetsFile} \ + O="~{basename}.hs_metrics.txt" + } + + output { + File HsMetrics = basename + ".hs_metrics.txt" + } + + runtime { + docker: dockerImage + time_minutes: timeMinutes + memory: "~{memoryMb}M" + } + + parameter_meta { + # inputs + inputBam: {description: "The input BAM file for which metrics will be collected.", category: "required"} + inputBamIndex: {description: "The index of the input BAM file.", category: "required"} + referenceFasta: {description: "The reference fasta file which was also used for mapping.", category: "required"} + referenceFastaDict: {description: "The sequence dictionary associated with the reference fasta file.", + category: "required"} + referenceFastaFai: {description: "The index for the reference fasta file.", category: "required"} + targets: {description: "Picard interval file of the capture targets.", category: "required"} + baits: {description: "Picard interval file of the capture bait set.", category: "advanced"} + basename: {description: "The basename/prefix of the output files (may include directories).", category: "required"} + memoryMb: {description: "The amount of memory this job will use in megabytes.", category: "advanced"} + javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + task CollectMultipleMetrics { input { File inputBam @@ -315,6 +380,57 @@ task CollectTargetedPcrMetrics { } } +task CollectVariantCallingMetrics { + input { + File dbsnp + File dbsnpIndex + File inputVCF + File inputVCFIndex + String basename + + String memory = "9G" + String javaXmx = "8G" + Int timeMinutes = 1440 + String dockerImage = "quay.io/biocontainers/picard:2.23.2--0" + } + + command { + set -e + mkdir -p "$(dirname ~{basename})" + picard -Xmx~{javaXmx} \ + CollectVariantCallingMetrics -XX:ParallelGCThreads=1 \ + DBSNP=~{dbsnp} \ + INPUT=~{inputVCF} \ + OUTPUT=~{basename} + } + + output { + File details = basename + ".variant_calling_detail_metrics" + File summary = basename + ".variant_calling_summary_metrics" + } + + runtime { + docker: dockerImage + time_minutes: timeMinutes + memory: memory + } + + parameter_meta { + # inputs + dbsnp: {description: "DBSNP vcf file to use with CollectVariantCallingMetrics.", category: "required"} + dbsnpIndex: {description: "Index file for the DBSNP VCF.", category: "required"} + inputVCF: {description: "Input VCF file", category: "required"} + inputVCFIndex: {description: "Index file for the input VCF.", category: "required"} + basename: {description: "The basename/prefix of the output files (may include directories).", category: "required"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", + category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", + category: "advanced"} + } +} + task CreateSequenceDictionary { input { File inputFile diff --git a/samtools.wdl b/samtools.wdl index 24d95aa4b4b17a8e2d5eeb6a26c19cebeeeac312..ad94338a37d7e0b526aa1623780c350819241ec3 100644 --- a/samtools.wdl +++ b/samtools.wdl @@ -332,6 +332,7 @@ task Merge { Int threads = 1 Int timeMinutes = 1 + ceil(size(bamFiles, "G") * 2) + String memory = "4G" String dockerImage = "quay.io/biocontainers/samtools:1.10--h9402c20_2" } String indexPath = sub(outputBamPath, "\.bam$",".bai") @@ -355,6 +356,7 @@ task Merge { runtime { cpu: threads docker: dockerImage + memory: memory time_minutes: timeMinutes } @@ -362,7 +364,7 @@ task Merge { # inputs bamFiles: {description: "The BAM files to merge.", category: "required"} outputBamPath: {description: "The location the merged BAM file should be written to.", category: "common"} - threads: {description: "Number of threads to use.", category: "advanced"} + threads: {description: "Number of threads to use.", category: "common"} force: {description: "Equivalent to samtools merge's `-f` flag.", category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", diff --git a/whatshap.wdl b/whatshap.wdl new file mode 100644 index 0000000000000000000000000000000000000000..2ee90f5024ca3fec7051a091ecaa2944f61de874 --- /dev/null +++ b/whatshap.wdl @@ -0,0 +1,197 @@ +version 1.0 + +# Copyright (c) 2018 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +task Phase { + input { + String outputVCF + File? reference + File? referenceIndex + String? tag + String? algorithm + Boolean? indels + String? sample + String? chromosome + String? threshold + String? ped + File vcf + File vcfIndex + File phaseInput + File phaseInputIndex + + String memory = "4G" + Int timeMinutes = 120 + # Whatshap 1.0, tabix 0.2.5 + String dockerImage = "quay.io/biocontainers/mulled-v2-5c61fe1d8c284dd05d26238ce877aa323205bf82:89b4005d04552bdd268e8af323df83357e968d83-0" + } + + command { + whatshap phase \ + ~{vcf} \ + ~{phaseInput} \ + ~{if defined(outputVCF) then ("--output " + '"' + outputVCF + '"') else ""} \ + ~{if defined(reference) then ("--reference " + '"' + reference + '"') else ""} \ + ~{if defined(tag) then ("--tag " + '"' + tag + '"') else ""} \ + ~{if defined(algorithm) then ("--algorithm " + '"' + algorithm + '"') else ""} \ + ~{true="--indels" false="" indels} \ + ~{if defined(sample) then ("--sample " + '"' + sample + '"') else ""} \ + ~{if defined(chromosome) then ("--chromosome " + '"' + chromosome + '"') else ""} \ + ~{if defined(threshold) then ("--threshold " + '"' + threshold + '"') else ""} \ + ~{if defined(ped) then ("--ped " + '"' + ped + '"') else ""} \ + tabix -p vcf ~{outputVCF} + } + + output { + File phasedVCF = outputVCF + File phasedVCFIndex = outputVCF + ".tbi" + } + + runtime { + docker: dockerImage + time_minutes: timeMinutes + memory: memory + } + + parameter_meta { + outputVCF: {description: "Output VCF file. Add .gz to the file name to get compressed output. If omitted, use standard output.", category: "common"} + reference: {description: "Reference file. Provide this to detect alleles through re-alignment. If no index (.fai) exists, it will be created", category: "common"} + tag: {description: "Store phasing information with PS tag (standardized) or HP tag (used by GATK ReadBackedPhasing) (default: {description: PS)", category: "common"} + algorithm: {description: "Phasing algorithm to use (default: {description: whatshap)", category: "advanced"} + indels: {description: "Also phase indels (default: {description: do not phase indels)", category: "common"} + sample: {description: "Name of a sample to phase. If not given, all samples in the input VCF are phased. Can be used multiple times.", category: "common"} + chromosome: {description: "Name of chromosome to phase. If not given, all chromosomes in the input VCF are phased. Can be used multiple times.", category: "common"} + threshold: {description: "The threshold of the ratio between the probabilities that a pair of reads come from the same haplotype and different haplotypes in the read merging model (default: {description: 1000000).", category: "advanced"} + ped: {description: "Use pedigree information in PED file to improve phasing (switches to PedMEC algorithm). Columns 2, 3, 4 must refer to child, mother, and father sample names as used in the VCF and BAM/CRAM. Other columns are ignored.", category: "advanced"} + vcf: {description: "VCF or BCF file with variants to be phased (can be gzip-compressed)", category: "required"} + vcfIndex: {description: "Index for the VCF or BCF file with variants to be phased", category: "required"} + phaseInput: {description: "BAM, CRAM, VCF or BCF file(s) with phase information, either through sequencing reads (BAM, CRAM) or through phased blocks (VCF, BCF)", category: "required"} + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + } +} + +task Stats { + input { + String? gtf + String? sample + String? tsv + String? blockList + String? chromosome + File vcf + + String memory = "4G" + Int timeMinutes = 120 + # Whatshap 1.0, tabix 0.2.5 + String dockerImage = "quay.io/biocontainers/mulled-v2-5c61fe1d8c284dd05d26238ce877aa323205bf82:89b4005d04552bdd268e8af323df83357e968d83-0" + } + + command { + whatshap stats \ + ~{vcf} \ + ~{if defined(gtf) then ("--gtf " + '"' + gtf + '"') else ""} \ + ~{if defined(sample) then ("--sample " + '"' + sample + '"') else ""} \ + ~{if defined(tsv) then ("--tsv " + '"' + tsv + '"') else ""} \ + ~{if defined(blockList) then ("--block-list " + '"' + blockList + '"') else ""} \ + ~{if defined(chromosome) then ("--chromosome " + '"' + chromosome + '"') else ""} + } + + output { + File? phasedGTF = gtf + File? phasedTSV = tsv + File? phasedBlockList = blockList + } + + runtime { + docker: dockerImage + time_minutes: timeMinutes + memory: memory + } + + parameter_meta { + gtf: "Write phased blocks to GTF file." + sample: "Name of the sample to process. If not given, use first sample found in VCF." + tsv: "Filename to write statistics to (tab-separated)." + blockList: "Filename to write list of all blocks to (one block per line)." + chromosome: "Name of chromosome to process. If not given, all chromosomes in the input VCF are considered." + vcf: "Phased VCF file" + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + } +} + +task Haplotag { + input { + String outputFile + File? reference + File? referenceFastaIndex + String? regions + String? sample + File vcf + File vcfIndex + File alignments + File alignmentsIndex + + String memory = "4G" + Int timeMinutes = 120 + # Whatshap 1.0, tabix 0.2.5 + String dockerImage = "quay.io/biocontainers/mulled-v2-5c61fe1d8c284dd05d26238ce877aa323205bf82:89b4005d04552bdd268e8af323df83357e968d83-0" + } + + command { + whatshap haplotag \ + ~{vcf} \ + ~{alignments} \ + ~{if defined(outputFile) then ("--output " + '"' + outputFile+ '"') else ""} \ + ~{if defined(reference) then ("--reference " + '"' + reference + '"') else ""} \ + ~{if defined(regions) then ("--regions " + '"' + regions + '"') else ""} \ + ~{if defined(sample) then ("--sample " + '"' + sample + '"') else ""} \ + python3 -c "import pysam; pysam.index('~{outputFile}')" + } + + output { + File bam = outputFile + File bamIndex = outputFile + ".bai" + } + + runtime { + docker: dockerImage + time_minutes: timeMinutes + memory: memory + } + + parameter_meta { + outputFile: "Output file. If omitted, use standard output." + reference: "Reference file. Provide this to detect alleles through re-alignment. If no index (.fai) exists, it will be created." + referenceFastaIndex: "Index for the reference file." + regions: "Specify region(s) of interest to limit the tagging to reads/variants overlapping those regions. You can specify a space-separated list of regions in the form of chrom:start-end, chrom (consider entire chromosome), or chrom:start (consider region from this start to end of chromosome)." + sample: "Name of a sample to phase. If not given, all samples in the input VCF are phased. Can be used multiple times." + vcf: "VCF file with phased variants (must be gzip-compressed and indexed)." + vcfIndex: "Index for the VCF or BCF file with variants to be phased." + alignments: "File (BAM/CRAM) with read alignments to be tagged by haplotype." + alignmentsIndex: "Index for the alignment file." + memory: {description: "The amount of memory this job will use.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + } +}