gridss.wdl 12.87 KiB
version 1.0
# Copyright (c) 2020 Leiden University Medical Center
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import "bwa.wdl" as bwa
task AnnotateInsertedSequence {
input {
File inputVcf
String outputPath = "gridss.annotated.vcf.gz"
File viralReference
File viralReferenceFai
File viralReferenceDict
File viralReferenceImg
Int threads = 8
String javaXmx = "8G"
String memory = "9G"
String dockerImage = "quay.io/biowdl/gridss:2.12.2"
Int timeMinutes = 120
}
command {
set -e
_JAVA_OPTIONS="$_JAVA_OPTIONS -Xmx~{javaXmx}"
AnnotateInsertedSequence \
REFERENCE_SEQUENCE=~{viralReference} \
INPUT=~{inputVcf} \
OUTPUT=~{outputPath} \
ALIGNMENT=APPEND \
WORKING_DIR='.' \
WORKER_THREADS=~{threads}
}
output {
File outputVcf = outputPath
File outputVcfIndex = outputPath + ".tbi"
}
runtime {
cpu: threads
memory: memory
time_minutes: timeMinutes # !UnknownRuntimeKey
docker: dockerImage
}
parameter_meta {
inputVcf: {description: "The input VCF file.", category: "required"}
outputPath: {description: "The path the output will be written to.", category: "common"}
viralReference: {description: "A fasta file with viral sequences.", category: "required"}
viralReferenceFai: {description: "The index for the viral reference fasta.", category: "required"}
viralReferenceDict: {description: "The dict file for the viral reference.", category: "required"}
viralReferenceImg: {description: "The BWA index image (generated with GATK BwaMemIndexImageCreator) of the viral reference.", category: "required"}
memory: {description: "The amount of memory this job will use.", category: "advanced"}
javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.",
category: "advanced"}
timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
category: "advanced"}
}
}
task AnnotateSvTypes {
input {
File gridssVcf
File gridssVcfIndex
String outputPath = "./gridss.svtyped.vcf"
String memory = "32G"
String dockerImage = "quay.io/biocontainers/bioconductor-structuralvariantannotation:1.10.0--r41hdfd78af_0"
Int timeMinutes = 240
}
# Based on https://github.com/PapenfussLab/gridss/issues/74
command <<<
set -e
mkdir -p "$(dirname ~{outputPath})"
R --vanilla << "EOF"
library(VariantAnnotation)
library(StructuralVariantAnnotation)
vcf_path <- "~{gridssVcf}"
out_path <- "~{outputPath}"
# Simple SV type classifier
simpleEventType <- function(gr) {
return(ifelse(seqnames(gr) != seqnames(partner(gr)), "BND", # inter-chromosomosal
ifelse(gr$insLen >= abs(gr$svLen) * 0.7, "INS",
ifelse(strand(gr) == strand(partner(gr)), "INV",
ifelse(xor(start(gr) < start(partner(gr)), strand(gr) == "-"), "DEL",
"DUP")))))
}
header <- scanVcfHeader(vcf_path)
vcf <- readVcf(vcf_path, seqinfo(header))
gr <- breakpointRanges(vcf)
svtype <- simpleEventType(gr)
info(vcf[gr$sourceId])$SVTYPE <- svtype
writeVcf(vcf, out_path, index=T)
EOF
>>>
output {
File vcf = outputPath
}
runtime {
memory: memory
time_minutes: timeMinutes # !UnknownRuntimeKey
docker: dockerImage
}
parameter_meta {
gridssVcf: {description: "The VCF produced by GRIDSS.", category: "required"}
gridssVcfIndex: {description: "The index for the VCF produced by GRIDSS.", category: "required"}
outputPath: {description: "The path the output should be written to.", category: "common"}
memory: {description: "The amount of memory this job will use.", category: "advanced"}
timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
category: "advanced"}
}
}
task GRIDSS {
input {
File tumorBam
File tumorBai
String tumorLabel
BwaIndex reference
String outputPrefix = "gridss"
File? normalBam
File? normalBai
String? normalLabel
File? blacklistBed
File? gridssProperties
Int jvmHeapSizeGb = 300
Int nonJvmMemoryGb = 50
Int threads = 4
Int timeMinutes = ceil(7200 / threads) + 1800
String dockerImage = "quay.io/biowdl/gridss:2.12.2"
}
command {
set -e
mkdir -p "$(dirname ~{outputPrefix})"
gridss \
-w . \
--reference ~{reference.fastaFile} \
--output ~{outputPrefix}.vcf.gz \
--assembly ~{outputPrefix}_assembly.bam \
~{"-c " + gridssProperties} \
~{"-t " + threads} \
~{"--jvmheap " + jvmHeapSizeGb + "G"} \
--labels ~{normalLabel}~{true="," false="" defined(normalLabel)}~{tumorLabel} \
~{"--blacklist " + blacklistBed} \
~{normalBam} \
~{tumorBam}
samtools index ~{outputPrefix}_assembly.bam ~{outputPrefix}_assembly.bai
# For some reason the VCF index is sometimes missing
if [ ! -e ~{outputPrefix}.vcf.gz.tbi ]
then
tabix ~{outputPrefix}.vcf.gz
fi
}
output {
File vcf = outputPrefix + ".vcf.gz"
File vcfIndex = outputPrefix + ".vcf.gz.tbi"
File assembly = outputPrefix + "_assembly.bam"
File assemblyIndex = outputPrefix + "_assembly.bai"
}
runtime {
cpu: threads
memory: "~{jvmHeapSizeGb + nonJvmMemoryGb}G"
time_minutes: timeMinutes # !UnknownRuntimeKey
docker: dockerImage
}
parameter_meta {
# inputs
tumorBam: {description: "The input BAM file. This should be the tumor/case sample in case of a paired analysis.", category: "required"}
tumorBai: {description: "The index for tumorBam.", category: "required"}
tumorLabel: {description: "The name of the (tumor) sample.", category: "required"}
reference: {description: "A BWA index, this should also include the fasta index file (.fai).", category: "required"}
outputPrefix: {description: "The prefix for the output files. This may include parent directories.", category: "common"}
normalBam: {description: "The BAM file for the normal/control sample.", category: "advanced"}
normalBai: {description: "The index for normalBam.", category: "advanced"}
normalLabel: {description: "The name of the normal sample.", category: "advanced"}
blacklistBed: {description: "A bed file with blaclisted regins.", category: "advanced"}
gridssProperties: {description: "A properties file for gridss.", category: "advanced"}
threads: {description: "The number of the threads to use.", category: "advanced"}
jvmHeapSizeGb: {description: "The size of JVM heap for assembly and variant calling", category: "advanced"}
nonJvmMemoryGb: {description: "The amount of memory in Gb to be requested besides JVM memory.", category: "advanced"}
timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"}
# outputs
vcf: {description: "VCF file including variant allele fractions."}
vcfIndex: {description: "Index of output VCF."}
assembly: {description: "The GRIDSS assembly BAM."}
assemblyIndex: {description: "Index of output BAM file."}
}
}
task GridssAnnotateVcfRepeatmasker {
input {
File gridssVcf
File gridssVcfIndex
String outputPath = "./gridss.repeatmasker_annotated.vcf.gz"
String memory = "25G"
Int threads = 8
String dockerImage = "quay.io/biowdl/gridss:2.12.2"
Int timeMinutes = 1440
}
command {
gridss_annotate_vcf_repeatmasker \
--output ~{outputPath} \
--jar /usr/local/share/gridss-2.12.2-0/gridss.jar \
-w . \
-t ~{threads} \
~{gridssVcf}
}
output {
File annotatedVcf = outputPath
File annotatedVcfIndex = "~{outputPath}.tbi"
}
runtime {
cpu: threads
memory: memory
time_minutes: timeMinutes # !UnknownRuntimeKey
docker: dockerImage
}
parameter_meta {
gridssVcf: {description: "The GRIDSS output.", category: "required"}
gridssVcfIndex: {description: "The index for the GRIDSS output.", category: "required"}
outputPath: {description: "The path the output should be written to.", category: "common"}
threads: {description: "The number of the threads to use.", category: "advanced"}
memory: {description: "The amount of memory this job will use.", category: "advanced"}
timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
category: "advanced"}
}
}
task Virusbreakend {
input {
File bam
File bamIndex
File referenceFasta
File referenceFastaFai
File referenceFastaDict
File referenceImg
File virusbreakendDB
String outputPath = "./virusbreakend.vcf"
String memory = "75G"
Int threads = 8
String dockerImage = "quay.io/biowdl/gridss:2.12.2"
Int timeMinutes = 180
}
command {
set -e
mkdir virusbreakenddb
tar -xzvf ~{virusbreakendDB} -C virusbreakenddb --strip-components 1
virusbreakend \
--output ~{outputPath} \
--workingdir . \
--reference ~{referenceFasta} \
--db virusbreakenddb \
--jar /usr/local/share/gridss-2.12.2-0/gridss.jar \
-t ~{threads} \
~{bam}
}
output {
File vcf = outputPath
File summary = "~{outputPath}.summary.tsv"
}
runtime {
cpu: threads
memory: memory
time_minutes: timeMinutes # !UnknownRuntimeKey
docker: dockerImage
}
parameter_meta {
bam: {description: "A BAM file.", category: "required"}
bamIndex: {description: "The index for the BAM file.", category: "required"}
referenceFasta: {description: "The fasta of the reference genome.", category: "required"}
referenceImg: {description: "The BWA index image (generated with GATK BwaMemIndexImageCreator) of the reference.", category: "required"}
virusbreakendDB: {description: "A .tar.gz containing the virusbreakend database.", category: "required"}
outputPath: {description: "The path the output should be written to.", category: "common"}
memory: {description: "The amount of memory this job will use.", category: "advanced"}
threads: {description: "The number of the threads to use.", category: "advanced"}
timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.",
category: "advanced"}
}
}