Skip to content
Snippets Groups Projects
Commit 1724ee74 authored by Mei's avatar Mei
Browse files

Merge remote-tracking branch 'origin/develop' into biowdl-307

parents 33f53d10 d40e5a34
No related branches found
No related tags found
No related merge requests found
language: java
# We use conda to install cromwell.
language: python
python:
- 3.6
before_install:
# Install conda
- export MINICONDA=${HOME}/miniconda
- export PATH=${MINICONDA}/bin:${PATH}
- wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
- bash miniconda.sh -b -f -p ${MINICONDA}
- conda config --set always_yes yes
- conda config --add channels defaults
- conda config --add channels bioconda
- conda config --add channels conda-forge
install:
- conda install cromwell
script:
- set -e
- export CROMWELL_VERSION=35
- wget https://github.com/broadinstitute/cromwell/releases/download/$CROMWELL_VERSION/womtool-$CROMWELL_VERSION.jar
- for F in `find -name "*.wdl"`; do echo $F; java -jar womtool-*.jar validate $F; done
- 'if [ "$TRAVIS_PULL_REQUEST" != "false" ]; then git submodule foreach --recursive git checkout $TRAVIS_BRANCH && git submodule foreach --recursive git pull; fi'
- "git diff --exit-code || (echo ERROR: Git changes detected. Please update submodules && exit 1)"
- set -e
- for FILE in $(find -name "*.wdl"); do echo $FILE; womtool validate $FILE; done
- 'if [ "$TRAVIS_PULL_REQUEST" != "false" ]; then git submodule foreach git checkout develop && git submodule foreach git pull; fi'
- "git diff --exit-code || (echo ERROR: Git changes detected. Please update submodules && exit 1)"
......@@ -11,6 +11,24 @@ that users understand how the changes affect the new version.
version 1.0.0-dev
---------------------------
+ Removed "pipefail" from command sections TALON and TranscriptClean
+ Add WDL task for Minimap2
+ Add WDL task for TALON
+ Add WDL task for TranscriptClean
+ Fastqsplitter: fix mkdir command to work with biocontainer's busybox mkdir
+ Cutadapt: simplify interface
+ Bigger memory multiplier in mutect to take in account bigger vmem usage
+ Cutadapt: Remove default adapter
+ Fastqsplitter: use version 1.1.
+ Picard: Use version 2.20.5 of the biocontainer as this includes the R dependency
+ Common: Update dockerTag to dockerImage.
+ GATK: Add CombineVariants task that allows, e.g., to merge VCFs from different callers.
+ Mutect2: Add GATK tasks related to variant filtering (LearnReadOrientationModel, MergeStats, GetPileupSummaries, CalculateContamination and FilterMutectCalls).
+ Mutect2: Add "--germline-resource" and "--f1r2-tar-gz" inputs, requiring an update to GATK 4.1.2.0.
+ Mutect2: Add necessary missing index attribute for panel of normals.
+ MultiQC: Add memory variable to multiqc task.
+ GATK: SplitNCigarReads, BaseRecalibration and ApplyBQSR do no longer need regions files as required inputs.
+ VarDict: Add user definable flags (-M, -A, -Q, -d, -v, -f) to the paired VCF filtering script.
+ Cutadapt: If the output is a gzipped file, compress with level 1 (instead of default 6).
+ Cutadapt: Fix issues with read2output when using single-end reads.
+ Add feature type, idattr and additional attributes to htseq-count.
......
......@@ -77,7 +77,7 @@ task Copy {
Boolean recursive = false
# Version not that important as long as it is stable.
String dockerTag = "5.0.2"
String dockerImage = "bash:5.0.2"
}
command {
......@@ -91,7 +91,7 @@ task Copy {
}
runtime {
docker: "bash:" + dockerTag
docker: dockerImage
}
}
......@@ -155,7 +155,7 @@ task YamlToJson {
input {
File yaml
String outputJson = basename(yaml, "\.ya?ml$") + ".json"
String dockerTag = "3.13-py37-slim"
String dockerImage = "biowdl/pyyaml:3.13-py37-slim"
}
command {
set -e
......@@ -174,7 +174,7 @@ task YamlToJson {
}
runtime {
docker: "biowdl/pyyaml:" + dockerTag
docker: dockerImage
}
}
......
......@@ -7,16 +7,12 @@ task Cutadapt {
String read1output = "cut_r1.fq.gz"
String? read2output
String? format
Array[String]+? adapter
Array[String]+? front
Array[String]+? anywhere
Array[String]+? adapterRead2
Array[String]+? frontRead2
Array[String]+? anywhereRead2
# FIXME: default should be set at the subworkflow level, not here. Needs to wait for cromwell fix.
Array[String]+? adapterBoth = ["AGATCGGAAGAG"]
# contaminations = anywhereBoth
Array[String]+? contaminations
Array[String] adapter = []
Array[String] front = []
Array[String] anywhere = []
Array[String] adapterRead2 = []
Array[String] frontRead2 = []
Array[String] anywhereRead2 = []
Boolean? interleaved
String? pairFilter
Float? errorRate
......@@ -74,25 +70,7 @@ task Cutadapt {
then "mkdir -p $(dirname " + realRead2output + ")"
else ""
# FIXME: This crappy overengineering can be removed once cromwell can handle subworkflow inputs correctly.
# Some WDL magic here to set both adapters with one setting.
# If then else's are needed to keep the variable optional and undefined
Array[String]+? adapterForward = if (defined(adapter) || defined(adapterBoth))
then select_first([adapter, adapterBoth])
else adapter
# Check if read2 is defined before applying adapters.
Array[String]+? adapterReverse = if (defined(read2) && (defined(adapterRead2) || defined(adapterBoth)))
then select_first([adapterRead2, adapterBoth])
else adapterRead2
# Same for contaminations
Array[String]+? anywhereForward = if (defined(anywhere) || defined(contaminations))
then select_first([anywhere, contaminations])
else anywhere
Array[String]+? anywhereReverse = if (defined(read2) && (defined(anywhereRead2) || defined(contaminations)))
then select_first([anywhereRead2, contaminations])
else anywhereRead2
# FIXME: Use prefix() function for adapter, adapterRead2, etc.
command {
set -e
~{"mkdir -p $(dirname " + read1output + ")"}
......@@ -100,12 +78,12 @@ task Cutadapt {
cutadapt \
~{"--cores=" + cores} \
~{true="-Z" false="" Z} \
~{true="-a" false="" defined(adapterForward)} ~{sep=" -a " adapterForward} \
~{true="-A" false="" defined(adapterReverse)} ~{sep=" -A " adapterReverse} \
~{true="-g" false="" defined(front)} ~{sep=" -g " front} \
~{true="-G" false="" defined(frontRead2)} ~{sep=" -G " frontRead2} \
~{true="-b" false="" defined(anywhereForward)} ~{sep=" -b " anywhereForward} \
~{true="-B" false="" defined(anywhereReverse)} ~{sep=" -B " anywhereReverse} \
~{true="-a" false="" length(adapter) > 0} ~{sep=" -a " adapter} \
~{true="-A" false="" length(adapterRead2) > 0} ~{sep=" -A " adapterRead2} \
~{true="-g" false="" length(front) > 0} ~{sep=" -g " front} \
~{true="-G" false="" length(frontRead2) > 0} ~{sep=" -G " frontRead2} \
~{true="-b" false="" length(anywhere) > 0} ~{sep=" -b " anywhere} \
~{true="-B" false="" length(anywhereRead2) > 0} ~{sep=" -B " anywhereRead2} \
--output ~{read1output} ~{if defined(read2) then "-p " + realRead2output else ""} \
~{"--to-short-output " + tooShortOutputPath} \
~{"--to-short-paired-output " + tooShortPairedOutputPath} \
......
......@@ -26,7 +26,7 @@ task Fastqsplitter {
input {
File inputFastq
Array[String]+ outputPaths
String dockerImage = "quay.io/biocontainers/fastqsplitter:1.0.0--py_0"
String dockerImage = "quay.io/biocontainers/fastqsplitter:1.1.0--py37h516909a_1"
Int? compressionLevel
Int? threadsPerFile
# fastqplitter utilizes one thread per input file and one or more threads per output file + one thread for the application.
......@@ -34,15 +34,18 @@ task Fastqsplitter {
Int cores = 1 + ceil(0.5 * length(outputPaths))
}
command {
# Busybox mkdir does not accept multiple paths.
command <<<
set -e
mkdir -p $(dirname ~{sep=' ' outputPaths})
for FILE in ~{sep=' ' outputPaths}
do mkdir -p $(dirname $FILE)
done
fastqsplitter \
~{"-c " + compressionLevel} \
~{"-t " + threadsPerFile} \
-i ~{inputFastq} \
-o ~{sep=' -o ' outputPaths}
}
>>>
output {
Array[File] chunks = outputPaths
......
......@@ -7,7 +7,7 @@ task ApplyBQSR {
File inputBamIndex
String outputBamPath
File recalibrationReport
Array[File]+ sequenceGroupInterval
Array[File] sequenceGroupInterval = []
File referenceFasta
File referenceFastaDict
File referenceFastaFai
......@@ -32,7 +32,7 @@ task ApplyBQSR {
--static-quantized-quals 10 \
--static-quantized-quals 20 \
--static-quantized-quals 30 \
-L ~{sep=" -L " sequenceGroupInterval}
~{true="-L" false="" length(sequenceGroupInterval) > 0} ~{sep=' -L ' sequenceGroupInterval}
}
output {
......@@ -53,7 +53,7 @@ task BaseRecalibrator {
File inputBam
File inputBamIndex
String recalibrationReportPath
Array[File]+ sequenceGroupInterval
Array[File] sequenceGroupInterval = []
Array[File]? knownIndelsSitesVCFs
Array[File]? knownIndelsSitesVCFIndexes
File? dbsnpVCF
......@@ -82,7 +82,7 @@ task BaseRecalibrator {
--use-original-qualities \
-O ~{recalibrationReportPath} \
--known-sites ~{sep=" --known-sites " knownIndelsSitesVCFsArg} \
-L ~{sep=" -L " sequenceGroupInterval}
~{true="-L" false="" length(sequenceGroupInterval) > 0} ~{sep=' -L ' sequenceGroupInterval}
}
output {
......@@ -258,12 +258,17 @@ task MuTect2 {
String outputVcf
String tumorSample
String? normalSample
File? germlineResource
File? germlineResourceIndex
File? panelOfNormals
File? panelOfNormalsIndex
String f1r2TarGz = "f1r2.tar.gz"
Array[File]+ intervals
String outputStats = outputVcf + ".stats"
Int memory = 4
Float memoryMultiplier = 3
String dockerImage = "quay.io/biocontainers/gatk4:4.1.0.0--0"
Float memoryMultiplier = 4
String dockerImage = "quay.io/biocontainers/gatk4:4.1.2.0--1"
}
command {
......@@ -275,7 +280,9 @@ task MuTect2 {
-I ~{sep=" -I " inputBams} \
-tumor ~{tumorSample} \
~{"-normal " + normalSample} \
~{"--germline-resource " + germlineResource} \
~{"--panel-of-normals " + panelOfNormals} \
~{"--f1r2-tar-gz " + f1r2TarGz} \
-O ~{outputVcf} \
-L ~{sep=" -L " intervals}
}
......@@ -283,6 +290,178 @@ task MuTect2 {
output {
File vcfFile = outputVcf
File vcfFileIndex = outputVcf + ".tbi"
File f1r2File = f1r2TarGz
File stats = outputStats
}
runtime {
docker: dockerImage
memory: ceil(memory * memoryMultiplier)
}
}
task LearnReadOrientationModel {
input {
Array[File]+ f1r2TarGz
Int memory = 12
Float memoryMultiplier = 2
String dockerImage = "quay.io/biocontainers/gatk4:4.1.2.0--1"
}
command {
set -e
gatk --java-options -Xmx~{memory}G \
LearnReadOrientationModel \
-I ~{sep=" -I " f1r2TarGz} \
-O "artifact-priors.tar.gz"
}
output {
File artifactPriorsTable = "artifact-priors.tar.gz"
}
runtime {
docker: dockerImage
memory: ceil(memory * memoryMultiplier)
}
}
task MergeStats {
input {
Array[File]+ stats
Int memory = 14
Float memoryMultiplier = 2
String dockerImage = "quay.io/biocontainers/gatk4:4.1.2.0--1"
}
command {
set -e
gatk --java-options -Xmx~{memory}G \
MergeMutectStats \
-stats ~{sep=" -stats " stats} \
-O "merged.stats"
}
output {
File mergedStats = "merged.stats"
}
runtime {
docker: dockerImage
memory: ceil(memory * memoryMultiplier)
}
}
task GetPileupSummaries {
input {
File sampleBam
File sampleBamIndex
File variantsForContamination
File variantsForContaminationIndex
File sitesForContamination
File sitesForContaminationIndex
String outputPrefix
Int memory = 12
Float memoryMultiplier = 2
String dockerImage = "quay.io/biocontainers/gatk4:4.1.2.0--1"
}
command {
set -e
gatk --java-options -Xmx~{memory}G \
GetPileupSummaries \
-I ~{sampleBam} \
-V ~{variantsForContamination} \
-L ~{sitesForContamination} \
-O ~{outputPrefix + "-pileups.table"}
}
output {
File pileups = outputPrefix + "-pileups.table"
}
runtime {
docker: dockerImage
memory: ceil(memory * memoryMultiplier)
}
}
task CalculateContamination {
input {
File tumorPileups
File? normalPileups
Int memory = 12
Float memoryMultiplier = 2
String dockerImage = "quay.io/biocontainers/gatk4:4.1.2.0--1"
}
command {
set -e
gatk --java-options -Xmx~{memory}G \
CalculateContamination \
-I ~{tumorPileups} \
~{"-matched " + normalPileups} \
-O "contamination.table" \
--tumor-segmentation "segments.table"
}
output {
File contaminationTable = "contamination.table"
File mafTumorSegments = "segments.table"
}
runtime {
docker: dockerImage
memory: ceil(memory * memoryMultiplier)
}
}
task FilterMutectCalls {
input {
File referenceFasta
File referenceFastaFai
File referenceFastaDict
File unfilteredVcf
File unfilteredVcfIndex
String outputVcf
File? contaminationTable
File? mafTumorSegments
File? artifactPriors
Int uniqueAltReadCount = 4
File mutect2Stats
String? extraArgs
Int memory = 12
Float memoryMultiplier = 2
String dockerImage = "quay.io/biocontainers/gatk4:4.1.2.0--1"
}
command {
set -e
mkdir -p $(dirname ~{outputVcf})
gatk --java-options -Xmx~{memory}G \
FilterMutectCalls \
-R ~{referenceFasta} \
-V ~{unfilteredVcf} \
-O ~{outputVcf} \
~{"--contamination-table " + contaminationTable} \
~{"--tumor-segmentation " + mafTumorSegments} \
~{"--ob-priors " + artifactPriors} \
~{"--unique-alt-read-count " + uniqueAltReadCount} \
~{"-stats " + mutect2Stats} \
--filtering-stats "filtering.stats" \
--showHidden \
~{extraArgs}
}
output {
File filteredVcf = outputVcf
File filteredVcfIndex = outputVcf + ".tbi"
File filteringStats = "filtering.stats"
}
runtime {
......@@ -299,7 +478,7 @@ task SplitNCigarReads {
File referenceFastaDict
File referenceFastaFai
String outputBam
Array[File]+ intervals
Array[File] intervals = []
Int memory = 4
Float memoryMultiplier = 4
......@@ -314,7 +493,7 @@ task SplitNCigarReads {
-I ~{inputBam} \
-R ~{referenceFasta} \
-O ~{outputBam} \
-L ~{sep=' -L ' intervals}
~{true="-L" false="" length(intervals) > 0} ~{sep=' -L ' intervals}
}
output {
......@@ -327,3 +506,59 @@ task SplitNCigarReads {
memory: ceil(memory * memoryMultiplier)
}
}
task CombineVariants {
input {
String installDir = "/usr" # .jar location in the docker image
File referenceFasta
File referenceFastaFai
File referenceFastaDict
String genotypeMergeOption = "UNIQUIFY"
String filteredRecordsMergeType = "KEEP_IF_ANY_UNFILTERED"
Array[String]+ identifiers
Array[File]+ variantVcfs # follow "identifiers" array order
Array[File]+ variantIndexes
String outputPath
Int memory = 12
Float memoryMultiplier = 2
String dockerImage = "broadinstitute/gatk3:3.8-1"
}
command <<<
set -e
mkdir -p $(dirname "~{outputPath}")
# build "-V:<ID> <file.vcf>" arguments according to IDs and VCFs to merge
# Make sure commands are run in bash
bash -c '#!/usr/bin/env bash
set -eux
ids=(~{sep=" " identifiers})
vars=(~{sep=" " variantVcfs})
V_args=$(
for (( i = 0; i < ${#ids[@]}; ++i ))
do
printf -- "-V:%s %s " "${ids[i]}" "${vars[i]}"
done
)
java -Xmx~{memory}G -jar ~{installDir}/GenomeAnalysisTK.jar \
-T CombineVariants \
-R ~{referenceFasta} \
--genotypemergeoption ~{genotypeMergeOption} \
--filteredrecordsmergetype ~{filteredRecordsMergeType} \
--out ~{outputPath} \
$V_args
'
>>>
output {
File combinedVcf = outputPath
File combinedVcfIndex = outputPath + ".tbi"
}
runtime {
docker: dockerImage
memory: ceil(memory * memoryMultiplier)
}
}
version 1.0
# Copyright (c) 2019 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
task Indexing {
input {
File referenceFile
String outputPrefix
Boolean useHomopolymerCompressedKmer = false
Int kmerSize = 15
Int minimizerWindowSize = 10
Int? splitIndex
Int cores = 1
Int memory = 4
String dockerImage = "quay.io/biocontainers/minimap2:2.17--h84994c4_0"
}
command {
set -e
mkdir -p $(dirname ~{outputPrefix})
minimap2 \
~{true="-H" false="" useHomopolymerCompressedKmer} \
~{"-k " + kmerSize} \
~{"-w " + minimizerWindowSize} \
~{"-I " + splitIndex} \
~{"-d " + outputPrefix + ".mmi"} \
~{"-t " + cores} \
~{referenceFile}
}
output {
File outputIndexFile = outputPrefix + ".mmi"
}
runtime {
cpu: cores
memory: memory
docker: dockerImage
}
parameter_meta {
referenceFile: "Reference fasta file."
outputPrefix: "Output directory path + output file prefix."
useHomopolymerCompressedKmer: "Use homopolymer-compressed k-mer (preferrable for PacBio)."
kmerSize: "K-mer size (no larger than 28)."
minimizerWindowSize: "Minimizer window size."
splitIndex: "Split index for every ~NUM input bases."
outputIndexFile: "Indexed reference file."
}
}
task Mapping {
input {
File queryFile
File referenceFile
String outputPrefix
String presetOption
Boolean outputSAM = false
Int? maxFragmentLength
Int? maxIntronLength
Boolean? skipSelfAndDualMappings
Int? retainMaxSecondaryAlignments
Int? matchingScore
Int? mismatchPenalty
String? howToFindGTAG
Boolean? secondaryAlignment
Int cores = 4
Int memory = 7
String dockerImage = "quay.io/biocontainers/minimap2:2.17--h84994c4_0"
}
command {
set -e
mkdir -p $(dirname ~{outputPrefix})
minimap2 \
~{"-x " + presetOption} \
~{true="-a" false="" outputSAM} \
~{"-G " + maxIntronLength} \
~{"-F " + maxFragmentLength} \
~{true="-X" false="" skipSelfAndDualMappings} \
~{"-N " + retainMaxSecondaryAlignments} \
~{"-A " + matchingScore} \
~{"-B " + mismatchPenalty} \
~{"-u " + howToFindGTAG} \
--secondary=~{true="yes" false="no" secondaryAlignment} \
~{"-o " + outputPrefix} \
~{"-t " + cores} \
~{referenceFile} \
~{queryFile}
}
output {
File outputAlignmentFile = outputPrefix
}
runtime {
cpu: cores
memory: memory
docker: dockerImage
}
parameter_meta {
queryFile: "Input fasta file."
referenceFile: "Reference fasta file."
outputPrefix: "Output directory path + output file prefix."
presetOption: "This option applies multiple options at the same time."
outputSAM: "Output in the SAM format."
maxFragmentLength: "Max fragment length (effective with -xsr or in the fragment mode)."
maxIntronLength: "Max intron length (effective with -xsplice; changing -r)."
skipSelfAndDualMappings: "Skip self and dual mappings (for the all-vs-all mode)."
retainMaxSecondaryAlignments: "Retain at most INT secondary alignments."
matchingScore: "Matching score."
mismatchPenalty: "Mismatch penalty."
howToFindGTAG: "How to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG."
secondaryAlignment: "Whether to output secondary alignments."
outputAlignmentFile: "Mapping and alignment between collections of DNA sequences file."
}
}
......@@ -38,6 +38,7 @@ task MultiQC {
Boolean verbose = false
Boolean quiet = false
Array[Boolean] finished = [] # An array of booleans that can be used to let multiqc wait on stuff.
Int memory = 4
}
command {
......@@ -86,6 +87,7 @@ task MultiQC {
}
runtime {
memory: memory
docker: dockerImage
}
}
......@@ -8,7 +8,7 @@ task BedToIntervalList {
Int memory = 4
Float memoryMultiplier = 3.0
String dockerImage = "quay.io/biocontainers/picard:2.18.26--0"
String dockerImage = "quay.io/biocontainers/picard:2.20.5--0"
}
command {
......@@ -52,9 +52,7 @@ task CollectMultipleMetrics {
Int memory = 8
Float memoryMultiplier = 4
# https://raw.githubusercontent.com/BioContainers/multi-package-containers/80886dfea00f3cd9e7ae2edf4fc42816a10e5403/combinations/mulled-v2-23d9f7c700e78129a769e78521eb86d6b8341923%3A8dde04faba6c9ac93fae7e846af3bafd2c331b3b-0.tsv
# Contains r-base=3.4.1,picard=2.18.2
String dockerImage = "quay.io/biocontainers/mulled-v2-23d9f7c700e78129a769e78521eb86d6b8341923:8dde04faba6c9ac93fae7e846af3bafd2c331b3b-0"
String dockerImage = "quay.io/biocontainers/picard:2.20.5--0"
}
......@@ -137,9 +135,7 @@ task CollectRnaSeqMetrics {
Int memory = 8
Float memoryMultiplier = 4.0
# https://raw.githubusercontent.com/BioContainers/multi-package-containers/80886dfea00f3cd9e7ae2edf4fc42816a10e5403/combinations/mulled-v2-23d9f7c700e78129a769e78521eb86d6b8341923%3A8dde04faba6c9ac93fae7e846af3bafd2c331b3b-0.tsv
# Contains r-base=3.4.1,picard=2.18.2
String dockerImage = "quay.io/biocontainers/mulled-v2-23d9f7c700e78129a769e78521eb86d6b8341923:8dde04faba6c9ac93fae7e846af3bafd2c331b3b-0"
String dockerImage = "quay.io/biocontainers/picard:2.20.5--0"
}
command {
......@@ -178,7 +174,7 @@ task CollectTargetedPcrMetrics {
Int memory = 4
Float memoryMultiplier = 3.0
String dockerImage = "quay.io/biocontainers/picard:2.18.26--0"
String dockerImage = "quay.io/biocontainers/picard:2.20.5--0"
}
command {
......@@ -216,7 +212,7 @@ task GatherBamFiles {
Int memory = 4
Float memoryMultiplier = 3.0
String dockerImage = "quay.io/biocontainers/picard:2.18.26--0"
String dockerImage = "quay.io/biocontainers/picard:2.20.5--0"
}
command {
......@@ -250,7 +246,7 @@ task GatherVcfs {
Int memory = 4
Float memoryMultiplier = 3.0
String dockerImage = "quay.io/biocontainers/picard:2.18.26--0"
String dockerImage = "quay.io/biocontainers/picard:2.20.5--0"
}
command {
......@@ -282,7 +278,7 @@ task MarkDuplicates {
Int memory = 8
Float memoryMultiplier = 3.0
String dockerImage = "quay.io/biocontainers/picard:2.18.26--0"
String dockerImage = "quay.io/biocontainers/picard:2.20.5--0"
# The program default for READ_NAME_REGEX is appropriate in nearly every case.
# Sometimes we wish to supply "null" in order to turn off optical duplicate detection
......@@ -335,7 +331,7 @@ task MergeVCFs {
Int memory = 8
Float memoryMultiplier = 3.0
String dockerImage = "quay.io/biocontainers/picard:2.18.26--0"
String dockerImage = "quay.io/biocontainers/picard:2.20.5--0"
}
# Using MergeVcfs instead of GatherVcfs so we can create indices
......@@ -369,7 +365,7 @@ task SamToFastq {
Int memory = 16 # High memory default to avoid crashes.
Float memoryMultiplier = 3.0
String dockerImage = "quay.io/biocontainers/picard:2.18.26--0"
String dockerImage = "quay.io/biocontainers/picard:2.20.5--0"
File? NONE
}
......@@ -406,7 +402,7 @@ task ScatterIntervalList {
Int memory = 4
Float memoryMultiplier = 3.0
String dockerImage = "quay.io/biocontainers/picard:2.18.26--0"
String dockerImage = "quay.io/biocontainers/picard:2.20.5--0"
}
command {
......@@ -441,7 +437,7 @@ task SortVcf {
Int memory = 8
Float memoryMultiplier = 3.0
String dockerImage = "quay.io/biocontainers/picard:2.18.26--0"
String dockerImage = "quay.io/biocontainers/picard:2.20.5--0"
}
......
......@@ -267,3 +267,35 @@ task ParallelSingleTrain {
docker: dockerImage
}
}
task ModifyStrelka {
input {
String installDir = "/opt/somaticseq/vcfModifier" #the location in the docker image
File strelkaVCF
String? outputVCFName = basename(strelkaVCF, ".gz")
Int threads = 1
String dockerImage = "lethalfang/somaticseq:3.1.0"
}
command {
set -e
~{installDir}/modify_Strelka.py \
-infile ~{strelkaVCF} \
-outfile "modified_strelka.vcf"
first_FORMAT_line_num=$(grep -n -m 1 '##FORMAT' "modified_strelka.vcf" | cut -d : -f 1)
sed "$first_FORMAT_line_num"'i##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">' "modified_strelka.vcf" > ~{outputVCFName}
}
output {
File outputVcf = outputVCFName
}
runtime {
cpu: threads
docker: dockerImage
}
}
talon.wdl 0 → 100644
version 1.0
# Copyright (c) 2019 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
task CreateAbundanceFileFromDatabase {
input {
File databaseFile
String outputPrefix
String genomeBuild
String annotationVersion
Boolean filterTranscripts = false
File? filterPairingsFile
Int cores = 1
Int memory = 4
String dockerImage = "biocontainers/talon:v4.2_cv2"
}
command {
set -e
mkdir -p $(dirname ~{outputPrefix})
create_abundance_file_from_database \
~{"--db=" + databaseFile} \
~{"--o=" + outputPrefix} \
~{"-b " + genomeBuild} \
~{"-a " + annotationVersion} \
~{true="--filter" false="" filterTranscripts} \
~{"-p " + filterPairingsFile}
}
output {
File outputAbundanceFile = outputPrefix + "_talon_abundance.tsv"
}
runtime {
cpu: cores
memory: memory
docker: dockerImage
}
parameter_meta {
databaseFile: "TALON database."
outputPrefix: "Output directory path + output file prefix."
genomeBuild: "Genome build to use."
annotationVersion: "Which annotation version to use."
filterTranscripts: "The transcripts in the database will be filtered prior to GTF creation."
filterPairingsFile: "A file indicating which datasets should be considered together."
outputAbundanceFile: "Abundance for each transcript in the TALON database across datasets."
}
}
task CreateGtfAbundanceFromDatabase {
input {
File databaseFile
String outputPrefix
String genomeBuild
String annotationVersion
Boolean filterTranscripts = false
File? filterPairingsFile
Int cores = 1
Int memory = 4
String dockerImage = "biocontainers/talon:v4.2_cv2"
}
command {
set -e
mkdir -p $(dirname ~{outputPrefix})
create_GTF_abundance_from_database \
~{"--db=" + databaseFile} \
~{"--o=" + outputPrefix} \
~{"-b " + genomeBuild} \
~{"-a " + annotationVersion} \
~{true="--filter" false="" filterTranscripts} \
~{"-p " + filterPairingsFile}
}
output {
File outputGTFfile = outputPrefix + "_talon_observedOnly.gtf"
File outputAbundanceFile = outputPrefix + "_talon_abundance.tsv"
}
runtime {
cpu: cores
memory: memory
docker: dockerImage
}
parameter_meta {
databaseFile: "TALON database."
outputPrefix: "Output directory path + output file prefix."
genomeBuild: "Genome build to use."
annotationVersion: "Which annotation version to use."
filterTranscripts: "The transcripts in the database will be filtered prior to GTF creation."
filterPairingsFile: "A file indicating which datasets should be considered together."
outputGTFfile: "The genes, transcripts, and exons stored a TALON database in GTF format."
outputAbundanceFile: "Abundance for each transcript in the TALON database across datasets."
}
}
task CreateGtfFromDatabase {
input {
File databaseFile
String outputPrefix
String genomeBuild
String annotationVersion
Boolean observedInDataset = false
File? whitelistFile
File? datasetFile
Int cores = 1
Int memory = 4
String dockerImage = "biocontainers/talon:v4.2_cv2"
}
command {
set -e
mkdir -p $(dirname ~{outputPrefix})
create_GTF_from_database \
~{"--db=" + databaseFile} \
~{"--o=" + outputPrefix} \
~{"-b " + genomeBuild} \
~{"-a " + annotationVersion} \
~{"--whitelist=" + whitelistFile} \
~{true="--observed" false="" observedInDataset} \
~{"-d " + datasetFile}
}
output {
File outputGTFfile = outputPrefix + "_talon.gtf"
}
runtime {
cpu: cores
memory: memory
docker: dockerImage
}
parameter_meta {
databaseFile: "TALON database."
outputPrefix: "Output directory path + output file prefix."
genomeBuild: "Genome build to use."
annotationVersion: "Which annotation version to use."
observedInDataset: "Output only includes transcripts that were observed at least once."
whitelistFile: "Whitelist file of transcripts to include in the output."
datasetFile: "A file indicating which datasets should be included."
outputGTFfile: "The genes, transcripts, and exons stored a TALON database in GTF format."
}
}
task InitializeTalonDatabase {
input {
File GTFfile
String outputPrefix
String genomeBuild
String annotationVersion
Int minimumLength = 300
String novelIDprefix = "TALON"
Int cutoff5p = 500
Int cutoff3p = 300
Int cores = 1
Int memory = 10
String dockerImage = "biocontainers/talon:v4.2_cv2"
}
command {
set -e
mkdir -p $(dirname ~{outputPrefix})
initialize_talon_database \
~{"--f=" + GTFfile} \
~{"--o=" + outputPrefix} \
~{"--g=" + genomeBuild} \
~{"--a=" + annotationVersion} \
~{"--l=" + minimumLength} \
~{"--idprefix=" + novelIDprefix} \
~{"--5p=" + cutoff5p} \
~{"--3p=" + cutoff3p}
}
output {
File outputDatabase = outputPrefix + ".db"
}
runtime {
cpu: cores
memory: memory
docker: dockerImage
}
parameter_meta {
GTFfile: "GTF annotation containing genes, transcripts, and edges."
outputPrefix: "Output directory path + output file prefix."
genomeBuild: "Name of genome build that the GTF file is based on (ie hg38)."
annotationVersion: "Name of supplied annotation (will be used to label data)."
minimumLength: "Minimum required transcript length."
novelIDprefix: "Prefix for naming novel discoveries in eventual TALON runs."
cutoff5p: "Maximum allowable distance (bp) at the 5' end during annotation."
cutoff3p: "Maximum allowable distance (bp) at the 3' end during annotation."
outputDatabase: "TALON database."
}
}
task MapAntisenseGenesToSense {
input {
File databaseFile
String outputPrefix
String annotationVersion
Int cores = 1
Int memory = 4
String dockerImage = "biocontainers/talon:v4.2_cv2"
}
command {
set -e
mkdir -p $(dirname ~{outputPrefix})
map_antisense_genes_to_sense \
~{"--db=" + databaseFile} \
~{"--o=" + outputPrefix} \
~{"-a " + annotationVersion}
}
output {
File outputAntisenseMapFile = outputPrefix + "_antisense_mapping.gtf"
}
runtime {
cpu: cores
memory: memory
docker: dockerImage
}
parameter_meta {
databaseFile: "TALON database."
outputPrefix: "Output directory path + output file prefix."
annotationVersion: "Which annotation version to use."
outputAntisenseMapFile: "IDs of the sense gene for every antisense gene in the database."
}
}
task SummarizeDatasets {
input {
File databaseFile
String outputPrefix
File? datasetGroupsCSV
Int cores = 1
Int memory = 4
String dockerImage = "biocontainers/talon:v4.2_cv2"
}
command {
set -e
mkdir -p $(dirname ~{outputPrefix})
summarize_datasets \
~{"--db " + databaseFile} \
~{"--o " + outputPrefix} \
~{"--groups " + datasetGroupsCSV}
}
output {
File outputSummaryFile = outputPrefix + "_talon_summary.tsv"
}
runtime {
cpu: cores
memory: memory
docker: dockerImage
}
parameter_meta {
databaseFile: "TALON database."
outputPrefix: "Output directory path + output file prefix."
datasetGroupsCSV: "File of comma-delimited dataset groups to process together."
outputSummaryFile: "Tab-delimited file of gene and transcript counts for each dataset."
}
}
task Talon {
input {
File SAMfile
File configFile
File databaseFile
String outputPrefix
String genomeBuild
String configFileName = basename(configFile)
String SAMfileName = basename(SAMfile)
Float minimumCoverage = 0.9
Int minimumIdentity = 0
Int cores = 1
Int memory = 4
String dockerImage = "biocontainers/talon:v4.2_cv2"
}
command {
set -e
mkdir -p $(dirname ~{outputPrefix})
mv ${configFile} ./${configFileName}
mv ${SAMfile} ./${SAMfileName}
talon \
~{"--f " + configFileName} \
~{"--db " + databaseFile} \
~{"--o " + outputPrefix} \
~{"--build " + genomeBuild} \
~{"--cov " + minimumCoverage} \
~{"--identity " + minimumIdentity}
}
output {
File outputUpdatedDatabase = databaseFile
File outputLog = outputPrefix + "_talon_QC.log"
}
runtime {
cpu: cores
memory: memory
docker: dockerImage
}
parameter_meta {
SAMfile: "Input SAM file, same one as described in configFile."
configFile: "Dataset config file."
databaseFile: "TALON database. Created using initialize_talon_database.py."
outputPrefix: "Output directory path + output file prefix."
genomeBuild: "Genome build (i.e. hg38) to use."
minimumCoverage: "Minimum alignment coverage in order to use a SAM entry."
minimumIdentity: "Minimum alignment identity in order to use a SAM entry."
outputUpdatedDatabase: "Updated TALON database."
outputLog: "Log file from TALON run."
}
}
version 1.0
# Copyright (c) 2019 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
task CleanSpliceJunctions {
input {
File SAMfile
File referenceGenome
String outputPrefix
File spliceJunctionAnnotation
File? variantFile
Int cores = 1
Int memory = 4
String dockerImage = "biocontainers/transcriptclean:v1.0.7_cv1"
}
command {
set -e
mkdir -p $(dirname ~{outputPrefix})
clean_splice_jns \
~{"--f=" + SAMfile} \
~{"--g=" + referenceGenome} \
~{"--o=" + outputPrefix} \
~{"--s=" + spliceJunctionAnnotation} \
~{"--v=" + variantFile}
}
output {
File outputCleanedSAM = outputPrefix + "_clean.sam"
}
runtime {
cpu: cores
memory: memory
docker: dockerImage
}
parameter_meta {
SAMfile: "Input SAM file"
referenceGenome: "Reference genome fasta file."
outputPrefix: "Output directory path + output file prefix."
spliceJunctionAnnotation: "Splice junction file"
variantFile: "VCF formatted file of variants"
outputCleanedSAM: "Cleaned sam output file."
}
}
task GetCorrectedSJsFromLog {
input {
File TElogFile
String outputPrefix
Int cores = 1
Int memory = 5
String dockerImage = "biocontainers/transcriptclean:v1.0.7_cv1"
}
command {
set -e
mkdir -p $(dirname ~{outputPrefix})
get_corrected_SJs_from_log \
~{TElogFile} \
~{outputPrefix + ".tsv"}
}
output {
File outputCorrectedSJs = outputPrefix + ".tsv"
}
runtime {
cpu: cores
memory: memory
docker: dockerImage
}
parameter_meta {
TElogFile: "TE log from TranscriptClean."
outputPrefix: "Output directory path + output file prefix."
outputCorrectedSJs: "Formely noncanonical splice junctions in BED format."
}
}
task GetSJsFromGtf {
input {
File GTFfile
File genomeFile
String outputPrefix
Int minIntronSize = 21
Int cores = 1
Int memory = 8
String dockerImage = "biocontainers/transcriptclean:v1.0.7_cv1"
}
command {
set -e
mkdir -p $(dirname ~{outputPrefix})
get_SJs_from_gtf \
~{"--f=" + GTFfile} \
~{"--g=" + genomeFile} \
~{"--o=" + outputPrefix + ".tsv"} \
~{"--minIntronSize=" + minIntronSize}
}
output {
File outputSJsFile = outputPrefix + ".tsv"
}
runtime {
cpu: cores
memory: memory
docker: dockerImage
}
parameter_meta {
GTFfile: "Input GTF file"
genomeFile: "Reference genome"
outputPrefix: "Output directory path + output file prefix."
minIntronSize: "Minimum size of intron to consider a junction."
outputSJsFile: "Extracted splice junctions."
}
}
task GetTranscriptCleanStats {
input {
File transcriptCleanSAMfile
String outputPrefix
Int cores = 1
Int memory = 4
String dockerImage = "biocontainers/transcriptclean:v1.0.7_cv1"
}
command {
set -e
mkdir -p $(dirname ~{outputPrefix})
get_TranscriptClean_stats \
~{transcriptCleanSAMfile} \
~{outputPrefix}
}
output {
File outputStatsFile = stdout()
}
runtime {
cpu: cores
memory: memory
docker: dockerImage
}
parameter_meta {
transcriptCleanSAMfile: "Output SAM file from TranscriptClean"
outputPrefix: "Output directory path + output file prefix."
outputStatsFile: "Summary stats from TranscriptClean run."
}
}
task TranscriptClean {
input {
File SAMfile
File referenceGenome
String outputPrefix
Int maxLenIndel = 5
Int maxSJoffset = 5
Boolean correctMismatches = true
Boolean correctIndels = true
Boolean dryRun = false
Boolean primaryOnly = false
File? spliceJunctionAnnotation
File? variantFile
Boolean? correctSJs
Int cores = 1
Int memory = 25
String dockerImage = "biocontainers/transcriptclean:v1.0.7_cv1"
}
command {
set -e
mkdir -p $(dirname ~{outputPrefix})
TranscriptClean \
~{"-s " + SAMfile} \
~{"-g " + referenceGenome} \
~{"-o " + outputPrefix} \
~{"-j " + spliceJunctionAnnotation} \
~{"-v " + variantFile} \
~{"--maxLenIndel=" + maxLenIndel} \
~{"--maxSJOffset=" + maxSJoffset} \
~{true="-m CORRECTMISMATCHES" false="-m false" correctMismatches} \
~{true="-i CORRECTINDELS" false="-i false" correctIndels} \
~{true="--correctSJs=CORRECTSJS" false="--correctSJs=false" correctSJs} \
~{true="--dryRun" false="" dryRun} \
~{true="--primaryOnly" false="" primaryOnly}
}
output {
File outputTranscriptCleanFasta = outputPrefix + "_clean.fa"
File outputTranscriptCleanLog = outputPrefix + "_clean.log"
File outputTranscriptCleanSAM = outputPrefix + "_clean.sam"
File outputTranscriptCleanTElog = outputPrefix + "_clean.TE.log"
}
runtime {
cpu: cores
memory: memory
docker: dockerImage
}
parameter_meta {
SAMfile: "Input SAM file containing transcripts to correct."
referenceGenome: "Reference genome fasta file."
outputPrefix: "Output directory path + output file prefix."
spliceJunctionAnnotation: "Splice junction file"
maxLenIndel: "Maximum size indel to correct."
maxSJoffset: "Maximum distance from annotated splice junction to correct."
correctMismatches: "Set this to make TranscriptClean correct mismatches."
correctIndels: "Set this to make TranscriptClean correct indels."
correctSJs: "Set this to make TranscriptClean correct splice junctions."
dryRun: "TranscriptClean will read in the data but don't do any correction."
primaryOnly: "TranscriptClean will only output primary mappings of transcripts."
outputTranscriptCleanFasta: "Fasta file containing corrected reads."
outputTranscriptCleanLog: "Log file of TranscriptClean run."
outputTranscriptCleanSAM: "SAM file containing corrected aligned reads."
outputTranscriptCleanTElog: "TE log file of TranscriptClean run."
}
}
......@@ -20,6 +20,13 @@ task VarDict {
Int endColumn = 3
Int geneColumn = 4
Boolean outputCandidateSomaticOnly = true
Boolean outputAllVariantsAtSamePosition = true
Float mappingQuality = 20
Int minimumTotalDepth = 8
Int minimumVariantDepth = 4
Float minimumAlleleFrequency = 0.02
Int threads = 1
Int memory = 16
Float memoryMultiplier = 2.5
......@@ -45,6 +52,12 @@ task VarDict {
~{true="var2vcf_paired.pl" false="var2vcf_valid.pl" defined(normalBam)} \
-N "~{tumorSampleName}~{"|" + normalSampleName}" \
~{true="" false="-E" defined(normalBam)} \
~{true="-M" false="" outputCandidateSomaticOnly} \
~{true="-A" false="" outputAllVariantsAtSamePosition} \
-Q ~{mappingQuality} \
-d ~{minimumTotalDepth} \
-v ~{minimumVariantDepth} \
-f ~{minimumAlleleFrequency} \
> ~{outputVcf}
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment