diff --git a/CHANGELOG.md b/CHANGELOG.md index 7813c2090dea4db071f71744add6e89cdfd1516c..9a546718279bc853ad32fff24257e442cda43271 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,11 @@ that users understand how the changes affect the new version. version 4.0.0-develop --------------------------- ++ TALON: Update `FilterTalonTranscripts` to new version, which removes the + pairingsFile and replaces this with datasetsFile. ++ TALON: Add `GetSpliceJunctions` & `LabelReads` tasks. ++ TALON: Update to version 5.0. ++ Add tasks for pbmm2, the PacBio wrapper for minimap2. + Update the image for chunked-scatter and make use of new features from 0.2.0. + Tuned resource requirements for GATK VariantEval, MultiQC, Picard metrics and STAR. diff --git a/pbmm2.wdl b/pbmm2.wdl new file mode 100644 index 0000000000000000000000000000000000000000..84fbd2d08e32c9b9676e5b24f6d6c0843bb95dce --- /dev/null +++ b/pbmm2.wdl @@ -0,0 +1,74 @@ +version 1.0 + +# Copyright (c) 2020 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task Mapping { + input { + String presetOption + Boolean sort=true + String sample + File referenceMMI + File queryFile + + Int cores = 4 + String memory = "30G" + Int timeMinutes = 1 + ceil(size(queryFile, "G") * 200 / cores) + String dockerImage = "quay.io/biocontainers/pbmm2:1.3.0--h56fc30b_1" + } + + command { + pbmm2 align \ + --preset ~{presetOption} \ + ~{true="--sort" false="" sort} \ + -j ~{cores} \ + ~{referenceMMI} \ + ~{queryFile} \ + ~{sample}.align.bam + } + + output { + File outputAlignmentFile = sample + ".align.bam" + File outputIndexFile = sample + ".align.bam.bai" + } + + runtime { + cpu: cores + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + presetOption: {description: "This option applies multiple options at the same time.", category: "required"} + sort: {description: "Sort the output bam file.", category: "advanced"} + sample: {description: "Name of the sample"} + referenceMMI: {description: "MMI file for the reference.", category: "required"} + queryFile: {description: "BAM file with reads to align against the reference.", category: "required"} + cores: {description: "The number of cores to be used.", category: "advanced"} + memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # output + outputAlignmentFile: {description: "Mapped bam file."} + outputIndexFile: {description: "Bam index file."} + } +} diff --git a/scripts b/scripts index b83da72b9b43b956a3062b78fb08044eb9fae464..325a129c14de56b2055ee0e9e0da7dc74df5fec4 160000 --- a/scripts +++ b/scripts @@ -1 +1 @@ -Subproject commit b83da72b9b43b956a3062b78fb08044eb9fae464 +Subproject commit 325a129c14de56b2055ee0e9e0da7dc74df5fec4 diff --git a/talon.wdl b/talon.wdl index 98e0c13a7af5ebe3029f02522dbcdcbbd35c905c..a469ddbac873944a9236cd53611b8fd8b46c7981 100644 --- a/talon.wdl +++ b/talon.wdl @@ -32,7 +32,7 @@ task CreateAbundanceFileFromDatabase { String memory = "4G" Int timeMinutes = 30 - String dockerImage = "biocontainers/talon:v4.4.2_cv1" + String dockerImage = "biocontainers/talon:v5.0_cv1" } command { @@ -88,7 +88,7 @@ task CreateGtfFromDatabase { String memory = "4G" Int timeMinutes = 30 - String dockerImage = "biocontainers/talon:v4.4.2_cv1" + String dockerImage = "biocontainers/talon:v5.0_cv1" } command { @@ -137,12 +137,16 @@ task FilterTalonTranscripts { File databaseFile String annotationVersion String outputPrefix + Float maxFracA = 0.5 + Int minCount = 5 + Boolean allowGenomic = false - File? pairingsFile + File? datasetsFile + Int? minDatasets String memory = "4G" Int timeMinutes = 30 - String dockerImage = "biocontainers/talon:v4.4.2_cv1" + String dockerImage = "biocontainers/talon:v5.0_cv1" } command { @@ -152,7 +156,11 @@ task FilterTalonTranscripts { --db=~{databaseFile} \ -a ~{annotationVersion} \ ~{"--o=" + outputPrefix + "_whitelist.csv"} \ - ~{"-p " + pairingsFile} + --maxFracA=~{maxFracA} \ + --minCount=~{minCount} \ + ~{true="--allowGenomic" false="" allowGenomic} \ + --datasets=~{datasetsFile} \ + --minDatasets=~{minDatasets} } output { @@ -170,7 +178,11 @@ task FilterTalonTranscripts { databaseFile: {description: "TALON database.", category: "required"} annotationVersion: {description: "Which annotation version to use.", category: "required"} outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} - pairingsFile: {description: "A file indicating which datasets should be considered together.", category: "advanced"} + maxFracA: {description: "Maximum fraction of As to allow in the window located immediately after any read assigned to a novel transcript.", category: "advanced"} + minCount: {description: "Number of minimum occurrences required for a novel transcript PER dataset.", category: "advanced"} + allowGenomic: {description: "If this option is set, transcripts from the Genomic novelty category will be permitted in the output.", category: "advanced"} + datasetsFile: {description: "Datasets to include.", category: "advanced"} + minDatasets: {description: "Minimum number of datasets novel transcripts must be found in.", category: "advanced"} memory: {description: "The amount of memory available to the job.", category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} @@ -190,7 +202,7 @@ task GetReadAnnotations { String memory = "4G" Int timeMinutes = 30 - String dockerImage = "biocontainers/talon:v4.4.2_cv1" + String dockerImage = "biocontainers/talon:v5.0_cv1" } command { @@ -228,6 +240,57 @@ task GetReadAnnotations { } } +task GetSpliceJunctions { + input { + File SJinformationFile + String inputFileType = "db" + File referenceGTF + String runMode = "intron" + String outputPrefix + + String memory = "4G" + Int timeMinutes = 30 + String dockerImage = "biocontainers/talon:v5.0_cv1" + } + + Map[String, String] SJfileType = {"db": "--db", "gtf": "--gtf"} + + command { + set -e + mkdir -p "$(dirname ~{outputPrefix})" + talon_get_sjs \ + ~{SJfileType[inputFileType] + SJinformationFile} \ + --ref ~{referenceGTF} \ + --mode ~{runMode} \ + --outprefix ~{outputPrefix} + } + + output { + File outputSJfile = outputPrefix + "_" + runMode + "s.tsv" + } + + runtime { + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + SJinformationFile: {description: "TALON GTF file or database from which to extract exons/introns.", category: "required"} + inputFileType: {description: "The file type of SJinformationFile.", category: "common"} + referenceGTF: {description: "GTF reference file (ie GENCODE).", category: "required"} + runMode: {description: "Determines whether to include introns or exons in the output.", category: "common"} + outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} + memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputSJfile: {description: "File containing locations, novelty and transcript assignments of exons/introns."} + } +} + task InitializeTalonDatabase { input { File GTFfile @@ -241,7 +304,7 @@ task InitializeTalonDatabase { String memory = "10G" Int timeMinutes = 60 - String dockerImage = "biocontainers/talon:v4.4.2_cv1" + String dockerImage = "biocontainers/talon:v5.0_cv1" } command { @@ -287,13 +350,72 @@ task InitializeTalonDatabase { } } +task LabelReads { + input { + File SAMfile + File referenceGenome + Int fracaRangeSize = 20 + String tmpDir = "./tmp_label_reads" + Boolean deleteTmp = true + String outputPrefix + + Int threads = 4 + String memory = "25G" + Int timeMinutes = 2880 + String dockerImage = "biocontainers/talon:v5.0_cv1" + } + + command { + set -e + mkdir -p "$(dirname ~{outputPrefix})" + talon_label_reads \ + --f=~{SAMfile} \ + --g=~{referenceGenome} \ + --t=~{threads} \ + --ar=~{fracaRangeSize} \ + --tmpDir=~{tmpDir} \ + ~{true="--deleteTmp" false="" deleteTmp} \ + --o=~{outputPrefix} + } + + output { + File outputLabeledSAM = outputPrefix + "_labeled.sam" + File outputReadLabels = outputPrefix + "_read_labels.tsv" + } + + runtime { + cpu: threads + memory: memory + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + SAMfile: {description: "SAM file of transcripts.", category: "required"} + referenceGenome: {description: "Reference genome fasta file.", category: "required"} + fracaRangeSize: {description: "Size of post-transcript interval to compute fraction.", category: "common"} + tmpDir: {description: "Path to directory for tmp files.", category: "advanced"} + deleteTmp: {description: "If set, tmp dir will be removed.", category: "advanced"} + outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} + threads: {description: "The number of threads to be used.", category: "advanced"} + memory: {description: "The amount of memory available to the job.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputLabeledSAM: {description: "SAM file with labeled transcripts."} + outputReadLabels: {description: "Tabular file with fraction description per read."} + } +} + task ReformatGtf { input { File GTFfile String memory = "4G" Int timeMinutes = 30 - String dockerImage = "biocontainers/talon:v4.4.2_cv1" + String dockerImage = "biocontainers/talon:v5.0_cv1" } command { @@ -334,7 +456,7 @@ task SummarizeDatasets { String memory = "4G" Int timeMinutes = 50 - String dockerImage = "biocontainers/talon:v4.4.2_cv1" + String dockerImage = "biocontainers/talon:v5.0_cv1" } command { @@ -383,10 +505,10 @@ task Talon { Float minimumIdentity = 0.8 String outputPrefix - Int cores = 4 + Int threads = 4 String memory = "25G" Int timeMinutes = 2880 - String dockerImage = "biocontainers/talon:v4.4.2_cv1" + String dockerImage = "biocontainers/talon:v5.0_cv1" } command <<< @@ -405,7 +527,7 @@ task Talon { ~{"--f " + outputPrefix + "/talonConfigFile.csv"} \ --db ~{databaseFile} \ --build ~{genomeBuild} \ - --threads ~{cores} \ + --threads ~{threads} \ --cov ~{minimumCoverage} \ --identity ~{minimumIdentity} \ ~{"--o " + outputPrefix + "/run"} @@ -419,7 +541,7 @@ task Talon { } runtime { - cpu: cores + cpu: threads memory: memory time_minutes: timeMinutes docker: dockerImage @@ -435,7 +557,7 @@ task Talon { minimumCoverage: {description: "Minimum alignment coverage in order to use a SAM entry.", category: "common"} minimumIdentity: {description: "Minimum alignment identity in order to use a SAM entry.", category: "common" } outputPrefix: {description: "Output directory path + output file prefix.", category: "required"} - cores: {description: "The number of cores to be used.", category: "advanced"} + threads: {description: "The number of threads to be used.", category: "advanced"} memory: {description: "The amount of memory available to the job.", category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"}