Merge pull request #161 from biowdl/BIOWDL-354

BIOWDL-354: Update parameter_meta and TALON task cores.

Merge pull request #161 from biowdl/BIOWDL-354
c9ec2639 · Jasper Boom · GitHub · 1ea50377 · 07bac37b · c9ec2639
Unverified Commit c9ec2639 authored 5 years ago by Jasper Boom Committed by GitHub 5 years ago
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,53 +11,55 @@ that users understand how the changes affect the new version.

 version 2.1.0-dev
 ---------------------------
-+ Updated TALON to version 4.4
+ Updated parameter_meta sections for Minimap2 and TranscriptClean to wdl-aid format.
+ Updated cores variable for TALON, the default is now 4.
+ Updated TALON to version 4.4.
 + Added parameter_meta sections to the following tools:
    + htseq
    + cutadapt
    + collect-columns
    + stringtie
    + fastqc
-+ Updated star default image to 2.7.3a
+ Updated star default image to 2.7.3a.
 + Hisat2 now indexes the resulting BAM file.
-+ Samtools index now also works without setting a path for the output
-+ Bugfix: Biowdl-input-converter now makes sure the output directory exists
+ Samtools index now also works without setting a path for the output.
+ Bugfix: Biowdl-input-converter now makes sure the output directory exists.

 version 2.0.0
 ---------------------------
-+ TranscriptClean: Update TranscriptClean to version 2.0.2
+ TranscriptClean: Update TranscriptClean to version 2.0.2.
 + Memory runtime attributes are now Strings indicating total memory, as opposed to Ints indicating memory per core.
 + Memory inputs for most tasks are now Strings, remaining Int memory inputs are renamed to "memoryGb".
 + Use the biowdl-input-converter container for JsonToYaml, to reduce the amount of containers needed.
 + Add biowdl-input-converter and remove SampleConfigToSampleReadgroupLists which it replaces.
-+ GATK.GenotypeGVCFs: Increased memoryMultiplier from 2.0 to 3.0 
-+ Minimap2: Add -k option to minimap2 mapping
-+ Added bwakit task
-+ Minimap2: Add the option for --MD tag
-+ TALON: Update average memory needs for main TALON process
+ GATK.GenotypeGVCFs: Increased memoryMultiplier from 2.0 to 3.0 .
+ Minimap2: Add -k option to minimap2 mapping.
+ Added bwakit task.
+ Minimap2: Add the option for --MD tag.
+ TALON: Update average memory needs for main TALON process.

 version 1.0.0
 ---------------------------
-+ Common: Add "SampleConfigToSampleReadgroupLists" task
-+ MultiQC: the "interactive" input is now set to true by default
+ Common: Add "SampleConfigToSampleReadgroupLists" task.
+ MultiQC: the "interactive" input is now set to true by default.
 + Removed deprecated tasks:
  + bioconda.installPrefix
  + mergecounts.MergeCounts
-+ GATK.BaseRecalibrator: "knownIndelsSitesVCFs" and "knownIndelsSitesVCFIndexes" are no longer optional, but now have a default of "[]"
-+ Removed BWA index task
-+ Removed unused "picardJar" input from bwa.wdl
-+ All inputs to bedtools Sort are now reflected in the generated command
-+ TranscriptClean: Update TranscriptClean container to version 1.0.8
-+ Removed "pipefail" from command sections TALON and TranscriptClean
-+ Add WDL task for Minimap2
-+ Add WDL task for TALON
-+ Add WDL task for TranscriptClean
-+ Fastqsplitter: fix mkdir command to work with biocontainer's busybox mkdir
-+ Cutadapt: simplify interface
-+ Bigger memory multiplier in mutect to take in account bigger vmem usage
-+ Cutadapt: Remove default adapter
+ GATK.BaseRecalibrator: "knownIndelsSitesVCFs" and "knownIndelsSitesVCFIndexes" are no longer optional, but now have a default of "[]".
+ Removed BWA index task.
+ Removed unused "picardJar" input from bwa.wdl.
+ All inputs to bedtools Sort are now reflected in the generated command.
+ TranscriptClean: Update TranscriptClean container to version 1.0.8.
+ Removed "pipefail" from command sections TALON and TranscriptClean.
+ Add WDL task for Minimap2.
+ Add WDL task for TALON.
+ Add WDL task for TranscriptClean.
+ Fastqsplitter: fix mkdir command to work with biocontainer's busybox mkdir.
+ Cutadapt: simplify interface.
+ Bigger memory multiplier in mutect to take in account bigger vmem usage.
+ Cutadapt: Remove default adapter.
 + Fastqsplitter: use version 1.1.
-+ Picard: Use version 2.20.5 of the biocontainer as this includes the R dependency
+ Picard: Use version 2.20.5 of the biocontainer as this includes the R dependency.
 + Common: Update dockerTag to dockerImage.
 + GATK: Add CombineVariants task that allows, e.g., to merge VCFs from different callers.
 + Mutect2: Add GATK tasks related to variant filtering (LearnReadOrientationModel, MergeStats, GetPileupSummaries, CalculateContamination and FilterMutectCalls).

--- a/minimap2.wdl
+++ b/minimap2.wdl
@@ -22,11 +22,11 @@ version 1.0

 task Indexing {
    input {
-        File referenceFile
-        String outputPrefix
        Boolean useHomopolymerCompressedKmer = false
        Int kmerSize = 15
        Int minimizerWindowSize = 10
+        String outputPrefix
+        File referenceFile

        Int? splitIndex

@@ -42,9 +42,9 @@ task Indexing {
        ~{true="-H" false="" useHomopolymerCompressedKmer} \
        ~{"-k " + kmerSize} \
        ~{"-w " + minimizerWindowSize} \
-        ~{"-I " + splitIndex} \
        ~{"-d " + outputPrefix + ".mmi"} \
        ~{"-t " + cores} \
+        ~{"-I " + splitIndex} \
        ~{referenceFile}
    }

@@ -59,35 +59,55 @@ task Indexing {
    }

    parameter_meta {
-        referenceFile: "Reference fasta file."
-        outputPrefix: "Output directory path + output file prefix."
-        useHomopolymerCompressedKmer: "Use homopolymer-compressed k-mer (preferrable for PacBio)."
-        kmerSize: "K-mer size (no larger than 28)."
-        minimizerWindowSize: "Minimizer window size."
-        splitIndex: "Split index for every ~NUM input bases."
-
-        outputIndexFile: "Indexed reference file."
+        useHomopolymerCompressedKmer: {
+            description: "Use homopolymer-compressed k-mer (preferrable for PacBio).",
+            category: "advanced"
+        }
+        kmerSize: {
+            description: "K-mer size (no larger than 28).",
+            category: "advanced"
+        }
+        minimizerWindowSize: {
+            description: "Minimizer window size.",
+            category: "advanced"
+        }
+        outputPrefix: {
+            description: "Output directory path + output file prefix.",
+            category: "required"
+        }
+        referenceFile: {
+            description: "Reference fasta file.",
+            category: "required"
+        }
+        splitIndex: {
+            description: "Split index for every ~NUM input bases.",
+            category: "advanced"
+        }
+        outputIndexFile: {
+            description: "Indexed reference file.",
+            category: "required"
+        }
    }
 }

 task Mapping {
    input {
-        File queryFile
-        File referenceFile
-        String outputPrefix
        String presetOption
-        Boolean outputSAM = false
        Int kmerSize = 15
+        Boolean skipSelfAndDualMappings = false
+        Boolean outputSAM = false
+        String outputPrefix
+        Boolean addMDtagToSAM = false
+        Boolean secondaryAlignment = false
+        File referenceFile
+        File queryFile

-        Int? maxFragmentLength
        Int? maxIntronLength
-        Boolean? skipSelfAndDualMappings
+        Int? maxFragmentLength
        Int? retainMaxSecondaryAlignments
        Int? matchingScore
        Int? mismatchPenalty
        String? howToFindGTAG
-        Boolean? secondaryAlignment
-        Boolean? addMDtagToSAM

        Int cores = 4
        String memory = "30G"
@@ -99,19 +119,19 @@ task Mapping {
        mkdir -p $(dirname ~{outputPrefix})
        minimap2 \
        ~{"-x " + presetOption} \
+        ~{"-k " + kmerSize} \
+        ~{true="-X" false="" skipSelfAndDualMappings} \
        ~{true="-a" false="" outputSAM} \
+        ~{"-o " + outputPrefix} \
+        ~{true="--MD" false="" addMDtagToSAM} \
+        --secondary=~{true="yes" false="no" secondaryAlignment} \
+        ~{"-t " + cores} \
        ~{"-G " + maxIntronLength} \
        ~{"-F " + maxFragmentLength} \
-        ~{"-k " + kmerSize} \
-        ~{true="-X" false="" skipSelfAndDualMappings} \
        ~{"-N " + retainMaxSecondaryAlignments} \
        ~{"-A " + matchingScore} \
        ~{"-B " + mismatchPenalty} \
        ~{"-u " + howToFindGTAG} \
-        --secondary=~{true="yes" false="no" secondaryAlignment} \
-        ~{true="--MD" false="" addMDtagToSAM} \
-        ~{"-o " + outputPrefix} \
-        ~{"-t " + cores} \
        ~{referenceFile} \
        ~{queryFile}
    }
@@ -127,22 +147,69 @@ task Mapping {
    }

    parameter_meta {
-        queryFile: "Input fasta file."
-        referenceFile: "Reference fasta file."
-        outputPrefix: "Output directory path + output file prefix."
-        presetOption: "This option applies multiple options at the same time."
-        outputSAM: "Output in the SAM format."
-        maxFragmentLength: "Max fragment length (effective with -xsr or in the fragment mode)."
-        maxIntronLength: "Max intron length (effective with -xsplice; changing -r)."
-        kmerSize: "K-mer size (no larger than 28)."
-        skipSelfAndDualMappings: "Skip self and dual mappings (for the all-vs-all mode)."
-        retainMaxSecondaryAlignments: "Retain at most INT secondary alignments."
-        matchingScore: "Matching score."
-        mismatchPenalty: "Mismatch penalty."
-        howToFindGTAG: "How to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG."
-        secondaryAlignment: "Whether to output secondary alignments."
-        addMDtagToSAM: "Adds a MD tag to the SAM output file."
-
-        outputAlignmentFile: "Mapping and alignment between collections of DNA sequences file."
+        presetOption: {
+            description: "This option applies multiple options at the same time.",
+            category: "common"
+        }
+        kmerSize: {
+            description: "K-mer size (no larger than 28).",
+            category: "advanced"
+        }
+        outputSAM: {
+            description: "Output in the SAM format.",
+            category: "common"
+        }
+        outputPrefix: {
+            description: "Output directory path + output file prefix.",
+            category: "required"
+        }
+        maxIntronLength: {
+            description: "Max intron length (effective with -xsplice; changing -r).",
+            category: "advanced"
+        }
+        maxFragmentLength: {
+            description: "Max fragment length (effective with -xsr or in the fragment mode).",
+            category: "advanced"
+        }
+        skipSelfAndDualMappings: {
+            description: "Skip self and dual mappings (for the all-vs-all mode).",
+            category: "advanced"
+        }
+        retainMaxSecondaryAlignments: {
+            description: "Retain at most INT secondary alignments.",
+            category: "advanced"
+        }
+        matchingScore: {
+            description: "Matching score.",
+            category: "advanced"
+        }
+        mismatchPenalty: {
+            description: "Mismatch penalty.",
+            category: "advanced"
+        }
+        howToFindGTAG: {
+            description: "How to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG.",
+            category: "common"
+        }
+        addMDtagToSAM: {
+            description: "Adds a MD tag to the SAM output file.",
+            category: "common"
+        }
+        secondaryAlignment: {
+            description: "Whether to output secondary alignments.",
+            category: "advanced"
+        }
+        referenceFile: {
+            description: "Reference fasta file.",
+            category: "required"
+        }
+        queryFile: {
+            description: "Input fasta file.",
+            category: "required"
+        }
+        outputAlignmentFile: {
+            description: "Mapping and alignment between collections of DNA sequences file.",
+            category: "required"
+        }
    }
 }
--- a/scripts @ 6eaa313f
+++ b/scripts @ 6eaa313f
-Subproject commit e00dc247dac8f4aa91a77d6d307f928cc8449527
+Subproject commit 6eaa313f172f3efb9e62f2140b8d7fb34da6bd9a
--- a/talon.wdl
+++ b/talon.wdl
@@ -456,7 +456,7 @@ task Talon {
        String configFileName = basename(configFile)
        String SAMfileName = basename(SAMfile)

-        Int cores = 1
+        Int cores = 4
        String memory = "20G"
        String dockerImage = "biocontainers/talon:v4.4_cv1"
    }

--- a/transcriptclean.wdl
+++ b/transcriptclean.wdl
@@ -38,8 +38,8 @@ task GetSJsFromGtf {
        get_SJs_from_gtf \
        ~{"--f=" + GTFfile} \
        ~{"--g=" + genomeFile} \
-        ~{"--o=" + outputPrefix + ".tsv"} \
-        ~{"--minIntronSize=" + minIntronSize}
+        ~{"--minIntronSize=" + minIntronSize} \
+        ~{"--o=" + outputPrefix + ".tsv"}
    }

    output {
@@ -53,12 +53,26 @@ task GetSJsFromGtf {
    }

    parameter_meta {
-        GTFfile: "Input GTF file"
-        genomeFile: "Reference genome"
-        outputPrefix: "Output directory path + output file prefix."
-        minIntronSize: "Minimum size of intron to consider a junction."
-
-        outputSJsFile: "Extracted splice junctions."
+        GTFfile: {
+            description: "Input GTF file",
+            category: "required"
+        }
+        genomeFile: {
+            description: "Reference genome",
+            category: "required"
+        }
+        minIntronSize: {
+            description: "Minimum size of intron to consider a junction.",
+            category: "advanced"
+        }
+        outputPrefix: {
+            description: "Output directory path + output file prefix.",
+            category: "required"
+        }
+        outputSJsFile: {
+            description: "Extracted splice junctions.",
+            category: "required"
+        }
    }
 }

@@ -91,10 +105,18 @@ task GetTranscriptCleanStats {
    }

    parameter_meta {
-        transcriptCleanSAMfile: "Output SAM file from TranscriptClean"
-        outputPrefix: "Output directory path + output file prefix."
-
-        outputStatsFile: "Summary stats from TranscriptClean run."
+        transcriptCleanSAMfile: {
+            description: "Output SAM file from TranscriptClean",
+            category: "required"
+        }
+        outputPrefix: {
+            description: "Output directory path + output file prefix.",
+            category: "required"
+        }
+        outputStatsFile: {
+            description: "Summary stats from TranscriptClean run.",
+            category: "required"
+        }
    }
 }

@@ -102,9 +124,9 @@ task TranscriptClean {
    input {
        File SAMfile
        File referenceGenome
-        String outputPrefix
        Int maxLenIndel = 5
        Int maxSJoffset = 5
+        String outputPrefix
        Boolean correctMismatches = true
        Boolean correctIndels = true
        Boolean correctSJs = true
@@ -112,6 +134,7 @@ task TranscriptClean {
        Boolean primaryOnly = false
        Boolean canonOnly = false
        Int bufferSize = 100
+        Boolean deleteTmp = true

        File? spliceJunctionAnnotation
        File? variantFile
@@ -127,11 +150,10 @@ task TranscriptClean {
        TranscriptClean \
        ~{"-s " + SAMfile} \
        ~{"-g " + referenceGenome} \
-        ~{"-o " + outputPrefix} \
-        ~{"-j " + spliceJunctionAnnotation} \
-        ~{"-v " + variantFile} \
+        ~{"-t " + cores} \
        ~{"--maxLenIndel=" + maxLenIndel} \
        ~{"--maxSJOffset=" + maxSJoffset} \
+        ~{"-o " + outputPrefix} \
        ~{true="-m true" false="-m false" correctMismatches} \
        ~{true="-i true" false="-i false" correctIndels} \
        ~{true="--correctSJs=true" false="--correctSJs=false" correctSJs} \
@@ -139,7 +161,9 @@ task TranscriptClean {
        ~{true="--primaryOnly" false="" primaryOnly} \
        ~{true="--canonOnly" false="" canonOnly} \
        ~{"--bufferSize=" + bufferSize} \
-        ~{"-t " + cores}
+        ~{true="--deleteTmp" false="" deleteTmp} \
+        ~{"-j " + spliceJunctionAnnotation} \
+        ~{"-v " + variantFile}
    }

    output {
@@ -156,24 +180,81 @@ task TranscriptClean {
    }

    parameter_meta {
-        SAMfile: "Input SAM file containing transcripts to correct."
-        referenceGenome: "Reference genome fasta file."
-        outputPrefix: "Output directory path + output file prefix."
-        spliceJunctionAnnotation: "Splice junction file."
-        variantFile: "VCF formatted file of variants."
-        maxLenIndel: "Maximum size indel to correct."
-        maxSJoffset: "Maximum distance from annotated splice junction to correct."
-        correctMismatches: "Set this to make TranscriptClean correct mismatches."
-        correctIndels: "Set this to make TranscriptClean correct indels."
-        correctSJs: "Set this to make TranscriptClean correct splice junctions."
-        dryRun: "TranscriptClean will read in the data but don't do any correction."
-        primaryOnly: "TranscriptClean will only output primary mappings of transcripts."
-        canonOnly: "TranscriptClean will output only canonical transcripts and transcript containing annotated noncanonical junctions."
-        bufferSize: "Number of lines to output to file at once by each thread during run."
-
-        outputTranscriptCleanFasta: "Fasta file containing corrected reads."
-        outputTranscriptCleanLog: "Log file of TranscriptClean run."
-        outputTranscriptCleanSAM: "SAM file containing corrected aligned reads."
-        outputTranscriptCleanTElog: "TE log file of TranscriptClean run."
+        SAMfile: {
+            description: "Input SAM file containing transcripts to correct.",
+            category: "required"
+        }
+        referenceGenome: {
+            description: "Reference genome fasta file.",
+            category: "required"
+        }
+        maxLenIndel: {
+            description: "Maximum size indel to correct.",
+            category: "advanced"
+        }
+        maxSJoffset: {
+            description: "Maximum distance from annotated splice junction to correct.",
+            category: "advanced"
+        }
+        outputPrefix: {
+            description: "Output directory path + output file prefix.",
+            category: "required"
+        }
+        correctMismatches: {
+            description: "Set this to make TranscriptClean correct mismatches.",
+            category: "common"
+        }
+        correctIndels: {
+            description: "Set this to make TranscriptClean correct indels.",
+            category: "common"
+        }
+        correctSJs: {
+            description: "Set this to make TranscriptClean correct splice junctions.",
+            category: "common"
+        }
+        dryRun: {
+            description: "TranscriptClean will read in the data but don't do any correction.",
+            category: "advanced"
+        }
+        primaryOnly: {
+            description: "Only output primary mappings of transcripts.",
+            category: "advanced"
+        }
+        canonOnly: {
+            description: "Only output canonical transcripts and transcript containing annotated noncanonical junctions.",
+            category: "advanced"
+        }
+        bufferSize: {
+            description: "Number of lines to output to file at once by each thread during run.",
+            category: "common"
+        }
+        deleteTmp: {
+            description: "The temporary directory generated by TranscriptClean will be removed.",
+            category: "common"
+        }
+        spliceJunctionAnnotation: {
+            description: "Splice junction file.",
+            category: "common"
+        }
+        variantFile: {
+            description: "VCF formatted file of variants.",
+            category: "common"
+        }
+        outputTranscriptCleanFasta: {
+            description: "Fasta file containing corrected reads.",
+            category: "required"
+        }
+        outputTranscriptCleanLog: {
+            description: "Log file of TranscriptClean run.",
+            category: "required"
+        }
+        outputTranscriptCleanSAM: {
+            description: "SAM file containing corrected aligned reads.",
+            category: "required"
+        }
+        outputTranscriptCleanTElog: {
+            description: "TE log file of TranscriptClean run.",
+            category: "required"
+        }
   }
 }