diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..9f11b755a17d8192c60f61cb17b8902dffbd9f23 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea/ diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000000000000000000000000000000000000..31667a1596d1fb758353b01ab723cd694e864956 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,5 @@ +language: java +script: + - set -e + - wget https://github.com/broadinstitute/cromwell/releases/download/30.2/womtool-30.2.jar + - for F in `find -name "*.wdl"`; do java -jar womtool-30.2.jar validate $F; done diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..6e2dff36af8a00f40e4ad9e07e7c282111f72da0 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Peter van 't Hof + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/biopet.wdl b/biopet.wdl new file mode 100644 index 0000000000000000000000000000000000000000..89293a9a5cbffd680e72dcf4ccfe397f397bd159 --- /dev/null +++ b/biopet.wdl @@ -0,0 +1,161 @@ +task FastqSplitter { + String? preCommand + File inputFastq + String outputPath + Int numberChunks + String tool_jar + Array[Int] chunks = range(numberChunks) + + command { + set -e -o pipefail + ${preCommand} + mkdir -p ${sep=' ' prefix(outputPath + "/chunk_", chunks)} + if [ ${numberChunks} -gt 1 ]; then + SEP="/${basename(inputFastq)} -o " + java -jar ${tool_jar} -I ${inputFastq} -o ${sep='$SEP' prefix(outputPath + "/chunk_", chunks)}/${basename(inputFastq)} + else + ln -sf ${inputFastq} ${outputPath}/chunk_0/${basename(inputFastq)} + fi + } + + output { + Array[File] outputFastqFiles = glob(outputPath + "/chunk_*/" + basename(inputFastq)) + } +} + +task ScatterRegions { + String? preCommand + File ref_fasta + File ref_dict + String outputDirPath + String tool_jar + Int? scatterSize + File? regions + + Float? memory + Float? memoryMultiplier + + Int mem = ceil(select_first([memory, 4.0])) + command { + set -e -o pipefail + ${preCommand} + mkdir -p ${outputDirPath} + java -Xmx${mem}G -jar ${tool_jar} \ + -R ${ref_fasta} \ + -o ${outputDirPath} \ + ${"-s " + scatterSize} \ + ${"-L " + regions} + } + + output { + Array[File] scatters = glob(outputDirPath + "/scatter-*.bed") + } + + runtime { + memory: ceil(mem * select_first([memoryMultiplier, 2.0])) + } +} + +task SampleConfig { + String? preCommand + String tool_jar + Array[File]+ inputFiles + String? sample + String? library + String? readgroup + String? jsonOutputPath + String? tsvOutputPath + + Float? memory + Float? memoryMultiplier + + Int mem = ceil(select_first([memory, 4.0])) + command { + set -e -o pipefail + ${preCommand} + mkdir -p . ${"$(dirname " + jsonOutputPath + ")"} ${"$(dirname " + tsvOutputPath + ")"} + java -Xmx${mem}G -jar ${tool_jar} \ + -i ${sep="-i " inputFiles} \ + ${"--sample " + sample} \ + ${"--library " + library} \ + ${"--readgroup " + readgroup} \ + ${"--jsonOutput " + jsonOutputPath} \ + ${"--tsvOutput " + tsvOutputPath} + } + + output { + Array[String] keys = read_lines(stdout()) + File? jsonOutput = jsonOutputPath + File? tsvOutput = tsvOutputPath + Object values = if (defined(tsvOutput) && size(tsvOutput) > 0) then read_map(tsvOutput) else { "": "" } + } + + runtime { + memory: ceil(mem * select_first([memoryMultiplier, 2.0])) + } +} + +task BaseCounter { + String? preCommand + String tool_jar #Should this be of type File? + File bam + File refFlat + String outputDir + String prefix + + Float? memory + Float? memoryMultiplier + + Int mem = ceil(select_first([memory, 12.0])) + command { + set -e -o pipefail + ${preCommand} + mkdir -p ${outputDir} + java -Xmx${mem}G -jar ${tool_jar} \ + -b ${bam} \ + -r ${refFlat} \ + -o ${outputDir} \ + -p ${prefix} + } + + output { + File exonAntisense = outputDir + "/" + prefix + ".base.exon.antisense.counts" + File exon = outputDir + "/" + prefix + ".base.exon.counts" + File exonMergeAntisense = outputDir + "/" + prefix + ".base.exon.merge.antisense.counts" + File exonMerge = outputDir + "/" + prefix + ".base.exon.merge.counts" + File exonMergeSense = outputDir + "/" + prefix + ".base.exon.merge.sense.counts" + File exonSense = outputDir + "/" + prefix + ".base.exon.sense.counts" + File geneAntisense = outputDir + "/" + prefix + ".base.gene.antisense.counts" + File gene = outputDir + "/" + prefix + ".base.gene.counts" + File geneExonicAntisense = outputDir + "/" + prefix + ".base.gene.exonic.antisense.counts" + File geneExonic = outputDir + "/" + prefix + ".base.gene.exonic.counts" + File geneExonicSense = outputDir + "/" + prefix + ".base.gene.exonic.sense.counts" + File geneIntronicAntisense = outputDir + "/" + prefix + ".base.gene.intronic.antisense.counts" + File geneIntronic = outputDir + "/" + prefix + ".base.gene.intronic.counts" + File geneIntronicSense = outputDir + "/" + prefix + ".base.gene.intronic.sense.counts" + File geneSense = outputDir + "/" + prefix + ".base.gene.sense.counts" + File intronAntisense = outputDir + "/" + prefix + ".base.intron.antisense.counts" + File intron = outputDir + "/" + prefix + ".base.intron.counts" + File intronMergeAntisense = outputDir + "/" + prefix + ".base.intron.merge.antisense.counts" + File intronMerge = outputDir + "/" + prefix + ".base.intron.merge.counts" + File intronMergeSense = outputDir + "/" + prefix + ".base.intron.merge.sense.counts" + File intronSense = outputDir + "/" + prefix + ".base.intron.sense.counts" + File metaExonsNonStranded = outputDir + "/" + prefix + ".base.metaexons.non_stranded.counts" + File metaExonsStrandedAntisense = outputDir + "/" + prefix + ".base.metaexons.stranded.antisense.counts" + File metaExonsStranded = outputDir + "/" + prefix + ".base.metaexons.stranded.counts" + File metaExonsStrandedSense = outputDir + "/" + prefix + ".base.metaexons.stranded.sense.counts" + File transcriptAntisense = outputDir + "/" + prefix + ".base.transcript.antisense.counts" + File transcript = outputDir + "/" + prefix + ".base.transcript.counts" + File transcriptExonicAntisense = outputDir + "/" + prefix + ".base.transcript.exonic.antisense.counts" + File transcriptExonic = outputDir + "/" + prefix + ".base.transcript.exonic.counts" + File transcriptExonicSense = outputDir + "/" + prefix + ".base.transcript.exonic.sense.counts" + File transcriptIntronicAntisense = outputDir + "/" + prefix + ".base.transcript.intronic.antisense.counts" + File transcriptIntronic = outputDir + "/" + prefix + ".base.transcript.intronic.counts" + File transcriptIntronicSense = outputDir + "/" + prefix + ".base.transcript.intronic.sense.counts" + File transcriptSense = outputDir + "/" + prefix + ".base.transcript.sense.counts" + } + + runtime { + memory: ceil(mem * select_first([memoryMultiplier, 1.5])) + } +} diff --git a/bwa.wdl b/bwa.wdl index 0a2cb1a32afd56c9239f8b141180be120b6618ba..0a8b37fa8ad3fec1d7d3400f6e9b4b5d342dba0a 100644 --- a/bwa.wdl +++ b/bwa.wdl @@ -1,131 +1,28 @@ -task mem { - File read1 - File indexBase - Array[File] indexFiles # Not used by the command, but needed so cromwell does provide the proper links. - File? read2 - String? outputFile = "aligned.bam" +task BwaMem { String? preCommand - Int? threads = 1 - Int? memory = 8 - Int? minimumSeedLength - Int? w - Int? d - Float? r - Int? y - Int? c - Int? D - Int? m - Int? W - Boolean? skipMateRescue - Boolean? skipPairing - Int? matchScore - Int? mismatchPenalty - String? gapOpenPenalty - String? gapExtensionPenalty - String? endClippingPenalty - String? unpairedReadPairPenalty - String? readType - Boolean? smartPairing - String? readGroupHeaderLine - String? H - Boolean? j - Boolean? five - Boolean? q - Int? K - Int? minimumOutputScore - String? h - Boolean? a - Boolean? appendComment - Boolean? V - Boolean? Y - Boolean? M - String? I - parameter_meta { - referenceFiles: "Should contain all the index files from the index task" - } - command { - set -e -o pipefail - ${"mkdir -p $(dirname " + outputFile + ")"} - ${preCommand} - bwa mem \ - ${"-t " + threads } \ - ${"-k " + minimumSeedLength } \ - ${"-w " + w } \ - ${"-d " + d } \ - ${"-r " + r } \ - ${"-y " + y } \ - ${"-c " + c } \ - ${"-D " + D } \ - ${"-W " + W } \ - ${"-m " + m } \ - ${true="-s " false="" skipMateRescue } \ - ${true="-P " false="" skipPairing } \ - ${"-A " + matchScore } \ - ${"-B " + mismatchPenalty } \ - ${"-O " + gapOpenPenalty } \ - ${"-E " + gapExtensionPenalty } \ - ${"-L " + endClippingPenalty } \ - ${"-U " + unpairedReadPairPenalty } \ - ${"-x " + readType } \ - ${true="-p " false="" smartPairing} \ - ${"-r " + readGroupHeaderLine} \ - ${"-H " + H } \ - ${true="-j" false="" j } \ - ${true="-5" false="" five } \ - ${true="-q" false="" q } \ - ${"-K " + K } \ - ${"-T " + minimumOutputScore } \ - ${"-h " + h } \ - ${true="-a" false="" a } \ - ${true="-C" false="" appendComment } \ - ${true="-V" false="" V } \ - ${true="-Y" false="" Y } \ - ${true="-M" false="" M } \ - ${"-I " + I } \ - ${indexBase} ${read1} ${read2} \ - | picard SortSam CREATE_INDEX=TRUE TMP_DIR=null \ - INPUT=/dev/stdin SORT_ORDER=coordinate OUTPUT=${outputFile} - ln -s $(basename ${outputFile} | sed 's/.bam$/.bai/') ${outputFile}.bai - } - output { - File bamFile = select_first([outputFile]) - File bamIndexPicard = sub(bamFile, ".bam$", ".bai") - File bamIndexSamtools = select_first([outputFile]) + ".bai" - } - runtime { - cpu: select_first([threads]) - memory: select_first([memory]) - } -} + File inputR1 + File? inputR2 + String referenceFasta + String outputPath + String? readgroup -task index { - File fasta - String? preCommand - String? constructionAlgorithm - Int? blockSize - String? outputDir - String fastaFilename = basename(fasta) + Int? threads + Int? memory command { set -e -o pipefail - ${"mkdir -p " + outputDir} + mkdir -p $(dirname ${outputPath}) ${preCommand} - ln -sf ${fasta} ${outputDir + "/"}${fastaFilename} - bwa index \ - ${"-a " + constructionAlgorithm} \ - ${"-b" + blockSize} \ - ${outputDir + "/"}${fastaFilename} + bwa mem ${"-t " + threads} \ + ${"-R '" + readgroup + "'"} \ + ${referenceFasta} ${inputR1} ${inputR2} | samtools sort --output-fmt BAM - > ${outputPath} } output { - File indexBase = if (defined(outputDir)) then select_first([outputDir]) + "/" + fastaFilename else fastaFilename - File indexedFasta = indexBase - Array[File] indexFiles = [indexBase + ".bwt",indexBase + ".pac",indexBase + ".sa",indexBase + ".amb",indexBase + ".ann"] + File bamFile = outputPath } - parameter_meta { - fasta: "Fasta file to be indexed" - constructionAlgorithm: "-a STR BWT construction algorithm: bwtsw, is or rb2 [auto]" - blockSize: "-b INT block size for the bwtsw algorithm (effective with -a bwtsw) [10000000]" - outputDir: "index will be created in this output directory" + runtime{ + cpu: if defined(threads) then threads else 1 + memory: if defined(memory) then memory else 8 } -} \ No newline at end of file +} diff --git a/common.wdl b/common.wdl index 91f8ffd93da76706f1d5567214da32c9f35faf50..4acd8bd48c0fb60a954f26a6b4b7bf654dcecc01 100644 --- a/common.wdl +++ b/common.wdl @@ -1,31 +1,49 @@ task objectMd5 { Object the_object + command { cat ${write_object(the_object)} | md5sum - | sed -e 's/ -//' } + output { String md5sum = read_string(stdout()) } + + runtime { + memory: 1 + } } task mapMd5 { Map[String,String] map + command { - cat ${write_map(map)} | md5sum - | sed -e 's/ -//' + cat ${write_map(map)} | md5sum - | sed -e 's/ -//' } + output { String md5sum = read_string(stdout()) } + + runtime { + memory: 1 + } } task stringArrayMd5 { Array[String] stringArray + command { - set -eu -o pipefail - echo ${sep=',' stringArray} | md5sum - | sed -e 's/ -//' + set -eu -o pipefail + echo ${sep=',' stringArray} | md5sum - | sed -e 's/ -//' } + output { - String md5sum = read_string(stdout()) + String md5sum = read_string(stdout()) + } + + runtime { + memory: 1 } } @@ -33,37 +51,68 @@ task concatenateTextFiles { Array[File] fileList String combinedFilePath Boolean? unzip=false + command { set -e -o pipefail ${"mkdir -p $(dirname " + combinedFilePath + ")"} ${true='zcat' false= 'cat' unzip} ${sep=' ' fileList} \ > ${combinedFilePath} } + output { File combinedFile = combinedFilePath } + + runtime { + memory: 1 + } } # inspired by https://gatkforums.broadinstitute.org/wdl/discussion/9616/is-there-a-way-to-flatten-arrays task flattenStringArray { Array[Array[String]] arrayList + command { - for line in $(echo ${sep=', ' arrayList}) ; \ - do echo $line | tr -d '"[],' ; done + for line in $(echo ${sep=', ' arrayList}) ; \ + do echo $line | tr -d '"[],' ; done } + output { Array[String] flattenedArray = read_lines(stdout()) } + + runtime { + memory: 1 + } } task appendToStringArray { Array[String] array String string + command { echo "${sep='\n' array} ${string}" } + output { Array[String] out_array = read_lines(stdout()) } + + runtime { + memory: 1 + } +} + +task createLink { + File inputFile + String outputPath + + command { + ln -sf ${inputFile} ${outputPath} + } + + output { + File link = outputPath + } } \ No newline at end of file diff --git a/fastqc.wdl b/fastqc.wdl index ed724dd92e7e32c9595f6f888f87dbaea12117b4..b361f10d59c0731d195254f7f281580d1d15c8de 100644 --- a/fastqc.wdl +++ b/fastqc.wdl @@ -48,15 +48,7 @@ task fastqc { File rawReport = reportDir + "/fastqc_data.txt" File htmlReport = reportDir + "/fastqc_report.html" File summary = reportDir + "/summary.txt" - File adapterContent = reportDir + "/Images/adapter_content.png" - File duplicationLevels = reportDir + "/Images/duplication_levels.png" - File perBaseNContent = reportDir + "/Images/per_base_n_content.png" - File perBaseQuality = reportDir + "/Images/per_base_quality.png" - File perBaseSequenceContent = reportDir + "/Images/per_base_sequence_content.png" - File perSequenceGCContent = reportDir + "/Images/per_sequence_gc_content.png" - File perSequenceQuality = reportDir + "/Images/per_sequence_quality.png" - #File perTileQuality = reportDir + "/Images/per_tile_quality.png" - File sequenceLengthDistribution = reportDir + "/Images/sequence_length_distribution.png" + Array[File] images = glob(reportDir + "/Images/*.png") } runtime { @@ -75,10 +67,15 @@ task extractAdapters { File? knownAdapterFile Float? adapterCutoff Boolean? outputAsFasta + + Float? memory + Float? memoryMultiplier + + Int mem = ceil(select_first([memory, 4.0])) command { set -e mkdir -p ${outputDir} - java -jar ${extractAdaptersFastqcJar} \ + java -Xmx${mem}G -jar ${extractAdaptersFastqcJar} \ --inputFile ${inputFile} \ ${"--adapterOutputFile " + adapterOutputFilePath } \ ${"--contamsOutputFile " + contamsOutputFilePath } \ @@ -95,23 +92,30 @@ task extractAdapters { Array[String] adapterList = read_lines(select_first([adapterOutputFilePath])) Array[String] contamsList = read_lines(select_first([contamsOutputFilePath])) } + runtime { - memory: 8 + memory: ceil(mem * select_first([memoryMultiplier, 2.5])) } } task getConfiguration { String? preCommand String? fastqcDirFile = "fastqcDir.txt" + command { set -e -o pipefail ${preCommand} echo $(dirname $(readlink -f $(which fastqc))) > ${fastqcDirFile} } + output { String fastqcDir = read_string(fastqcDirFile) File adapterList = fastqcDir + "/Configuration/adapter_list.txt" File contaminantList = fastqcDir + "/Configuration/contaminant_list.txt" File limits = fastqcDir + "/Configuration/limits.txt" } + + runtime { + memory: 1 + } } diff --git a/gatk.wdl b/gatk.wdl new file mode 100644 index 0000000000000000000000000000000000000000..160849ad00e3d849bfb26a44ce717b73e2c4918f --- /dev/null +++ b/gatk.wdl @@ -0,0 +1,286 @@ +# Generate Base Quality Score Recalibration (BQSR) model +task BaseRecalibrator { + String? preCommand + String gatk_jar + String input_bam + String input_bam_index + String recalibration_report_filename + Array[File]+ sequence_group_interval + Array[File]+ known_indels_sites_VCFs + Array[File]+ known_indels_sites_indices + File ref_dict + File ref_fasta + File ref_fasta_index + + Float? memory + Float? memoryMultiplier + + Int mem = ceil(select_first([memory, 4.0])) + command { + set -e -o pipefail + ${preCommand} + java -Xms${mem}G -jar ${gatk_jar} \ + BaseRecalibrator \ + -R ${ref_fasta} \ + -I ${input_bam} \ + --use-original-qualities \ + -O ${recalibration_report_filename} \ + --known-sites ${sep=" --known-sites " known_indels_sites_VCFs} \ + -L ${sep=" -L " sequence_group_interval} + } + + output { + File recalibration_report = "${recalibration_report_filename}" + } + + runtime { + memory: ceil(mem * select_first([memoryMultiplier, 1.5])) + } +} + +# Apply Base Quality Score Recalibration (BQSR) model +task ApplyBQSR { + String? preCommand + String gatk_jar + String input_bam + String output_bam_path + File recalibration_report + Array[String] sequence_group_interval + File ref_dict + File ref_fasta + File ref_fasta_index + Int? compression_level + + Float? memory + Float? memoryMultiplier + + Int mem = ceil(select_first([memory, 4.0])) + command { + set -e -o pipefail + ${preCommand} + java ${"-Dsamjdk.compression_level=" + compression_level} \ + -Xms${mem}G -jar ${gatk_jar} \ + ApplyBQSR \ + --create-output-bam-md5 \ + --add-output-sam-program-record \ + -R ${ref_fasta} \ + -I ${input_bam} \ + --use-original-qualities \ + -O ${output_bam_path} \ + -bqsr ${recalibration_report} \ + --static-quantized-quals 10 --static-quantized-quals 20 --static-quantized-quals 30 \ + -L ${sep=" -L " sequence_group_interval} + } + + output { + File recalibrated_bam = "${output_bam_path}" + File recalibrated_bam_checksum = "${output_bam_path}.md5" + } + + runtime { + memory: ceil(mem * select_first([memoryMultiplier, 1.5])) + } +} + +# Combine multiple recalibration tables from scattered BaseRecalibrator runs +task GatherBqsrReports { + String? preCommand + String gatk_jar + Array[File] input_bqsr_reports + String output_report_filepath + + Float? memory + Float? memoryMultiplier + + Int mem = ceil(select_first([memory, 4.0])) + command { + set -e -o pipefail + ${preCommand} + java -Xms${mem}G -jar ${gatk_jar} \ + GatherBQSRReports \ + -I ${sep=' -I ' input_bqsr_reports} \ + -O ${output_report_filepath} + } + + output { + File output_bqsr_report = "${output_report_filepath}" + } + + runtime { + memory: ceil(mem * select_first([memoryMultiplier, 1.5])) + } +} + +# Call variants on a single sample with HaplotypeCaller to produce a GVCF +task HaplotypeCallerGvcf { + String? preCommand + Array[File]+ input_bams + Array[File]+ input_bams_index + Array[File]+ interval_list + String gvcf_basename + File ref_dict + File ref_fasta + File ref_fasta_index + Float? contamination + Int? compression_level + String gatk_jar + + Float? memory + Float? memoryMultiplier + + Int mem = ceil(select_first([memory, 4.0])) + command { + set -e -o pipefail + ${preCommand} + java ${"-Dsamjdk.compression_level=" + compression_level} \ + -Xmx${mem}G -jar ${gatk_jar} \ + HaplotypeCaller \ + -R ${ref_fasta} \ + -O ${gvcf_basename}.vcf.gz \ + -I ${sep=" -I " input_bams} \ + -L ${sep=' -L ' interval_list} \ + -contamination ${default=0 contamination} \ + -ERC GVCF + } + + output { + File output_gvcf = "${gvcf_basename}.vcf.gz" + File output_gvcf_index = "${gvcf_basename}.vcf.gz.tbi" + } + + runtime { + memory: ceil(mem * select_first([memoryMultiplier, 1.5])) + } +} + +task GenotypeGVCFs { + String? preCommand + File gvcf_files + File gvcf_file_indexes + Array[File]+ intervals + + String output_basename + + String gatk_jar + + File ref_fasta + File ref_fasta_index + File ref_dict + + File dbsnp_vcf + File dbsnp_vcf_index + + Int? compression_level + Float? memory + Float? memoryMultiplier + + Int mem = ceil(select_first([memory, 4.0])) + command { + set -e -o pipefail + ${preCommand} + + java ${"-Dsamjdk.compression_level=" + compression_level} \ + -Xmx${mem}G -jar ${gatk_jar} \ + GenotypeGVCFs \ + -R ${ref_fasta} \ + -O ${output_basename + ".vcf.gz"} \ + -D ${dbsnp_vcf} \ + -G StandardAnnotation \ + --only-output-calls-starting-in-intervals \ + -new-qual \ + -V ${gvcf_files} \ + -L ${sep=' -L ' intervals} + } + + output { + File output_vcf = output_basename + ".vcf.gz" + File output_vcf_index = output_basename + ".vcf.gz.tbi" + } + + runtime{ + memory: ceil(mem * select_first([memoryMultiplier, 1.5])) + } +} + +task CombineGVCFs { + String? preCommand + Array[File]+ gvcf_files + Array[File]+ gvcf_file_indexes + Array[File]+ intervals + + String output_basename + + String gatk_jar + + File ref_fasta + File ref_fasta_index + File ref_dict + + Int? compression_level + Float? memory + Float? memoryMultiplier + + Int mem = ceil(select_first([memory, 4.0])) + command { + set -e -o pipefail + ${preCommand} + + if [ ${length(gvcf_files)} -gt 1 ]; then + java ${"-Dsamjdk.compression_level=" + compression_level} \ + -Xmx${mem}G -jar ${gatk_jar} \ + CombineGVCFs \ + -R ${ref_fasta} \ + -O ${output_basename + ".vcf.gz"} \ + -V ${sep=' -V ' gvcf_files} \ + -L ${sep=' -L ' intervals} + else + ln -sf ${select_first(gvcf_files)} ${output_basename + ".vcf.gz"} + ln -sf ${select_first(gvcf_files)}.tbi ${output_basename + ".vcf.gz.tbi"} + fi + } + + output { + File output_gvcf = output_basename + ".vcf.gz" + File output_gvcf_index = output_basename + ".vcf.gz.tbi" + } + + runtime { + memory: ceil(mem * select_first([memoryMultiplier, 1.5])) + } +} + +task SplitNCigarReads { + String? preCommand + + File input_bam + File ref_fasta + File ref_fasta_index + File ref_dict + String output_bam + String gatk_jar + Array[File]+ intervals + + Float? memory + Float? memoryMultiplier + + Int mem = ceil(select_first([memory, 4.0])) + command { + set -e -o pipefail + ${preCommand} + java -Xms${mem}G -jar ${gatk_jar} \ + SplitNCigarReads \ + -I ${input_bam} \ + -R ${ref_fasta} \ + -O ${output_bam} # might have to be -o depending on GATK version \ + -L ${sep=' -L ' intervals} + } + + output { + File bam = output_bam + File bam_index = output_bam + ".bai" + } + + runtime { + memory: ceil(mem * select_first([memoryMultiplier, 1.5])) + } +} diff --git a/htseq.wdl b/htseq.wdl new file mode 100644 index 0000000000000000000000000000000000000000..6376e3ebeac324848bc20fe2a73f6be9fa6a13b2 --- /dev/null +++ b/htseq.wdl @@ -0,0 +1,31 @@ +task HTSeqCount { + String? preCommand + Array[File] alignmentFiles + File gtfFile + String outputTable + String? format + String? order + String? stranded + + Int? memory + + command { + set -e -o pipefail + ${preCommand} + htseq-count \ + -f ${default="bam" format} \ + -r ${default="pos" order} \ + -s ${default="no" stranded} \ + ${sep=" " alignmentFiles} \ + ${gtfFile} \ + > ${outputTable} + } + + output { + File counts = outputTable + } + + runtime { + memory: select_first([memory, 3]) + } +} \ No newline at end of file diff --git a/mergecounts.wdl b/mergecounts.wdl new file mode 100644 index 0000000000000000000000000000000000000000..c2373f7f02596607f1c9999ecfb35ff49aa6c43a --- /dev/null +++ b/mergecounts.wdl @@ -0,0 +1,39 @@ +task MergeCounts { + String? preCommand + + Array[File] inputFiles + String outputFile + String idVar + String measurementVar + + # Based on a script by Szymon Kielbasa/Ioannis Moustakas + command <<< + set -e -o pipefail + ${preCommand} + R --no-save --slave <<CODE > ${outputFile} + library(dplyr) + library(reshape2) + + listOfFiles <- c("${sep='", "' inputFiles}") + + d <- do.call(rbind, lapply(listOfFiles, function(file){ + d <- read.table(file, header=TRUE, comment.char="#") + colI <- grep(${measurementVar}, colnames(d)) + colnames(d)[colI] <- strsplit(file, "/")[[1]][3] + d <- d %>% melt(id.vars=${idVar}, measure.vars=colI, + variable.name="sample", value.name="count") + })) + + d <- d %>% dcast(paste0(${idVar}, " ~ sample"), value.var="count") + write.table(d, sep="\t", quote=FALSE, row.names=FALSE) + CODE + >>> + + output { + File mergedCounts = outputFile + } + + runtime { + memory: 4 + (2*length(inputFiles)) + } +} \ No newline at end of file diff --git a/picard.wdl b/picard.wdl new file mode 100644 index 0000000000000000000000000000000000000000..104261816f42dea6126dc5c645ab7871e618fe1e --- /dev/null +++ b/picard.wdl @@ -0,0 +1,153 @@ +task ScatterIntervalList { + String? preCommand + File interval_list + Int scatter_count + String picard_jar + + Float? memory + Float? memoryMultiplier + + Int mem = ceil(select_first([memory, 4.0])) + command { + set -e -o pipefail + ${preCommand} + mkdir scatter_list + java -Xmx${mem}G -jar ${picard_jar} \ + IntervalListTools \ + SCATTER_COUNT=${scatter_count} \ + SUBDIVISION_MODE=BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \ + UNIQUE=true \ + SORT=true \ + INPUT=${interval_list} \ + OUTPUT=scatter_list + } + + output { + Array[File] out = glob("scatter_list/*/*.interval_list") + Int interval_count = read_int(stdout()) + } + + runtime { + memory: ceil(mem * select_first([memoryMultiplier, 1.5])) + } +} + +# Combine multiple recalibrated BAM files from scattered ApplyRecalibration runs +task GatherBamFiles { + String? preCommand + Array[File]+ input_bams + String output_bam_path + Int? compression_level + String picard_jar + + Float? memory + Float? memoryMultiplier + + Int mem = ceil(select_first([memory, 4.0])) + command { + set -e -o pipefail + ${preCommand} + java ${"-Dsamjdk.compression_level=" + compression_level} \ + -Xmx${mem}G -jar ${picard_jar} \ + GatherBamFiles \ + INPUT=${sep=' INPUT=' input_bams} \ + OUTPUT=${output_bam_path} \ + CREATE_INDEX=true \ + CREATE_MD5_FILE=true + } + + output { + File output_bam = "${output_bam_path}" + File output_bam_index = sub(output_bam_path, ".bam$", ".bai") + File output_bam_md5 = "${output_bam_path}.md5" + } + + runtime { + memory: ceil(mem * select_first([memoryMultiplier, 1.5])) + } +} + +# Mark duplicate reads to avoid counting non-independent observations +task MarkDuplicates { + String? preCommand + Array[File] input_bams + String output_bam_path + String metrics_path + Int? compression_level + String picard_jar + + Float? memory + Float? memoryMultiplier + + # The program default for READ_NAME_REGEX is appropriate in nearly every case. + # Sometimes we wish to supply "null" in order to turn off optical duplicate detection + # This can be desirable if you don't mind the estimated library size being wrong and optical duplicate detection is taking >7 days and failing + String? read_name_regex + + # Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly + # This works because the output of BWA is query-grouped and therefore, so is the output of MergeBamAlignment. + # While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname" + Int mem = ceil(select_first([memory, 4.0])) + command { + set -e -o pipefail + ${preCommand} + mkdir -p $(dirname ${output_bam_path}) + java ${"-Dsamjdk.compression_level=" + compression_level} \ + -Xmx${mem}G -jar ${picard_jar} \ + MarkDuplicates \ + INPUT=${sep=' INPUT=' input_bams} \ + OUTPUT=${output_bam_path} \ + METRICS_FILE=${metrics_path} \ + VALIDATION_STRINGENCY=SILENT \ + ${"READ_NAME_REGEX=" + read_name_regex} \ + OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \ + CLEAR_DT="false" \ + CREATE_INDEX=true \ + ADD_PG_TAG_TO_READS=false + } + + output { + File output_bam = output_bam_path + File output_bam_index = sub(output_bam_path, ".bam$", ".bai") + File duplicate_metrics = metrics_path + } + + runtime { + memory: ceil(mem * select_first([memoryMultiplier, 1.5])) + } +} + +# Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs +task MergeVCFs { + String? preCommand + Array[File] input_vcfs + Array[File] input_vcfs_indexes + String output_vcf_path + Int? compression_level + String picard_jar + + Float? memory + Float? memoryMultiplier + + # Using MergeVcfs instead of GatherVcfs so we can create indices + # See https://github.com/broadinstitute/picard/issues/789 for relevant GatherVcfs ticket + Int mem = ceil(select_first([memory, 4.0])) + command { + set -e -o pipefail + ${preCommand} + java ${"-Dsamjdk.compression_level=" + compression_level} \ + -Xmx${mem}G -jar ${picard_jar} \ + MergeVcfs \ + INPUT=${sep=' INPUT=' input_vcfs} \ + OUTPUT=${output_vcf_path} + } + + output { + File output_vcf = output_vcf_path + File output_vcf_index = output_vcf_path + ".tbi" + } + + runtime { + memory: ceil(mem * select_first([memoryMultiplier, 1.5])) + } +} \ No newline at end of file diff --git a/samtools.wdl b/samtools.wdl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..249143ffa7d4215650b6b8dce6d0b2b216548d2b 100644 --- a/samtools.wdl +++ b/samtools.wdl @@ -0,0 +1,67 @@ +task Index { + String? preCommand + String bamFilePath + + command { + set -e -o pipefail + ${preCommand} + samtools index ${bamFilePath} + } + + output { + File indexFile = bamFilePath + ".bai" + } +} + +task Merge { + String? preCommand + Array[File]+ bamFiles + String outputBamPath + + command { + set -e -o pipefail + ${preCommand} + if [ ${length(bamFiles)} -gt 1 ] + then + samtools merge ${outputBamPath} ${sep=' ' bamFiles} + else + ln -sf ${bamFiles} ${outputBamPath} + fi + } + + output { + File outputBam = outputBamPath + } +} + +task Markdup { + String? preCommand + File inputBam + String outputBamPath + + command { + set -e -o pipefail + ${preCommand} + samtools markdup ${inputBam} ${outputBamPath} + } + + output { + File outputBam = outputBamPath + } +} + +task Flagstat { + String? preCommand + File inputBam + String outputPath + + command { + set -e -o pipefail + ${preCommand} + samtools flagstat ${inputBam} > ${outputPath} + } + + output { + File flagstat = outputPath + } +} diff --git a/star.wdl b/star.wdl new file mode 100644 index 0000000000000000000000000000000000000000..d7d3b7b595953704ab0de936b82e1ba7405fe279 --- /dev/null +++ b/star.wdl @@ -0,0 +1,48 @@ +task Star { + String? preCommand + + Array[File] inputR1 + Array[File?] inputR2 + String genomeDir + String outFileNamePrefix + + String? outSAMtype + String? readFilesCommand + Int? runThreadN + String? outStd + String? twopassMode + Array[String]? outSAMattrRGline + + Int? memory + + #TODO needs to be extended for all possible output extensions + Map[String, String] samOutputNames = {"BAM SortedByCoordinate": "sortedByCoord.out.bam"} + + # converts String? to String for use as key (for the Map above) in output + String key = select_first([outSAMtype, "BAM SortedByCoordinate"]) + + command { + set -e -o pipefail + mkdir -p ${sub(outFileNamePrefix, basename(outFileNamePrefix) + "$", "")} + ${preCommand} + STAR \ + --readFilesIn ${sep=',' inputR1} ${sep="," inputR2} \ + --outFileNamePrefix ${outFileNamePrefix} \ + --genomeDir ${genomeDir} \ + --outSAMtype ${default="BAM SortedByCoordinate" outSAMtype} \ + --readFilesCommand ${default="zcat" readFilesCommand} \ + ${"--runThreadN " + runThreadN} \ + ${"--outStd " + outStd} \ + ${"--twopassMode " + twopassMode} \ + ${true="--outSAMattrRGline " false="" defined(outSAMattrRGline)} ${sep=" , " outSAMattrRGline} + } + + output { + File bamFile = outFileNamePrefix + "Aligned." + samOutputNames[key] + } + + runtime { + cpu: select_first([runThreadN, 1]) + memory: select_first([memory, 10]) + } +} \ No newline at end of file diff --git a/stringtie.wdl b/stringtie.wdl new file mode 100644 index 0000000000000000000000000000000000000000..5fdcd6ddedcac23e06eb4728b01289a95eaa665e --- /dev/null +++ b/stringtie.wdl @@ -0,0 +1,33 @@ +task Stringtie { + String? preCommand + File alignedReads + File? referenceGtf + Int? threads + String assembledTranscriptsFile + Boolean? firstStranded + Boolean? secondStranded + String? geneAbundanceFile + + command { + set -e -o pipefail + ${preCommand} + stringtie \ + ${"-p " + threads} \ + ${"-G " + referenceGtf} \ + ${true="--rf" false="" firstStranded} \ + ${true="fr" false="" secondStranded} \ + -o ${assembledTranscriptsFile} \ + ${"-A " + geneAbundanceFile} \ + ${alignedReads} \ + + } + + output { + File assembledTranscripts = assembledTranscriptsFile + File? geneAbundance = geneAbundanceFile + } + + runtime { + cpu: select_first([threads, 1]) + } +} \ No newline at end of file