Update Picard and reevaluate use of intel inflater/defaler

09b97388 · Ruben Vorderman · 7b2d86fe · 09b97388
Commit 09b97388 authored 3 years ago by Ruben Vorderman
--- a/picard.wdl
+++ b/picard.wdl
@@ -29,7 +29,7 @@ task BedToIntervalList {
        String javaXmx = "3G"
        String memory = "4G"
        Int timeMinutes = 5
-        String dockerImage = "quay.io/biocontainers/picard:2.23.8--0"
+        String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0"
    }

    command {
@@ -89,7 +89,7 @@ task CollectHsMetrics {
        # Additional * 2 because picard multiple metrics reads the
        # reference fasta twice.
        Int timeMinutes = 1 + ceil(size(referenceFasta, "G") * 3 * 2) + ceil(size(inputBam, "G") * 6)
-        String dockerImage = "quay.io/biocontainers/picard:2.23.8--0"
+        String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0"
    }

    command {
@@ -158,7 +158,7 @@ task CollectMultipleMetrics {
        Int memoryMb = javaXmxMb + 512
        # Additional * 2 because picard multiple metrics reads the reference fasta twice.
        Int timeMinutes = 1 + ceil(size(referenceFasta, "G") * 3 * 2) + ceil(size(inputBam, "G") * 6)
-        String dockerImage = "quay.io/biocontainers/picard:2.23.8--0"
+        String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0"
    }

    command {
@@ -284,7 +284,7 @@ task CollectRnaSeqMetrics {
        String memory = "9G"
        # With 6 minutes per G there were several timeouts.
        Int timeMinutes = 1 + ceil(size(inputBam, "G") * 12)
-        String dockerImage = "quay.io/biocontainers/picard:2.23.8--0"
+        String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0"
    }

    command {
@@ -342,7 +342,7 @@ task CollectTargetedPcrMetrics {
        String javaXmx = "3G"
        String memory = "4G"
        Int timeMinutes = 1 + ceil(size(inputBam, "G") * 6)
-        String dockerImage = "quay.io/biocontainers/picard:2.23.8--0"
+        String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0"
    }

    command {
@@ -404,7 +404,7 @@ task CollectVariantCallingMetrics {
        String javaXmx =  "8G"
        String memory = "9G"
        Int timeMinutes = 1440
-        String dockerImage = "quay.io/biocontainers/picard:2.23.8--0"
+        String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0"
    }

    command {
@@ -453,7 +453,7 @@ task CreateSequenceDictionary {

        String javaXmx = "2G"
        String memory = "3G"
-        String dockerImage = "quay.io/biocontainers/picard:2.23.8--0"
+        String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0"
    }

    command {
@@ -497,13 +497,15 @@ task GatherBamFiles {
        String outputBamPath
        Boolean createMd5File = false

-        Int? compressionLevel
+        Int compressionLevel = 1
+        Boolean useJdkInflater = false
+        Boolean useJdkDeflater = true  # Achieves much better compression rates than the intel deflater

        Int javaXmxMb = 1024
        Int memoryMb = javaXmxMb + 512
        # One minute per input gigabyte.
        Int timeMinutes = 1 + ceil(size(inputBams, "G") * 1)
-        String dockerImage = "quay.io/biocontainers/picard:2.23.8--0"
+        String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0"
    }

    command {
@@ -513,7 +515,9 @@ task GatherBamFiles {
        GatherBamFiles \
        INPUT=~{sep=' INPUT=' inputBams} \
        OUTPUT=~{outputBamPath} \
-        ~{"COMPRESSION_LEVEL=" + compressionLevel} \
+        COMPRESSION_LEVEL=~{compressionLevel} \
+        USE_JDK_INFLATER=~{true="true" false="false" useJdkInflater} \
+        USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater} \
        CREATE_INDEX=true \
        CREATE_MD5_FILE=~{true="true" false="false" createMd5File}
    }
@@ -536,7 +540,9 @@ task GatherBamFiles {
        inputBamsIndex: {description: "The indexes of the input BAM files.", category: "required"}
        outputBamPath: {description: "The path where the merged BAM file will be written.", caregory: "required"}
        createMd5File: {decription: "Whether to create an md5 file of the output BAM.", category: "advanced"}
-        compressionLevel: {description: "The compression level of the output BAM.", category: "advanced"}
+        compressionLevel: {description: "The compression level at which the BAM files are written.", category: "advanced"}
+        useJdkInflater: {description: "True, uses the java inflater. False, uses the optimized intel inflater.", category: "advanced"}
+        useJdkDeflater: {description: "True, uses the java deflator to compress the BAM files. False uses the optimized intel deflater.", category: "advanced"}
        javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.", category: "advanced"}
        memoryMb: {description: "The amount of memory this job will use in megabytes.", category: "advanced"}
        timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
@@ -555,10 +561,14 @@ task GatherVcfs {
        Array[File]+ inputVcfIndexes
        String outputVcfPath = "out.vcf.gz"

+        Int compressionLevel = 1
+        Boolean useJdkInflater = false
+        Boolean useJdkDeflater = true  # Achieves much better compression rates than the intel deflater
+
        String javaXmx = "4G"
        String memory = "5G"
        Int timeMinutes = 1 + ceil(size(inputVcfs, "G") * 2)
-        String dockerImage = "quay.io/biocontainers/picard:2.23.8--0"
+        String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0"
    }

    command {
@@ -566,6 +576,10 @@ task GatherVcfs {
        mkdir -p "$(dirname ~{outputVcfPath})"
        picard -Xmx~{javaXmx} -XX:ParallelGCThreads=1 \
        GatherVcfs \
+        COMPRESSION_LEVEL=~{compressionLevel} \
+        USE_JDK_INFLATER=~{true="true" false="false" useJdkInflater} \
+        USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater} \
+        CREATE_INDEX=true \
        INPUT=~{sep=' INPUT=' inputVcfs} \
        OUTPUT=~{outputVcfPath}
    }
@@ -590,6 +604,10 @@ task GatherVcfs {
        timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
        dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"}

+        compressionLevel: {description: "The compression level at which the BAM files are written.", category: "advanced"}
+        useJdkInflater: {description: "True, uses the java inflater. False, uses the optimized intel inflater.", category: "advanced"}
+        useJdkDeflater: {description: "True, uses the java deflator to compress the BAM files. False uses the optimized intel deflater.", category: "advanced"}
+
        # outputs
        outputVcf: {description: "Multiple VCF files gathered into one file."}
    }
@@ -601,14 +619,11 @@ task MarkDuplicates {
        Array[File]+ inputBams
        String outputBamPath
        String metricsPath
-        Int compressionLevel = 1
        Boolean createMd5File = false
-        Boolean useJdkInflater = true  # Slightly faster than the intel one.
-        # Better results for compression level 1 (much smaller).
-        # Higher compression levels similar to intel deflater.
-        # NOTE: this might change in the future when the intel
-        # deflater is updated!
-        Boolean useJdkDeflater = true
+        
+        Int compressionLevel = 1
+        Boolean useJdkInflater = false
+        Boolean useJdkDeflater = true  # Achieves much better compression rates than the intel deflater

        # The program default for READ_NAME_REGEX is appropriate in nearly every case.
        # Sometimes we wish to supply "null" in order to turn off optical duplicate detection.
@@ -622,7 +637,7 @@ task MarkDuplicates {
        String memoryMb = javaXmxMb + 512

        Int timeMinutes = 1 + ceil(size(inputBams, "G") * 8)
-        String dockerImage = "quay.io/biocontainers/picard:2.23.8--0"
+        String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0"
    }

    # Task is assuming query-sorted input so that the Secondary and Supplementary reads get
@@ -638,6 +653,8 @@ task MarkDuplicates {
        OUTPUT=~{outputBamPath} \
        METRICS_FILE=~{metricsPath} \
        COMPRESSION_LEVEL=~{compressionLevel} \
+        USE_JDK_INFLATER=~{true="true" false="false" useJdkInflater} \
+        USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater} \
        VALIDATION_STRINGENCY=SILENT \
        ~{"READ_NAME_REGEX=" + read_name_regex} \
        OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \
@@ -668,9 +685,9 @@ task MarkDuplicates {
        outputBamPath: {description: "The location where the ouptut BAM file should be written.", category: "required"}
        metricsPath: {description: "The location where the output metrics file should be written.", category: "required"}
        compressionLevel: {description: "The compression level at which the BAM files are written.", category: "advanced"}
-        createMd5File: {description: "Whether to create a md5 file for the created BAM file.", category: "advanced"}
        useJdkInflater: {description: "True, uses the java inflater. False, uses the optimized intel inflater.", category: "advanced"}
        useJdkDeflater: {description: "True, uses the java deflator to compress the BAM files. False uses the optimized intel deflater.", category: "advanced"}
+        createMd5File: {description: "Whether to create a md5 file for the created BAM file.", category: "advanced"}
        read_name_regex: {description: "Equivalent to the `READ_NAME_REGEX` option of MarkDuplicates.", category: "advanced"}
        javaXmxMb: {description: "The maximum memory available to the program in megabytes. Should be lower than `memoryMb` to accommodate JVM overhead.", category: "advanced"}
        memoryMb: {description: "The amount of memory this job will use in megabytes.", category: "advanced"}
@@ -692,16 +709,20 @@ task MergeVCFs {
        Array[File]+ inputVCFsIndexes
        String outputVcfPath
        Int compressionLevel = 1
-        Boolean useJdkInflater = true # Slightly faster than the intel one.
+        Boolean useJdkInflater = false
        # Better results for compression level 1 (much smaller).
        # Higher compression levels similar to intel deflater.
        # NOTE: this might change in the future when the intel deflater is updated!
-        Boolean useJdkDeflater = true
+        # Second NOTE: No it did not change. Only the fastest algorithm with
+        # worse compression is wrapped in the intel GKL. Instead of using
+        # one of the slightly slower but better compressing alternatives from ISA-L. 
+        # (Which are also faster than zlib.)
+        Boolean useJdkDeflater = true  # Achieves much better compression rates than the intel deflater

        String javaXmx = "4G"
        String memory = "5G"
        Int timeMinutes = 1 + ceil(size(inputVCFs, "G")) * 2
-        String dockerImage = "quay.io/biocontainers/picard:2.23.8--0"
+        String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0"
    }

    # Using MergeVcfs instead of GatherVcfs so we can create indices.
@@ -757,7 +778,7 @@ task SamToFastq {
        String javaXmx = "16G" # High memory default to avoid crashes.
        String memory = "17G"
        Int timeMinutes = 30
-        String dockerImage = "quay.io/biocontainers/picard:2.23.8--0"
+        String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0"

        File? noneFile
    }
@@ -818,7 +839,7 @@ task ScatterIntervalList {

        String javaXmx = "3G"
        String memory = "4G"
-        String dockerImage = "quay.io/biocontainers/picard:2.23.8--0"
+        String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0"
    }

    command {
@@ -853,13 +874,15 @@ task SortSam {
        Boolean createMd5File = false
        Int maxRecordsInRam = 500000
        Int compressionLevel = 1
+        Boolean useJdkInflater = false
+        Boolean useJdkDeflater = true  # Achieves much better compression rates than the intel deflater

        # Default ram of 4 GB. Using 125001.0  to prevent an answer of
        # 4.000000001 which gets rounded to 5.
        # GATK Best practices uses 75000 here: https://github.com/gatk-workflows/broad-prod-wgs-germline-snps-indels/blob/d2934ed656ade44801f9cfe1c0e78d4f80684b7b/PairedEndSingleSampleWf-fc-hg38.wdl#L778
        Int XmxGb = ceil(maxRecordsInRam / 125001.0)
        Int timeMinutes = 1 + ceil(size(inputBam, "G") * 3)
-        String dockerImage = "quay.io/biocontainers/picard:2.23.8--0"
+        String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0"
    }

    command {
@@ -872,6 +895,8 @@ task SortSam {
        SORT_ORDER=~{true="queryname" false="coordinate" sortByName} \
        CREATE_INDEX=true \
        COMPRESSION_LEVEL=~{compressionLevel} \
+        USE_JDK_INFLATER=~{true="true" false="false" useJdkInflater} \
+        USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater} \
        VALIDATION_STRINGENCY=SILENT \
        CREATE_MD5_FILE=~{true="true" false="false" createMd5File}

@@ -896,7 +921,9 @@ task SortSam {
        sortByName: {description: "Sort the output file by name, default is position.", category: "advanced"}
        createMd5File: {description: "Whether to create an MD5 digest for any BAM or FASTQ files created.", category: "advanced"}
        maxRecordsInRam: {description: "This will specify the number of records stored in RAM before spilling to disk.", category: "advanced"}
-        compressionLevel: {description: "Compression level for all compressed files created.", category: "advanced"}
+        compressionLevel: {description: "The compression level at which the BAM files are written.", category: "advanced"}
+        useJdkInflater: {description: "True, uses the java inflater. False, uses the optimized intel inflater.", category: "advanced"}
+        useJdkDeflater: {description: "True, uses the java deflator to compress the BAM files. False uses the optimized intel deflater.", category: "advanced"}
        XmxGb: {description: "The maximum memory available to picard SortSam. Should be lower than `memory` to accommodate JVM overhead and BWA mem's memory usage.", category: "advanced"}
        timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
        dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"}
@@ -917,7 +944,7 @@ task SortVcf {
        String javaXmx = "8G"
        String memory = "9G"
        Int timeMinutes = 1 + ceil(size(vcfFiles, "G") * 5)
-        String dockerImage = "quay.io/biocontainers/picard:2.23.8--0"
+        String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0"
    }


@@ -967,7 +994,7 @@ task RenameSample {
        String javaXmx = "8G"
        String memory = "9G"
        Int timeMinutes = 1 + ceil(size(inputVcf, "G") * 2)
-        String dockerImage = "quay.io/biocontainers/picard:2.23.8--0"
+        String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0"
    }

    command {
@@ -1014,11 +1041,13 @@ task UmiAwareMarkDuplicatesWithMateCigar {
        String tempdir = "temp"
        Boolean removeDuplicates = true
        String umiTagName = "RX"
-
+        Int compressionLevel = 1
+        Boolean useJdkInflater = false
+        Boolean useJdkDeflater = true  # Achieves much better compression rates than the intel deflater
        String javaXmx = "8G"
        String memory = "9G"
        Int timeMinutes = 360
-        String dockerImage = "quay.io/biocontainers/picard:2.25.7--hdfd78af_0"
+        String dockerImage = "quay.io/biocontainers/picard:2.26.10--hdfd78af_0"
    }

    command {
@@ -1034,6 +1063,9 @@ task UmiAwareMarkDuplicatesWithMateCigar {
        TMP_DIR=~{tempdir} \
        REMOVE_DUPLICATES=~{removeDuplicates} \
        CREATE_INDEX=true \
+        COMPRESSION_LEVEL=~{compressionLevel} \
+        USE_JDK_INFLATER=~{true="true" false="false" useJdkInflater} \
+        USE_JDK_DEFLATER=~{true="true" false="false" useJdkDeflater}
    }

    output {
@@ -1058,6 +1090,9 @@ task UmiAwareMarkDuplicatesWithMateCigar {
        removeDuplicates: {description: "Whether the duplicate reads should be removed instead of marked.", category: "common"}
        umiTagName: {description: "Which tag in the BAM file holds the UMI.", category: "common"}
        tempdir: {description: "Temporary directory.", category: "advanced"}
+        compressionLevel: {description: "The compression level at which the BAM files are written.", category: "advanced"}
+        useJdkInflater: {description: "True, uses the java inflater. False, uses the optimized intel inflater.", category: "advanced"}
+        useJdkDeflater: {description: "True, uses the java deflator to compress the BAM files. False uses the optimized intel deflater.", category: "advanced"}
        memory: {description: "The amount of memory this job will use.", category: "advanced"}
        timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
        dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"}