version 1.0

# Copyright (c) 2022 Leiden University Medical Center
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

task BamReadNameToUmiTag {

    # This task processes a bam file with reads that have been extracted with
    # umi-tools extract. The UMI is extracted from the read name again and put 
    # in the bam file again with umiTag (default RX)
    input {
        File inputBam
        String outputPath = "output.bam"
        String umiTag = "RX"
        String separatorChar = "_"

        String memory = "2GiB"
        Int timeMinutes = 1 + ceil(size([inputBam], "GiB") * 10)
        String dockerImage = "quay.io/biocontainers/pysam:0.17.0--py39h051187c_0"
    }
    
    String bamIndexPath = sub(select_first([outputPath]), "\.bam$", ".bai")
    
    command <<<
        python <<CODE
        import pysam 
        import sys
        import os

        from typing import Tuple

        def split_umi_from_name(name, separator_char = "_") -> Tuple[str, str]:
            id_and_rest = name.split(maxsplit=1)
            id = id_and_rest[0]
            # If there was no whitespace id_and_rest will have length 1
            other_parts = id_and_rest[1] if len(id_and_rest) == 2 else ""
            underscore_index = id.rfind(separator_char)
            umi = id[underscore_index + 1:]
            new_id = id[:underscore_index]
            if other_parts:
                return " ".join([new_id, other_parts]), umi
            return new_id, umi

        def annotate_umis(in_file, out_file, bam_tag="RX", separator_char = "_"):
            in_bam = pysam.AlignmentFile(in_file, "rb")
            os.makedirs(os.path.dirname(out_file), exist_ok=True)
            out_bam = pysam.AlignmentFile(out_file, "wb", template=in_bam)
            # Encode bam_tag as bytes. Otherwise pysam converts it to bytes anyway.
            encoded_bam_tag = bam_tag.encode('ascii')
            for segment in in_bam:  # type: pysam.AlignedSegment
                new_name, umi = split_umi_from_name(segment.query_name, separator_char)
                segment.query_name = new_name
                # Encode umi as ascii. Otherwise pysam encodes it to bytes anyway.
                # Value type has to be a string though, otherwise pysam crashes.
                segment.set_tag(encoded_bam_tag, umi.encode('ascii'), value_type="Z")
                out_bam.write(segment)

        if __name__ == "__main__":
            annotate_umis("~{inputBam}", "~{outputPath}", "~{umiTag}", "~{separatorChar}")
            pysam.index("~{outputPath}", "~{bamIndexPath}", b=True)
        CODE
    >>>

    output {
        File outputBam = outputPath
        File outputBamIndex = bamIndexPath
    }

    runtime {
        memory: memory
        time_minutes: timeMinutes
        docker: dockerImage
    }

    parameter_meta {
        # inputs
        inputBam: {description: "The input SAM file.", category: "required"}
        outputPath: {description: "Output directory path + output file.", category: "common"}
        umiTag: {description: "The tag used for UMIs in the output BAM file.", category: "common"}
        separatorChar: {description: "Character used to separate the UMIs from the read name.", category: "common"}

        memory: {description: "The amount of memory available to the job.", category: "advanced"}
        timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
        dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"}

        # outputs
        outputBam: {description: "Sorted BAM file."}
        outputBamIndex: {description: "Sorted BAM file index."}
    }
}