Skip to content
Snippets Groups Projects
Commit 3961ab4e authored by Ruben Vorderman's avatar Ruben Vorderman
Browse files

Allow a custom separator char

parent 44781d17
No related branches found
No related tags found
No related merge requests found
......@@ -29,6 +29,7 @@ task BamReadNameToUmiTag {
File inputBam
String outputPath = "output.bam"
String umiTag = "RX"
String separatorChar = "_"
String memory = "2GiB"
Int timeMinutes = 1 + ceil(size([inputBam], "GiB") * 10)
......@@ -45,26 +46,26 @@ task BamReadNameToUmiTag {
from typing import Tuple
def split_umi_from_name(name) -> Tuple[str, str]:
def split_umi_from_name(name, separator_char = "_") -> Tuple[str, str]:
id_and_rest = name.split(maxsplit=1)
id = id_and_rest[0]
# If there was no whitespace id_and_rest will have length 1
other_parts = id_and_rest[1] if len(id_and_rest) == 2 else ""
underscore_index = id.rfind("_")
underscore_index = id.rfind(separator_char)
umi = id[underscore_index + 1:]
new_id = id[:underscore_index]
if other_parts:
return " ".join([new_id, other_parts]), umi
return new_id, umi
def annotate_umis(in_file, out_file, bam_tag="RX"):
def annotate_umis(in_file, out_file, bam_tag="RX", separator_char = "_"):
in_bam = pysam.AlignmentFile(in_file, "rb")
os.makedirs(os.path.dirname(out_file), exist_ok=True)
out_bam = pysam.AlignmentFile(out_file, "wb", template=in_bam)
# Encode bam_tag as bytes. Otherwise pysam converts it to bytes anyway.
encoded_bam_tag = bam_tag.encode('ascii')
for segment in in_bam: # type: pysam.AlignedSegment
new_name, umi = split_umi_from_name(segment.query_name)
new_name, umi = split_umi_from_name(segment.query_name, separator_char)
segment.query_name = new_name
# Encode umi as ascii. Otherwise pysam encodes it to bytes anyway.
# Value type has to be a string though, otherwise pysam crashes.
......@@ -72,7 +73,7 @@ task BamReadNameToUmiTag {
out_bam.write(segment)
if __name__ == "__main__":
annotate_umis("~{inputBam}", "~{outputPath}", "~{umiTag}")
annotate_umis("~{inputBam}", "~{outputPath}", "~{umiTag}", "~{separatorChar}")
pysam.index("~{outputPath}", "~{bamIndexPath}", b=True)
CODE
>>>
......@@ -93,6 +94,7 @@ task BamReadNameToUmiTag {
inputBam: {description: "The input SAM file.", category: "required"}
outputPath: {description: "Output directory path + output file.", category: "common"}
umiTag: {description: "The tag used for UMIs in the output BAM file.", category: "common"}
separatorChar: {description: "Character used to separate the UMIs from the read name", category: "common"}
memory: {description: "The amount of memory available to the job.", category: "advanced"}
timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment