From 8ef4a68a21e029f0dbdef5475b160d14ce54833a Mon Sep 17 00:00:00 2001 From: "J.F.J. Laros" <j.f.j.laros@lumc.nl> Date: Thu, 14 Aug 2014 15:01:15 +0200 Subject: [PATCH] Added weights for variants. --- mutalyzer/describe.py | 64 +++++++++++++++++++++++++++++-------------- mutalyzer/variant.py | 23 ++++++++++++++-- 2 files changed, 64 insertions(+), 23 deletions(-) diff --git a/mutalyzer/describe.py b/mutalyzer/describe.py index 91d072f0..520322ba 100644 --- a/mutalyzer/describe.py +++ b/mutalyzer/describe.py @@ -181,16 +181,28 @@ def find_fs(peptide, alternative_peptide, fs): #find_fs -def var_to_rawvar(s1, s2, var, seq_list=[], container=DNAVar): +def var_to_rawvar(s1, s2, var, weight_position, seq_list=[], container=DNAVar): """ """ # Unknown. if s1 == '?' or s2 == '?': return [container(type="unknown")] + + ins_length = var.sample_end - var.sample_start + weight = 0 + + if seq_list: + inserted = seq_list + weight = seq_list.weight() + else: + inserted = ISeqList([ISeq( + sequence=s2[var.sample_start:var.sample_end], + weight=ins_length)]) + + # Insertion / Duplication. if var.reference_start == var.reference_end: - ins_length = var.sample_end - var.sample_start shift5, shift3 = roll(s2, var.sample_start + 1, var.sample_end) shift = shift5 + shift3 @@ -199,6 +211,7 @@ def var_to_rawvar(s1, s2, var, seq_list=[], container=DNAVar): var.sample_start += shift3 var.sample_end += shift3 + # FIXME: range can be a duplication. if not seq_list and (var.sample_start - ins_length >= 0 and s1[var.reference_start - ins_length:var.reference_start] == s2[var.sample_start:var.sample_end]): @@ -207,14 +220,16 @@ def var_to_rawvar(s1, s2, var, seq_list=[], container=DNAVar): end=var.reference_end, type="dup", shift=shift, sample_start=var.sample_start + 1, sample_end=var.sample_end, inserted=ISeqList([ISeq(sequence=s2[ - var.sample_start:var.sample_end])])) + var.sample_start:var.sample_end])]), weight=var.weight) #if + + weight += (2 * weight_position + extractor.WEIGHT_SEPARATOR + + extractor.WEIGHT_INSERTION) + return container(start=var.reference_start, - end=var.reference_start + 1, - inserted=seq_list or - ISeqList([ISeq(sequence=s2[var.sample_start:var.sample_end])]), - type="ins", shift=shift, sample_start=var.sample_start + 1, - sample_end=var.sample_end) + end=var.reference_start + 1, inserted=inserted, type="ins", + shift=shift, sample_start=var.sample_start + 1, + sample_end=var.sample_end, weight=weight) #if # Deletion. @@ -228,7 +243,7 @@ def var_to_rawvar(s1, s2, var, seq_list=[], container=DNAVar): return container(start=var.reference_start + 1, end=var.reference_end, type="del", shift=shift, sample_start=var.sample_start, sample_end=var.sample_end + 1, - deleted=ISeqList([ISeq(sequence=s1[ + weight=var.weight, deleted=ISeqList([ISeq(sequence=s1[ var.reference_start:var.reference_end])])) #if @@ -238,7 +253,7 @@ def var_to_rawvar(s1, s2, var, seq_list=[], container=DNAVar): return container(start=var.reference_start + 1, end=var.reference_end, sample_start=var.sample_start + 1, - sample_end=var.sample_end, type="subst", + sample_end=var.sample_end, type="subst", weight=var.weight, deleted=ISeqList([ISeq(sequence=s1[var.reference_start])]), inserted=ISeqList([ISeq(sequence=s2[var.sample_start])])) #if @@ -258,16 +273,19 @@ def var_to_rawvar(s1, s2, var, seq_list=[], container=DNAVar): deleted=ISeqList([ISeq(sequence=s1[ var.reference_start:var.reference_end])]), inserted=ISeqList([ISeq(sequence=s2[ - var.sample_start:var.reference_end])])) + var.sample_start:var.reference_end])]), weight=var.weight) #if # InDel. + weight += weight_position + extractor.WEIGHT_DELETION_INSERTION + if var.reference_start + 1 == var.reference_end: + weight += weight_position + extractor.WEIGHT_SEPARATOR + return container(start=var.reference_start + 1, end=var.reference_end, deleted=ISeqList([ISeq(sequence=s1[ - var.reference_start:var.reference_end])]), inserted=seq_list or - ISeqList([ISeq(sequence=s2[var.sample_start:var.sample_end])]), + var.reference_start:var.reference_end])]), inserted=seq_list, type="delins", sample_start=var.sample_start + 1, - sample_end=var.sample_end) + sample_end=var.sample_end, weight=var.weight) #var_to_rawvar def describe_dna(s1, s2): @@ -285,7 +303,8 @@ def describe_dna(s1, s2): description = Allele() in_transposition = 0 - for variant in extractor.extract(str(s1), len(s1), str(s2), len(s2), 0): + variant_extract = extractor.extract(str(s1), len(s1), str(s2), len(s2), 0) + for variant in variant_extract.variants: print (variant.type, variant.reference_start, variant.reference_end, variant.sample_start, variant.sample_end, variant.transposition_start, @@ -302,22 +321,27 @@ def describe_dna(s1, s2): if in_transposition: if variant.type & extractor.IDENTITY: seq_list.append(ISeq(start=variant.transposition_start + 1, - end=variant.transposition_end, reverse=False)) + end=variant.transposition_end, weight=variant.weight, + reverse=False)) elif variant.type & extractor.REVERSE_COMPLEMENT: seq_list.append(ISeq(start=variant.transposition_start + 1, - end=variant.transposition_end, reverse=True)) + end=variant.transposition_end, weight=variant.weight, + reverse=True)) else: seq_list.append(ISeq( - sequence=s2[variant.sample_start:variant.sample_end])) + sequence=s2[variant.sample_start:variant.sample_end], + weight=variant.weight)) #if elif not (variant.type & extractor.IDENTITY): - description.append(var_to_rawvar(s1, s2, variant)) + description.append(var_to_rawvar(s1, s2, variant, + variant_extract.weight_position)) if variant.type & extractor.TRANSPOSITION_CLOSE: in_transposition -= 1 if not in_transposition: - description.append(var_to_rawvar(s1, s2, variant, seq_list)) + description.append(var_to_rawvar(s1, s2, variant, + variant_extract.weight_position, seq_list)) #if #for diff --git a/mutalyzer/variant.py b/mutalyzer/variant.py index 5db00642..de8187d5 100644 --- a/mutalyzer/variant.py +++ b/mutalyzer/variant.py @@ -1,6 +1,7 @@ """ """ +from extractor.extractor import WEIGHT_SEPARATOR from mutalyzer import models class HGVSList(list): @@ -12,6 +13,13 @@ class HGVSList(list): return "[{}]".format(';'.join(map(str, self))) return str(self[0]) #__str__ + + def weight(self): + W = sum(map(lambda x: x.weight, self)) + + if len(self) > 1: + return W + (len(self) + 1) * WEIGHT_SEPARATOR + return W #HGVSList class Allele(HGVSList): @@ -24,7 +32,7 @@ class ISeq(object): """ Container for an inserted sequence. """ - def __init__(self, sequence="", start=0, end=0, reverse=False): + def __init__(self, sequence="", start=0, end=0, weight=0, reverse=False): """ :arg sequence: Literal inserted sequence. :type sequence: str @@ -32,12 +40,15 @@ class ISeq(object): :type start: int :arg end: End position for a transposed sequence. :type end: int + :arg weight: Weight of the variant (normalised length). + :type weight: int :arg reverse: Inverted transposed sequence. :type reverse: bool """ self.sequence = sequence self.start = start self.end = end + self.weight = weight self.reverse = reverse self.type = "trans" @@ -64,7 +75,7 @@ class DNAVar(models.DNAVar): def __init__(self, start=0, start_offset=0, end=0, end_offset=0, sample_start=0, sample_start_offset=0, sample_end=0, sample_end_offset=0, type="none", deleted=ISeqList([ISeq()]), - inserted=ISeqList([ISeq()]), shift=0): + inserted=ISeqList([ISeq()]), weight=0, shift=0): """ Initialise the class with the appropriate values. @@ -90,6 +101,8 @@ class DNAVar(models.DNAVar): :type deleted: str :arg inserted: Inserted part. :type inserted: object + :arg weight: Weight of the variant (normalised length). + :type weight: int :arg shift: Amount of freedom. :type shift: int """ @@ -106,6 +119,7 @@ class DNAVar(models.DNAVar): self.type = type self.deleted = deleted self.inserted = inserted + self.weight = weight self.shift = shift #__init__ @@ -144,7 +158,7 @@ class ProteinVar(models.ProteinVar): """ def __init__(self, start=0, end=0, sample_start=0, sample_end=0, type="none", deleted=ISeqList([ISeq()]), - inserted=ISeqList([ISeq()]), shift=0, term=0): + inserted=ISeqList([ISeq()]), weight=0, shift=0, term=0): """ Initialise the class with the appropriate values. @@ -162,6 +176,8 @@ class ProteinVar(models.ProteinVar): :type deleted: str :arg inserted: Inserted part. :type inserted: object + :arg weight: Weight of the variant (normalised length). + :type weight: int :arg shift: Amount of freedom. :type shift: int :arg term: @@ -174,6 +190,7 @@ class ProteinVar(models.ProteinVar): self.type = type self.deleted = deleted self.inserted = inserted + self.weight = weight self.shift = shift self.term = term #__init__ -- GitLab