From 939200521caed7a44a2398f4145ac48a17ec10f9 Mon Sep 17 00:00:00 2001
From: "J.F.J. Laros" <j.f.j.laros@lumc.nl>
Date: Sat, 18 Aug 2012 17:57:25 +0000
Subject: [PATCH] Added functionality for generating complex protein
 descriptions.

describe.py:
- Added functionality to detect frame shifts.
- Modified the RawVar class to support protein descriptions.
- Added the core protein description function.

models.py:
- Modified the RawVar class to support protein descriptions.


git-svn-id: https://humgenprojects.lumc.nl/svn/mutalyzer/trunk@595 eb6bd6ab-9ccd-42b9-aceb-e2899b4a52f1
---
 mutalyzer/describe.py | 344 ++++++++++++++++++++++++++++++++++++++++--
 mutalyzer/models.py   |   5 +
 mutalyzer/website.py  |   2 +-
 3 files changed, 341 insertions(+), 10 deletions(-)

diff --git a/mutalyzer/describe.py b/mutalyzer/describe.py
index 48f084e0..644573a7 100644
--- a/mutalyzer/describe.py
+++ b/mutalyzer/describe.py
@@ -7,7 +7,11 @@ leading from one sequence to an other.
 @requires: Bio.Seq
 """
 
+import collections
 import Bio.Seq
+from Bio.SeqUtils import seq3
+from Bio.Data import CodonTable
+
 from mutalyzer.util import longest_common_prefix, longest_common_suffix
 from mutalyzer.util import palinsnoop, roll
 from mutalyzer import models
@@ -100,6 +104,108 @@ def LongestCommonSubstring(s1, s2):
     return x_longest, y_longest, longest
 #LongestCommonSubstring
 
+def makeFSTables(table_id):
+    """
+    For every pair of amino acids, calculate the set of possible amino acids in
+    a different reading frame. Do this for both alternative reading frames (+1
+    and +2).
+
+    @arg table_id: Coding table ID.
+    @type table_id: int
+    @returns: Two dictionaries for the two alternative reading frames.
+    @rtype: tuple(dict, dict)
+    """
+    # Make the forward translation table.
+    table = dict(CodonTable.unambiguous_dna_by_id[table_id].forward_table)
+    for i in CodonTable.unambiguous_dna_by_id[table_id].stop_codons:
+        table[i] = '*'
+
+    # Make the reverse translation table.
+    reverse_table = collections.defaultdict(list)
+    for i in table:
+        reverse_table[table[i]].append(i)
+
+    # Make the frame shift tables.
+    FS1 = collections.defaultdict(set)
+    FS2 = collections.defaultdict(set)
+    for AA_i in reverse_table:
+        for AA_j in reverse_table:
+            for codon_i in reverse_table[AA_i]:
+                for codon_j in reverse_table[AA_j]:
+                    FS1[AA_i + AA_j].add(table[(codon_i + codon_j)[1:4]]) # +1.
+                    FS2[AA_i + AA_j].add(table[(codon_i + codon_j)[2:5]]) # +2.
+                #for
+    return FS1, FS2
+#makeFSTables
+
+def __makeOverlaps(peptide):
+    """
+    Make a list of overlapping 2-mers of {peptide} in order of appearance.
+
+    @arg peptide: A peptide sequence.
+    @type peptide: str
+    @returns: All 2-mers of {peptide} in order of appearance.
+    @rtype: list(str)
+    """
+    return map(lambda x: peptide[x:x+2], range(len(peptide) - 1))
+#__makeOverlaps
+
+def __options(pList, peptidePrefix, FS, output):
+    """
+    Enumerate all peptides that could result from a frame shift.
+
+    @arg pList: List of overlapping 2-mers of a peptide.
+    @type pList: list(str)
+    @arg peptidePrefix: Prefix of a peptide in the alternative reading frame.
+    @type peptidePrefix: str
+    @arg FS: Frame shift table.
+    @type FS: dict
+    @arg output: List of peptides, should be empty initially.
+    @type output: list(str)
+    """
+    if not pList:
+        output.append(peptidePrefix)
+        return
+    #if
+    for i in FS[pList[0]]:
+        __options(pList[1:], peptidePrefix + i, FS, output)
+#__options
+
+def enumFS(peptide, FS):
+    """
+    Enumerate all peptides that could result from a frame shift.
+
+    @arg peptide: Original peptide sequence.
+    @type peptide: str
+    @arg FS: Frame shift table.
+    @type FS: dict
+    """
+    output = []
+
+    __options(__makeOverlaps(peptide), "", FS, output)
+    return output
+#enumFS
+
+def fitFS(peptide, altPeptide, FS):
+    """
+    Check whether peptide {altPeptide} is a possible frame shift of peptide
+    {peptide}.
+
+    @arg peptide: Original peptide sequence.
+    @type peptide: str
+    @arg altPeptide: Observed peptide sequence.
+    @type altPeptide: str
+    @arg FS: Frame shift table.
+    @type FS: dict
+    """
+    pList = __makeOverlaps(peptide)
+
+    for i in range(len(altPeptide)):
+        if not altPeptide[i] in FS[pList[i]]:
+            return False
+    return True
+#fitFS
+
 class RawVar(models.RawVar):
     """
     Container for a raw variant.
@@ -112,8 +218,9 @@ class RawVar(models.RawVar):
       retuned, resulting in a description like: 100_100A>T
     """
 
-    def __init__(self, start=0, start_offset=0, end=0, end_offset=0,
-        type="none", deleted="", inserted="", shift=0):
+    def __init__(self, DNA=True, start=0, start_offset=0, end=0, end_offset=0,
+        type="none", deleted="", inserted="", shift=0, startAA="", endAA="",
+        term=0):
         """
         Initialise the class with the appropriate values.
 
@@ -136,6 +243,7 @@ class RawVar(models.RawVar):
         """
         # TODO: Will this container be used for all variants, or only genomic?
         #       start_offset and end_offset may be never used.
+        self.DNA = DNA
         self.start = start
         self.start_offset = start_offset
         self.end = end
@@ -144,10 +252,14 @@ class RawVar(models.RawVar):
         self.deleted = deleted
         self.inserted = inserted
         self.shift = shift
+        self.startAA = startAA
+        self.endAA = endAA
+        self.term = term
         self.hgvs = self.description()
+        self.hgvsLength = self.descriptionLength()
     #__init__
 
-    def description(self):
+    def __DNADescription(self):
         """
         Give the HGVS description of the raw variant stored in this class.
 
@@ -174,9 +286,45 @@ class RawVar(models.RawVar):
         #if
 
         return descr + "%s>%s" % (self.deleted, self.inserted)
-    #description
+    #__DNADescription
 
-    def descriptionLength(self):
+    def __proteinDescription(self):
+        """
+        Give the HGVS description of the raw variant stored in this class.
+
+        Note that this function relies on the absence of values to make the
+        correct description. Also see the comment in the class definition.
+
+        @returns: The HGVS description of the raw variant stored in this class.
+        @rtype: str
+        """
+        if not self.start:
+            return "="
+
+        descr = ""
+        if not self.deleted:
+            if self.type == "ext":
+                descr += '*'
+            else:
+                descr += "%s" % seq3(self.startAA)
+        #if
+        else:
+            descr += "%s" % seq3(self.deleted)
+        descr += "%i" % self.start
+        if self.end:
+            descr += "_%s%i" % (seq3(self.endAA), self.end)
+        if self.type not in ["subst", "stop", "ext", "fs"]:
+            descr += self.type
+        if self.inserted:
+            descr += "%s" % seq3(self.inserted)
+        if self.type == "stop":
+            return descr + '*'
+        if self.term:
+            return descr + "%s*%i" % (self.type, self.term)
+        return descr
+    #__proteinDescription
+
+    def __DNADescriptionLength(self):
         """
         Give the standardised length of the HGVS description of the raw variant
         stored in this class.
@@ -191,7 +339,7 @@ class RawVar(models.RawVar):
         if not self.start : # =
             return 1
 
-        descrLen = 1 # Start position
+        descrLen = 1 # Start position.
 
         if self.end : # '_' and end position.
             descrLen += 2
@@ -205,6 +353,61 @@ class RawVar(models.RawVar):
         #if
 
         return 4 # Start position, '>' and end position.
+    #__DNAdescriptionLength
+
+    def __proteinDescriptionLength(self):
+        """
+        Give the standardised length of the HGVS description of the raw variant
+        stored in this class.
+
+        Note that this function relies on the absence of values to make the
+        correct description. Also see the comment in the class definition.
+
+        @returns: The standardised length of the HGVS description of the raw
+            variant stored in this class.
+        @rtype: int
+        """
+        if not self.start: # =
+            return 1
+
+        descrLen = 1      # Start position.
+        if not self.deleted and self.type == "ext":
+            descrLen += 1 # *
+        else:
+            descrLen += 3 # One amino acid.
+        if self.end:
+            descrLen += 5 # `_' + one amino acid + end position.
+        if self.type not in ["subst", "stop", "ext", "fs"]:
+            descrLen += len(self.type)
+        if self.inserted:
+            descrLen += 3 * len(self.inserted)
+        if self.type == "stop":
+            return descrLen + 1 # *
+        if self.term:
+            return descrLen + len(self.type) + 2 # `*' + length until stop.
+        return descrLen
+    #__proteinDescriptionLength
+
+    def description(self):
+        """
+        """
+        if self.DNA:
+            return self.__DNADescription()
+        return self.__proteinDescription()
+    #description
+
+    def descriptionLength(self):
+        """
+        Give the standardised length of the HGVS description of the raw variant
+        stored in this class.
+
+        @returns: The standardised length of the HGVS description of the raw
+            variant stored in this class.
+        @rtype: int
+        """
+        if self.DNA:
+            return self.__DNADescriptionLength()
+        return self.__proteinDescriptionLength()
     #descriptionLength
 #RawVar
 
@@ -233,7 +436,8 @@ def alleleDescriptionLength(allele):
     @returns: The standardised length of the HGVS description of {allele}.
     @rval: int
     """
-    return sum(map(lambda x : x.descriptionLength(), allele))
+    # NOTE: Do we need to count the ; and [] ?
+    return sum(map(lambda x : x.hgvsLength, allele))
 #alleleDescriptionLength
 
 def printpos(s, start, end, fill = 0):
@@ -400,7 +604,127 @@ def DNA_description(M, s1, s2, s1_start, s1_end, s2_start, s2_end):
     return default
 #DNA_description
 
-def describeDNA(original, mutated):
+def protein_description(M, s1, s2, s1_start, s1_end, s2_start, s2_end):
+    """
+    Give an allele description of the change from {s1} to {s2} in the range
+    {s1_start}..{s1_end} on {s1} and {s2_start}..{s2_end} on {s2}.
+
+    arg s1: Sequence 1.
+    type s1: str
+    arg s2: Sequence 2.
+    type s2: str
+    arg s1_start: Start of the range on {s1}.
+    type s1_start: int
+    arg s1_end: End of the range on {s1}.
+    type s1_end: int
+    arg s2_start: Start of the range on {s2}.
+    type s2_start: int
+    arg s2_end: End of the range on {s2}.
+    type s2_end: int
+
+    @returns: A list of RawVar objects, representing the allele.
+    @rval: list(RawVar)
+    """
+    # Nothing happened.
+    if s1 == s2:
+        return [RawVar(DNA=False)]
+
+    # Substitution.
+    if s1_start + 1 == s1_end and  s2_start + 1 == s2_end:
+        return [RawVar(DNA=False, start=s1_start + 1, deleted=s1[s1_start],
+            inserted=s2[s2_start], type="subst")]
+
+    # Insertion / Duplication / Extention.
+    if s1_start == s1_end:
+        if len(s1) == s1_start + 1:
+            return [RawVar(DNA=False, start=s1_start + 1,
+                inserted=s2[s2_start], term=abs(len(s1) - len(s2)),
+                type="ext")]
+        ins_length = s2_end - s2_start
+        shift5, shift3 = roll(s2, s2_start + 1, s2_end)
+        shift = shift5 + shift3
+
+        s1_start += shift3
+        s1_end += shift3
+        s2_start += shift3
+        s2_end += shift3
+
+        if s2_start - ins_length >= 0 and \
+            s1[s1_start - ins_length:s1_start] == s2[s2_start:s2_end]:
+
+            if ins_length == 1:
+                return [RawVar(DNA=False, start=s1_start,
+                    startAA=s1[s1_start - 1], type="dup", shift=shift)]
+            return [RawVar(DNA=False, start=s1_start - ins_length + 1,
+                startAA=s1[s1_start - ins_length], end=s1_end,
+                endAA=s1[s1_end - 1], type="dup", shift=shift)]
+        #if
+        return [RawVar(DNA=False, start=s1_start, startAA=s1[s1_start - 1],
+            end=s1_start + 1, endAA=s1[s1_end], inserted=s2[s2_start:s2_end],
+            type="ins", shift=shift)]
+    #if
+
+    # Deletion / Inframe stop.
+    if s2_start == s2_end:
+        if len(s2) == s2_end + 1:
+            return [RawVar(DNA=False, start=s1_start + 1, startAA=s1[s1_start],
+                type="stop")]
+        shift5, shift3 = roll(s1, s1_start + 1, s1_end)
+        shift = shift5 + shift3
+
+        s1_start += shift3 + 1
+        s1_end += shift3
+
+        if s1_start == s1_end:
+            return [RawVar(DNA=False, start=s1_start, startAA=s1[s1_start - 1],
+                 type="del", shift=shift)]
+        return [RawVar(DNA=False, start=s1_start, startAA=s1[s1_start - 1],
+            end=s1_end, endAA=s1[s1_end - 1], type="del", shift=shift)]
+    #if
+
+    # Simple InDel.
+    if s1_start + 1 == s1_end:
+        return [RawVar(DNA=False, start=s1_start + 1, startAA=s1[s1_start],
+            inserted=s2[s2_start:s2_end], type="delins")]
+
+    # Frameshift.
+    if len(s1) == s1_end + 1 and len(s2) == s2_end + 1:
+        # TODO: the FS tables should be made for all coding tables in advance.
+        FS1, FS2 = makeFSTables(1) # Standard coding table.
+
+        if (fitFS(s1[s1_start + 1:], s2[s2_start + 1:], FS1) or
+            fitFS(s1[s1_start + 1:], s2[s2_start + 1:], FS2)):
+            return [RawVar(DNA=False, start=s1_start + 1, deleted=s1[s1_start],
+                inserted=s2[s2_start], term=len(s2) - s2_start, type="fs")]
+    #if
+
+    # At this point, we have an indel.
+    s1_end_f, s2_end_f, lcs_f_len = LongestCommonSubstring(s1[s1_start:s1_end],
+        s2[s2_start:s2_end])
+
+    # InDel.
+    default = [RawVar(DNA=False, start=s1_start + 1, startAA=s1[s1_start],
+        end=s1_end, endAA=s1[s1_end], inserted=s2[s2_start:s2_end],
+        type="delins")]
+
+    if not lcs_f_len : # Optimisation, not really needed.
+        return default
+
+    r1_len = s1_end_f - lcs_f_len
+    r2_len = s1_end - s1_start - s1_end_f
+    m1_len = s2_end_f - lcs_f_len
+    m2_len = s2_end - s2_start - s2_end_f
+
+    partial = protein_description(M, s1, s2, s1_start, s1_start + r1_len,
+        s2_start, s2_start + m1_len) + protein_description(M, s1, s2,
+        s1_end - r2_len, s1_end, s2_end - m2_len, s2_end)
+
+    if alleleDescriptionLength(partial) <= alleleDescriptionLength(default):
+        return partial
+    return default
+#protein_description
+
+def describe(original, mutated, DNA=True):
     """
     Convenience function for DNA_description().
 
@@ -421,5 +745,7 @@ def describeDNA(original, mutated):
 
     M = LCSMatrix(s1, s2)
 
+    if not DNA:
+        return protein_description(M, s1, s2, lcp, s1_end, lcp, s2_end)
     return DNA_description(M, s1, s2, lcp, s1_end, lcp, s2_end)
-#describeDNA
+#describe
diff --git a/mutalyzer/models.py b/mutalyzer/models.py
index fd847d30..9f5622a6 100644
--- a/mutalyzer/models.py
+++ b/mutalyzer/models.py
@@ -93,6 +93,7 @@ class RawVar(ComplexModel):
     """
     __namespace__ = SOAP_NAMESPACE
 
+    DNA = Mandatory.Boolean
     start = Mandatory.Integer
     start_offset = Mandatory.Integer
     end = Mandatory.Integer
@@ -101,7 +102,11 @@ class RawVar(ComplexModel):
     deleted = Mandatory.String
     inserted = Mandatory.String
     shift = Mandatory.Integer
+    startAA = Mandatory.String
+    endAA = Mandatory.String
+    term = Mandatory.Integer
     hgvs = Mandatory.String
+    hgvsLength = Mandatory.Integer
 #RawVar
 
 
diff --git a/mutalyzer/website.py b/mutalyzer/website.py
index 72929978..f7d706ce 100644
--- a/mutalyzer/website.py
+++ b/mutalyzer/website.py
@@ -919,7 +919,7 @@ class DescriptionExtractor:
             output.addMessage(__file__, 3, "ENODNA",
                 "Variant sequence is not DNA.")
 
-        result = describe.describeDNA(referenceSeq, variantSeq)
+        result = describe.describe(referenceSeq, variantSeq)
         description = describe.alleleDescription(result)
 
         errors, warnings, summary = output.Summary()
-- 
GitLab