From 851e71fef32e4f4861a7609077ff02c97b069622 Mon Sep 17 00:00:00 2001
From: Martijn Vermaat <martijn@vermaat.name>
Date: Tue, 22 Sep 2015 09:16:24 +0200
Subject: [PATCH] Visualise protein change, also with alternative start

In the case of an alternative start codon (in the reference CDS),
protein changes were not visualised. This is fixed and a WALTSTART
warning is also issued.

Also, if a new non-reference start codon is created by the variant,
visualise this as such.
---
 mutalyzer/util.py            |   4 --
 mutalyzer/variantchecker.py  | 124 +++++++++++++++++++++++------------
 tests/test_variantchecker.py |  93 ++++++++++++++++++++++++--
 3 files changed, 172 insertions(+), 49 deletions(-)

diff --git a/mutalyzer/util.py b/mutalyzer/util.py
index 4f95f643..16667d58 100644
--- a/mutalyzer/util.py
+++ b/mutalyzer/util.py
@@ -600,10 +600,6 @@ def protein_description(cds_stop, s1, s2):
     else:
         description = in_frame_description(s1, s2)
 
-    if not s2 or s1[0] != s2[0]:
-        # Mutation in start codon.
-        return 'p.?', description[1], description[2], description[3]
-
     return description
 #protein_description
 
diff --git a/mutalyzer/variantchecker.py b/mutalyzer/variantchecker.py
index ca35dd49..a3bc7a8d 100644
--- a/mutalyzer/variantchecker.py
+++ b/mutalyzer/variantchecker.py
@@ -1329,6 +1329,30 @@ def _add_transcript_info(mutator, transcript, output):
 
     # Add protein prediction to output.
     if transcript.translate:
+        # Data added to the output object:
+        # - origCDS: Original CDS.
+        # - newCDS: Variant CDS.
+        # - oldprotein: Original protein sequence, ending with '*'.
+        # - newprotein:
+        #     - If variant CDS could not be translated, this is '?'.
+        #     - If start codon was affected, this is '?'.
+        #     - If variant protein equals original protein, this is unset.
+        #     - Otherwise, this is the variant protein sequence, ending with
+        #       '*' if a stop codon was found.
+        # - altStart:
+        #     - If variant CDS could be translated and variant created a new
+        #       start codon, this is the new start codon.
+        #     - Unset otherwise.
+        # - altProtein:
+        #     - If variant CDS could be translated and variant created a new
+        #       start codon, and variant protein does not equal original
+        #       protein, this is the variant protein sequence, ending with '*'
+        #       if a stop codon was found.
+        # - oldProteinFancy, newProteinFancy, altProteinFancy: Versions of the
+        #     protein sequences formatted for HTML.
+        # - oldProteinFancyText, newProteinFancyText, altProteinFancyText:
+        #     Versions of the protein sequences formatted for plaintext.
+
         cds_original = util.splice(mutator.orig, transcript.CDS.positionList)
         cds_original.alphabet = IUPAC.unambiguous_dna
 
@@ -1343,8 +1367,6 @@ def _add_transcript_info(mutator, transcript, output):
                                      transcript.CM.orientation)
         cds_variant.alphabet = IUPAC.unambiguous_dna
 
-        #output.addOutput('origCDS', cds_original)
-
         if transcript.CM.orientation == -1:
             cds_original = cds_original.reverse_complement()
             cds_variant = cds_variant.reverse_complement()
@@ -1361,8 +1383,17 @@ def _add_transcript_info(mutator, transcript, output):
                               'In frame stop codon found.')
             return
 
+        if not protein_original.startswith('M'):
+            protein_original = 'M' + protein_original[1:]
+            output.addMessage(__file__, 2, 'WALTSTART',
+                              'Reference protein translated from alternative '
+                              'start codon %s.' % (unicode(cds_original[:3])))
+
         protein_variant = cds_variant.translate(table=transcript.txTable)
 
+        if protein_variant:
+            protein_variant = 'M' + protein_variant[1:]
+
         # Up to and including the first '*', or the entire string.
         try:
             stop = unicode(protein_variant).index('*')
@@ -1370,8 +1401,6 @@ def _add_transcript_info(mutator, transcript, output):
         except ValueError:
             pass
 
-        # Note: addOutput('origCDS', ...) was first before the possible
-        #       reverse complement operation above.
         output.addOutput('origCDS', unicode(cds_original))
         output.addOutput("newCDS", unicode(cds_variant[:len(protein_variant) * 3]))
 
@@ -1381,34 +1410,43 @@ def _add_transcript_info(mutator, transcript, output):
         # website.py.
         # I think it would also be nice to include the mutated list of splice
         # sites.
-        if not protein_variant or unicode(protein_variant[0]) != 'M':
-            # Todo: Protein differences are not color-coded,
-            # use something like below in protein_description().
+
+        if not protein_variant or unicode(cds_variant[:3]) != unicode(cds_original[:3]):
+            # Could not translate variant CDS or variant hits start codon. In
+            # that case we predict p.? and see if a non-reference start codon
+            # was created.
             util.print_protein_html(unicode(protein_original), 0, 0,
                                     output, 'oldProteinFancy')
             util.print_protein_html(unicode(protein_original), 0, 0,
                                     output, 'oldProteinFancyText', text=True)
-            if unicode(cds_variant[0:3]) in \
-                   CodonTable.unambiguous_dna_by_id[transcript.txTable].start_codons:
-                output.addOutput('newprotein', '?')
-                util.print_protein_html('?', 0, 0, output, 'newProteinFancy')
-                util.print_protein_html('?', 0, 0, output,
-                    'newProteinFancyText', text=True)
-                output.addOutput('altStart', unicode(cds_variant[0:3]))
-                if unicode(protein_original[1:]) != unicode(protein_variant[1:]):
-                    output.addOutput('altProtein',
-                                     'M' + unicode(protein_variant[1:]))
-                    util.print_protein_html('M' + unicode(protein_variant[1:]), 0,
-                        0, output, 'altProteinFancy')
-                    util.print_protein_html('M' + unicode(protein_variant[1:]), 0,
-                        0, output, 'altProteinFancyText', text=True)
-            else :
-                output.addOutput('newprotein', '?')
-                util.print_protein_html('?', 0, 0, output, 'newProteinFancy')
-                util.print_protein_html('?', 0, 0, output,
-                    'newProteinFancyText', text=True)
+            output.addOutput('newprotein', '?')
+            util.print_protein_html('?', 0, 0, output, 'newProteinFancy')
+            util.print_protein_html('?', 0, 0, output,
+                'newProteinFancyText', text=True)
+
+            if protein_variant:
+                # Variant CDS could be translated, but start codon was
+                # affected.
+                start_codons = CodonTable.unambiguous_dna_by_id[
+                    transcript.txTable].start_codons
+
+                if unicode(cds_variant[0:3]) in start_codons:
+                    # A non-reference start codon was created.
+                    output.addOutput('altStart', unicode(cds_variant[0:3]))
+
+                    if unicode(protein_original) != unicode(protein_variant):
+                        # The resulting protein is actually different, so
+                        # visualise the difference.
+                        # Todo: Protein differences are not color-coded,
+                        # use something like below in protein_description().
+                        output.addOutput('altProtein', unicode(protein_variant))
+                        util.print_protein_html(unicode(protein_variant), 0,
+                            0, output, 'altProteinFancy')
+                        util.print_protein_html(unicode(protein_variant), 0,
+                            0, output, 'altProteinFancyText', text=True)
 
         else:
+            # Variant CDS was translated and start codon is unchanged.
             cds_length = util.cds_length(
                 mutator.shift_sites(transcript.CDS.positionList))
             descr, first, last_original, last_variant = \
@@ -1840,23 +1878,27 @@ def check_variant(description, output):
                     # So we manually translate the first codon to M. But only
                     # if it was not affected by the variant.
                     protein_variant = cds_variant.translate(table=transcript.txTable)
-                    if unicode(cds_variant[:3]) == unicode(cds_original[:3]):
+                    if protein_variant and unicode(cds_variant[:3]) == unicode(cds_original[:3]):
                         protein_variant = protein_original[0] + protein_variant[1:]
 
-                    # Up to and including the first '*', or the entire string.
-                    try:
-                        stop = unicode(protein_variant).index('*')
-                        protein_variant = protein_variant[:stop + 1]
-                    except ValueError:
-                        pass
-
-                    try:
-                        cds_length = util.cds_length(
-                            mutator.shift_sites(transcript.CDS.positionList))
-                        transcript.proteinDescription = util.protein_description(
-                            cds_length, unicode(protein_original), unicode(protein_variant))[0]
-                    except IndexError:
-                        # Todo: Probably CDS start was hit by removal of exon..
+                        # Up to and including the first '*', or the entire string.
+                        try:
+                            stop = unicode(protein_variant).index('*')
+                            protein_variant = protein_variant[:stop + 1]
+                        except ValueError:
+                            pass
+
+                        try:
+                            cds_length = util.cds_length(
+                                mutator.shift_sites(transcript.CDS.positionList))
+                            transcript.proteinDescription = util.protein_description(
+                                cds_length, unicode(protein_original), unicode(protein_variant))[0]
+                        except IndexError:
+                            # Todo: Probably CDS start was hit by removal of exon..
+                            transcript.proteinDescription = 'p.?'
+
+                    else:
+                        # Mutation in start codon.
                         transcript.proteinDescription = 'p.?'
 
             else:
diff --git a/tests/test_variantchecker.py b/tests/test_variantchecker.py
index 4c5509ef..f45fbb01 100644
--- a/tests/test_variantchecker.py
+++ b/tests/test_variantchecker.py
@@ -1357,6 +1357,7 @@ class TestVariantchecker(MutalyzerTest):
         """
         check_variant('AB026906.1:c.276C>T', self.output)
         assert 'AB026906.1(SDHD_i001):p.(=)' in self.output.getOutput('protDescriptions')
+        assert not self.output.getOutput('newProteinFancy')
 
     @fix(cache('NM_024426.4'))
     def test_synonymous_p_is_alt_start(self):
@@ -1366,17 +1367,27 @@ class TestVariantchecker(MutalyzerTest):
         """
         check_variant('NM_024426.4:c.1107A>G', self.output)
         assert 'NM_024426.4(WT1_i001):p.(=)' in self.output.getOutput('protDescriptions')
-        assert 'CTG' in self.output.getOutput('altStart')
+        assert not self.output.getOutput('newProteinFancy')
+        waltstart = self.output.getMessagesWithErrorCode('WALTSTART')
+        assert len(waltstart) == 1
+        assert self.output.getOutput('oldprotein')[0].startswith('M')
+        assert not self.output.getOutput('newProtein')
+        assert not self.output.getOutput('altStart')
+        assert not self.output.getOutput('altProteinFancy')
 
     @fix(cache('AB026906.1'))
     def test_start_codon(self):
         """
         Mutation of start codon should yield a p.? description.
         """
-        check_variant('AB026906.1:c.1A>T', self.output)
+        check_variant('AB026906.1:c.1A>G', self.output)
         assert 'AB026906.1(SDHD_i001):p.?' in self.output.getOutput('protDescriptions')
-        west = self.output.getMessagesWithErrorCode('WSTART')
-        assert len(west) == 1
+        wstart = self.output.getMessagesWithErrorCode('WSTART')
+        assert len(wstart) == 1
+        assert self.output.getOutput('newprotein')[0] == '?'
+        waltstart = self.output.getMessagesWithErrorCode('WALTSTART')
+        assert len(waltstart) == 0
+        assert not self.output.getOutput('altStart')
 
     @fix(cache('NM_024426.4'))
     def test_start_codon_alt_start(self):
@@ -1388,3 +1399,77 @@ class TestVariantchecker(MutalyzerTest):
         assert 'NM_024426.4(WT1_i001):p.?' in self.output.getOutput('protDescriptions')
         west = self.output.getMessagesWithErrorCode('WSTART')
         assert len(west) == 1
+        assert self.output.getOutput('newprotein')[0] == '?'
+        waltstart = self.output.getMessagesWithErrorCode('WALTSTART')
+        assert len(waltstart) == 1
+        assert not self.output.getOutput('altStart')
+
+    @fix(cache('AB026906.1'))
+    def test_start_codon_yield_start_p_is(self):
+        """
+        Silent mutation creating new start codon should yield a p.?
+        description. The visualisation should also render the case for the new
+        start codon.
+        """
+        check_variant('AB026906.1:c.1A>T', self.output)  # yields TTG start codon
+        assert 'AB026906.1(SDHD_i001):p.?' in self.output.getOutput('protDescriptions')
+        wstart = self.output.getMessagesWithErrorCode('WSTART')
+        assert len(wstart) == 1
+        assert self.output.getOutput('newprotein')[0] == '?'
+        waltstart = self.output.getMessagesWithErrorCode('WALTSTART')
+        assert len(waltstart) == 0
+        assert self.output.getOutput('oldprotein')[0].startswith('M')
+        assert 'TTG' in self.output.getOutput('altStart')
+        assert not self.output.getOutput('altProteinFancy')
+
+    @fix(cache('NM_024426.4'))
+    def test_start_codon_alt_start_yield_start_p_is(self):
+        """
+        Silent mutation creating new start codon should yield a p.?
+        description, also with an alternative start codon. The visualisation
+        should also render the case for the new start codon.
+        """
+        check_variant('NM_024426.4:c.1C>A', self.output)  # yields ATG start codon
+        assert 'NM_024426.4(WT1_i001):p.?' in self.output.getOutput('protDescriptions')
+        west = self.output.getMessagesWithErrorCode('WSTART')
+        assert len(west) == 1
+        assert self.output.getOutput('newprotein')[0] == '?'
+        waltstart = self.output.getMessagesWithErrorCode('WALTSTART')
+        assert len(waltstart) == 1
+        assert self.output.getOutput('oldprotein')[0].startswith('M')
+        assert 'ATG' in self.output.getOutput('altStart')
+        assert not self.output.getOutput('altProteinFancy')
+
+    @fix(cache('AB026906.1'))
+    def test_start_codon_yield_start(self):
+        """
+        Mutation creating new start codon should yield a p.? description. The
+        visualisation should also render the case for the new start codon.
+        """
+        check_variant('AB026906.1:c.1_4delinsTTGA', self.output)  # yields TTG start codon
+        assert 'AB026906.1(SDHD_i001):p.?' in self.output.getOutput('protDescriptions')
+        wstart = self.output.getMessagesWithErrorCode('WSTART')
+        assert len(wstart) == 1
+        assert self.output.getOutput('newprotein')[0] == '?'
+        waltstart = self.output.getMessagesWithErrorCode('WALTSTART')
+        assert len(waltstart) == 0
+        assert 'TTG' in self.output.getOutput('altStart')
+        assert self.output.getOutput('altProtein')[0].startswith('M')
+
+    @fix(cache('NM_024426.4'))
+    def test_start_codon_alt_start_yield_start(self):
+        """
+        Mutation creating new start codon should yield a p.? description, also
+        with an alternative start codon. The visualisation should also render
+        the new start codon.
+        """
+        check_variant('NM_024426.4:c.1_4delinsATGA', self.output)  # yields ATG start codon
+        assert 'NM_024426.4(WT1_i001):p.?' in self.output.getOutput('protDescriptions')
+        west = self.output.getMessagesWithErrorCode('WSTART')
+        assert len(west) == 1
+        assert self.output.getOutput('newprotein')[0] == '?'
+        waltstart = self.output.getMessagesWithErrorCode('WALTSTART')
+        assert len(waltstart) == 1
+        assert self.output.getOutput('oldprotein')[0].startswith('M')
+        assert 'ATG' in self.output.getOutput('altStart')
+        assert self.output.getOutput('altProtein')[0].startswith('M')
-- 
GitLab