From 851e71fef32e4f4861a7609077ff02c97b069622 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat <martijn@vermaat.name> Date: Tue, 22 Sep 2015 09:16:24 +0200 Subject: [PATCH] Visualise protein change, also with alternative start In the case of an alternative start codon (in the reference CDS), protein changes were not visualised. This is fixed and a WALTSTART warning is also issued. Also, if a new non-reference start codon is created by the variant, visualise this as such. --- mutalyzer/util.py | 4 -- mutalyzer/variantchecker.py | 124 +++++++++++++++++++++++------------ tests/test_variantchecker.py | 93 ++++++++++++++++++++++++-- 3 files changed, 172 insertions(+), 49 deletions(-) diff --git a/mutalyzer/util.py b/mutalyzer/util.py index 4f95f643..16667d58 100644 --- a/mutalyzer/util.py +++ b/mutalyzer/util.py @@ -600,10 +600,6 @@ def protein_description(cds_stop, s1, s2): else: description = in_frame_description(s1, s2) - if not s2 or s1[0] != s2[0]: - # Mutation in start codon. - return 'p.?', description[1], description[2], description[3] - return description #protein_description diff --git a/mutalyzer/variantchecker.py b/mutalyzer/variantchecker.py index ca35dd49..a3bc7a8d 100644 --- a/mutalyzer/variantchecker.py +++ b/mutalyzer/variantchecker.py @@ -1329,6 +1329,30 @@ def _add_transcript_info(mutator, transcript, output): # Add protein prediction to output. if transcript.translate: + # Data added to the output object: + # - origCDS: Original CDS. + # - newCDS: Variant CDS. + # - oldprotein: Original protein sequence, ending with '*'. + # - newprotein: + # - If variant CDS could not be translated, this is '?'. + # - If start codon was affected, this is '?'. + # - If variant protein equals original protein, this is unset. + # - Otherwise, this is the variant protein sequence, ending with + # '*' if a stop codon was found. + # - altStart: + # - If variant CDS could be translated and variant created a new + # start codon, this is the new start codon. + # - Unset otherwise. + # - altProtein: + # - If variant CDS could be translated and variant created a new + # start codon, and variant protein does not equal original + # protein, this is the variant protein sequence, ending with '*' + # if a stop codon was found. + # - oldProteinFancy, newProteinFancy, altProteinFancy: Versions of the + # protein sequences formatted for HTML. + # - oldProteinFancyText, newProteinFancyText, altProteinFancyText: + # Versions of the protein sequences formatted for plaintext. + cds_original = util.splice(mutator.orig, transcript.CDS.positionList) cds_original.alphabet = IUPAC.unambiguous_dna @@ -1343,8 +1367,6 @@ def _add_transcript_info(mutator, transcript, output): transcript.CM.orientation) cds_variant.alphabet = IUPAC.unambiguous_dna - #output.addOutput('origCDS', cds_original) - if transcript.CM.orientation == -1: cds_original = cds_original.reverse_complement() cds_variant = cds_variant.reverse_complement() @@ -1361,8 +1383,17 @@ def _add_transcript_info(mutator, transcript, output): 'In frame stop codon found.') return + if not protein_original.startswith('M'): + protein_original = 'M' + protein_original[1:] + output.addMessage(__file__, 2, 'WALTSTART', + 'Reference protein translated from alternative ' + 'start codon %s.' % (unicode(cds_original[:3]))) + protein_variant = cds_variant.translate(table=transcript.txTable) + if protein_variant: + protein_variant = 'M' + protein_variant[1:] + # Up to and including the first '*', or the entire string. try: stop = unicode(protein_variant).index('*') @@ -1370,8 +1401,6 @@ def _add_transcript_info(mutator, transcript, output): except ValueError: pass - # Note: addOutput('origCDS', ...) was first before the possible - # reverse complement operation above. output.addOutput('origCDS', unicode(cds_original)) output.addOutput("newCDS", unicode(cds_variant[:len(protein_variant) * 3])) @@ -1381,34 +1410,43 @@ def _add_transcript_info(mutator, transcript, output): # website.py. # I think it would also be nice to include the mutated list of splice # sites. - if not protein_variant or unicode(protein_variant[0]) != 'M': - # Todo: Protein differences are not color-coded, - # use something like below in protein_description(). + + if not protein_variant or unicode(cds_variant[:3]) != unicode(cds_original[:3]): + # Could not translate variant CDS or variant hits start codon. In + # that case we predict p.? and see if a non-reference start codon + # was created. util.print_protein_html(unicode(protein_original), 0, 0, output, 'oldProteinFancy') util.print_protein_html(unicode(protein_original), 0, 0, output, 'oldProteinFancyText', text=True) - if unicode(cds_variant[0:3]) in \ - CodonTable.unambiguous_dna_by_id[transcript.txTable].start_codons: - output.addOutput('newprotein', '?') - util.print_protein_html('?', 0, 0, output, 'newProteinFancy') - util.print_protein_html('?', 0, 0, output, - 'newProteinFancyText', text=True) - output.addOutput('altStart', unicode(cds_variant[0:3])) - if unicode(protein_original[1:]) != unicode(protein_variant[1:]): - output.addOutput('altProtein', - 'M' + unicode(protein_variant[1:])) - util.print_protein_html('M' + unicode(protein_variant[1:]), 0, - 0, output, 'altProteinFancy') - util.print_protein_html('M' + unicode(protein_variant[1:]), 0, - 0, output, 'altProteinFancyText', text=True) - else : - output.addOutput('newprotein', '?') - util.print_protein_html('?', 0, 0, output, 'newProteinFancy') - util.print_protein_html('?', 0, 0, output, - 'newProteinFancyText', text=True) + output.addOutput('newprotein', '?') + util.print_protein_html('?', 0, 0, output, 'newProteinFancy') + util.print_protein_html('?', 0, 0, output, + 'newProteinFancyText', text=True) + + if protein_variant: + # Variant CDS could be translated, but start codon was + # affected. + start_codons = CodonTable.unambiguous_dna_by_id[ + transcript.txTable].start_codons + + if unicode(cds_variant[0:3]) in start_codons: + # A non-reference start codon was created. + output.addOutput('altStart', unicode(cds_variant[0:3])) + + if unicode(protein_original) != unicode(protein_variant): + # The resulting protein is actually different, so + # visualise the difference. + # Todo: Protein differences are not color-coded, + # use something like below in protein_description(). + output.addOutput('altProtein', unicode(protein_variant)) + util.print_protein_html(unicode(protein_variant), 0, + 0, output, 'altProteinFancy') + util.print_protein_html(unicode(protein_variant), 0, + 0, output, 'altProteinFancyText', text=True) else: + # Variant CDS was translated and start codon is unchanged. cds_length = util.cds_length( mutator.shift_sites(transcript.CDS.positionList)) descr, first, last_original, last_variant = \ @@ -1840,23 +1878,27 @@ def check_variant(description, output): # So we manually translate the first codon to M. But only # if it was not affected by the variant. protein_variant = cds_variant.translate(table=transcript.txTable) - if unicode(cds_variant[:3]) == unicode(cds_original[:3]): + if protein_variant and unicode(cds_variant[:3]) == unicode(cds_original[:3]): protein_variant = protein_original[0] + protein_variant[1:] - # Up to and including the first '*', or the entire string. - try: - stop = unicode(protein_variant).index('*') - protein_variant = protein_variant[:stop + 1] - except ValueError: - pass - - try: - cds_length = util.cds_length( - mutator.shift_sites(transcript.CDS.positionList)) - transcript.proteinDescription = util.protein_description( - cds_length, unicode(protein_original), unicode(protein_variant))[0] - except IndexError: - # Todo: Probably CDS start was hit by removal of exon.. + # Up to and including the first '*', or the entire string. + try: + stop = unicode(protein_variant).index('*') + protein_variant = protein_variant[:stop + 1] + except ValueError: + pass + + try: + cds_length = util.cds_length( + mutator.shift_sites(transcript.CDS.positionList)) + transcript.proteinDescription = util.protein_description( + cds_length, unicode(protein_original), unicode(protein_variant))[0] + except IndexError: + # Todo: Probably CDS start was hit by removal of exon.. + transcript.proteinDescription = 'p.?' + + else: + # Mutation in start codon. transcript.proteinDescription = 'p.?' else: diff --git a/tests/test_variantchecker.py b/tests/test_variantchecker.py index 4c5509ef..f45fbb01 100644 --- a/tests/test_variantchecker.py +++ b/tests/test_variantchecker.py @@ -1357,6 +1357,7 @@ class TestVariantchecker(MutalyzerTest): """ check_variant('AB026906.1:c.276C>T', self.output) assert 'AB026906.1(SDHD_i001):p.(=)' in self.output.getOutput('protDescriptions') + assert not self.output.getOutput('newProteinFancy') @fix(cache('NM_024426.4')) def test_synonymous_p_is_alt_start(self): @@ -1366,17 +1367,27 @@ class TestVariantchecker(MutalyzerTest): """ check_variant('NM_024426.4:c.1107A>G', self.output) assert 'NM_024426.4(WT1_i001):p.(=)' in self.output.getOutput('protDescriptions') - assert 'CTG' in self.output.getOutput('altStart') + assert not self.output.getOutput('newProteinFancy') + waltstart = self.output.getMessagesWithErrorCode('WALTSTART') + assert len(waltstart) == 1 + assert self.output.getOutput('oldprotein')[0].startswith('M') + assert not self.output.getOutput('newProtein') + assert not self.output.getOutput('altStart') + assert not self.output.getOutput('altProteinFancy') @fix(cache('AB026906.1')) def test_start_codon(self): """ Mutation of start codon should yield a p.? description. """ - check_variant('AB026906.1:c.1A>T', self.output) + check_variant('AB026906.1:c.1A>G', self.output) assert 'AB026906.1(SDHD_i001):p.?' in self.output.getOutput('protDescriptions') - west = self.output.getMessagesWithErrorCode('WSTART') - assert len(west) == 1 + wstart = self.output.getMessagesWithErrorCode('WSTART') + assert len(wstart) == 1 + assert self.output.getOutput('newprotein')[0] == '?' + waltstart = self.output.getMessagesWithErrorCode('WALTSTART') + assert len(waltstart) == 0 + assert not self.output.getOutput('altStart') @fix(cache('NM_024426.4')) def test_start_codon_alt_start(self): @@ -1388,3 +1399,77 @@ class TestVariantchecker(MutalyzerTest): assert 'NM_024426.4(WT1_i001):p.?' in self.output.getOutput('protDescriptions') west = self.output.getMessagesWithErrorCode('WSTART') assert len(west) == 1 + assert self.output.getOutput('newprotein')[0] == '?' + waltstart = self.output.getMessagesWithErrorCode('WALTSTART') + assert len(waltstart) == 1 + assert not self.output.getOutput('altStart') + + @fix(cache('AB026906.1')) + def test_start_codon_yield_start_p_is(self): + """ + Silent mutation creating new start codon should yield a p.? + description. The visualisation should also render the case for the new + start codon. + """ + check_variant('AB026906.1:c.1A>T', self.output) # yields TTG start codon + assert 'AB026906.1(SDHD_i001):p.?' in self.output.getOutput('protDescriptions') + wstart = self.output.getMessagesWithErrorCode('WSTART') + assert len(wstart) == 1 + assert self.output.getOutput('newprotein')[0] == '?' + waltstart = self.output.getMessagesWithErrorCode('WALTSTART') + assert len(waltstart) == 0 + assert self.output.getOutput('oldprotein')[0].startswith('M') + assert 'TTG' in self.output.getOutput('altStart') + assert not self.output.getOutput('altProteinFancy') + + @fix(cache('NM_024426.4')) + def test_start_codon_alt_start_yield_start_p_is(self): + """ + Silent mutation creating new start codon should yield a p.? + description, also with an alternative start codon. The visualisation + should also render the case for the new start codon. + """ + check_variant('NM_024426.4:c.1C>A', self.output) # yields ATG start codon + assert 'NM_024426.4(WT1_i001):p.?' in self.output.getOutput('protDescriptions') + west = self.output.getMessagesWithErrorCode('WSTART') + assert len(west) == 1 + assert self.output.getOutput('newprotein')[0] == '?' + waltstart = self.output.getMessagesWithErrorCode('WALTSTART') + assert len(waltstart) == 1 + assert self.output.getOutput('oldprotein')[0].startswith('M') + assert 'ATG' in self.output.getOutput('altStart') + assert not self.output.getOutput('altProteinFancy') + + @fix(cache('AB026906.1')) + def test_start_codon_yield_start(self): + """ + Mutation creating new start codon should yield a p.? description. The + visualisation should also render the case for the new start codon. + """ + check_variant('AB026906.1:c.1_4delinsTTGA', self.output) # yields TTG start codon + assert 'AB026906.1(SDHD_i001):p.?' in self.output.getOutput('protDescriptions') + wstart = self.output.getMessagesWithErrorCode('WSTART') + assert len(wstart) == 1 + assert self.output.getOutput('newprotein')[0] == '?' + waltstart = self.output.getMessagesWithErrorCode('WALTSTART') + assert len(waltstart) == 0 + assert 'TTG' in self.output.getOutput('altStart') + assert self.output.getOutput('altProtein')[0].startswith('M') + + @fix(cache('NM_024426.4')) + def test_start_codon_alt_start_yield_start(self): + """ + Mutation creating new start codon should yield a p.? description, also + with an alternative start codon. The visualisation should also render + the new start codon. + """ + check_variant('NM_024426.4:c.1_4delinsATGA', self.output) # yields ATG start codon + assert 'NM_024426.4(WT1_i001):p.?' in self.output.getOutput('protDescriptions') + west = self.output.getMessagesWithErrorCode('WSTART') + assert len(west) == 1 + assert self.output.getOutput('newprotein')[0] == '?' + waltstart = self.output.getMessagesWithErrorCode('WALTSTART') + assert len(waltstart) == 1 + assert self.output.getOutput('oldprotein')[0].startswith('M') + assert 'ATG' in self.output.getOutput('altStart') + assert self.output.getOutput('altProtein')[0].startswith('M') -- GitLab