diff --git a/mutalyzer/util.py b/mutalyzer/util.py index 4f95f64382703114fb18a0c50868df6115ba1e06..16667d58612d8e493b3a6a135a9e5a0a32c4382a 100644 --- a/mutalyzer/util.py +++ b/mutalyzer/util.py @@ -600,10 +600,6 @@ def protein_description(cds_stop, s1, s2): else: description = in_frame_description(s1, s2) - if not s2 or s1[0] != s2[0]: - # Mutation in start codon. - return 'p.?', description[1], description[2], description[3] - return description #protein_description diff --git a/mutalyzer/variantchecker.py b/mutalyzer/variantchecker.py index e385c336492c074ca1053ef959fefbe0acbe5d90..a3bc7a8d36e6fe59515653cf5dfb8dce6d592ffa 100644 --- a/mutalyzer/variantchecker.py +++ b/mutalyzer/variantchecker.py @@ -1329,6 +1329,30 @@ def _add_transcript_info(mutator, transcript, output): # Add protein prediction to output. if transcript.translate: + # Data added to the output object: + # - origCDS: Original CDS. + # - newCDS: Variant CDS. + # - oldprotein: Original protein sequence, ending with '*'. + # - newprotein: + # - If variant CDS could not be translated, this is '?'. + # - If start codon was affected, this is '?'. + # - If variant protein equals original protein, this is unset. + # - Otherwise, this is the variant protein sequence, ending with + # '*' if a stop codon was found. + # - altStart: + # - If variant CDS could be translated and variant created a new + # start codon, this is the new start codon. + # - Unset otherwise. + # - altProtein: + # - If variant CDS could be translated and variant created a new + # start codon, and variant protein does not equal original + # protein, this is the variant protein sequence, ending with '*' + # if a stop codon was found. + # - oldProteinFancy, newProteinFancy, altProteinFancy: Versions of the + # protein sequences formatted for HTML. + # - oldProteinFancyText, newProteinFancyText, altProteinFancyText: + # Versions of the protein sequences formatted for plaintext. + cds_original = util.splice(mutator.orig, transcript.CDS.positionList) cds_original.alphabet = IUPAC.unambiguous_dna @@ -1343,8 +1367,6 @@ def _add_transcript_info(mutator, transcript, output): transcript.CM.orientation) cds_variant.alphabet = IUPAC.unambiguous_dna - #output.addOutput('origCDS', cds_original) - if transcript.CM.orientation == -1: cds_original = cds_original.reverse_complement() cds_variant = cds_variant.reverse_complement() @@ -1361,8 +1383,17 @@ def _add_transcript_info(mutator, transcript, output): 'In frame stop codon found.') return + if not protein_original.startswith('M'): + protein_original = 'M' + protein_original[1:] + output.addMessage(__file__, 2, 'WALTSTART', + 'Reference protein translated from alternative ' + 'start codon %s.' % (unicode(cds_original[:3]))) + protein_variant = cds_variant.translate(table=transcript.txTable) + if protein_variant: + protein_variant = 'M' + protein_variant[1:] + # Up to and including the first '*', or the entire string. try: stop = unicode(protein_variant).index('*') @@ -1370,8 +1401,6 @@ def _add_transcript_info(mutator, transcript, output): except ValueError: pass - # Note: addOutput('origCDS', ...) was first before the possible - # reverse complement operation above. output.addOutput('origCDS', unicode(cds_original)) output.addOutput("newCDS", unicode(cds_variant[:len(protein_variant) * 3])) @@ -1381,34 +1410,43 @@ def _add_transcript_info(mutator, transcript, output): # website.py. # I think it would also be nice to include the mutated list of splice # sites. - if not protein_variant or unicode(protein_variant[0]) != 'M': - # Todo: Protein differences are not color-coded, - # use something like below in protein_description(). + + if not protein_variant or unicode(cds_variant[:3]) != unicode(cds_original[:3]): + # Could not translate variant CDS or variant hits start codon. In + # that case we predict p.? and see if a non-reference start codon + # was created. util.print_protein_html(unicode(protein_original), 0, 0, output, 'oldProteinFancy') util.print_protein_html(unicode(protein_original), 0, 0, output, 'oldProteinFancyText', text=True) - if unicode(cds_variant[0:3]) in \ - CodonTable.unambiguous_dna_by_id[transcript.txTable].start_codons: - output.addOutput('newprotein', '?') - util.print_protein_html('?', 0, 0, output, 'newProteinFancy') - util.print_protein_html('?', 0, 0, output, - 'newProteinFancyText', text=True) - output.addOutput('altStart', unicode(cds_variant[0:3])) - if unicode(protein_original[1:]) != unicode(protein_variant[1:]): - output.addOutput('altProtein', - 'M' + unicode(protein_variant[1:])) - util.print_protein_html('M' + unicode(protein_variant[1:]), 0, - 0, output, 'altProteinFancy') - util.print_protein_html('M' + unicode(protein_variant[1:]), 0, - 0, output, 'altProteinFancyText', text=True) - else : - output.addOutput('newprotein', '?') - util.print_protein_html('?', 0, 0, output, 'newProteinFancy') - util.print_protein_html('?', 0, 0, output, - 'newProteinFancyText', text=True) + output.addOutput('newprotein', '?') + util.print_protein_html('?', 0, 0, output, 'newProteinFancy') + util.print_protein_html('?', 0, 0, output, + 'newProteinFancyText', text=True) + + if protein_variant: + # Variant CDS could be translated, but start codon was + # affected. + start_codons = CodonTable.unambiguous_dna_by_id[ + transcript.txTable].start_codons + + if unicode(cds_variant[0:3]) in start_codons: + # A non-reference start codon was created. + output.addOutput('altStart', unicode(cds_variant[0:3])) + + if unicode(protein_original) != unicode(protein_variant): + # The resulting protein is actually different, so + # visualise the difference. + # Todo: Protein differences are not color-coded, + # use something like below in protein_description(). + output.addOutput('altProtein', unicode(protein_variant)) + util.print_protein_html(unicode(protein_variant), 0, + 0, output, 'altProteinFancy') + util.print_protein_html(unicode(protein_variant), 0, + 0, output, 'altProteinFancyText', text=True) else: + # Variant CDS was translated and start codon is unchanged. cds_length = util.cds_length( mutator.shift_sites(transcript.CDS.positionList)) descr, first, last_original, last_variant = \ @@ -1832,20 +1870,35 @@ def check_variant(description, output): % (gene.name, transcript.name)) transcript.proteinDescription = 'p.?' else: + # Because `cds_variant` might contain additional sequence + # after the actual CDS, we cannot use `cds=True` here. + # However, we do know that the first codon is a start codon + # and hence should translate to M. Which is what happens + # with `cds=True`, but not otherwise. + # So we manually translate the first codon to M. But only + # if it was not affected by the variant. protein_variant = cds_variant.translate(table=transcript.txTable) - # Up to and including the first '*', or the entire string. - try: - stop = unicode(protein_variant).index('*') - protein_variant = protein_variant[:stop + 1] - except ValueError: - pass - try: - cds_length = util.cds_length( - mutator.shift_sites(transcript.CDS.positionList)) - transcript.proteinDescription = util.protein_description( - cds_length, unicode(protein_original), unicode(protein_variant))[0] - except IndexError: - # Todo: Probably CDS start was hit by removal of exon.. + if protein_variant and unicode(cds_variant[:3]) == unicode(cds_original[:3]): + protein_variant = protein_original[0] + protein_variant[1:] + + # Up to and including the first '*', or the entire string. + try: + stop = unicode(protein_variant).index('*') + protein_variant = protein_variant[:stop + 1] + except ValueError: + pass + + try: + cds_length = util.cds_length( + mutator.shift_sites(transcript.CDS.positionList)) + transcript.proteinDescription = util.protein_description( + cds_length, unicode(protein_original), unicode(protein_variant))[0] + except IndexError: + # Todo: Probably CDS start was hit by removal of exon.. + transcript.proteinDescription = 'p.?' + + else: + # Mutation in start codon. transcript.proteinDescription = 'p.?' else: diff --git a/tests/data/NM_024426.4.gb.bz2 b/tests/data/NM_024426.4.gb.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..41c4dc2f096eb4adc4c79da86cde1f1341cbf40d Binary files /dev/null and b/tests/data/NM_024426.4.gb.bz2 differ diff --git a/tests/fixtures.py b/tests/fixtures.py index f579bcd92e684065f5205e22996ed96d8b5ccab1..c71f315f0238b7447a70c6de03d082e31c7d6909 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -60,6 +60,9 @@ REFERENCES = { 'NM_000193.2': {'filename': 'NM_000193.2.gb.bz2', 'checksum': '86d03e1cf38c1387d90116539ea0678f', 'geninfo_id': '21071042'}, + 'NM_024426.4': {'filename': 'NM_024426.4.gb.bz2', + 'checksum': '830a3beb9b7af3c6ba3e8a15b1bd0f54', + 'geninfo_id': '309951095'}, 'NP_064445.1': {'filename': 'NP_064445.1.gb.bz2', 'checksum': '33ea9315882b4a9d8c33018a201be2fa', 'geninfo_id': '9910526'}, diff --git a/tests/test_variantchecker.py b/tests/test_variantchecker.py index 24053b70d8e63ade20c34a040a89b6a84eee649c..f45fbb01d851f10e7b79438c7ea450280c1ae8f8 100644 --- a/tests/test_variantchecker.py +++ b/tests/test_variantchecker.py @@ -1357,3 +1357,119 @@ class TestVariantchecker(MutalyzerTest): """ check_variant('AB026906.1:c.276C>T', self.output) assert 'AB026906.1(SDHD_i001):p.(=)' in self.output.getOutput('protDescriptions') + assert not self.output.getOutput('newProteinFancy') + + @fix(cache('NM_024426.4')) + def test_synonymous_p_is_alt_start(self): + """ + Synonymous mutation should yield a p.(=) description, also with an + alternative start codon. + """ + check_variant('NM_024426.4:c.1107A>G', self.output) + assert 'NM_024426.4(WT1_i001):p.(=)' in self.output.getOutput('protDescriptions') + assert not self.output.getOutput('newProteinFancy') + waltstart = self.output.getMessagesWithErrorCode('WALTSTART') + assert len(waltstart) == 1 + assert self.output.getOutput('oldprotein')[0].startswith('M') + assert not self.output.getOutput('newProtein') + assert not self.output.getOutput('altStart') + assert not self.output.getOutput('altProteinFancy') + + @fix(cache('AB026906.1')) + def test_start_codon(self): + """ + Mutation of start codon should yield a p.? description. + """ + check_variant('AB026906.1:c.1A>G', self.output) + assert 'AB026906.1(SDHD_i001):p.?' in self.output.getOutput('protDescriptions') + wstart = self.output.getMessagesWithErrorCode('WSTART') + assert len(wstart) == 1 + assert self.output.getOutput('newprotein')[0] == '?' + waltstart = self.output.getMessagesWithErrorCode('WALTSTART') + assert len(waltstart) == 0 + assert not self.output.getOutput('altStart') + + @fix(cache('NM_024426.4')) + def test_start_codon_alt_start(self): + """ + Mutation of start codon should yield a p.? description, also with an + alternative start codon. + """ + check_variant('NM_024426.4:c.1C>G', self.output) + assert 'NM_024426.4(WT1_i001):p.?' in self.output.getOutput('protDescriptions') + west = self.output.getMessagesWithErrorCode('WSTART') + assert len(west) == 1 + assert self.output.getOutput('newprotein')[0] == '?' + waltstart = self.output.getMessagesWithErrorCode('WALTSTART') + assert len(waltstart) == 1 + assert not self.output.getOutput('altStart') + + @fix(cache('AB026906.1')) + def test_start_codon_yield_start_p_is(self): + """ + Silent mutation creating new start codon should yield a p.? + description. The visualisation should also render the case for the new + start codon. + """ + check_variant('AB026906.1:c.1A>T', self.output) # yields TTG start codon + assert 'AB026906.1(SDHD_i001):p.?' in self.output.getOutput('protDescriptions') + wstart = self.output.getMessagesWithErrorCode('WSTART') + assert len(wstart) == 1 + assert self.output.getOutput('newprotein')[0] == '?' + waltstart = self.output.getMessagesWithErrorCode('WALTSTART') + assert len(waltstart) == 0 + assert self.output.getOutput('oldprotein')[0].startswith('M') + assert 'TTG' in self.output.getOutput('altStart') + assert not self.output.getOutput('altProteinFancy') + + @fix(cache('NM_024426.4')) + def test_start_codon_alt_start_yield_start_p_is(self): + """ + Silent mutation creating new start codon should yield a p.? + description, also with an alternative start codon. The visualisation + should also render the case for the new start codon. + """ + check_variant('NM_024426.4:c.1C>A', self.output) # yields ATG start codon + assert 'NM_024426.4(WT1_i001):p.?' in self.output.getOutput('protDescriptions') + west = self.output.getMessagesWithErrorCode('WSTART') + assert len(west) == 1 + assert self.output.getOutput('newprotein')[0] == '?' + waltstart = self.output.getMessagesWithErrorCode('WALTSTART') + assert len(waltstart) == 1 + assert self.output.getOutput('oldprotein')[0].startswith('M') + assert 'ATG' in self.output.getOutput('altStart') + assert not self.output.getOutput('altProteinFancy') + + @fix(cache('AB026906.1')) + def test_start_codon_yield_start(self): + """ + Mutation creating new start codon should yield a p.? description. The + visualisation should also render the case for the new start codon. + """ + check_variant('AB026906.1:c.1_4delinsTTGA', self.output) # yields TTG start codon + assert 'AB026906.1(SDHD_i001):p.?' in self.output.getOutput('protDescriptions') + wstart = self.output.getMessagesWithErrorCode('WSTART') + assert len(wstart) == 1 + assert self.output.getOutput('newprotein')[0] == '?' + waltstart = self.output.getMessagesWithErrorCode('WALTSTART') + assert len(waltstart) == 0 + assert 'TTG' in self.output.getOutput('altStart') + assert self.output.getOutput('altProtein')[0].startswith('M') + + @fix(cache('NM_024426.4')) + def test_start_codon_alt_start_yield_start(self): + """ + Mutation creating new start codon should yield a p.? description, also + with an alternative start codon. The visualisation should also render + the new start codon. + """ + check_variant('NM_024426.4:c.1_4delinsATGA', self.output) # yields ATG start codon + assert 'NM_024426.4(WT1_i001):p.?' in self.output.getOutput('protDescriptions') + west = self.output.getMessagesWithErrorCode('WSTART') + assert len(west) == 1 + assert self.output.getOutput('newprotein')[0] == '?' + waltstart = self.output.getMessagesWithErrorCode('WALTSTART') + assert len(waltstart) == 1 + assert self.output.getOutput('oldprotein')[0].startswith('M') + assert 'ATG' in self.output.getOutput('altStart') + assert self.output.getOutput('altProtein')[0].startswith('M')