diff --git a/mutalyzer/util.py b/mutalyzer/util.py index ae709477bbb789b625014046f8c1f4c336981417..93d6de8f4be0bf687c00ddff375b39866ad45567 100644 --- a/mutalyzer/util.py +++ b/mutalyzer/util.py @@ -407,18 +407,22 @@ def read_dna(handle): return ''.join(x for x in unicode(handle.read()).upper() if x in 'ATCG') -def in_frame_description(s1, s2) : +def in_frame_description(s1, s2): """ Give a description of an inframe difference of two proteins. Also give the position at which the proteins start to differ and the positions at which they are the same again. - >>> in_frame_description('MTAPQQMT', 'MTAQQMT') + >>> in_frame_description('MTAPQQMT*', 'MTAQQMT*') ('p.(Pro4del)', 3, 4, 3) - >>> in_frame_description('MTAPQQMT', 'MTAQMT') + >>> in_frame_description('MTAPQQMT*', 'MTAQMT*') ('p.(Pro4_Gln5del)', 3, 5, 3) - >>> in_frame_description('MTAPQQT', 'MTAQQMT') + >>> in_frame_description('MTAPQQT*', 'MTAQQMT*') ('p.(Pro4_Gln6delinsGlnGlnMet)', 3, 6, 6) + >>> in_frame_description('MTAPQQMT*', 'MTAPQQMTMQ*') + ('p.(*9Metext*2)', 8, 9, 11) + >>> in_frame_description('MTAPQQMT*', 'MTAPQQMTMQ') + ('p.(*9Metext*?)', 8, 8, 10) @arg s1: The original protein. @type s1: unicode @@ -439,6 +443,10 @@ def in_frame_description(s1, s2) : # Nothing happened. return ('p.(=)', 0, 0, 0) + s2_stop = '*' in s2 + s1 = s1.rstrip('*') + s2 = s2.rstrip('*') + lcp = len(longest_common_prefix(s1, s2)) lcs = len(longest_common_suffix(s1[lcp:], s2[lcp:])) s1_end = len(s1) - lcs @@ -447,9 +455,13 @@ def in_frame_description(s1, s2) : # Insertion / Duplication / Extention. if not s1_end - lcp: if len(s1) == lcp: - return ('p.(*%i%sext*%i)' % \ - (len(s1) + 1, seq3(s2[len(s1)]), abs(len(s1) - len(s2))), - len(s1), len(s1), len(s2)) + # http://www.hgvs.org/mutnomen/FAQ.html#nostop + stop = unicode(abs(len(s1) - len(s2))) if s2_stop else '?' + + return ('p.(*%i%sext*%s)' % \ + (len(s1) + 1, seq3(s2[len(s1)]), stop), + len(s1), len(s1) + 1, len(s2) + (1 if s2_stop else 0)) + ins_length = s2_end - lcp if lcp - ins_length >= 0 and s1[lcp - ins_length:lcp] == s2[lcp:s2_end]: @@ -472,7 +484,7 @@ def in_frame_description(s1, s2) : if not s2_end - lcp: if len(s2) == lcp: return ('p.(%s%i*)' % (seq3(s1[len(s2)]), len(s2) + 1), - 0, 0, 0) + lcp, len(s1) + 1, len(s2) + 1) if lcp + 1 == s1_end: return ('p.(%s%idel)' % (seq3(s1[lcp]), lcp + 1), @@ -506,12 +518,14 @@ def out_of_frame_description(s1, s2): Also give the position at which the proteins start to differ and the end positions (to be compatible with the in_frame_description function). - >>> out_of_frame_description('MTAPQQMT', 'MTAQQMT') - ('p.(Pro4Glnfs*5)', 3, 8, 7) - >>> out_of_frame_description('MTAPQQMT', 'MTAQMT') - ('p.(Pro4Glnfs*4)', 3, 8, 6) - >>> out_of_frame_description('MTAPQQT', 'MTAQQMT') - ('p.(Pro4Glnfs*5)', 3, 7, 7) + >>> out_of_frame_description('MTAPQQMT*', 'MTAQQMT*') + ('p.(Pro4Glnfs*5)', 3, 9, 8) + >>> out_of_frame_description('MTAPQQMT*', 'MTAQMT*') + ('p.(Pro4Glnfs*4)', 3, 9, 7) + >>> out_of_frame_description('MTAPQQT*', 'MTAQQMT*') + ('p.(Pro4Glnfs*5)', 3, 8, 8) + >>> out_of_frame_description('MTAPQQT*', 'MTAQQMT') + ('p.(Pro4Glnfs*?)', 3, 8, 7) @arg s1: The original protein. @type s1: unicode @@ -527,33 +541,44 @@ def out_of_frame_description(s1, s2): @todo: More intelligently handle longest_common_prefix(). """ - lcp = len(longest_common_prefix(s1, s2)) + s1_seq = s1.rstrip('*') + s2_seq = s2.rstrip('*') + lcp = len(longest_common_prefix(s1_seq, s2_seq)) - if lcp == len(s2): # NonSense mutation. - if lcp == len(s1): # Is this correct? + if lcp == len(s2_seq): # NonSense mutation. + if lcp == len(s1_seq): # Is this correct? return ('p.(=)', 0, 0, 0) return ('p.(%s%i*)' % (seq3(s1[lcp]), lcp + 1), lcp, len(s1), lcp) - if lcp == len(s1) : - return ('p.(*%i%sext*%i)' % \ - (len(s1) + 1, seq3(s2[len(s1)]), abs(len(s1) - len(s2))), - len(s1), len(s1), len(s2)) - return ('p.(%s%i%sfs*%i)' % \ - (seq3(s1[lcp]), lcp + 1, seq3(s2[lcp]), len(s2) - lcp + 1), + if lcp == len(s1_seq): + # http://www.hgvs.org/mutnomen/FAQ.html#nostop + stop = unicode(abs(len(s1_seq) - len(s2_seq))) if '*' in s2 else '?' + + return ('p.(*%i%sext*%s)' % \ + (len(s1_seq) + 1, seq3(s2[len(s1_seq)]), stop), + len(s1_seq), len(s1), len(s2)) + + # http://www.hgvs.org/mutnomen/FAQ.html#nostop + stop = unicode(len(s2_seq) - lcp + 1) if '*' in s2 else '?' + + return ('p.(%s%i%sfs*%s)' % \ + (seq3(s1[lcp]), lcp + 1, seq3(s2[lcp]), stop), lcp, len(s1), len(s2)) #out_of_frame_description -def protein_description(cds_stop, s1, s2) : +def protein_description(cds_stop, s1, s2): """ Wrapper function for the in_frame_description() and out_of_frame_description() functions. It uses the value cds_stop to decide which one to call. - >>> protein_description(34, 'MTAPQQMT', 'MTAQQMT') - ('p.(Pro4Glnfs*5)', 3, 8, 7) - >>> protein_description(33, 'MTAPQQMT', 'MTAQQMT') + >>> protein_description(34, 'MTAPQQMT*', 'MTAQQMT*') + ('p.(Pro4Glnfs*5)', 3, 9, 8) + >>> protein_description(34, 'MTAPQQMT*', 'MTAQQMT') + ('p.(Pro4Glnfs*?)', 3, 9, 7) + >>> protein_description(33, 'MTAPQQMT*', 'MTAQQMT*') ('p.(Pro4del)', 3, 4, 3) - >>> protein_description(33, 'MTAPQQMT', 'TTAQQMT') + >>> protein_description(33, 'MTAPQQMT*', 'TTAQQMT*') ('p.?', 0, 4, 3) @arg cds_stop: Position of the stop codon in c. notation (CDS length). @@ -639,10 +664,14 @@ def _insert_tag(s, pos1, pos2, tag1, tag2): if 0 <= pos1 < block: # Insert tag1. output = output[:pos1] + tag1 + output[pos1:] - if 0 <= pos2 < block: + if 0 < pos2 < block: # Insert tag2. output = output[:-(block - pos2)] + tag2 \ + output[-(block - pos2):] + if pos2 == block: + # Insert tag2. Special case, since s[:-0] would yield the empty + # string. + output = output + tag2 return output #_insert_tag diff --git a/mutalyzer/variantchecker.py b/mutalyzer/variantchecker.py index 3f0ee4220d8d38451ab7cf4067f287c31ae29383..e385c336492c074ca1053ef959fefbe0acbe5d90 100644 --- a/mutalyzer/variantchecker.py +++ b/mutalyzer/variantchecker.py @@ -1349,22 +1349,33 @@ def _add_transcript_info(mutator, transcript, output): cds_original = cds_original.reverse_complement() cds_variant = cds_variant.reverse_complement() - if '*' in cds_original.translate(table=transcript.txTable)[:-1]: + protein_original = cds_original.translate(table=transcript.txTable) + + if not protein_original.endswith('*'): + output.addMessage(__file__, 3, 'ESTOP', + 'No stop codon found.') + return + + if '*' in protein_original[:-1]: output.addMessage(__file__, 3, 'ESTOP', 'In frame stop codon found.') return - protein_original = cds_original.translate(table=transcript.txTable, - to_stop=True) - protein_variant = cds_variant.translate(table=transcript.txTable, - to_stop=True) + protein_variant = cds_variant.translate(table=transcript.txTable) + + # Up to and including the first '*', or the entire string. + try: + stop = unicode(protein_variant).index('*') + protein_variant = protein_variant[:stop + 1] + except ValueError: + pass # Note: addOutput('origCDS', ...) was first before the possible # reverse complement operation above. output.addOutput('origCDS', unicode(cds_original)) - output.addOutput("newCDS", unicode(cds_variant[:(len(protein_variant) + 1) * 3])) + output.addOutput("newCDS", unicode(cds_variant[:len(protein_variant) * 3])) - output.addOutput('oldprotein', unicode(protein_original) + '*') + output.addOutput('oldprotein', unicode(protein_original)) # Todo: Don't generate the fancy HTML protein views here, do this in # website.py. @@ -1373,9 +1384,9 @@ def _add_transcript_info(mutator, transcript, output): if not protein_variant or unicode(protein_variant[0]) != 'M': # Todo: Protein differences are not color-coded, # use something like below in protein_description(). - util.print_protein_html(unicode(protein_original) + '*', 0, 0, + util.print_protein_html(unicode(protein_original), 0, 0, output, 'oldProteinFancy') - util.print_protein_html(unicode(protein_original) + '*', 0, 0, + util.print_protein_html(unicode(protein_original), 0, 0, output, 'oldProteinFancyText', text=True) if unicode(cds_variant[0:3]) in \ CodonTable.unambiguous_dna_by_id[transcript.txTable].start_codons: @@ -1386,10 +1397,10 @@ def _add_transcript_info(mutator, transcript, output): output.addOutput('altStart', unicode(cds_variant[0:3])) if unicode(protein_original[1:]) != unicode(protein_variant[1:]): output.addOutput('altProtein', - 'M' + unicode(protein_variant[1:]) + '*') - util.print_protein_html('M' + unicode(protein_variant[1:]) + '*', 0, + 'M' + unicode(protein_variant[1:])) + util.print_protein_html('M' + unicode(protein_variant[1:]), 0, 0, output, 'altProteinFancy') - util.print_protein_html('M' + unicode(protein_variant[1:]) + '*', 0, + util.print_protein_html('M' + unicode(protein_variant[1:]), 0, 0, output, 'altProteinFancyText', text=True) else : output.addOutput('newprotein', '?') @@ -1405,18 +1416,15 @@ def _add_transcript_info(mutator, transcript, output): unicode(protein_original), unicode(protein_variant)) - # This is never used. - output.addOutput('myProteinDescription', descr) - - util.print_protein_html(unicode(protein_original) + '*', first, + util.print_protein_html(unicode(protein_original), first, last_original, output, 'oldProteinFancy') - util.print_protein_html(unicode(protein_original) + '*', first, + util.print_protein_html(unicode(protein_original), first, last_original, output, 'oldProteinFancyText', text=True) if unicode(protein_original) != unicode(protein_variant): - output.addOutput('newprotein', unicode(protein_variant) + '*') - util.print_protein_html(unicode(protein_variant) + '*', first, + output.addOutput('newprotein', unicode(protein_variant)) + util.print_protein_html(unicode(protein_variant), first, last_variant, output, 'newProteinFancy') - util.print_protein_html(unicode(protein_variant) + '*', first, + util.print_protein_html(unicode(protein_variant), first, last_variant, output, 'newProteinFancyText', text=True) #_add_transcript_info @@ -1808,8 +1816,7 @@ def check_variant(description, output): if not len(cds_original) % 3: try: # FIXME this is a bit of a rancid fix. - protein_original = cds_original.translate( - table=transcript.txTable, cds=True, to_stop=True) + protein_original = cds_original.translate(table=transcript.txTable, cds=True) except CodonTable.TranslationError: if transcript.current: output.addMessage( @@ -1825,8 +1832,13 @@ def check_variant(description, output): % (gene.name, transcript.name)) transcript.proteinDescription = 'p.?' else: - protein_variant = cds_variant.translate( - table=transcript.txTable, to_stop=True) + protein_variant = cds_variant.translate(table=transcript.txTable) + # Up to and including the first '*', or the entire string. + try: + stop = unicode(protein_variant).index('*') + protein_variant = protein_variant[:stop + 1] + except ValueError: + pass try: cds_length = util.cds_length( mutator.shift_sites(transcript.CDS.positionList)) diff --git a/tests/data/NM_001199.3.gb.bz2 b/tests/data/NM_001199.3.gb.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..a7dd692a2d5eac5e71aa433be19ae21ecbc639d0 Binary files /dev/null and b/tests/data/NM_001199.3.gb.bz2 differ diff --git a/tests/fixtures.py b/tests/fixtures.py index 652d2f86f4a1a593601958b756aa0c485fa98939..f579bcd92e684065f5205e22996ed96d8b5ccab1 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -51,6 +51,8 @@ REFERENCES = { 'NM_002001.2': {'filename': 'NM_002001.2.gb.bz2', 'checksum': '7fd5aa4fe864fd5193f224fca8cea70d', 'geninfo_id': '31317229'}, + 'NM_001199.3': {'filename': 'NM_001199.3.gb.bz2', + 'checksum': 'e750b6dcead66b8bb953ce445bcd3093'}, 'NG_008939.1': {'filename': 'NG_008939.1.gb.bz2', 'checksum': '114a03e16ad2f63531d796c2fb0d7039', 'geninfo_id': '211938431', diff --git a/tests/test_variantchecker.py b/tests/test_variantchecker.py index 735ff27b7881fc6371aca89ee4036697e80d8678..8f0eea3c94265ae28c808f07ae67a8cccc61417d 100644 --- a/tests/test_variantchecker.py +++ b/tests/test_variantchecker.py @@ -1316,3 +1316,36 @@ class TestVariantchecker(MutalyzerTest): check_variant('NM_003002.2:c.1del', self.output) w_exon_annotation = self.output.getMessagesWithErrorCode('WEXON_ANNOTATION') assert len(w_exon_annotation) == 0 + + @fix(cache('NM_001199.3')) + def test_fs_no_stop(self): + """ + Frame shift yielding no stop codon should be described with + uncertainty of the stop codon. + + http://www.hgvs.org/mutnomen/FAQ.html#nostop + """ + check_variant('NM_001199.3(BMP1):c.2188dup', self.output) + assert 'NM_001199.3(BMP1_i001):p.(Gln730Profs*?)' in self.output.getOutput('protDescriptions') + + @fix(cache('NM_000193.2')) + def test_ext_no_stop(self): + """ + Extension yielding no stop codon should be described with + uncertainty of the stop codon. + + http://www.hgvs.org/mutnomen/FAQ.html#nostop + """ + check_variant('NM_000193.2:c.1388G>C', self.output) + assert 'NM_000193.2(SHH_i001):p.(*463Serext*?)' in self.output.getOutput('protDescriptions') + + @fix(cache('NM_000193.2')) + def test_fs_ext_no_stop(self): + """ + Extension yielding no stop codon should be described with + uncertainty of the stop codon. + + http://www.hgvs.org/mutnomen/FAQ.html#nostop + """ + check_variant('NM_000193.2:c.1388_1389insC', self.output) + assert 'NM_000193.2(SHH_i001):p.(*463Cysext*?)' in self.output.getOutput('protDescriptions')