diff --git a/src/Modules/Mutator.py b/src/Modules/Mutator.py index a913397ae094269556202566304553c9121018a7..f9af2303c2659bca45c07f3862565de9eb7955e6 100644 --- a/src/Modules/Mutator.py +++ b/src/Modules/Mutator.py @@ -22,7 +22,7 @@ The original as well as the mutated string are stored here. # - Mutator ; Mutate a string and register all shift points. -from itertools import izip_longest +from itertools import ifilter, izip_longest from Bio import Restriction from Bio.Seq import Seq from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA @@ -43,6 +43,8 @@ class Mutator() : where the modifications in length are stored. Each first element of the tuples in this list is unique, each second element is non-zero. + - __removed_sites ; Set of splice sites to ignore in mutated + string. - __restrictionBatch ; Public variables: @@ -106,6 +108,7 @@ class Mutator() : self.__config = config self.__output = output self.__shift = [] + self.__removed_sites = set() self.__restrictionBatch = Restriction.RestrictionBatch([], ['N']) self.orig = orig @@ -363,6 +366,19 @@ class Mutator() : return ret #shiftpos + def add_removed_sites(self, sites): + """ + Add sites to the set of splice sites to ignore in the mutated string. + + @arg sites: A list of splice sites to ignore. + @type sites: list of int + + @todo: Resulting list of ignored sites should always be even. + """ + for site in sites: + self.__removed_sites.add(site) + #add_ignore_sites + def newSplice(self, sites) : """ Generate a list of new splice sites. @@ -374,7 +390,7 @@ class Mutator() : @rtype: list of int - Example 1 (DNA): NG_012772.1 + Example 1 (DNA): NG_012772.1(BRCA2_v001) ...---------[=========]----------... ^ ^ @@ -430,8 +446,9 @@ class Mutator() : new_sites = [] - prev_donor = sites[0] - 1 - sites_iter = iter(sites) + prev_donor = None + sites_iter = ifilter(lambda s: s not in self.__removed_sites, sites) + for acceptor, donor in izip_longest(sites_iter, sites_iter): # We don't want to do the -1+1 dance if @@ -446,7 +463,8 @@ class Mutator() : # Condition 3) makes sure we don't include insertions directly # in front of CDS start in the CDS. It also affects translation # start, but this should be no problem. - if prev_donor == acceptor - 1 or self.shift_minus_at(acceptor): + if not prev_donor or prev_donor == acceptor - 1 or \ + self.shift_minus_at(acceptor): new_sites.append(self.shiftpos(acceptor)) else: new_sites.append(self.shiftpos(acceptor - 1) + 1) diff --git a/src/Mutalyzer.py b/src/Mutalyzer.py index 3a3edeaa22db37995152afd7c1797fb1fef888f0..6a7a24d5485828e01285e00e7f0880cf7100315c 100644 --- a/src/Mutalyzer.py +++ b/src/Mutalyzer.py @@ -536,7 +536,7 @@ def __overSplice(pos1, pos2, sites) : @arg pos1: The first coordinate of the range in g. notation. @type pos1: integer - @arg pos2: The first coordinate of the range in g. notation. + @arg pos2: The second coordinate of the range in g. notation. @type pos2: integer @arg sites: A list of splice sites in g. notation. @type sites: list(integer) @@ -1043,6 +1043,7 @@ def checkInsertion(start_g, end_g, Arg1, MUU, GenRecordInstance, O) : Arg1, start_g, start_g + 1, MUU.mutated[newStart + shift:newStop + shift], newStart + shift, newStart + shift + 1)) + #if if shift != roll[1]: O.addMessage(__file__, 1, "IROLLBACK", "Insertion of %s at position %i_%i was not corrected to an " \ @@ -1055,6 +1056,7 @@ def checkInsertion(start_g, end_g, Arg1, MUU, GenRecordInstance, O) : GenRecordInstance.name(start_g, start_g + 1, "ins", MUU.mutated[newStart + shift:newStop + shift] , "", (roll[0], shift)) + #else #checkInsertion def __ivs2g(location, transcript) : @@ -1149,7 +1151,10 @@ def __normal2g(RawVar, transcript) : def __rv(MUU, RawVar, GenRecordInstance, parts, O, transcript) : """ + Process one raw variant. + @todo: documentation + @todo: parts argument is not used """ # FIXME check this @@ -1229,10 +1234,54 @@ def __rv(MUU, RawVar, GenRecordInstance, parts, O, transcript) : Arg1 = Bio.Seq.reverse_complement(RawVar.Arg1) Arg2 = Bio.Seq.reverse_complement(RawVar.Arg2) + splice_abort = False + + # If we hit a splice site, issue a warning. Later on we decide if we + # can still process this variant in any way (e.g. if it deletes an + # entire exon). if transcript and __overSplice(start_g, end_g, transcript.CM.RNA) : + splice_abort = True O.addMessage(__file__, 2, "WOVERSPLICE", "Variant hits one or more splice sites.") + # If we have a deletion, and it covers exactly an even number of splice + # sites, remove these splice sites. + # Todo: Special cases for first/last exon? Upstream/downstream exons? + # Note, this is not the same as __overSplice(). Here we collect + # sites where the delection borders the exon/intron boundary. + if transcript and RawVar.MutationType == 'del': + removed_sites = [] + sites = iter(transcript.CM.RNA) + for acceptor, donor in izip_longest(sites, sites): + if start_g <= acceptor <= end_g + 1: + removed_sites.append(acceptor) + if start_g - 1 <= donor <= end_g: + removed_sites.append(donor) + + if len(removed_sites) and not len(removed_sites) % 2: + # An even number of splice sites was removed. We can deal with + # this, but issue a warning. + splice_abort = False + MUU.add_removed_sites(removed_sites) + O.addMessage(__file__, 1, "IDELSPLICE", "Removed %i splice " \ + "sites from transcript." % len(removed_sites)) + + # If splice_abort is set, this basically means WOVERSPLICE was called and + # IDELSPLICE was not called. + # I guess in that case we do want to generate the visualisation, the + # genomic description, and affected transcripts. But NOT the predicted + # protein. + # The following solution is a bit of a hack. By setting the .translate + # field of the transcript to False, we force that no protein is predicted. + if splice_abort: + #return + transcript.translate = False + # The affected protein description for this transcript will now be + # a question mark, e.g. "NG_012772.1(BRCA2_i001):?". But protein + # descriptions for other transcripts (where splice sites are also + # crippled) are still shown. I think we ideally would not want this. + # However, some transcripts might be unaffected and should be shown. + if RawVar.MutationType in ["del", "dup", "subst", "delins"] : __checkOptArg(MUU.orig, start_g, end_g, Arg1, O) diff --git a/src/tests/test_mutalyzer.py b/src/tests/test_mutalyzer.py index d6420ec8fbf05f1ca31d820d0855df401055a330..a1996b39e519e9da89fdf67a673351faa374163e 100755 --- a/src/tests/test_mutalyzer.py +++ b/src/tests/test_mutalyzer.py @@ -84,6 +84,7 @@ class TestMutalyzer(unittest.TestCase): """ Mutalyzer.process('NM_000143.3:c.-1_1insCAT', self.config, self.output) self.assertEqual(self.output.getIndexedOutput("newprotein", 0), None) + # Todo: is this a good test? def test_ins_cds_start_after(self): """ @@ -91,6 +92,100 @@ class TestMutalyzer(unittest.TestCase): """ Mutalyzer.process('NM_000143.3:c.1_2insCAT', self.config, self.output) self.assertEqual(self.output.getIndexedOutput("newprotein", 0), '?') + # Todo: is this a good test? + + def test_del_splice_site(self): + """ + Deletion hitting one splice site should not be possible. + """ + Mutalyzer.process('NG_012772.1(BRCA2_v001):c.632-5_670del', + self.config, self.output) + woversplice = self.output.getMessagesWithErrorCode('WOVERSPLICE') + self.assertTrue(len(woversplice) > 0) + idelsplice = self.output.getMessagesWithErrorCode('IDELSPLICE') + self.assertTrue(len(idelsplice) == 0) + # Todo: For now, the following is how to check if no proteins + # prediction is done. + self.assertFalse(self.output.getOutput('newprotein')) + + def test_del_exon(self): + """ + Deletion of an entire exon should be possible. + """ + Mutalyzer.process('NG_012772.1(BRCA2_v001):c.632-5_681+7del', + self.config, self.output) + woversplice = self.output.getMessagesWithErrorCode('WOVERSPLICE') + self.assertTrue(len(woversplice) > 0) + idelsplice = self.output.getMessagesWithErrorCode('IDELSPLICE') + self.assertTrue(len(idelsplice) > 0) + # Todo: For now, the following is how to check if no proteins + # prediction is done. + self.assertTrue(self.output.getOutput('newprotein')) + + def test_del_exon_in_frame(self): + """ + Deletion of an entire exon with length a triplicate should give a + proteine product with just this deletion (and possibly substitutions + directly before and after). + + NG_012772.1(BRCA2_v001):c.68-7_316+7del is such a variant, since + positions 68 through 316 are exactly one exon and (316-68+1)/3 = 83. + """ + Mutalyzer.process('NG_012772.1(BRCA2_v001):c.68-7_316+7del', + self.config, self.output) + woversplice = self.output.getMessagesWithErrorCode('WOVERSPLICE') + self.assertTrue(len(woversplice) > 0) + idelsplice = self.output.getMessagesWithErrorCode('IDELSPLICE') + self.assertTrue(len(idelsplice) > 0) + # Todo: For now, the following is how to check if no proteins + # prediction is done. + self.assertTrue(self.output.getOutput('newprotein')) + # Todo: assert that protein products indeed have only this difference. + + def test_del_exons(self): + """ + Deletion of two entire exons should be possible. + """ + Mutalyzer.process('NG_012772.1(BRCA2_v001):c.632-5_793+7del', + self.config, self.output) + woversplice = self.output.getMessagesWithErrorCode('WOVERSPLICE') + self.assertTrue(len(woversplice) > 0) + idelsplice = self.output.getMessagesWithErrorCode('IDELSPLICE') + self.assertTrue(len(idelsplice) > 0) + # Todo: For now, the following is how to check if no proteins + # prediction is done. + self.assertTrue(self.output.getOutput('newprotein')) + + def test_del_intron(self): + """ + Deletion of an entire intron should be possible (fusion of remaining + exonic parts). + """ + Mutalyzer.process('NG_012772.1(BRCA2_v001):c.622_674del', + self.config, self.output) + woversplice = self.output.getMessagesWithErrorCode('WOVERSPLICE') + self.assertTrue(len(woversplice) > 0) + idelsplice = self.output.getMessagesWithErrorCode('IDELSPLICE') + self.assertTrue(len(idelsplice) > 0) + # Todo: For now, the following is how to check if no proteins + # prediction is done. + self.assertTrue(self.output.getOutput('newprotein')) + + def test_del_intron_in_frame(self): + """ + Deletion of an entire intron should be possible (fusion of remaining + exonic parts). + """ + Mutalyzer.process('NG_012772.1(BRCA2_v001):c.622_672del', + self.config, self.output) + woversplice = self.output.getMessagesWithErrorCode('WOVERSPLICE') + self.assertTrue(len(woversplice) > 0) + idelsplice = self.output.getMessagesWithErrorCode('IDELSPLICE') + self.assertTrue(len(idelsplice) > 0) + # Todo: For now, the following is how to check if no proteins + # prediction is done. + self.assertTrue(self.output.getOutput('newprotein')) + # Todo: assert that protein products indeed have only this difference. if __name__ == '__main__': diff --git a/src/tests/test_mutator.py b/src/tests/test_mutator.py index fd6af9fa2430998e98b310142d36f8d7aed2634e..7f942ccf93ff93f9e8c61f29d2a9c1744a6a4805 100755 --- a/src/tests/test_mutator.py +++ b/src/tests/test_mutator.py @@ -668,6 +668,30 @@ class TestMutator(unittest.TestCase): m.insM(18, 'AT') # g.18_19insAT self.assertEqual(m.newSplice(sites), [4, 9, 10, 17, 18, 29]) + def test_newSplice_removed_sites(self): + """ + After removing splice sites, newSplice() should filter them. + """ + l = 40 + sites = [4, 9, 14, 19, 25, 27, 32, 38] + m = self._mutator(_seq(l)) + m.add_removed_sites([19, 25]) + self.assertEqual(m.newSplice(sites), [4, 9, 14, 27, 32, 38]) + m.add_removed_sites([27, 32]) + self.assertEqual(m.newSplice(sites), [4, 9, 14, 38]) + m.insM(13, 'A') # g.13_14insA + self.assertEqual(m.newSplice(sites), [4, 9, 14, 39]) + + def test_sites_even_invariant(self): + """ + The number of splice sites should always be even. Modifying the list + of splice sites must always prevent the result of an odd number of + splice sites. + + Todo: this test. + """ + pass + if __name__ == '__main__': # Usage: