Skip to content
Snippets Groups Projects
Commit e301a743 authored by Vermaat's avatar Vermaat
Browse files

Try to handle whole-exon deletion and exon-fusion.

src/Modules/Mutator.py:
- Added method (add_removed_sites) to give splice sites that should be
  ignored. Now newSplice() does not process these splice sites.

src/Mutalyzer.py:
- Handle whole-exon deletions and exon-fusions by removing splice sites
  where we can. A notice is printed in this case. Otherwise, if splice
  sites are hit in some other way, no protein product is predicted. This
  fixes Trac issues #35 and #36.



git-svn-id: https://humgenprojects.lumc.nl/svn/mutalyzer/trunk@195 eb6bd6ab-9ccd-42b9-aceb-e2899b4a52f1
parent 6da9de0a
No related branches found
No related tags found
No related merge requests found
...@@ -22,7 +22,7 @@ The original as well as the mutated string are stored here. ...@@ -22,7 +22,7 @@ The original as well as the mutated string are stored here.
# - Mutator ; Mutate a string and register all shift points. # - Mutator ; Mutate a string and register all shift points.
from itertools import izip_longest from itertools import ifilter, izip_longest
from Bio import Restriction from Bio import Restriction
from Bio.Seq import Seq from Bio.Seq import Seq
from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA
...@@ -43,6 +43,8 @@ class Mutator() : ...@@ -43,6 +43,8 @@ class Mutator() :
where the modifications in length are stored. where the modifications in length are stored.
Each first element of the tuples in this list Each first element of the tuples in this list
is unique, each second element is non-zero. is unique, each second element is non-zero.
- __removed_sites ; Set of splice sites to ignore in mutated
string.
- __restrictionBatch ; - __restrictionBatch ;
Public variables: Public variables:
...@@ -106,6 +108,7 @@ class Mutator() : ...@@ -106,6 +108,7 @@ class Mutator() :
self.__config = config self.__config = config
self.__output = output self.__output = output
self.__shift = [] self.__shift = []
self.__removed_sites = set()
self.__restrictionBatch = Restriction.RestrictionBatch([], ['N']) self.__restrictionBatch = Restriction.RestrictionBatch([], ['N'])
self.orig = orig self.orig = orig
...@@ -363,6 +366,19 @@ class Mutator() : ...@@ -363,6 +366,19 @@ class Mutator() :
return ret return ret
#shiftpos #shiftpos
def add_removed_sites(self, sites):
"""
Add sites to the set of splice sites to ignore in the mutated string.
@arg sites: A list of splice sites to ignore.
@type sites: list of int
@todo: Resulting list of ignored sites should always be even.
"""
for site in sites:
self.__removed_sites.add(site)
#add_ignore_sites
def newSplice(self, sites) : def newSplice(self, sites) :
""" """
Generate a list of new splice sites. Generate a list of new splice sites.
...@@ -374,7 +390,7 @@ class Mutator() : ...@@ -374,7 +390,7 @@ class Mutator() :
@rtype: list of int @rtype: list of int
Example 1 (DNA): NG_012772.1 Example 1 (DNA): NG_012772.1(BRCA2_v001)
...---------[=========]----------... ...---------[=========]----------...
^ ^ ^ ^
...@@ -430,8 +446,9 @@ class Mutator() : ...@@ -430,8 +446,9 @@ class Mutator() :
new_sites = [] new_sites = []
prev_donor = sites[0] - 1 prev_donor = None
sites_iter = iter(sites) sites_iter = ifilter(lambda s: s not in self.__removed_sites, sites)
for acceptor, donor in izip_longest(sites_iter, sites_iter): for acceptor, donor in izip_longest(sites_iter, sites_iter):
# We don't want to do the -1+1 dance if # We don't want to do the -1+1 dance if
...@@ -446,7 +463,8 @@ class Mutator() : ...@@ -446,7 +463,8 @@ class Mutator() :
# Condition 3) makes sure we don't include insertions directly # Condition 3) makes sure we don't include insertions directly
# in front of CDS start in the CDS. It also affects translation # in front of CDS start in the CDS. It also affects translation
# start, but this should be no problem. # start, but this should be no problem.
if prev_donor == acceptor - 1 or self.shift_minus_at(acceptor): if not prev_donor or prev_donor == acceptor - 1 or \
self.shift_minus_at(acceptor):
new_sites.append(self.shiftpos(acceptor)) new_sites.append(self.shiftpos(acceptor))
else: else:
new_sites.append(self.shiftpos(acceptor - 1) + 1) new_sites.append(self.shiftpos(acceptor - 1) + 1)
......
...@@ -536,7 +536,7 @@ def __overSplice(pos1, pos2, sites) : ...@@ -536,7 +536,7 @@ def __overSplice(pos1, pos2, sites) :
@arg pos1: The first coordinate of the range in g. notation. @arg pos1: The first coordinate of the range in g. notation.
@type pos1: integer @type pos1: integer
@arg pos2: The first coordinate of the range in g. notation. @arg pos2: The second coordinate of the range in g. notation.
@type pos2: integer @type pos2: integer
@arg sites: A list of splice sites in g. notation. @arg sites: A list of splice sites in g. notation.
@type sites: list(integer) @type sites: list(integer)
...@@ -1043,6 +1043,7 @@ def checkInsertion(start_g, end_g, Arg1, MUU, GenRecordInstance, O) : ...@@ -1043,6 +1043,7 @@ def checkInsertion(start_g, end_g, Arg1, MUU, GenRecordInstance, O) :
Arg1, start_g, start_g + 1, Arg1, start_g, start_g + 1,
MUU.mutated[newStart + shift:newStop + shift], MUU.mutated[newStart + shift:newStop + shift],
newStart + shift, newStart + shift + 1)) newStart + shift, newStart + shift + 1))
#if
if shift != roll[1]: if shift != roll[1]:
O.addMessage(__file__, 1, "IROLLBACK", O.addMessage(__file__, 1, "IROLLBACK",
"Insertion of %s at position %i_%i was not corrected to an " \ "Insertion of %s at position %i_%i was not corrected to an " \
...@@ -1055,6 +1056,7 @@ def checkInsertion(start_g, end_g, Arg1, MUU, GenRecordInstance, O) : ...@@ -1055,6 +1056,7 @@ def checkInsertion(start_g, end_g, Arg1, MUU, GenRecordInstance, O) :
GenRecordInstance.name(start_g, start_g + 1, "ins", GenRecordInstance.name(start_g, start_g + 1, "ins",
MUU.mutated[newStart + shift:newStop + shift] , "", MUU.mutated[newStart + shift:newStop + shift] , "",
(roll[0], shift)) (roll[0], shift))
#else
#checkInsertion #checkInsertion
def __ivs2g(location, transcript) : def __ivs2g(location, transcript) :
...@@ -1149,7 +1151,10 @@ def __normal2g(RawVar, transcript) : ...@@ -1149,7 +1151,10 @@ def __normal2g(RawVar, transcript) :
def __rv(MUU, RawVar, GenRecordInstance, parts, O, transcript) : def __rv(MUU, RawVar, GenRecordInstance, parts, O, transcript) :
""" """
Process one raw variant.
@todo: documentation @todo: documentation
@todo: parts argument is not used
""" """
# FIXME check this # FIXME check this
...@@ -1229,10 +1234,54 @@ def __rv(MUU, RawVar, GenRecordInstance, parts, O, transcript) : ...@@ -1229,10 +1234,54 @@ def __rv(MUU, RawVar, GenRecordInstance, parts, O, transcript) :
Arg1 = Bio.Seq.reverse_complement(RawVar.Arg1) Arg1 = Bio.Seq.reverse_complement(RawVar.Arg1)
Arg2 = Bio.Seq.reverse_complement(RawVar.Arg2) Arg2 = Bio.Seq.reverse_complement(RawVar.Arg2)
splice_abort = False
# If we hit a splice site, issue a warning. Later on we decide if we
# can still process this variant in any way (e.g. if it deletes an
# entire exon).
if transcript and __overSplice(start_g, end_g, transcript.CM.RNA) : if transcript and __overSplice(start_g, end_g, transcript.CM.RNA) :
splice_abort = True
O.addMessage(__file__, 2, "WOVERSPLICE", O.addMessage(__file__, 2, "WOVERSPLICE",
"Variant hits one or more splice sites.") "Variant hits one or more splice sites.")
# If we have a deletion, and it covers exactly an even number of splice
# sites, remove these splice sites.
# Todo: Special cases for first/last exon? Upstream/downstream exons?
# Note, this is not the same as __overSplice(). Here we collect
# sites where the delection borders the exon/intron boundary.
if transcript and RawVar.MutationType == 'del':
removed_sites = []
sites = iter(transcript.CM.RNA)
for acceptor, donor in izip_longest(sites, sites):
if start_g <= acceptor <= end_g + 1:
removed_sites.append(acceptor)
if start_g - 1 <= donor <= end_g:
removed_sites.append(donor)
if len(removed_sites) and not len(removed_sites) % 2:
# An even number of splice sites was removed. We can deal with
# this, but issue a warning.
splice_abort = False
MUU.add_removed_sites(removed_sites)
O.addMessage(__file__, 1, "IDELSPLICE", "Removed %i splice " \
"sites from transcript." % len(removed_sites))
# If splice_abort is set, this basically means WOVERSPLICE was called and
# IDELSPLICE was not called.
# I guess in that case we do want to generate the visualisation, the
# genomic description, and affected transcripts. But NOT the predicted
# protein.
# The following solution is a bit of a hack. By setting the .translate
# field of the transcript to False, we force that no protein is predicted.
if splice_abort:
#return
transcript.translate = False
# The affected protein description for this transcript will now be
# a question mark, e.g. "NG_012772.1(BRCA2_i001):?". But protein
# descriptions for other transcripts (where splice sites are also
# crippled) are still shown. I think we ideally would not want this.
# However, some transcripts might be unaffected and should be shown.
if RawVar.MutationType in ["del", "dup", "subst", "delins"] : if RawVar.MutationType in ["del", "dup", "subst", "delins"] :
__checkOptArg(MUU.orig, start_g, end_g, Arg1, O) __checkOptArg(MUU.orig, start_g, end_g, Arg1, O)
......
...@@ -84,6 +84,7 @@ class TestMutalyzer(unittest.TestCase): ...@@ -84,6 +84,7 @@ class TestMutalyzer(unittest.TestCase):
""" """
Mutalyzer.process('NM_000143.3:c.-1_1insCAT', self.config, self.output) Mutalyzer.process('NM_000143.3:c.-1_1insCAT', self.config, self.output)
self.assertEqual(self.output.getIndexedOutput("newprotein", 0), None) self.assertEqual(self.output.getIndexedOutput("newprotein", 0), None)
# Todo: is this a good test?
def test_ins_cds_start_after(self): def test_ins_cds_start_after(self):
""" """
...@@ -91,6 +92,100 @@ class TestMutalyzer(unittest.TestCase): ...@@ -91,6 +92,100 @@ class TestMutalyzer(unittest.TestCase):
""" """
Mutalyzer.process('NM_000143.3:c.1_2insCAT', self.config, self.output) Mutalyzer.process('NM_000143.3:c.1_2insCAT', self.config, self.output)
self.assertEqual(self.output.getIndexedOutput("newprotein", 0), '?') self.assertEqual(self.output.getIndexedOutput("newprotein", 0), '?')
# Todo: is this a good test?
def test_del_splice_site(self):
"""
Deletion hitting one splice site should not be possible.
"""
Mutalyzer.process('NG_012772.1(BRCA2_v001):c.632-5_670del',
self.config, self.output)
woversplice = self.output.getMessagesWithErrorCode('WOVERSPLICE')
self.assertTrue(len(woversplice) > 0)
idelsplice = self.output.getMessagesWithErrorCode('IDELSPLICE')
self.assertTrue(len(idelsplice) == 0)
# Todo: For now, the following is how to check if no proteins
# prediction is done.
self.assertFalse(self.output.getOutput('newprotein'))
def test_del_exon(self):
"""
Deletion of an entire exon should be possible.
"""
Mutalyzer.process('NG_012772.1(BRCA2_v001):c.632-5_681+7del',
self.config, self.output)
woversplice = self.output.getMessagesWithErrorCode('WOVERSPLICE')
self.assertTrue(len(woversplice) > 0)
idelsplice = self.output.getMessagesWithErrorCode('IDELSPLICE')
self.assertTrue(len(idelsplice) > 0)
# Todo: For now, the following is how to check if no proteins
# prediction is done.
self.assertTrue(self.output.getOutput('newprotein'))
def test_del_exon_in_frame(self):
"""
Deletion of an entire exon with length a triplicate should give a
proteine product with just this deletion (and possibly substitutions
directly before and after).
NG_012772.1(BRCA2_v001):c.68-7_316+7del is such a variant, since
positions 68 through 316 are exactly one exon and (316-68+1)/3 = 83.
"""
Mutalyzer.process('NG_012772.1(BRCA2_v001):c.68-7_316+7del',
self.config, self.output)
woversplice = self.output.getMessagesWithErrorCode('WOVERSPLICE')
self.assertTrue(len(woversplice) > 0)
idelsplice = self.output.getMessagesWithErrorCode('IDELSPLICE')
self.assertTrue(len(idelsplice) > 0)
# Todo: For now, the following is how to check if no proteins
# prediction is done.
self.assertTrue(self.output.getOutput('newprotein'))
# Todo: assert that protein products indeed have only this difference.
def test_del_exons(self):
"""
Deletion of two entire exons should be possible.
"""
Mutalyzer.process('NG_012772.1(BRCA2_v001):c.632-5_793+7del',
self.config, self.output)
woversplice = self.output.getMessagesWithErrorCode('WOVERSPLICE')
self.assertTrue(len(woversplice) > 0)
idelsplice = self.output.getMessagesWithErrorCode('IDELSPLICE')
self.assertTrue(len(idelsplice) > 0)
# Todo: For now, the following is how to check if no proteins
# prediction is done.
self.assertTrue(self.output.getOutput('newprotein'))
def test_del_intron(self):
"""
Deletion of an entire intron should be possible (fusion of remaining
exonic parts).
"""
Mutalyzer.process('NG_012772.1(BRCA2_v001):c.622_674del',
self.config, self.output)
woversplice = self.output.getMessagesWithErrorCode('WOVERSPLICE')
self.assertTrue(len(woversplice) > 0)
idelsplice = self.output.getMessagesWithErrorCode('IDELSPLICE')
self.assertTrue(len(idelsplice) > 0)
# Todo: For now, the following is how to check if no proteins
# prediction is done.
self.assertTrue(self.output.getOutput('newprotein'))
def test_del_intron_in_frame(self):
"""
Deletion of an entire intron should be possible (fusion of remaining
exonic parts).
"""
Mutalyzer.process('NG_012772.1(BRCA2_v001):c.622_672del',
self.config, self.output)
woversplice = self.output.getMessagesWithErrorCode('WOVERSPLICE')
self.assertTrue(len(woversplice) > 0)
idelsplice = self.output.getMessagesWithErrorCode('IDELSPLICE')
self.assertTrue(len(idelsplice) > 0)
# Todo: For now, the following is how to check if no proteins
# prediction is done.
self.assertTrue(self.output.getOutput('newprotein'))
# Todo: assert that protein products indeed have only this difference.
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -668,6 +668,30 @@ class TestMutator(unittest.TestCase): ...@@ -668,6 +668,30 @@ class TestMutator(unittest.TestCase):
m.insM(18, 'AT') # g.18_19insAT m.insM(18, 'AT') # g.18_19insAT
self.assertEqual(m.newSplice(sites), [4, 9, 10, 17, 18, 29]) self.assertEqual(m.newSplice(sites), [4, 9, 10, 17, 18, 29])
def test_newSplice_removed_sites(self):
"""
After removing splice sites, newSplice() should filter them.
"""
l = 40
sites = [4, 9, 14, 19, 25, 27, 32, 38]
m = self._mutator(_seq(l))
m.add_removed_sites([19, 25])
self.assertEqual(m.newSplice(sites), [4, 9, 14, 27, 32, 38])
m.add_removed_sites([27, 32])
self.assertEqual(m.newSplice(sites), [4, 9, 14, 38])
m.insM(13, 'A') # g.13_14insA
self.assertEqual(m.newSplice(sites), [4, 9, 14, 39])
def test_sites_even_invariant(self):
"""
The number of splice sites should always be even. Modifying the list
of splice sites must always prevent the result of an odd number of
splice sites.
Todo: this test.
"""
pass
if __name__ == '__main__': if __name__ == '__main__':
# Usage: # Usage:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment