From a6434a924bd52a69a6d99e507e0bb01bc7747de1 Mon Sep 17 00:00:00 2001
From: Martijn Vermaat <martijn@vermaat.name>
Date: Tue, 16 Aug 2011 10:56:39 +0000
Subject: [PATCH] Fix: don't remove flanking splice sites on deletion of exons
 on a transcript reference.

git-svn-id: https://humgenprojects.lumc.nl/svn/mutalyzer/branches/refactor-mutalyzer-branch@322 eb6bd6ab-9ccd-42b9-aceb-e2899b4a52f1
---
 README                                        |  2 +-
 mutalyzer/variantchecker.py                   | 29 ++++++-
 ...st_mutalyzer.py => test_variantchecker.py} | 79 ++++++++++++++-----
 tests/test_website.py                         |  4 +-
 4 files changed, 90 insertions(+), 24 deletions(-)
 rename tests/{test_mutalyzer.py => test_variantchecker.py} (78%)

diff --git a/README b/README
index 616fbac1..0774f980 100644
--- a/README
+++ b/README
@@ -73,7 +73,7 @@ Todo list:
 - Check for os.path.join vulnerabilities.
 - Use web.config.debug=False on production server and perhaps put this in
   the configuration file.
-- Add database indices to extras/post-install.sh script.
+- Solution for database schema migration on version updates.
 
 Code style guide:
 - Follow PEP 8 (code) and PEP 257 (docstrings).
diff --git a/mutalyzer/variantchecker.py b/mutalyzer/variantchecker.py
index 99896625..f717c3ca 100644
--- a/mutalyzer/variantchecker.py
+++ b/mutalyzer/variantchecker.py
@@ -978,9 +978,32 @@ def process_raw_variant(mutator, variant, record, transcript, output):
     if transcript and variant.MutationType == 'del':
         removed_sites = []
         for acceptor, donor in util.grouper(transcript.CM.RNA):
-            if first <= acceptor <= last + 1:
+
+            # If we have introns, we match splice sites in a fuzzy way. This
+            # Means that in the case of
+            #
+            #               a            b
+            #     ===========------------=============
+            #
+            # with splice sites a and b, a deletion a+1_b-1 of the entire
+            # intron gets treated as a deletion of both splice sites.
+            #
+            # We don't want this behaviour on e.g. RNA, where we only have
+            # exons. In the case of
+            #
+            #              a b           c d
+            #     ========== ============= ===========
+            #
+            # with splice sites a b c d, a deletion b_c of the middle exon
+            # should only remove splice sites b and c, not a and d.
+            if record.record.molType == 'g':
+                fuzzy = 1
+            else:
+                fuzzy = 0
+
+            if first <= acceptor <= last + fuzzy:
                 removed_sites.append(acceptor)
-            if first - 1 <= donor <= last:
+            if first - fuzzy <= donor <= last:
                 removed_sites.append(donor)
 
         if len(removed_sites) and not len(removed_sites) % 2:
@@ -1004,6 +1027,8 @@ def process_raw_variant(mutator, variant, record, transcript, output):
                 output.addMessage(__file__, 1, 'IDELSPLICE',
                                   'Removed %i splice sites from selected ' \
                                   'transcript.' % len(removed_sites))
+                # This is primarily for use in unittests.
+                output.addOutput('removedSpliceSites', len(removed_sites))
 
     # If splice_abort is set, this basically means WOVERSPLICE was called and
     # IDELSPLICE was not called.
diff --git a/tests/test_mutalyzer.py b/tests/test_variantchecker.py
similarity index 78%
rename from tests/test_mutalyzer.py
rename to tests/test_variantchecker.py
index ddbaf0c3..1fba607f 100644
--- a/tests/test_mutalyzer.py
+++ b/tests/test_variantchecker.py
@@ -1,5 +1,5 @@
 """
-Tests for the Mutalyzer module.
+Tests for the variantchecker module.
 """
 
 
@@ -16,14 +16,13 @@ from mutalyzer.output import Output
 from mutalyzer.variantchecker import check_variant
 
 
-class TestMutalyzer():
+class TestVariantchecker():
     """
-    Test the Mutalyzer module.
+    Test the variantchecker module.
     """
-
     def setUp(self):
         """
-        Initialize test Mutalyzer module.
+        Initialize test variantchecker module.
         """
         self.config = Config()
         self.output = Output(__file__, self.config.Output)
@@ -42,7 +41,7 @@ class TestMutalyzer():
         """
         check_variant('NM_003002.2:c.274del', self.config, self.output)
         wroll = self.output.getMessagesWithErrorCode('WROLLFORWARD')
-        assert len(wroll) == 0
+        assert_equal(len(wroll), 0)
 
     def test_no_roll_splice(self):
         """
@@ -52,7 +51,7 @@ class TestMutalyzer():
         wrollback = self.output.getMessagesWithErrorCode('IROLLBACK')
         assert len(wrollback) > 0
         wroll = self.output.getMessagesWithErrorCode('WROLLFORWARD')
-        assert len(wroll) == 0
+        assert_equal(len(wroll), 0)
 
     def test_partial_roll_splice(self):
         """
@@ -104,7 +103,7 @@ class TestMutalyzer():
         check_variant('AL449423.14:g.65470_65471insTAC', self.config, self.output)
         assert 'AL449423.14(CDKN2A_v001):c.99_100insTAG' in self.output.getOutput('descriptions')
         assert_equal ('AL449423.14:g.65471_65472insACT', self.output.getIndexedOutput('genomicDescription', 0, ''))
-        assert len(self.output.getMessagesWithErrorCode('WROLLFORWARD')) == 1
+        assert_equal(len(self.output.getMessagesWithErrorCode('WROLLFORWARD')), 1)
 
     def test_roll_reverse_ins(self):
         """
@@ -114,7 +113,7 @@ class TestMutalyzer():
         check_variant('AL449423.14:g.65471_65472insACT', self.config, self.output)
         assert 'AL449423.14(CDKN2A_v001):c.99_100insTAG' in self.output.getOutput('descriptions')
         assert_equal ('AL449423.14:g.65471_65472insACT', self.output.getIndexedOutput('genomicDescription', 0, ''))
-        assert len(self.output.getMessagesWithErrorCode('WROLLFORWARD')) == 0
+        assert_equal(len(self.output.getMessagesWithErrorCode('WROLLFORWARD')), 0)
 
     def test_roll_message_forward(self):
         """
@@ -122,8 +121,8 @@ class TestMutalyzer():
         strand (forward).
         """
         check_variant('AL449423.14:g.65470_65471insTAC', self.config, self.output)
-        assert len(self.output.getMessagesWithErrorCode('WROLLFORWARD')) == 1
-        assert len(self.output.getMessagesWithErrorCode('WROLLREVERSE')) == 0
+        assert_equal(len(self.output.getMessagesWithErrorCode('WROLLFORWARD')), 1)
+        assert_equal(len(self.output.getMessagesWithErrorCode('WROLLREVERSE')), 0)
 
     def test_roll_message_reverse(self):
         """
@@ -131,8 +130,8 @@ class TestMutalyzer():
         strand (reverse).
         """
         check_variant('AL449423.14(CDKN2A_v001):c.98_99insGTA', self.config, self.output)
-        assert len(self.output.getMessagesWithErrorCode('WROLLFORWARD')) == 0
-        assert len(self.output.getMessagesWithErrorCode('WROLLREVERSE')) == 1
+        assert_equal(len(self.output.getMessagesWithErrorCode('WROLLFORWARD')), 0)
+        assert_equal(len(self.output.getMessagesWithErrorCode('WROLLREVERSE')), 1)
 
     def test_ins_cds_start(self):
         """
@@ -157,7 +156,7 @@ class TestMutalyzer():
         check_variant('NG_012772.1(BRCA2_v001):c.632-5_670del',
                       self.config, self.output)
         assert len(self.output.getMessagesWithErrorCode('WOVERSPLICE')) > 0
-        assert len(self.output.getMessagesWithErrorCode('IDELSPLICE')) == 0
+        assert_equal(self.output.getOutput('removedSpliceSites'), [])
         # Todo: For now, the following is how to check if no protein
         # prediction is done.
         assert not self.output.getOutput('newprotein')
@@ -169,7 +168,19 @@ class TestMutalyzer():
         check_variant('NG_012772.1(BRCA2_v001):c.632-5_681+7del',
                       self.config, self.output)
         assert len(self.output.getMessagesWithErrorCode('WOVERSPLICE')) > 0
-        assert len(self.output.getMessagesWithErrorCode('IDELSPLICE')) > 0
+        assert_equal(self.output.getOutput('removedSpliceSites'), [2])
+        # Todo: For now, the following is how to check if protein
+        # prediction is done.
+        assert self.output.getOutput('newprotein')
+
+    def test_del_exon_exact(self):
+        """
+        Deletion of exactly an exon should be possible.
+        """
+        check_variant('NG_012772.1(BRCA2_v001):c.632_681del',
+                      self.config, self.output)
+        assert_equal(len(self.output.getMessagesWithErrorCode('WOVERSPLICE')), 0)
+        assert_equal(self.output.getOutput('removedSpliceSites'), [2])
         # Todo: For now, the following is how to check if protein
         # prediction is done.
         assert self.output.getOutput('newprotein')
@@ -186,7 +197,7 @@ class TestMutalyzer():
         check_variant('NG_012772.1(BRCA2_v001):c.68-7_316+7del',
                       self.config, self.output)
         assert len(self.output.getMessagesWithErrorCode('WOVERSPLICE')) > 0
-        assert len(self.output.getMessagesWithErrorCode('IDELSPLICE')) > 0
+        assert_equal(self.output.getOutput('removedSpliceSites'), [2])
         # Todo: For now, the following is how to check if protein
         # prediction is done.
         assert self.output.getOutput('newprotein')
@@ -199,7 +210,7 @@ class TestMutalyzer():
         check_variant('NG_012772.1(BRCA2_v001):c.632-5_793+7del',
                       self.config, self.output)
         assert len(self.output.getMessagesWithErrorCode('WOVERSPLICE')) > 0
-        assert len(self.output.getMessagesWithErrorCode('IDELSPLICE')) > 0
+        assert_equal(self.output.getOutput('removedSpliceSites'), [4])
         # Todo: For now, the following is how to check if protein
         # prediction is done.
         assert self.output.getOutput('newprotein')
@@ -212,11 +223,27 @@ class TestMutalyzer():
         check_variant('NG_012772.1(BRCA2_v001):c.622_674del',
                       self.config, self.output)
         assert len(self.output.getMessagesWithErrorCode('WOVERSPLICE')) > 0
-        assert len(self.output.getMessagesWithErrorCode('IDELSPLICE')) > 0
+        assert_equal(self.output.getOutput('removedSpliceSites'), [2])
         # Todo: For now, the following is how to check if protein
         # prediction is done.
         assert self.output.getOutput('newprotein')
 
+    def test_del_intron_exact(self):
+        """
+        Deletion of exactly an intron should be possible (fusion of flanking
+        exons).
+        """
+        check_variant('NG_012772.1(BRCA2_v001):c.681+1_682-1del',
+                      self.config, self.output)
+        assert_equal(self.output.getMessagesWithErrorCode('WOVERSPLICE'), [])
+        assert_equal(self.output.getOutput('removedSpliceSites'), [2])
+        # Note: The protein prediction is done, but 'newprotein' is not set
+        # because we have no change. So to check if the prediction is done, we
+        # check if 'oldprotein' is set and to check if the prediction is
+        # correct, we check if 'newprotein' is not set.
+        assert self.output.getOutput('oldprotein')
+        assert not self.output.getOutput('newprotein')
+
     def test_del_intron_in_frame(self):
         """
         Deletion of an entire intron should be possible (fusion of remaining
@@ -225,7 +252,7 @@ class TestMutalyzer():
         check_variant('NG_012772.1(BRCA2_v001):c.622_672del',
                       self.config, self.output)
         assert len(self.output.getMessagesWithErrorCode('WOVERSPLICE')) > 0
-        assert len(self.output.getMessagesWithErrorCode('IDELSPLICE')) > 0
+        assert_equal(self.output.getOutput('removedSpliceSites'), [2])
         # Todo: For now, the following is how to check if protein
         # prediction is done.
         assert self.output.getOutput('newprotein')
@@ -296,3 +323,17 @@ class TestMutalyzer():
                in self.output.getOutput('descriptions')
         # Todo: .c notation should still be c.632-?_681+?del, but what about
         # other transcripts?
+
+    def test_del_exon_transcript_reference(self):
+        """
+        Deletion of entire exon on a transcript reference should remove the
+        expected splice sites (only that of the deleted exon), and not those
+        of the flanking exons (as would happen using the mechanism for genomic
+        references).
+        """
+        check_variant('NM_018723.3:c.758_890del', self.config, self.output)
+        assert_equal(len(self.output.getMessagesWithErrorCode('WOVERSPLICE')), 0)
+        assert_equal(self.output.getOutput('removedSpliceSites'), [2])
+        # Todo: For now, the following is how to check if protein
+        # prediction is done.
+        assert self.output.getOutput('newprotein')
diff --git a/tests/test_website.py b/tests/test_website.py
index e740edd4..de31f2c7 100644
--- a/tests/test_website.py
+++ b/tests/test_website.py
@@ -520,8 +520,8 @@ facilisi."""
         r.mustcontain('0 Errors',
                       '0 Warnings',
                       'Raw variant 1: substitution at 7055')
-        assert r.body.find('go to bottom') == -1
-        assert r.body.find('<input') == -1
+        assert_equal(r.body.find('go to bottom'), -1)
+        assert_equal(r.body.find('<input'), -1)
 
     def test_variantinfo_g2c(self):
         """
-- 
GitLab