From 1dae4cf50c86bcd54330af62f39d3344f0eaaf48 Mon Sep 17 00:00:00 2001
From: Martijn Vermaat <martijn@vermaat.name>
Date: Wed, 7 Dec 2011 21:52:25 +0000
Subject: [PATCH] Complete fix r431 and add a unit test

git-svn-id: https://humgenprojects.lumc.nl/svn/mutalyzer/trunk@433 eb6bd6ab-9ccd-42b9-aceb-e2899b4a52f1
---
 mutalyzer/parsers/genbank.py  | 92 +++++++++++++++++++++++------------
 tests/test_parsers_genbank.py | 36 ++++++++++++++
 2 files changed, 97 insertions(+), 31 deletions(-)
 create mode 100644 tests/test_parsers_genbank.py

diff --git a/mutalyzer/parsers/genbank.py b/mutalyzer/parsers/genbank.py
index 69e3fd3d..e4728d45 100644
--- a/mutalyzer/parsers/genbank.py
+++ b/mutalyzer/parsers/genbank.py
@@ -5,6 +5,8 @@ mutalyzer GenRecord. Record populated with data from a GenBank file.
 
 
 import bz2
+from itertools import izip_longest
+
 from Bio import SeqIO, Entrez
 from Bio.Alphabet import ProteinAlphabet
 
@@ -161,38 +163,63 @@ class GBparser():
         return proteinAcc
     #__transcriptToProtein
 
-    def __findMismatch(self, productList, direction):
+    def _find_mismatch(self, sentences):
         """
-        Find the index of the first or last word that distinguishes one
-        sentence from an other.
+        Find the indices of the first and last words that distinguishes one
+        sentence from another. The index of the last word is counted backwards.
 
-        If direction equals 1, search for the first word.
-        If direction equals -1, search for the last word.
+        @arg sentences: A list of sentences.
+        @type sentences: list of strings
 
-        @arg productList: A list of sentences
-        @type productList: list of strings
-        @arg direction: The direction in which to search
-        @type direction: integer (1 or -1)
+        @return: The indices of the words where sentences start to differ,
+            both are -1 when no mismatches are found.
+        @rtype: tuple(int, int)
 
-        @return: The index of the word where sentences start to differ
-        @rtype: integer
+        Example usage:
+
+            >>> parser._find_mismatch(['a b c d e' , 'a B c d e', 'a b C d e'])
+            (1, 2)
+            >>> parser._find_mismatch(['a b c d e' , 'a b c d e', 'a b C D e'])
+            (2, 1)
+            >>> parser._find_mismatch(['a b c' , 'a b c', 'a b c'])
+            (-1, -1)
+
+        Note: The result can be used to slice the mismatching part from the
+            sentences where you take the negative value of the second returned
+            index. For the second example above:
+
+                >>> 'a b c d e'.split()[1:-2]
+                ['b', 'c']
+
+            But be careful since the second index may be 0, but slicing syntax
+            does not permit taking the -0 index from the right:
+
+                >>> 'a b c d e'.split()[2:0] == ['c', 'd', 'e']
+                False
+
+            Although less elegant, first check the second index for 0 and in
+            that case leave it out:
+
+                >>> 'a b c d e'.split()[2:] == ['c', 'd', 'e']
+                True
+
+            The case where no mismatch is found just works, since slicing with
+            [-1:1] yields the empty list.
         """
+        # Create lists of words
+        lists = map(str.split, sentences)
 
-        i = 0
-        while i < productList[0].count(' ') + 1 :
-            for j in range(1, len(productList)) :
-                if i <= productList[j].count(' ') :
-                    if productList[0][::direction].split(' ')[i] != \
-                       productList[j][::direction].split(' ')[i] :
-                        if direction == 1 :
-                            return i
-                        else :
-                            return productList[0].count(' ') - i + 1
-                    #if
-            i += 1
-        #while
-        return 0
-    #__findMismatch
+        try:
+            forward, reverse = [next(i for i, v in
+                                     enumerate(izip_longest(*lists))
+                                     if not len(set(v)) <= 1)
+                                for lists in (lists, map(reversed, lists))]
+        except StopIteration:
+            # No mismatch found
+            forward = reverse = -1
+
+        return forward, reverse
+    #_find_mismatch
 
     def __tagByDict(self, locus, key):
         """
@@ -256,13 +283,16 @@ class GBparser():
 
         if productList :
             # Find the defining words in the product list.
-            a = self.__findMismatch(productList, 1)
-            b = self.__findMismatch(productList, -1)
+            a, b = self._find_mismatch(productList)
 
             # Add the defining words to the locus.
-            for i in range(len(locusList)) :
-                locusList[i].productTag = \
-                    ' '.join(productList[i].split(' ')[a:b])
+            for i in range(len(locusList)):
+                if b == 0:
+                    locusList[i].productTag = \
+                        ' '.join(productList[i].split()[a:])
+                else:
+                    locusList[i].productTag = \
+                        ' '.join(productList[i].split()[a:-b])
         #if
     #__tagLocus
 
diff --git a/tests/test_parsers_genbank.py b/tests/test_parsers_genbank.py
new file mode 100644
index 00000000..c7da5791
--- /dev/null
+++ b/tests/test_parsers_genbank.py
@@ -0,0 +1,36 @@
+"""
+Tests for the mutalyzer.parsers.genbank module.
+"""
+
+
+#import logging; logging.basicConfig()
+from nose.tools import *
+
+from mutalyzer.parsers import genbank
+
+
+class TestMutator():
+    """
+    Test the mutator module.
+    """
+    def setUp(self):
+        """
+        Initialize test mutator module.
+        """
+        self.gb_parser = genbank.GBparser()
+
+    def test_product_lists_mismatch(self):
+        """
+        Test finding mismatches in some product lists.
+        """
+        tests = [(['a b c d e', 'a b C D e', 'a b c d e'], (2, 1)),
+                 (['a b c d e', 'a b C d e', 'a B c d e'], (1, 2)),
+                 (['a c d a', 'a b a', 'a a', 'a'], (1, 1)),
+                 ([''], (-1, -1)),
+                 (['', ''], (-1, -1)),
+                 (['a', 'a'], (-1, -1)),
+                 (['a', 'b'], (0, 0)),
+                 (['a b c', 'a b c'], (-1, -1)),
+                 (['a b c d a b', 'a b'], (2, 2))]
+        for test in tests:
+            assert_equal(self.gb_parser._find_mismatch(test[0]), test[1])
-- 
GitLab