Use chardet instead of cchardet

Issue #50 showed a problem in our file encoding detection, caused by our cut-off for the confidence as reported by the cchardet [1] library: >>> import cchardet >>> s = u'NM_000052.4:c.2407\u20132A>G' >>> b = s.encode('WINDOWS-1252') >>> cchardet.detect(b) {'confidence': 0.5, 'encoding': u'WINDOWS-1252'} We require a confidence stictly greater than 0.5 and default to UTF8 otherwise. If, however, we try the same thing using the chardet [2] library, we get a higher confidence for the same string: >>> import chardet >>> chardet.detect(b) {'confidence': 0.73, 'encoding': 'windows-1252'} So the two obvious ways to solve this are: 1. Lower the confidence threshold. 2. Use chardet instead of cchardet. We implement the second solution here, since it also removes a C library dependency and we are not worried by performance. Of course the detected encoding remains a guess which can still be wrong! [1] https://github.com/PyYoshi/cChardet [2] https://github.com/chardet/chardet Fixes #50

Use chardet instead of cchardet
dedad241 · Vermaat · e0490337 · dedad241 · dedad241 · dedad241
Commit dedad241 authored 9 years ago by Vermaat
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -23,8 +23,7 @@ class Mock(MagicMock):
    def __getattr__(cls, name):
        return Mock()

-MOCK_MODULES = ['MySQLdb', 'cchardet', 'lxml', 'lxml.builder', 'lxml.etree',
-                'magic']
+MOCK_MODULES = ['MySQLdb', 'lxml', 'lxml.builder', 'lxml.etree', 'magic']
 sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)

 # If extensions (or modules to document with autodoc) are in another directory,

--- a/mutalyzer/File.py
+++ b/mutalyzer/File.py
@@ -25,7 +25,7 @@ import csv             # Sniffer(), reader(), Error
 import xlrd            # open_workbook()
 import zipfile         # ZipFile()
 import xml.dom.minidom # parseString()
-import cchardet as chardet
+import chardet

 from mutalyzer.config import settings


--- a/mutalyzer/Retriever.py
+++ b/mutalyzer/Retriever.py
@@ -27,7 +27,7 @@ from xml.dom import DOMException, minidom
 from xml.parsers import expat
 from httplib import HTTPException, IncompleteRead
 from sqlalchemy.orm.exc import NoResultFound
-import cchardet as chardet
+import chardet

 from mutalyzer import util
 from mutalyzer.config import settings

--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,7 @@ Sphinx==1.2.3
 Werkzeug==0.9.6
 alembic==0.6.7
 biopython==1.64
-cchardet==0.3.5
+chardet==2.3.0
 cssselect==0.9.1
 lxml==3.4.0
 mock==1.0.1

--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -350,3 +350,21 @@ class TestScheduler(MutalyzerTest):
                    ['\u2026AL449423.14(CDKN2A_v002):c.5_400del',
                     '(grammar): Expected W:(0123...) (at char 0), (line:1, col:1)']]
        self._batch_job_plain_text(variants, expected, 'syntax-checker')
+
+    def test_windows_1252_input(self):
+        """
+        Simple input encoded as WINDOWS-1252.
+        """
+        variants = ['AB026906.1:c.274G>T',
+                    # Encoded as WINDOWS-1252, the following is not valid UTF8.
+                    'NM_000052.4:c.2407\u20132A>G',
+                    'AL449423.14(CDKN2A_v002):c.5_400del']
+        batch_file = io.BytesIO(('\n'.join(variants) + '\n').encode('WINDOWS-1252'))
+        expected = [['AB026906.1:c.274G>T',
+                     'OK'],
+                    ['NM_000052.4:c.2407\u20132A>G',
+                     '(grammar): Expected W:(acgt...) (at char 18), (line:1, col:19)'],
+                    ['AL449423.14(CDKN2A_v002):c.5_400del',
+                     'OK']]
+
+        self._batch_job(batch_file, expected, 'syntax-checker')