From dedad241ad2722d7dc66db7829c356fc2697d48d Mon Sep 17 00:00:00 2001
From: Martijn Vermaat <martijn@vermaat.name>
Date: Fri, 3 Jul 2015 13:48:47 +0200
Subject: [PATCH] Use chardet instead of cchardet

Issue #50 showed a problem in our file encoding detection, caused
by our cut-off for the confidence as reported by the cchardet [1]
library:

    >>> import cchardet
    >>> s = u'NM_000052.4:c.2407\u20132A>G'
    >>> b = s.encode('WINDOWS-1252')
    >>> cchardet.detect(b)
    {'confidence': 0.5, 'encoding': u'WINDOWS-1252'}

We require a confidence stictly greater than 0.5 and default to
UTF8 otherwise.

If, however, we try the same thing using the chardet [2] library,
we get a higher confidence for the same string:

    >>> import chardet
    >>> chardet.detect(b)
    {'confidence': 0.73, 'encoding': 'windows-1252'}

So the two obvious ways to solve this are:

1. Lower the confidence threshold.
2. Use chardet instead of cchardet.

We implement the second solution here, since it also removes a C
library dependency and we are not worried by performance.

Of course the detected encoding remains a guess which can still
be wrong!

[1] https://github.com/PyYoshi/cChardet
[2] https://github.com/chardet/chardet

Fixes #50
---
 doc/conf.py             |  3 +--
 mutalyzer/File.py       |  2 +-
 mutalyzer/Retriever.py  |  2 +-
 requirements.txt        |  2 +-
 tests/test_scheduler.py | 18 ++++++++++++++++++
 5 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/doc/conf.py b/doc/conf.py
index 424e8512..322ffec5 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -23,8 +23,7 @@ class Mock(MagicMock):
     def __getattr__(cls, name):
         return Mock()
 
-MOCK_MODULES = ['MySQLdb', 'cchardet', 'lxml', 'lxml.builder', 'lxml.etree',
-                'magic']
+MOCK_MODULES = ['MySQLdb', 'lxml', 'lxml.builder', 'lxml.etree', 'magic']
 sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
 
 # If extensions (or modules to document with autodoc) are in another directory,
diff --git a/mutalyzer/File.py b/mutalyzer/File.py
index 5851e7a5..90641c3c 100644
--- a/mutalyzer/File.py
+++ b/mutalyzer/File.py
@@ -25,7 +25,7 @@ import csv             # Sniffer(), reader(), Error
 import xlrd            # open_workbook()
 import zipfile         # ZipFile()
 import xml.dom.minidom # parseString()
-import cchardet as chardet
+import chardet
 
 from mutalyzer.config import settings
 
diff --git a/mutalyzer/Retriever.py b/mutalyzer/Retriever.py
index cc6f9193..286caf3e 100644
--- a/mutalyzer/Retriever.py
+++ b/mutalyzer/Retriever.py
@@ -27,7 +27,7 @@ from xml.dom import DOMException, minidom
 from xml.parsers import expat
 from httplib import HTTPException, IncompleteRead
 from sqlalchemy.orm.exc import NoResultFound
-import cchardet as chardet
+import chardet
 
 from mutalyzer import util
 from mutalyzer.config import settings
diff --git a/requirements.txt b/requirements.txt
index add9c97c..413abda0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,7 @@ Sphinx==1.2.3
 Werkzeug==0.9.6
 alembic==0.6.7
 biopython==1.64
-cchardet==0.3.5
+chardet==2.3.0
 cssselect==0.9.1
 lxml==3.4.0
 mock==1.0.1
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index c118bfd7..7dd3a8c2 100644
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -350,3 +350,21 @@ class TestScheduler(MutalyzerTest):
                     ['\u2026AL449423.14(CDKN2A_v002):c.5_400del',
                      '(grammar): Expected W:(0123...) (at char 0), (line:1, col:1)']]
         self._batch_job_plain_text(variants, expected, 'syntax-checker')
+
+    def test_windows_1252_input(self):
+        """
+        Simple input encoded as WINDOWS-1252.
+        """
+        variants = ['AB026906.1:c.274G>T',
+                    # Encoded as WINDOWS-1252, the following is not valid UTF8.
+                    'NM_000052.4:c.2407\u20132A>G',
+                    'AL449423.14(CDKN2A_v002):c.5_400del']
+        batch_file = io.BytesIO(('\n'.join(variants) + '\n').encode('WINDOWS-1252'))
+        expected = [['AB026906.1:c.274G>T',
+                     'OK'],
+                    ['NM_000052.4:c.2407\u20132A>G',
+                     '(grammar): Expected W:(acgt...) (at char 18), (line:1, col:19)'],
+                    ['AL449423.14(CDKN2A_v002):c.5_400del',
+                     'OK']]
+
+        self._batch_job(batch_file, expected, 'syntax-checker')
-- 
GitLab