From dedad241ad2722d7dc66db7829c356fc2697d48d Mon Sep 17 00:00:00 2001 From: Martijn Vermaat <martijn@vermaat.name> Date: Fri, 3 Jul 2015 13:48:47 +0200 Subject: [PATCH] Use chardet instead of cchardet Issue #50 showed a problem in our file encoding detection, caused by our cut-off for the confidence as reported by the cchardet [1] library: >>> import cchardet >>> s = u'NM_000052.4:c.2407\u20132A>G' >>> b = s.encode('WINDOWS-1252') >>> cchardet.detect(b) {'confidence': 0.5, 'encoding': u'WINDOWS-1252'} We require a confidence stictly greater than 0.5 and default to UTF8 otherwise. If, however, we try the same thing using the chardet [2] library, we get a higher confidence for the same string: >>> import chardet >>> chardet.detect(b) {'confidence': 0.73, 'encoding': 'windows-1252'} So the two obvious ways to solve this are: 1. Lower the confidence threshold. 2. Use chardet instead of cchardet. We implement the second solution here, since it also removes a C library dependency and we are not worried by performance. Of course the detected encoding remains a guess which can still be wrong! [1] https://github.com/PyYoshi/cChardet [2] https://github.com/chardet/chardet Fixes #50 --- doc/conf.py | 3 +-- mutalyzer/File.py | 2 +- mutalyzer/Retriever.py | 2 +- requirements.txt | 2 +- tests/test_scheduler.py | 18 ++++++++++++++++++ 5 files changed, 22 insertions(+), 5 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index 424e8512..322ffec5 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -23,8 +23,7 @@ class Mock(MagicMock): def __getattr__(cls, name): return Mock() -MOCK_MODULES = ['MySQLdb', 'cchardet', 'lxml', 'lxml.builder', 'lxml.etree', - 'magic'] +MOCK_MODULES = ['MySQLdb', 'lxml', 'lxml.builder', 'lxml.etree', 'magic'] sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) # If extensions (or modules to document with autodoc) are in another directory, diff --git a/mutalyzer/File.py b/mutalyzer/File.py index 5851e7a5..90641c3c 100644 --- a/mutalyzer/File.py +++ b/mutalyzer/File.py @@ -25,7 +25,7 @@ import csv # Sniffer(), reader(), Error import xlrd # open_workbook() import zipfile # ZipFile() import xml.dom.minidom # parseString() -import cchardet as chardet +import chardet from mutalyzer.config import settings diff --git a/mutalyzer/Retriever.py b/mutalyzer/Retriever.py index cc6f9193..286caf3e 100644 --- a/mutalyzer/Retriever.py +++ b/mutalyzer/Retriever.py @@ -27,7 +27,7 @@ from xml.dom import DOMException, minidom from xml.parsers import expat from httplib import HTTPException, IncompleteRead from sqlalchemy.orm.exc import NoResultFound -import cchardet as chardet +import chardet from mutalyzer import util from mutalyzer.config import settings diff --git a/requirements.txt b/requirements.txt index add9c97c..413abda0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ Sphinx==1.2.3 Werkzeug==0.9.6 alembic==0.6.7 biopython==1.64 -cchardet==0.3.5 +chardet==2.3.0 cssselect==0.9.1 lxml==3.4.0 mock==1.0.1 diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index c118bfd7..7dd3a8c2 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -350,3 +350,21 @@ class TestScheduler(MutalyzerTest): ['\u2026AL449423.14(CDKN2A_v002):c.5_400del', '(grammar): Expected W:(0123...) (at char 0), (line:1, col:1)']] self._batch_job_plain_text(variants, expected, 'syntax-checker') + + def test_windows_1252_input(self): + """ + Simple input encoded as WINDOWS-1252. + """ + variants = ['AB026906.1:c.274G>T', + # Encoded as WINDOWS-1252, the following is not valid UTF8. + 'NM_000052.4:c.2407\u20132A>G', + 'AL449423.14(CDKN2A_v002):c.5_400del'] + batch_file = io.BytesIO(('\n'.join(variants) + '\n').encode('WINDOWS-1252')) + expected = [['AB026906.1:c.274G>T', + 'OK'], + ['NM_000052.4:c.2407\u20132A>G', + '(grammar): Expected W:(acgt...) (at char 18), (line:1, col:19)'], + ['AL449423.14(CDKN2A_v002):c.5_400del', + 'OK']] + + self._batch_job(batch_file, expected, 'syntax-checker') -- GitLab