From 57ec1d749cf75866f3b4da0864c436becd0e31b6 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat <martijn@vermaat.name> Date: Fri, 3 Jul 2015 15:13:33 +0200 Subject: [PATCH] Don't trust encoding auto-detection when decoding --- mutalyzer/File.py | 18 +++++++++++++++--- mutalyzer/Retriever.py | 11 +++++++++-- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/mutalyzer/File.py b/mutalyzer/File.py index add6e5d4..772d2002 100644 --- a/mutalyzer/File.py +++ b/mutalyzer/File.py @@ -170,7 +170,13 @@ class File() : handle = _UniversalNewlinesByteStreamIter(handle, encoding=encoding, buffer_size=BUFFER_SIZE) - buf = handle.read(BUFFER_SIZE) + try: + buf = handle.read(BUFFER_SIZE) + except UnicodeDecodeError: + self.__output.addMessage(__file__, 3, 'EBPARSE', + 'Could not decode file (using %s encoding).' + % encoding) + return None # Default dialect dialect = 'excel' @@ -196,8 +202,14 @@ class File() : reader = csv.reader(handle, dialect) ret = [] - for i in reader: - ret.append([c.decode('utf-8') for c in i]) + try: + for i in reader: + ret.append([c.decode('utf-8') for c in i]) + except UnicodeDecodeError: + self.__output.addMessage(__file__, 3, 'EBPARSE', + 'Could not decode file (using %s encoding).' + % encoding) + return None return ret #__parseCsvFile diff --git a/mutalyzer/Retriever.py b/mutalyzer/Retriever.py index 4cfaf642..e514ab96 100644 --- a/mutalyzer/Retriever.py +++ b/mutalyzer/Retriever.py @@ -114,7 +114,13 @@ class Retriever(object) : encoding = 'utf-8' if not util.is_utf8_alias(encoding): - raw_data = raw_data.decode(encoding).encode('utf-8') + try: + raw_data = raw_data.decode(encoding).encode('utf-8') + except UnicodeDecodeError: + self._output.addMessage(__file__, 4, 'ENOPARSE', + 'Could not decode file (using %s encoding).' + % encoding) + return None # Compress the data to save disk space. comp = bz2.BZ2Compressor() @@ -368,7 +374,8 @@ class GenBankRetriever(Retriever): "number to reduce downloading overhead." % unicode(record.id)) #if - self._write(raw_data, outfile) + if not self._write(raw_data, outfile): + return None return outfile, GI #write -- GitLab