Skip to content
Snippets Groups Projects
Commit 7f009cbd authored by Vermaat's avatar Vermaat
Browse files

Merge pull request #53 from mutalyzer/no-trust-chardet

Don't trust encoding auto-detection when decoding
parents 788159f9 57ec1d74
No related branches found
No related tags found
No related merge requests found
......@@ -143,7 +143,7 @@ class File() :
handle.seek(0)
if result['confidence'] > 0.5:
encoding = result['encoding']
encoding = unicode(result['encoding'])
else:
encoding = 'utf-8'
......@@ -170,7 +170,13 @@ class File() :
handle = _UniversalNewlinesByteStreamIter(handle, encoding=encoding,
buffer_size=BUFFER_SIZE)
buf = handle.read(BUFFER_SIZE)
try:
buf = handle.read(BUFFER_SIZE)
except UnicodeDecodeError:
self.__output.addMessage(__file__, 3, 'EBPARSE',
'Could not decode file (using %s encoding).'
% encoding)
return None
# Default dialect
dialect = 'excel'
......@@ -196,8 +202,14 @@ class File() :
reader = csv.reader(handle, dialect)
ret = []
for i in reader:
ret.append([c.decode('utf-8') for c in i])
try:
for i in reader:
ret.append([c.decode('utf-8') for c in i])
except UnicodeDecodeError:
self.__output.addMessage(__file__, 3, 'EBPARSE',
'Could not decode file (using %s encoding).'
% encoding)
return None
return ret
#__parseCsvFile
......
......@@ -109,12 +109,18 @@ class Retriever(object) :
"""
result = chardet.detect(raw_data)
if result['confidence'] > 0.5:
encoding = result['encoding']
encoding = unicode(result['encoding'])
else:
encoding = 'utf-8'
if not util.is_utf8_alias(encoding):
raw_data = raw_data.decode(encoding).encode('utf-8')
try:
raw_data = raw_data.decode(encoding).encode('utf-8')
except UnicodeDecodeError:
self._output.addMessage(__file__, 4, 'ENOPARSE',
'Could not decode file (using %s encoding).'
% encoding)
return None
# Compress the data to save disk space.
comp = bz2.BZ2Compressor()
......@@ -368,7 +374,8 @@ class GenBankRetriever(Retriever):
"number to reduce downloading overhead." % unicode(record.id))
#if
self._write(raw_data, outfile)
if not self._write(raw_data, outfile):
return None
return outfile, GI
#write
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment