Skip to content
Snippets Groups Projects
Commit 6f5c69bf authored by Vermaat's avatar Vermaat
Browse files

Correctly handle reference file encodings

parent 8acb0970
No related branches found
No related tags found
No related merge requests found
......@@ -12,14 +12,13 @@ Public classes:
from __future__ import unicode_literals
import codecs
import io
import os # path.isfile(), link() path.isdir(), path.mkdir(),
# walk(), path.getsize(), path.join(), stat(), remove()
import time
import bz2 # BZ2Compressor(), BZ2File()
import hashlib # md5(), update(), hexdigest()
import urllib2 # urlopen()
import StringIO # StringIO()
from Bio import SeqIO # read()
from Bio import Entrez # efetch(), read(), esearch(), esummary()
from Bio.Seq import UnknownSeq
......@@ -28,6 +27,7 @@ from xml.dom import DOMException, minidom
from xml.parsers import expat
from httplib import HTTPException, IncompleteRead
from sqlalchemy.orm.exc import NoResultFound
import cchardet as chardet
from mutalyzer import util
from mutalyzer.config import settings
......@@ -100,27 +100,33 @@ class Retriever(object) :
Write raw data to a compressed file.
@arg raw_data: The raw_data to be compressed and written
@type raw_data: string
@type raw_data: byte string
@arg filename: The intended name of the outfile
@type filename: unicode
@return: outfile ; The full path and name of the file written
@rtype: unicode
"""
# Todo: Should we write a utf-8 encoded genbank file? Not even sure
# what type `raw_data` is...
result = chardet.detect(raw_data)
if result['confidence'] > 0.5:
encoding = result['encoding']
else:
encoding = 'utf-8'
if not util.is_utf8_alias(encoding):
raw_data = raw_data.decode(encoding).encode('utf-8')
# Compress the data to save disk space.
comp = bz2.BZ2Compressor()
data = comp.compress(raw_data)
data += comp.flush()
out_handle = open(self._nametofile(filename), "w")
out_handle = open(self._nametofile(filename), "wb")
out_handle.write(data)
out_handle.close()
return out_handle.name # return the full path to the file
#_write
# Todo: check callers; argument should be a byte string
def _calcHash(self, content) :
"""
Calculate the md5sum of a piece of text.
......@@ -241,7 +247,7 @@ class Retriever(object) :
'IncompleteRead: %s' % unicode(e))
return []
if response_text == '\n':
if response_text.strip() == b'\n':
# This is apparently what dbSNP returns for non-existing dbSNP id
self._output.addMessage(__file__, 4, 'EENTREZ',
'ID rs%s could not be found in dbSNP.' \
......@@ -259,14 +265,14 @@ class Retriever(object) :
self._output.addMessage(__file__, -1, 'INFO',
'ExpatError: %s' % unicode(e))
self._output.addMessage(__file__, -1, 'INFO',
'Result from dbSNP: %s' % response_text)
'Result from dbSNP: %s' % unicode(response_text, 'utf-8'))
return []
except IndexError:
# The expected root element is not present.
self._output.addMessage(__file__, 4, 'EENTREZ', 'Unknown dbSNP ' \
'error. Result XML was not as expected.')
self._output.addMessage(__file__, -1, 'INFO',
'Result from dbSNP: %s' % response_text)
'Result from dbSNP: %s' % unicode(response_text, 'utf-8'))
return []
snps = []
......@@ -292,7 +298,6 @@ class GenBankRetriever(Retriever):
# Child specific init
#__init__
# todo: raw_data must always be a byte string
def write(self, raw_data, filename, extract) :
"""
Write raw data to a file. The data is parsed before writing, if a
......@@ -305,7 +310,7 @@ class GenBankRetriever(Retriever):
database).
@arg raw_data: The data
@type raw_data: string
@type raw_data: byte string
@arg filename: The intended name of the file.
@type filename: unicode
@arg extract: Flag that indicates whether to extract the record ID and
......@@ -320,26 +325,24 @@ class GenBankRetriever(Retriever):
@rtype: tuple (unicode, unicode)
"""
if raw_data == "\nNothing has been found\n" :
if raw_data.strip() == b'Nothing has been found':
self._output.addMessage(__file__, 4, "ENORECORD",
"The record could not be retrieved.")
return None
#if
fakehandle = StringIO.StringIO() # Unfortunately, BioPython needs a
fakehandle.write(raw_data) # file handle.
fakehandle = io.BytesIO() # Unfortunately, BioPython needs a
fakehandle.write(raw_data) # file handle.
fakehandle.seek(0)
try :
record = SeqIO.read(fakehandle, "genbank")
except (ValueError, AttributeError): # An error occured while parsing.
self._output.addMessage(__file__, 4, "ENOPARSE",
"The file could not be parsed.")
fakehandle.close()
return None
#except
if type(record.seq) == UnknownSeq :
fakehandle.close()
self._output.addMessage(__file__, 4, "ENOSEQ",
"This record contains no sequence. Chromosomal or contig " \
"records should be uploaded with the GenBank uploader.")
......@@ -349,12 +352,12 @@ class GenBankRetriever(Retriever):
outfile = filename
GI = None
if extract :
outfile = record.id
GI = record.annotations["gi"]
outfile = unicode(record.id)
GI = unicode(record.annotations["gi"])
if outfile != filename :
# Add the reference (incl version) to the reference output
# This differs if the original reference lacks a version
self._output.addOutput("reference", record.id)
self._output.addOutput("reference", unicode(record.id))
self._output.addOutput(
"BatchFlags", ("A1",(
filename,
......@@ -362,9 +365,8 @@ class GenBankRetriever(Retriever):
filename+"." )))
self._output.addMessage(__file__, 2, "WNOVER",
"No version number is given, using %s. Please use this " \
"number to reduce downloading overhead." % record.id)
"number to reduce downloading overhead." % unicode(record.id))
#if
fakehandle.close()
self._write(raw_data, outfile)
......@@ -390,7 +392,7 @@ class GenBankRetriever(Retriever):
'Could not retrieve %s.' % name)
return None
if raw_data == '\n' : # Check if the file is empty or not.
if raw_data.strip() == b'': # Check if the file is empty or not.
self._output.addMessage(__file__, 4, 'ERETR',
'Could not retrieve %s.' % name)
return None
......@@ -398,10 +400,10 @@ class GenBankRetriever(Retriever):
# This is a hack to detect constructed references, the proper way to
# do this would be to check the data_file_division attribute of the
# parsed GenBank file (it would be 'CON').
if '\nCONTIG' in raw_data:
if b'\nCONTIG' in raw_data:
try:
# Get the length in base pairs
length = int(raw_data[:raw_data.index(' bp', 0, 500)].split()[-1])
length = int(raw_data[:raw_data.index(b' bp', 0, 500)].split()[-1])
except ValueError, IndexError:
self._output.addMessage(__file__, 4, 'ERETR',
'Could not retrieve %s.' % name)
......@@ -583,24 +585,24 @@ class GenBankRetriever(Retriever):
'Could not get mapping information for gene %s.' % gene)
return None
if summary[0]["NomenclatureSymbol"].lower() == gene.lower() : # Found it.
if unicode(summary[0]["NomenclatureSymbol"]).lower() == gene.lower() : # Found it.
if not summary[0]["GenomicInfo"] :
self._output.addMessage(__file__, 4, "ENOMAPPING",
"No mapping information found for gene %s." % gene)
return None
#if
ChrAccVer = summary[0]["GenomicInfo"][0]["ChrAccVer"]
ChrLoc = summary[0]["GenomicInfo"][0]["ChrLoc"]
ChrStart = summary[0]["GenomicInfo"][0]["ChrStart"]
ChrStop = summary[0]["GenomicInfo"][0]["ChrStop"]
break;
ChrAccVer = unicode(summary[0]["GenomicInfo"][0]["ChrAccVer"])
ChrLoc = unicode(summary[0]["GenomicInfo"][0]["ChrLoc"])
ChrStart = unicode(summary[0]["GenomicInfo"][0]["ChrStart"])
ChrStop = unicode(summary[0]["GenomicInfo"][0]["ChrStop"])
break
#if
# Collect official symbols that has this gene as alias in case we
# can not find anything.
if gene in summary[0]["OtherAliases"] and \
if gene in [unicode(a) for a in summary[0]["OtherAliases"]] and \
summary[0]["NomenclatureSymbol"] :
aliases.append(summary[0]["NomenclatureSymbol"]);
aliases.append(unicode(summary[0]["NomenclatureSymbol"]))
#for
if not ChrAccVer : # We did not find any genes.
......@@ -643,6 +645,13 @@ class GenBankRetriever(Retriever):
@return: UD or None
@rtype: unicode
"""
if not (url.startswith('http://') or
url.startswith('https://') or
url.startswith('ftp://')):
self._output.addMessage(__file__, 4, "ERECPARSE",
"Only HTTP(S) or FTP locations are allowed.")
return None
handle = urllib2.urlopen(url)
info = handle.info()
if info["Content-Type"] == "text/plain" :
......@@ -688,7 +697,7 @@ class GenBankRetriever(Retriever):
If the downloaded file is recognised by its hash, the old UD number
is used.
@arg raw_data: A GenBank record
@arg raw_data: A GenBank record.
@type raw_data: byte string
@return: Accession number for the uploaded file.
......@@ -857,7 +866,6 @@ class LRGRetriever(Retriever):
# Now we have the file, so we can parse it.
file_handle = bz2.BZ2File(filename, "r")
file_handle = codecs.getreader('utf-8')(file_handle)
#create GenRecord.Record from LRG file
record = lrg.create_record(file_handle.read())
......@@ -978,7 +986,7 @@ class LRGRetriever(Retriever):
if a parse error occurs None is returned.
@arg raw_data: The data
@type raw_data: string
@type raw_data: byte string
@arg filename: The intended name of the file
@type filename: unicode
......
......@@ -112,7 +112,7 @@ def create_record(data):
Create a GenRecord.Record of a LRG <xml> formatted string.
@arg data: Content of LRG file
@type data: string
@type data: byte string
@return: GenRecord.Record instance
@rtype: object
......
......@@ -1058,6 +1058,12 @@ class MutalyzerService(ServiceBase):
output.addMessage(__file__, -1, 'INFO',
'Received request uploadGenBankLocalFile()')
# The Python type for `data` should be a sequence of `str` objects,
# but it seems we sometimes just get one `str` object. Perhaps only in
# the unit tests, but let's fix that anyway.
if isinstance(data, str):
data = [data]
# Note that the max file size check below might be bogus, since Spyne
# first checks the total request size, which by default has a maximum
# of 2 megabytes.
......
......@@ -93,6 +93,14 @@ def reverse_complement(sequence):
return ''.join(reversed(sequence.translate(table)))
def is_utf8_alias(encoding):
"""
Returns `True` if the given encoding is recognized as UTF-8.
"""
aliases = ('utf_8', 'u8', 'utf', 'utf8')
return encoding.lower().replace('-', '_') in aliases
def grouper(iterable, n=2, fillvalue=None):
"""
Make an iterator that takes {n} elements at a time from {iterable}, using
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment