diff --git a/mutalyzer/Retriever.py b/mutalyzer/Retriever.py index deb645ae68b238def0db684d81ea3b3b87078a16..ac09bafa3a735b7af6cd445633baeb47dacaf8a2 100644 --- a/mutalyzer/Retriever.py +++ b/mutalyzer/Retriever.py @@ -12,14 +12,13 @@ Public classes: from __future__ import unicode_literals -import codecs +import io import os # path.isfile(), link() path.isdir(), path.mkdir(), # walk(), path.getsize(), path.join(), stat(), remove() import time import bz2 # BZ2Compressor(), BZ2File() import hashlib # md5(), update(), hexdigest() import urllib2 # urlopen() -import StringIO # StringIO() from Bio import SeqIO # read() from Bio import Entrez # efetch(), read(), esearch(), esummary() from Bio.Seq import UnknownSeq @@ -28,6 +27,7 @@ from xml.dom import DOMException, minidom from xml.parsers import expat from httplib import HTTPException, IncompleteRead from sqlalchemy.orm.exc import NoResultFound +import cchardet as chardet from mutalyzer import util from mutalyzer.config import settings @@ -100,27 +100,33 @@ class Retriever(object) : Write raw data to a compressed file. @arg raw_data: The raw_data to be compressed and written - @type raw_data: string + @type raw_data: byte string @arg filename: The intended name of the outfile @type filename: unicode @return: outfile ; The full path and name of the file written @rtype: unicode """ - # Todo: Should we write a utf-8 encoded genbank file? Not even sure - # what type `raw_data` is... + result = chardet.detect(raw_data) + if result['confidence'] > 0.5: + encoding = result['encoding'] + else: + encoding = 'utf-8' + + if not util.is_utf8_alias(encoding): + raw_data = raw_data.decode(encoding).encode('utf-8') + # Compress the data to save disk space. comp = bz2.BZ2Compressor() data = comp.compress(raw_data) data += comp.flush() - out_handle = open(self._nametofile(filename), "w") + out_handle = open(self._nametofile(filename), "wb") out_handle.write(data) out_handle.close() return out_handle.name # return the full path to the file #_write - # Todo: check callers; argument should be a byte string def _calcHash(self, content) : """ Calculate the md5sum of a piece of text. @@ -241,7 +247,7 @@ class Retriever(object) : 'IncompleteRead: %s' % unicode(e)) return [] - if response_text == '\n': + if response_text.strip() == b'\n': # This is apparently what dbSNP returns for non-existing dbSNP id self._output.addMessage(__file__, 4, 'EENTREZ', 'ID rs%s could not be found in dbSNP.' \ @@ -259,14 +265,14 @@ class Retriever(object) : self._output.addMessage(__file__, -1, 'INFO', 'ExpatError: %s' % unicode(e)) self._output.addMessage(__file__, -1, 'INFO', - 'Result from dbSNP: %s' % response_text) + 'Result from dbSNP: %s' % unicode(response_text, 'utf-8')) return [] except IndexError: # The expected root element is not present. self._output.addMessage(__file__, 4, 'EENTREZ', 'Unknown dbSNP ' \ 'error. Result XML was not as expected.') self._output.addMessage(__file__, -1, 'INFO', - 'Result from dbSNP: %s' % response_text) + 'Result from dbSNP: %s' % unicode(response_text, 'utf-8')) return [] snps = [] @@ -292,7 +298,6 @@ class GenBankRetriever(Retriever): # Child specific init #__init__ - # todo: raw_data must always be a byte string def write(self, raw_data, filename, extract) : """ Write raw data to a file. The data is parsed before writing, if a @@ -305,7 +310,7 @@ class GenBankRetriever(Retriever): database). @arg raw_data: The data - @type raw_data: string + @type raw_data: byte string @arg filename: The intended name of the file. @type filename: unicode @arg extract: Flag that indicates whether to extract the record ID and @@ -320,26 +325,24 @@ class GenBankRetriever(Retriever): @rtype: tuple (unicode, unicode) """ - if raw_data == "\nNothing has been found\n" : + if raw_data.strip() == b'Nothing has been found': self._output.addMessage(__file__, 4, "ENORECORD", "The record could not be retrieved.") return None #if - fakehandle = StringIO.StringIO() # Unfortunately, BioPython needs a - fakehandle.write(raw_data) # file handle. + fakehandle = io.BytesIO() # Unfortunately, BioPython needs a + fakehandle.write(raw_data) # file handle. fakehandle.seek(0) try : record = SeqIO.read(fakehandle, "genbank") except (ValueError, AttributeError): # An error occured while parsing. self._output.addMessage(__file__, 4, "ENOPARSE", "The file could not be parsed.") - fakehandle.close() return None #except if type(record.seq) == UnknownSeq : - fakehandle.close() self._output.addMessage(__file__, 4, "ENOSEQ", "This record contains no sequence. Chromosomal or contig " \ "records should be uploaded with the GenBank uploader.") @@ -349,12 +352,12 @@ class GenBankRetriever(Retriever): outfile = filename GI = None if extract : - outfile = record.id - GI = record.annotations["gi"] + outfile = unicode(record.id) + GI = unicode(record.annotations["gi"]) if outfile != filename : # Add the reference (incl version) to the reference output # This differs if the original reference lacks a version - self._output.addOutput("reference", record.id) + self._output.addOutput("reference", unicode(record.id)) self._output.addOutput( "BatchFlags", ("A1",( filename, @@ -362,9 +365,8 @@ class GenBankRetriever(Retriever): filename+"." ))) self._output.addMessage(__file__, 2, "WNOVER", "No version number is given, using %s. Please use this " \ - "number to reduce downloading overhead." % record.id) + "number to reduce downloading overhead." % unicode(record.id)) #if - fakehandle.close() self._write(raw_data, outfile) @@ -390,7 +392,7 @@ class GenBankRetriever(Retriever): 'Could not retrieve %s.' % name) return None - if raw_data == '\n' : # Check if the file is empty or not. + if raw_data.strip() == b'': # Check if the file is empty or not. self._output.addMessage(__file__, 4, 'ERETR', 'Could not retrieve %s.' % name) return None @@ -398,10 +400,10 @@ class GenBankRetriever(Retriever): # This is a hack to detect constructed references, the proper way to # do this would be to check the data_file_division attribute of the # parsed GenBank file (it would be 'CON'). - if '\nCONTIG' in raw_data: + if b'\nCONTIG' in raw_data: try: # Get the length in base pairs - length = int(raw_data[:raw_data.index(' bp', 0, 500)].split()[-1]) + length = int(raw_data[:raw_data.index(b' bp', 0, 500)].split()[-1]) except ValueError, IndexError: self._output.addMessage(__file__, 4, 'ERETR', 'Could not retrieve %s.' % name) @@ -583,24 +585,24 @@ class GenBankRetriever(Retriever): 'Could not get mapping information for gene %s.' % gene) return None - if summary[0]["NomenclatureSymbol"].lower() == gene.lower() : # Found it. + if unicode(summary[0]["NomenclatureSymbol"]).lower() == gene.lower() : # Found it. if not summary[0]["GenomicInfo"] : self._output.addMessage(__file__, 4, "ENOMAPPING", "No mapping information found for gene %s." % gene) return None #if - ChrAccVer = summary[0]["GenomicInfo"][0]["ChrAccVer"] - ChrLoc = summary[0]["GenomicInfo"][0]["ChrLoc"] - ChrStart = summary[0]["GenomicInfo"][0]["ChrStart"] - ChrStop = summary[0]["GenomicInfo"][0]["ChrStop"] - break; + ChrAccVer = unicode(summary[0]["GenomicInfo"][0]["ChrAccVer"]) + ChrLoc = unicode(summary[0]["GenomicInfo"][0]["ChrLoc"]) + ChrStart = unicode(summary[0]["GenomicInfo"][0]["ChrStart"]) + ChrStop = unicode(summary[0]["GenomicInfo"][0]["ChrStop"]) + break #if # Collect official symbols that has this gene as alias in case we # can not find anything. - if gene in summary[0]["OtherAliases"] and \ + if gene in [unicode(a) for a in summary[0]["OtherAliases"]] and \ summary[0]["NomenclatureSymbol"] : - aliases.append(summary[0]["NomenclatureSymbol"]); + aliases.append(unicode(summary[0]["NomenclatureSymbol"])) #for if not ChrAccVer : # We did not find any genes. @@ -643,6 +645,13 @@ class GenBankRetriever(Retriever): @return: UD or None @rtype: unicode """ + if not (url.startswith('http://') or + url.startswith('https://') or + url.startswith('ftp://')): + self._output.addMessage(__file__, 4, "ERECPARSE", + "Only HTTP(S) or FTP locations are allowed.") + return None + handle = urllib2.urlopen(url) info = handle.info() if info["Content-Type"] == "text/plain" : @@ -688,7 +697,7 @@ class GenBankRetriever(Retriever): If the downloaded file is recognised by its hash, the old UD number is used. - @arg raw_data: A GenBank record + @arg raw_data: A GenBank record. @type raw_data: byte string @return: Accession number for the uploaded file. @@ -857,7 +866,6 @@ class LRGRetriever(Retriever): # Now we have the file, so we can parse it. file_handle = bz2.BZ2File(filename, "r") - file_handle = codecs.getreader('utf-8')(file_handle) #create GenRecord.Record from LRG file record = lrg.create_record(file_handle.read()) @@ -978,7 +986,7 @@ class LRGRetriever(Retriever): if a parse error occurs None is returned. @arg raw_data: The data - @type raw_data: string + @type raw_data: byte string @arg filename: The intended name of the file @type filename: unicode diff --git a/mutalyzer/parsers/lrg.py b/mutalyzer/parsers/lrg.py index 0336d1062589508447edc2891c19fd261a21dbe3..b22b7ce69f3e7bb1b58e0e72783432fe4d11c4a3 100644 --- a/mutalyzer/parsers/lrg.py +++ b/mutalyzer/parsers/lrg.py @@ -112,7 +112,7 @@ def create_record(data): Create a GenRecord.Record of a LRG <xml> formatted string. @arg data: Content of LRG file - @type data: string + @type data: byte string @return: GenRecord.Record instance @rtype: object diff --git a/mutalyzer/services/rpc.py b/mutalyzer/services/rpc.py index 7f50548b299c85071e6cdf6cb6fcbcf3b8bb33dc..c65053587d4e3b7639df78bf6152100cbf39114a 100644 --- a/mutalyzer/services/rpc.py +++ b/mutalyzer/services/rpc.py @@ -1058,6 +1058,12 @@ class MutalyzerService(ServiceBase): output.addMessage(__file__, -1, 'INFO', 'Received request uploadGenBankLocalFile()') + # The Python type for `data` should be a sequence of `str` objects, + # but it seems we sometimes just get one `str` object. Perhaps only in + # the unit tests, but let's fix that anyway. + if isinstance(data, str): + data = [data] + # Note that the max file size check below might be bogus, since Spyne # first checks the total request size, which by default has a maximum # of 2 megabytes. diff --git a/mutalyzer/util.py b/mutalyzer/util.py index 4017b57a3a37bcd17731928a8b595593ae7d5eaf..6b7987b31c8f9a7bed62507572f0c417589d6c4a 100644 --- a/mutalyzer/util.py +++ b/mutalyzer/util.py @@ -93,6 +93,14 @@ def reverse_complement(sequence): return ''.join(reversed(sequence.translate(table))) +def is_utf8_alias(encoding): + """ + Returns `True` if the given encoding is recognized as UTF-8. + """ + aliases = ('utf_8', 'u8', 'utf', 'utf8') + return encoding.lower().replace('-', '_') in aliases + + def grouper(iterable, n=2, fillvalue=None): """ Make an iterator that takes {n} elements at a time from {iterable}, using