Correctly handle reference file encodings

6f5c69bf · Vermaat · 8acb0970 · 6f5c69bf · 6f5c69bf · 6f5c69bf
Commit 6f5c69bf authored 10 years ago by Vermaat
--- a/mutalyzer/Retriever.py
+++ b/mutalyzer/Retriever.py
@@ -12,14 +12,13 @@ Public classes:

 from __future__ import unicode_literals

-import codecs
+import io
 import os              # path.isfile(), link() path.isdir(), path.mkdir(),
                       # walk(), path.getsize(), path.join(), stat(), remove()
 import time
 import bz2             # BZ2Compressor(), BZ2File()
 import hashlib         # md5(), update(), hexdigest()
 import urllib2         # urlopen()
-import StringIO        # StringIO()
 from Bio import SeqIO  # read()
 from Bio import Entrez # efetch(), read(), esearch(), esummary()
 from Bio.Seq import UnknownSeq
@@ -28,6 +27,7 @@ from xml.dom import DOMException, minidom
 from xml.parsers import expat
 from httplib import HTTPException, IncompleteRead
 from sqlalchemy.orm.exc import NoResultFound
+import cchardet as chardet

 from mutalyzer import util
 from mutalyzer.config import settings
@@ -100,27 +100,33 @@ class Retriever(object) :
        Write raw data to a compressed file.

        @arg raw_data: The raw_data to be compressed and written
-        @type raw_data: string
+        @type raw_data: byte string
        @arg filename: The intended name of the outfile
        @type filename: unicode

        @return: outfile ; The full path and name of the file written
        @rtype: unicode
        """
-        # Todo: Should we write a utf-8 encoded genbank file? Not even sure
-        #   what type `raw_data` is...
+        result = chardet.detect(raw_data)
+        if result['confidence'] > 0.5:
+            encoding = result['encoding']
+        else:
+            encoding = 'utf-8'
+
+        if not util.is_utf8_alias(encoding):
+            raw_data = raw_data.decode(encoding).encode('utf-8')
+
        # Compress the data to save disk space.
        comp = bz2.BZ2Compressor()
        data = comp.compress(raw_data)
        data += comp.flush()
-        out_handle = open(self._nametofile(filename), "w")
+        out_handle = open(self._nametofile(filename), "wb")
        out_handle.write(data)
        out_handle.close()

        return out_handle.name      # return the full path to the file
    #_write

-    # Todo: check callers; argument should be a byte string
    def _calcHash(self, content) :
        """
        Calculate the md5sum of a piece of text.
@@ -241,7 +247,7 @@ class Retriever(object) :
                                    'IncompleteRead: %s' % unicode(e))
            return []

-        if response_text == '\n':
+        if response_text.strip() == b'\n':
            # This is apparently what dbSNP returns for non-existing dbSNP id
            self._output.addMessage(__file__, 4, 'EENTREZ',
                                    'ID rs%s could not be found in dbSNP.' \
@@ -259,14 +265,14 @@ class Retriever(object) :
            self._output.addMessage(__file__, -1, 'INFO',
                                    'ExpatError: %s' % unicode(e))
            self._output.addMessage(__file__, -1, 'INFO',
-                                    'Result from dbSNP: %s' % response_text)
+                                    'Result from dbSNP: %s' % unicode(response_text, 'utf-8'))
            return []
        except IndexError:
            # The expected root element is not present.
            self._output.addMessage(__file__, 4, 'EENTREZ', 'Unknown dbSNP ' \
                                    'error. Result XML was not as expected.')
            self._output.addMessage(__file__, -1, 'INFO',
-                                    'Result from dbSNP: %s' % response_text)
+                                    'Result from dbSNP: %s' % unicode(response_text, 'utf-8'))
            return []

        snps = []
@@ -292,7 +298,6 @@ class GenBankRetriever(Retriever):
        # Child specific init
    #__init__

-    # todo: raw_data must always be a byte string
    def write(self, raw_data, filename, extract) :
        """
        Write raw data to a file. The data is parsed before writing, if a
@@ -305,7 +310,7 @@ class GenBankRetriever(Retriever):
        database).

        @arg raw_data: The data
-        @type raw_data: string
+        @type raw_data: byte string
        @arg filename: The intended name of the file.
        @type filename: unicode
        @arg extract: Flag that indicates whether to extract the record ID and
@@ -320,26 +325,24 @@ class GenBankRetriever(Retriever):
        @rtype: tuple (unicode, unicode)
        """

-        if raw_data == "\nNothing has been found\n" :
+        if raw_data.strip() == b'Nothing has been found':
            self._output.addMessage(__file__, 4, "ENORECORD",
                "The record could not be retrieved.")
            return None
        #if

-        fakehandle = StringIO.StringIO() # Unfortunately, BioPython needs a
-        fakehandle.write(raw_data)       # file handle.
+        fakehandle = io.BytesIO()     # Unfortunately, BioPython needs a
+        fakehandle.write(raw_data)    # file handle.
        fakehandle.seek(0)
        try :
            record = SeqIO.read(fakehandle, "genbank")
        except (ValueError, AttributeError):  # An error occured while parsing.
            self._output.addMessage(__file__, 4, "ENOPARSE",
                "The file could not be parsed.")
-            fakehandle.close()
            return None
        #except

        if type(record.seq) == UnknownSeq :
-            fakehandle.close()
            self._output.addMessage(__file__, 4, "ENOSEQ",
                "This record contains no sequence. Chromosomal or contig " \
                "records should be uploaded with the GenBank uploader.")
@@ -349,12 +352,12 @@ class GenBankRetriever(Retriever):
        outfile = filename
        GI = None
        if extract :
-            outfile = record.id
-            GI = record.annotations["gi"]
+            outfile = unicode(record.id)
+            GI = unicode(record.annotations["gi"])
            if outfile != filename :
                # Add the reference (incl version) to the reference output
                # This differs if the original reference lacks a version
-                self._output.addOutput("reference", record.id)
+                self._output.addOutput("reference", unicode(record.id))
                self._output.addOutput(
                        "BatchFlags", ("A1",(
                            filename,
@@ -362,9 +365,8 @@ class GenBankRetriever(Retriever):
                            filename+"." )))
                self._output.addMessage(__file__, 2, "WNOVER",
                    "No version number is given, using %s. Please use this " \
-                    "number to reduce downloading overhead." % record.id)
+                    "number to reduce downloading overhead." % unicode(record.id))
        #if
-        fakehandle.close()

        self._write(raw_data, outfile)

@@ -390,7 +392,7 @@ class GenBankRetriever(Retriever):
                                    'Could not retrieve %s.' % name)
            return None

-        if raw_data == '\n' :       # Check if the file is empty or not.
+        if raw_data.strip() == b'':       # Check if the file is empty or not.
            self._output.addMessage(__file__, 4, 'ERETR',
                                    'Could not retrieve %s.' % name)
            return None
@@ -398,10 +400,10 @@ class GenBankRetriever(Retriever):
        # This is a hack to detect constructed references, the proper way to
        # do this would be to check the data_file_division attribute of the
        # parsed GenBank file (it would be 'CON').
-        if '\nCONTIG' in raw_data:
+        if b'\nCONTIG' in raw_data:
            try:
                # Get the length in base pairs
-                length = int(raw_data[:raw_data.index(' bp', 0, 500)].split()[-1])
+                length = int(raw_data[:raw_data.index(b' bp', 0, 500)].split()[-1])
            except ValueError, IndexError:
                self._output.addMessage(__file__, 4, 'ERETR',
                                        'Could not retrieve %s.' % name)
@@ -583,24 +585,24 @@ class GenBankRetriever(Retriever):
                                        'Could not get mapping information for gene %s.' % gene)
                return None

-            if summary[0]["NomenclatureSymbol"].lower() == gene.lower() : # Found it.
+            if unicode(summary[0]["NomenclatureSymbol"]).lower() == gene.lower() : # Found it.
                if not summary[0]["GenomicInfo"] :
                    self._output.addMessage(__file__, 4, "ENOMAPPING",
                        "No mapping information found for gene %s." % gene)
                    return None
                #if
-                ChrAccVer = summary[0]["GenomicInfo"][0]["ChrAccVer"]
-                ChrLoc = summary[0]["GenomicInfo"][0]["ChrLoc"]
-                ChrStart = summary[0]["GenomicInfo"][0]["ChrStart"]
-                ChrStop = summary[0]["GenomicInfo"][0]["ChrStop"]
-                break;
+                ChrAccVer = unicode(summary[0]["GenomicInfo"][0]["ChrAccVer"])
+                ChrLoc = unicode(summary[0]["GenomicInfo"][0]["ChrLoc"])
+                ChrStart = unicode(summary[0]["GenomicInfo"][0]["ChrStart"])
+                ChrStop = unicode(summary[0]["GenomicInfo"][0]["ChrStop"])
+                break
            #if

            # Collect official symbols that has this gene as alias in case we
            # can not find anything.
-            if gene in summary[0]["OtherAliases"] and \
+            if gene in [unicode(a) for a in summary[0]["OtherAliases"]] and \
                summary[0]["NomenclatureSymbol"] :
-                aliases.append(summary[0]["NomenclatureSymbol"]);
+                aliases.append(unicode(summary[0]["NomenclatureSymbol"]))
        #for

        if not ChrAccVer : # We did not find any genes.
@@ -643,6 +645,13 @@ class GenBankRetriever(Retriever):
        @return: UD or None
        @rtype: unicode
        """
+        if not (url.startswith('http://') or
+                url.startswith('https://') or
+                url.startswith('ftp://')):
+            self._output.addMessage(__file__, 4, "ERECPARSE",
+                                    "Only HTTP(S) or FTP locations are allowed.")
+            return None
+
        handle = urllib2.urlopen(url)
        info = handle.info()
        if info["Content-Type"] == "text/plain" :
@@ -688,7 +697,7 @@ class GenBankRetriever(Retriever):
        If the downloaded file is recognised by its hash, the old UD number
        is used.

-        @arg raw_data: A GenBank record
+        @arg raw_data: A GenBank record.
        @type raw_data: byte string

        @return: Accession number for the uploaded file.
@@ -857,7 +866,6 @@ class LRGRetriever(Retriever):

        # Now we have the file, so we can parse it.
        file_handle = bz2.BZ2File(filename, "r")
-        file_handle = codecs.getreader('utf-8')(file_handle)

        #create GenRecord.Record from LRG file
        record = lrg.create_record(file_handle.read())
@@ -978,7 +986,7 @@ class LRGRetriever(Retriever):
        if a parse error occurs None is returned.

        @arg raw_data: The data
-        @type raw_data: string
+        @type raw_data: byte string
        @arg filename: The intended name of the file
        @type filename: unicode


--- a/mutalyzer/parsers/lrg.py
+++ b/mutalyzer/parsers/lrg.py
@@ -112,7 +112,7 @@ def create_record(data):
    Create a GenRecord.Record of a LRG <xml> formatted string.

    @arg data: Content of LRG file
-    @type data: string
+    @type data: byte string

    @return: GenRecord.Record instance
    @rtype: object

--- a/mutalyzer/services/rpc.py
+++ b/mutalyzer/services/rpc.py
@@ -1058,6 +1058,12 @@ class MutalyzerService(ServiceBase):
        output.addMessage(__file__, -1, 'INFO',
                          'Received request uploadGenBankLocalFile()')

+        # The Python type for `data` should be a sequence of `str` objects,
+        # but it seems we sometimes just get one `str` object. Perhaps only in
+        # the unit tests, but let's fix that anyway.
+        if isinstance(data, str):
+            data = [data]
+
        # Note that the max file size check below might be bogus, since Spyne
        # first checks the total request size, which by default has a maximum
        # of 2 megabytes.

--- a/mutalyzer/util.py
+++ b/mutalyzer/util.py
@@ -93,6 +93,14 @@ def reverse_complement(sequence):
    return ''.join(reversed(sequence.translate(table)))


+def is_utf8_alias(encoding):
+    """
+    Returns `True` if the given encoding is recognized as UTF-8.
+    """
+    aliases = ('utf_8', 'u8', 'utf', 'utf8')
+    return encoding.lower().replace('-', '_') in aliases
+
+
 def grouper(iterable, n=2, fillvalue=None):
    """
    Make an iterator that takes {n} elements at a time from {iterable}, using