diff --git a/doc/index.rst b/doc/index.rst index 58dd32c99c17fd2314c205b9b9d8e006fff9dd0d..e0da8e0dfb44a021e28a055ccf5ba6b3eb6d8f4f 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -46,6 +46,7 @@ Additional notes development issues new-organism + strings changelog copyright diff --git a/doc/strings.rst b/doc/strings.rst new file mode 100644 index 0000000000000000000000000000000000000000..51e26ed9f4f04b907292729d1aea6a58b3df5590 --- /dev/null +++ b/doc/strings.rst @@ -0,0 +1,164 @@ +String representations +====================== + +We live in a global economy with many different languages and alphabets. Using +byte strings for text and just assuming everything is ASCII encoded is +suboptimal and *will* lead to bugs. These bugs may even be security issues. + +That's why Mutalyzer uses unicode strings wherever possible and tries to be +aware of encodings when dealing with input and output. Here we describe how we +do it. + + +String representations in Python +-------------------------------- + +Since Mutalyzer only runs on Python 2.7, we can ignore all older Python versions +and Python 3. So, the two main string types in Python are: + +1. `str`, byte strings +2. `unicode`, unicode strings + +Byte strings are the default string type in Python 2.7 and are for example the +type you get when writing a string literal:: + + >>> type('mutalyzer') + <type 'str'> + +Unicode string literals can be written using the ``u`` prefix:: + + >>> type(u'mutalyzer') + <type 'unicode'> + +Many modules from the Python standard library and also third party libraries +consume and produce byte strings by default and may or may not work correctly +with unicode strings. + + +Unicode strategy +---------------- + +Internally, all strings should be represented by unicode strings as much as +possible. The main exceptions are large reference sequence strings. These can +often better be BioPython sequence objects, since that is how we usually get +them in the first place. + +Our main strategy is as follows: + +1. We use ``from __future__ import unicode_literals`` at the top of every + file. +2. All incoming strings are decoded to unicode (if necessary) as soon as + possible. +3. Outgoing strings are encoded to UTF8 (if necessary) as late as possible. +4. BioPython sequence objects can be based on byte strings as well as unicode + strings. +5. In the database, everything is UTF8. +6. We must be aware of the encoding of files supplied by the user or + downloaded from external sources. + +Point 1 ensures that `all string literals in our source code will be unicode +strings <http://python-future.org/unicode_literals.html>`_:: + + >>> from __future__ import unicode_literals + >>> type('mutalyzer') + <type 'unicode'> + +As for point 4, sometimes this may even change under our eyes (e.g., calling +``.reverse_complement()`` will change it to a byte string). We don't care as +long as they're BioPython objects, only when we get the sequence out we must +have it as unicode string. Their contents are always in the ASCII range +anyway. + +Although `Bio.Seq.reverse_complement` works fine on Python byte strings (and +we used to rely on that), it crashes on a Python unicode string. So we take +care to only use it on BioPython sequence objects and wrote our own reverse +complement function for unicode strings +(`mutalyzer.util.reverse_complement`). + + +Files +----- + +The Python builtin `open +<https://docs.python.org/2/library/functions.html#open>`_ cannot decode file +contents and just yields byte strings. Therefore, we typically use `io.open +<https://docs.python.org/2/library/io.html#io.open>`_ instead, which accepts +an `encoding` argument. + +Downloaded reference files are stored UTF8 encoded (and then bzipped). We can +assume UTF8 encoding when reading them back from disk. + +We try to detect the encoding of user uploaded text files (batch jobs, GenBank +files) and assume UTF8 if detection fails. + + +Libraries +--------- + +SQLAlchemy, our database toolkit, transparently sends both byte strings and +unicode strings UTF8 encoded to the database and presents all strings as +unicode strings to us. + +The webframework Mutalyzer uses, Flask, is also fully `unicode based +<http://flask.pocoo.org/docs/0.10/unicode/>`_. + +The Mutalyzer webservices are based on Spyne. The Spyne documentation `has the +following to say <http://spyne.io/docs/2.10/manual/03_types.html#strings>`_ +about its `String` and `Unicode` types: + + There are two string types in Spyne: `spyne.model.primitive.Unicode` and + `spyne.model.primitive.String` whose native types are `unicode` and `str` + respectively. + + Unlike the Python `str`, the Spyne `String` is not for arbitrary byte + streams. You should not use it unless you are absolutely, positively sure + that you need to deal with text data with an unknown encoding. In all + other cases, you should just use the `Unicode` type. They actually look + the same from outside, this distinction is made just to properly deal with + the quirks surrounding Python-2's `unicode` type. + + Remember that you have the `ByteArray` and `File` types at your disposal + when you need to deal with arbitrary byte streams. + + The `String` type will be just an alias for `Unicode` once Spyne gets + ported to Python 3. It might even be deprecated and removed in the future, + so make sure you are using either `Unicode` or `ByteArray` in your + interface definitions. + +So let's not ignore that and never use `String` in our webservice interface. + +The pyparsing library is used for parsing HGVS variant descriptions. Overall +it can deal with unicode input and also yields unicode output in that +case. However, there are some exceptions where we explicitely have to decode +to a unicode string (for example, omitted optional parts yield the empty byte +string). + + +Python 3 +-------- + +The situation in Python 3 is very different from Python 2.7. The two main +string types in Python 3 are: + +1. `str`, unicode strings +2. `byte`, byte strings + +Unicode strings are the default string type in Python 3 and are for example +the type you get when writing a string literal:: + + >>> type('mutalyzer') + <class 'str'> + +Byte string literals can be written using the ``b`` prefix:: + + >>> type(b'mutalyzer') + <class 'bytes'> + +Many modules from the Python standard library and also third party libraries +consume and produce unicode strings by default and may or may not work +correctly with byte strings. + +What does this mean for Mutalyzer? Actually, our current approach takes us +quite a bit closer to how things are generally done in Python 3. However, +Mutalyzer is very much not Python 3 compatible, even the unicode handling +parts are only valid in Python 2.7 on some points. diff --git a/extras/log-tools/find-crashes.py b/extras/log-tools/find-crashes.py index 0e6d791ef19995d3708a982573b40861822ca71f..cf6ba98600a0a4d4afcfb87f2e4ae136e9254096 100755 --- a/extras/log-tools/find-crashes.py +++ b/extras/log-tools/find-crashes.py @@ -9,6 +9,8 @@ crashed. """ +from __future__ import unicode_literals + import os from mutalyzer import config diff --git a/extras/monitor/mutalyzer-monitor.py b/extras/monitor/mutalyzer-monitor.py index b5ea49fdbfac865afec348dd163759d70905bd98..43e49abc2f1e502e9a7805efabc3090d06134853 100755 --- a/extras/monitor/mutalyzer-monitor.py +++ b/extras/monitor/mutalyzer-monitor.py @@ -15,6 +15,8 @@ Currently implemented checks: """ +from __future__ import unicode_literals + import argparse import logging import sys diff --git a/extras/soap-tools/batchjob.py b/extras/soap-tools/batchjob.py index 7558b98d8e284d0c0de8e7267c406145153bd8b1..de11bc2ac7d64c64efb00158e158fb7e5a9e19a3 100755 --- a/extras/soap-tools/batchjob.py +++ b/extras/soap-tools/batchjob.py @@ -17,6 +17,8 @@ to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/checkSyntax.py b/extras/soap-tools/checkSyntax.py index 78c63e5c902e25d0944b744dfc04691ef6053f40..a2bf32d780966a40f25fbc8846fbb41b61195bdf 100755 --- a/extras/soap-tools/checkSyntax.py +++ b/extras/soap-tools/checkSyntax.py @@ -12,6 +12,8 @@ and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/chromAccession.py b/extras/soap-tools/chromAccession.py index 4fb6e04f1b3baa844bcf50c31f6dd3f826c7ce73..457277d8e278093df25831c7e4de88f2b7d7cde6 100755 --- a/extras/soap-tools/chromAccession.py +++ b/extras/soap-tools/chromAccession.py @@ -13,6 +13,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/descriptionExtract.py b/extras/soap-tools/descriptionExtract.py index 7ca3b2eceefa27fa53b1f41794a8d92dd36b1bba..3889ca414ee9f1054d6350e6f1b87f4b4e909fe4 100755 --- a/extras/soap-tools/descriptionExtract.py +++ b/extras/soap-tools/descriptionExtract.py @@ -14,6 +14,8 @@ service and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getCache.py b/extras/soap-tools/getCache.py index 2f9c7df218b3c831964671a622f6f44d14f4d039..07a86818946b31a0de8e2555d3b62a85af05c8a2 100755 --- a/extras/soap-tools/getCache.py +++ b/extras/soap-tools/getCache.py @@ -12,6 +12,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getGeneAndTranscript.py b/extras/soap-tools/getGeneAndTranscript.py index 8946d59e71c8fb280b4e4e240acdd019f3fe24bd..e4ba939b0a335a34e43b85ff1135c3ada19d8aca 100755 --- a/extras/soap-tools/getGeneAndTranscript.py +++ b/extras/soap-tools/getGeneAndTranscript.py @@ -13,6 +13,8 @@ web service and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getGeneName.py b/extras/soap-tools/getGeneName.py index e3b7dd01445c37602131ffa73e51f680255ee376..ad4ce8c4afe8ad25780a778f76a0d28eaa4f0990 100755 --- a/extras/soap-tools/getGeneName.py +++ b/extras/soap-tools/getGeneName.py @@ -13,6 +13,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getTranscripts.py b/extras/soap-tools/getTranscripts.py index 51052fca68208719de8002af8b44418120180eb3..82af32191ee18635a07ba55472be91f64a8d830d 100755 --- a/extras/soap-tools/getTranscripts.py +++ b/extras/soap-tools/getTranscripts.py @@ -13,6 +13,8 @@ web service and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getTranscriptsAndInfo.py b/extras/soap-tools/getTranscriptsAndInfo.py index 86dc3ff446887e970cd6c521b998629848904943..12b94d86003fb96f3af035a9446a9788615c1bd7 100755 --- a/extras/soap-tools/getTranscriptsAndInfo.py +++ b/extras/soap-tools/getTranscriptsAndInfo.py @@ -14,6 +14,8 @@ and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getTranscriptsByGeneName.py b/extras/soap-tools/getTranscriptsByGeneName.py index d7789a0acbe91b85aef602f9771f25dfd13068a6..f31ff6ba6e667794fdfe3cbaf95f76dcf222038f 100755 --- a/extras/soap-tools/getTranscriptsByGeneName.py +++ b/extras/soap-tools/getTranscriptsByGeneName.py @@ -12,6 +12,8 @@ web service and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getTranscriptsMapping.py b/extras/soap-tools/getTranscriptsMapping.py index 79683369ed86b478aabc89c20c9195634a65a3f2..891dfa75a11100689d7b3f6d3948e8d0abd5ecf8 100755 --- a/extras/soap-tools/getTranscriptsMapping.py +++ b/extras/soap-tools/getTranscriptsMapping.py @@ -16,6 +16,8 @@ and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getdbSNPDescriptions.py b/extras/soap-tools/getdbSNPDescriptions.py index f5745533067a6e675077d5b9756bd9b7fcd75160..5be99c735012d7cc176e24396af40ccd350c7b43 100755 --- a/extras/soap-tools/getdbSNPDescriptions.py +++ b/extras/soap-tools/getdbSNPDescriptions.py @@ -12,6 +12,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/info.py b/extras/soap-tools/info.py index eb3cd058044621745a59d464bcfd70ca57602a19..1a4ea6e43335330798767d9aee73a880833848b6 100755 --- a/extras/soap-tools/info.py +++ b/extras/soap-tools/info.py @@ -10,6 +10,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/mappingInfo.py b/extras/soap-tools/mappingInfo.py index 49fb4ac404df042d044ce9b6525e2084a8a992f0..7a473b1c9a6cfd86401e75bad22b55ca5f123f2e 100755 --- a/extras/soap-tools/mappingInfo.py +++ b/extras/soap-tools/mappingInfo.py @@ -14,6 +14,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/numberConversion.py b/extras/soap-tools/numberConversion.py index 977bbc719ce83dd34b2047add81ff55cdd978fa8..bd5262f4bb19d75d4d852593ac0ebfd116d627e0 100755 --- a/extras/soap-tools/numberConversion.py +++ b/extras/soap-tools/numberConversion.py @@ -13,6 +13,8 @@ web service and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/runMutalyzer.py b/extras/soap-tools/runMutalyzer.py index 0a2d1e7593db0eed2963cdd80606a015f1ec7a11..475cc6c18c8f6aab61bdb5a952c5d448ec99af2b 100755 --- a/extras/soap-tools/runMutalyzer.py +++ b/extras/soap-tools/runMutalyzer.py @@ -13,6 +13,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/sliceChromosomeByGene.py b/extras/soap-tools/sliceChromosomeByGene.py index 8e24c54d9b9a9a16cdebcb75fb836e7e6a9b66bc..c4e0e4183d002d53b7e620ddef8cad700691d7ca 100755 --- a/extras/soap-tools/sliceChromosomeByGene.py +++ b/extras/soap-tools/sliceChromosomeByGene.py @@ -13,6 +13,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/sp.py b/extras/soap-tools/sp.py index d395d1993195a7664d6daa3ac05c7e7f2c3476f6..a2fd0be498607c268b9ab31d5c8c60efbee6ef5c 100755 --- a/extras/soap-tools/sp.py +++ b/extras/soap-tools/sp.py @@ -11,6 +11,8 @@ # This code is in the public domain; it can be used for whatever purpose # with absolutely no restrictions. +from __future__ import unicode_literals + import sys from SOAPpy import WSDL diff --git a/extras/soap-tools/transcriptInfo.py b/extras/soap-tools/transcriptInfo.py index d25d361a94461572ebd600ac165b3513d8dea92e..bd9c14e8c5dcb0c3b3bca03e513b60f725d89566 100755 --- a/extras/soap-tools/transcriptInfo.py +++ b/extras/soap-tools/transcriptInfo.py @@ -12,6 +12,8 @@ and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/migrations/script.py.mako b/migrations/script.py.mako index 95702017ea341e6455933b35f8ef5bf45f2df728..56af6fd8e141a90a81a3cf64d4f1af10eb291cf7 100644 --- a/migrations/script.py.mako +++ b/migrations/script.py.mako @@ -6,6 +6,8 @@ Create Date: ${create_date} """ +from __future__ import unicode_literals + # revision identifiers, used by Alembic. revision = ${repr(up_revision)} down_revision = ${repr(down_revision)} diff --git a/migrations/versions/402ff01b0d5d_fix_grcm38_chromosome_accession_number_.py b/migrations/versions/402ff01b0d5d_fix_grcm38_chromosome_accession_number_.py index ca664e5629e625ce136b92963c91a637fd790ed5..10ed1f8be249bd96d42fd7c398cbbc3c034d87fd 100644 --- a/migrations/versions/402ff01b0d5d_fix_grcm38_chromosome_accession_number_.py +++ b/migrations/versions/402ff01b0d5d_fix_grcm38_chromosome_accession_number_.py @@ -6,6 +6,8 @@ Create Date: 2014-10-08 15:10:21.522551 """ +from __future__ import unicode_literals + # revision identifiers, used by Alembic. revision = '402ff01b0d5d' down_revision = 'ea660b66f26' diff --git a/migrations/versions/ea660b66f26_initial_schema.py b/migrations/versions/ea660b66f26_initial_schema.py index d0d474ed4a532d1661b126aa3a83abc1170bcdd8..eec6ce6af5ee8767be03e99bda445305002394b1 100644 --- a/migrations/versions/ea660b66f26_initial_schema.py +++ b/migrations/versions/ea660b66f26_initial_schema.py @@ -6,6 +6,8 @@ Create Date: 2014-02-04 18:38:28.416032 """ +from __future__ import unicode_literals + # revision identifiers, used by Alembic. revision = 'ea660b66f26' down_revision = None diff --git a/mutalyzer/Crossmap.py b/mutalyzer/Crossmap.py index 0fb166dc9e2e0c42aef3473ba14015bf9624a726..0de7ce3aba863de574bc9d79e2278b084f417b8c 100644 --- a/mutalyzer/Crossmap.py +++ b/mutalyzer/Crossmap.py @@ -10,6 +10,8 @@ and stop and the orientation of a transcript. #Public classes: # - Crossmap ; Convert from g. to c. or n. notation or vice versa. +from __future__ import unicode_literals + class Crossmap() : """ Convert from I{g.} to I{c.} or I{n.} notation or vice versa. @@ -406,13 +408,13 @@ class Crossmap() : @type a: integer @return: The converted notation (may be unaltered) - @rtype: string + @rtype: unicode """ if a > self.__STOP : - return '*' + str(a - self.__STOP) + return '*' + unicode(a - self.__STOP) - return str(a) + return unicode(a) #int2main def main2int(self, s) : @@ -423,7 +425,7 @@ class Crossmap() : - __STOP ; CDS stop in I{c.} notation. @arg s: A string in '*' notation - @type s: string + @type s: unicode @return: The converted notation (may be unaltered) @rtype: integer @@ -447,20 +449,20 @@ class Crossmap() : @type fuzzy: bool @return: The offset in HGVS notation - @rtype: string + @rtype: unicode """ if t[1] > 0 : # The exon boundary is downstream. if fuzzy: return '+?' if t[0] >= self.__trans_end : # It is downstream of the last exon. - return "+d" + str(t[1]) - return '+' + str(t[1]) + return "+d" + unicode(t[1]) + return '+' + unicode(t[1]) #if if t[1] < 0 : # The exon boundary is uptream. if fuzzy: return '-?' if t[0] <= self.__trans_start : # It is upstream of the first exon. - return "-u" + str(-t[1]) - return str(t[1]) + return "-u" + unicode(-t[1]) + return unicode(t[1]) #if return '' # No offset was given. #int2offset @@ -472,7 +474,7 @@ class Crossmap() : sensible. @arg s: An offset in HGVS notation - @type s: string + @type s: unicode @return: The offset as an integer @rtype: integer @@ -505,12 +507,12 @@ class Crossmap() : @type fuzzy: bool @return: The position in HGVS notation - @rtype: string + @rtype: unicode """ if t[0] >= self.__trans_end or t[0] <= self.__trans_start: - return str(self.int2main(self.__minus(t[0], -t[1]))) - return str(self.int2main(t[0])) + str(self.int2offset(t, fuzzy)) + return unicode(self.int2main(self.__minus(t[0], -t[1]))) + return unicode(self.int2main(t[0])) + unicode(self.int2offset(t, fuzzy)) #tuple2string def g2c(self, a, fuzzy=False) : @@ -525,7 +527,7 @@ class Crossmap() : @type fuzzy: bool @return: The position in HGVS notation - @rtype: string + @rtype: unicode """ return self.tuple2string(self.g2x(a), fuzzy) #g2c diff --git a/mutalyzer/File.py b/mutalyzer/File.py index b95f03012205f4ec02832d610833f39797d43a15..1212d310de7b085e65a0fa25f5a5458e6e74137a 100644 --- a/mutalyzer/File.py +++ b/mutalyzer/File.py @@ -16,24 +16,82 @@ Module for parsing CSV files and spreadsheets. # - File ; Parse CSV files and spreadsheets. +from __future__ import unicode_literals + +import codecs +import re import magic # open(), MAGIC_MIME, MAGIC_NONE import csv # Sniffer(), reader(), Error import xlrd # open_workbook() import zipfile # ZipFile() import xml.dom.minidom # parseString() -import os # remove() -import tempfile -import types # UnicodeType -from cStringIO import StringIO +import cchardet as chardet -from mutalyzer import util from mutalyzer.config import settings -# Amount of bytes to be read for determining the file type. +# Amount of bytes to be read from a file at a time (this is also the amount +# read for determining the file type). BUFFER_SIZE = 32768 +class _UniversalNewlinesByteStreamIter(object): + """ + The codecs module doesn't provide universal newline support. This class is + used as a stream wrapper that provides this functionality. + + The wrapped stream must yield byte strings. We decode it using the given + encoding, normalise newlines, and yield UTF-8 encoded data (read method) + or lines (as iterator). + + Adaptation from an old Cython version: + https://github.com/cython/cython/blob/076fac3/Cython/Utils.py + """ + normalise_newlines = re.compile('\r\n?|\n').sub + + def __init__(self, stream, encoding='utf-8', buffer_size=0x1000): + # let's assume .read() doesn't change + self.stream = codecs.getreader(encoding)(stream) + self._read = self.stream.read + self.buffer_size = buffer_size + + def _read_normalised(self, count=None): + count = count or self.buffer_size + data = self._read(count) + if '\r' not in data: + return data + if data.endswith('\r'): + # may be missing a '\n' + data += self._read(1) + return self.normalise_newlines('\n', data) + + def _readlines(self): + buffer = [] + data = self._read_normalised() + while data: + buffer.append(data) + lines = ''.join(buffer).splitlines(True) + for line in lines[:-1]: + yield line + buffer = [lines[-1]] + data = self._read_normalised() + + if buffer[0]: + yield buffer[0] + + def seek(self, pos): + if pos == 0: + self.stream.seek(0) + else: + raise NotImplementedError + + def read(self, count=-1): + return self._read_normalised(count).encode('utf-8') + + def __iter__(self): + return (line.encode('utf-8') for line in self._readlines()) + + class File() : """ Parse CSV files and spreadsheets. @@ -45,7 +103,6 @@ class File() : - __init__(config, output) ; Initialise the class. Private methods: - - __tempFileWrapper(func, handle) ; Call func() with a filename. - __parseCsvFile(handle) ; Parse a CSV file. - __parseXlsFile(handle) ; Parse an Excel file. - __parseOdsFile(handle) ; Parse an OpenDocument Spreadsheet file. @@ -71,56 +128,48 @@ class File() : self.__output = output #: The Output object #__init__ - def __tempFileWrapper(self, func, handle) : + def __parseCsvFile(self, handle) : """ - Make a temporary file, put the content of a stream in it and pass - the filename to a general function. Return whatever this function - returns. + Parse a CSV file. Does not reset the file handle to start. - @arg func: general function that needs a file name as argument - @type func: function - @arg handle: A stream - @type handle: stream - - @return: unknown; the output of func(). - @rtype: ? - """ - write_handle, filename = tempfile.mkstemp(text=True) - - # Dump the content of the stream pointed to by handle into the file. - handle.seek(0) - os.write(write_handle, handle.read()) - os.close(write_handle) - - # Open the file with func(). - ret = func(filename) - # Apperantly apache will remove this file even when opened by the - # function *func - os.remove(filename) - - return ret - #__tempFileWrapper - - def __parseCsvFile(self, handle_) : - """ - Parse a CSV file. - The stream is not rewinded after use. - - @arg handle: A handle to a stream - @type handle: stream + @arg handle: CSV file. Must be a seekable binary file object. + @type handle: file object @return: list of lists @rtype: list """ - # We wrap the file in a temporary file just to have universal newlines - # which is not always possible to have on incoming files (thinks web - # and rpc frontends). This transparently solves the problem of Unix - # versus Windows versus Mac style newlines. - handle = tempfile.TemporaryFile('rU+w') - for chunk in handle_: - handle.write(chunk) - + buf = handle.read(BUFFER_SIZE) + result = chardet.detect(buf) handle.seek(0) + + if result['confidence'] > 0.5: + encoding = result['encoding'] + else: + encoding = 'utf-8' + + # Python 2.7 makes it extraordinarily hard to do this correctly. We + # have a binary file object containing lines of text in a certain + # encoding with unknown style of line-endings. + # + # We want to correctly decode the file contents, accept any style of + # line-endings, parse the lines with the `csv` module, and return + # unicode strings. + # + # 1. `codecs.getreader` does not have a universal newlines mode. + # 2. `io.TextIOWrapper` cannot be wrapped around our file object, + # since it is required to be an `io.BufferedIOBase`, which it + # usually will not be. + # 3. The `csv` module cannot read unicode. + # + # Ugh. + # + # So, we use a stream wrapper that consumes byte strings, decodes to + # unicode, normalises newlines, and produces the result UTF-8 encoded. + # That's what we feed the `csv` module. We decode what it gives back + # to unicode strings. What a mess. + handle = _UniversalNewlinesByteStreamIter(handle, encoding=encoding, + buffer_size=BUFFER_SIZE) + buf = handle.read(BUFFER_SIZE) # Default dialect @@ -148,41 +197,38 @@ class File() : ret = [] for i in reader: - ret.append(i) + ret.append([c.decode('utf-8') for c in i]) - handle.close() return ret #__parseCsvFile def __parseXlsFile(self, handle) : """ - Parse an Excel file. - The stream is not rewinded after use. + Parse an Excel file. Does not reset the file handle to start. - @arg handle: A handle to a stream - @type handle: stream + @arg handle: Excel file. Must be a binary file object. + @type handle: file object @return: A list of lists @rtype: list """ - workBook = self.__tempFileWrapper(xlrd.open_workbook, handle) + try: + workBook = xlrd.open_workbook(file_contents=handle.read()) + except xlrd.XLRDError: + return None + sheet = workBook.sheet_by_index(0) ret = [] for i in range(sheet.nrows) : row = [] for j in sheet.row_values(i) : - if type(j) == types.UnicodeType : # Convert the data to strings. - row.append(j.encode("utf8")) - else : - row.append(str(j)) + row.append(j) #for ret.append(row) #for - del sheet, workBook - return ret #__parseXlsFile @@ -197,8 +243,8 @@ class File() : @return: A list of lists @rtype: list """ + # Todo: Use a library for this. - #zipFile = self.__tempFileWrapper(zipfile.ZipFile, handle) zipFile = zipfile.ZipFile(handle) doc = xml.dom.minidom.parseString(zipFile.read("content.xml")) zipFile.close() @@ -209,10 +255,11 @@ class File() : for j in i.getElementsByTagName("table:table-cell") : c = j.getElementsByTagName("text:p") if c : - row.append(c[0].lastChild.data.encode("utf8")) + row.append(c[0].lastChild.data) #if #for - ret.append(row) + if row: + ret.append(row) #for return ret @@ -343,23 +390,23 @@ class File() : Get the mime type of a stream by inspecting a fixed number of bytes. The stream is rewinded after use. - @arg handle: A handle to a stream - @type handle: stream + @arg handle: Stream to be inspected. Must be a seekable binary file + object. + @type handle: file object - @return: The mime type of a file - @rtype: string + @return: The mime type of a file and a textual description. + @rtype: unicode, unicode """ handle.seek(0) buf = handle.read(BUFFER_SIZE) MagicInstance = magic.open(magic.MAGIC_MIME) MagicInstance.load() - mimeType = MagicInstance.buffer(buf).split(';')[0] + mimeType = MagicInstance.buffer(buf).decode('utf-8').split(';')[0] MagicInstance.close() MagicInstance = magic.open(magic.MAGIC_NONE) MagicInstance.load() - description = MagicInstance.buffer(buf) - del MagicInstance + description = MagicInstance.buffer(buf).decode('utf-8') handle.seek(0) return mimeType, description @@ -368,22 +415,28 @@ class File() : def parseFileRaw(self, handle) : """ Check which format a stream has and parse it with the appropriate - parser if the stream is recognised. + parser if the stream is recognised. Does not reset the file handle to + start. - @arg handle: A handle to a stream - @type handle: stream + @arg handle: Input file to be parsed. Must be a seekable binary file + object. + @type handle: file object @return: A list of lists, None if an error occured @rtype: list """ mimeType = self.getMimeType(handle) - if mimeType[0] == "text/plain" : + if mimeType[0] == "text/plain": return self.__parseCsvFile(handle) - if mimeType[0] == "application/vnd.ms-office" : + if mimeType[0] in ('application/vnd.ms-excel', + 'application/vnd.ms-office', + 'application/msword', + 'application/zip'): return self.__parseXlsFile(handle) - if mimeType == ("application/octet-stream", - "OpenDocument Spreadsheet") : + if (mimeType[0] == 'application/vnd.oasis.opendocument.spreadsheet' or + mimeType[1] in ('OpenDocument Spreadsheet', + 'OpenOffice.org 1.x Calc spreadsheet')): return self.__parseOdsFile(handle) return None @@ -392,10 +445,12 @@ class File() : def parseBatchFile(self, handle) : """ Check which format a stream has and parse it with the appropriate - parser if the stream is recognised. + parser if the stream is recognised. Does not reset the file handle to + start. - @arg handle: A handle to a stream - @type handle: stream + @arg handle: Batch job input file. Must be a seekable binary file + object. + @type handle: file object @return: A sanitised list of lists (without a header or empty lines) (or None if an error occured) and the number of columns. @@ -419,9 +474,9 @@ def makeList(l, maxlen=10): @arg maxlen: maximum length of the string you want to return @type maxlen: integer @return: a list converted to a string with comma's and spaces - @rtype: string + @rtype: unicode """ - ret = ", ".join(str(i) for i in l[:maxlen]) + ret = ", ".join(i for i in l[:maxlen]) if len(l)>maxlen: return ret+", ..." else: diff --git a/mutalyzer/GenRecord.py b/mutalyzer/GenRecord.py index b30ed80060bb3135f28650bb9bfae4bbf1f30b61..5a729f737270d0ef52d8acc08d9af16de5668589 100644 --- a/mutalyzer/GenRecord.py +++ b/mutalyzer/GenRecord.py @@ -15,7 +15,7 @@ search for them each time. # - GenRecord ; Convert a GenBank record to a nested dictionary. -import Bio +from __future__ import unicode_literals from mutalyzer import util from mutalyzer import Crossmap @@ -85,7 +85,7 @@ class Locus(object) : - CM ; A Crossmap object. @arg name: identifier of the locus - @type name: string + @type name: unicode """ self.name = name @@ -131,7 +131,7 @@ class Locus(object) : Expands the DNA description with a new raw variant. @arg rawVariant: description of a single mutation - @type rawVariant: string + @type rawVariant: unicode """ if self.description: # Don't change anything if we already have an unknown value. @@ -170,7 +170,7 @@ class Gene(object) : - __locusTag ; @arg name: gene name - @type name: string + @type name: unicode """ self.name = name @@ -199,14 +199,14 @@ class Gene(object) : Find a transcript, given its name. @arg name: transcript variant number - @type name: string + @type name: unicode @return: transcript @rtype: object """ for i in self.transcriptList : - if i.name == name or i.name == str("%03i" % int(name)): + if i.name == name or i.name == "%03i" % int(name): return i return None #findLocus @@ -230,7 +230,7 @@ class Gene(object) : Look in the list of transcripts for a given protein accession number. @arg protAcc: protein accession number - @type protAcc: string + @type protAcc: unicode @return: transcript @rtype: object @@ -300,7 +300,7 @@ class Record(object) : Returns a Gene object, given its name. @arg name: Gene name - @type name: string + @type name: unicode @return: Gene object @rtype: object @@ -332,7 +332,7 @@ class Record(object) : Expands the DNA description with a new raw variant. @arg rawVariant: description of a single mutation - @type rawVariant: string + @type rawVariant: unicode """ if self.description : @@ -469,18 +469,18 @@ class GenRecord() : @arg gene: Gene @type gene: object @arg string: DNA sequence - @type string: string + @type string: unicode @kwarg string_reverse: DNA sequence to use (if not None) for the reverse complement. @return: reverse-complement (if applicable), otherwise return the original. - @rtype: string + @rtype: unicode """ if gene.orientation == -1: if string_reverse: string = string_reverse - return Bio.Seq.reverse_complement(string) + return util.reverse_complement(string) return string #__maybeInvert @@ -639,15 +639,15 @@ class GenRecord() : @arg stop_g: stop position @type stop_g: integer @arg varType: variant type - @type varType: string + @type varType: unicode @arg arg1: argument 1 of a raw variant - @type arg1: string + @type arg1: unicode @arg arg2: argument 2 of a raw variant - @type arg2: string + @type arg2: unicode @arg roll: ??? @type roll: tuple (integer, integer) @kwarg arg1_reverse: argument 1 to be used on reverse strand - @type arg1_reverse: string + @type arg1_reverse: unicode @kwarg start_fuzzy: Indicates if start position of variant is fuzzy. @type start_fuzzy: bool @kwarg stop_fuzzy: Indicates if stop position of variant is fuzzy. @@ -666,8 +666,8 @@ class GenRecord() : else: chromStart = self.record.toChromPos(stop_g) chromStop = self.record.toChromPos(start_g) - chromArg1 = Bio.Seq.reverse_complement(arg1) - chromArg2 = Bio.Seq.reverse_complement(arg2) + chromArg1 = util.reverse_complement(arg1) + chromArg2 = util.reverse_complement(arg2) # Todo: Should we use arg1_reverse here? if roll : diff --git a/mutalyzer/Retriever.py b/mutalyzer/Retriever.py index 5fa91eeb1940dcb14fd5ce7087495281a91cdfe9..ac09bafa3a735b7af6cd445633baeb47dacaf8a2 100644 --- a/mutalyzer/Retriever.py +++ b/mutalyzer/Retriever.py @@ -10,13 +10,15 @@ Public classes: """ +from __future__ import unicode_literals + +import io import os # path.isfile(), link() path.isdir(), path.mkdir(), # walk(), path.getsize(), path.join(), stat(), remove() import time import bz2 # BZ2Compressor(), BZ2File() import hashlib # md5(), update(), hexdigest() import urllib2 # urlopen() -import StringIO # StringIO() from Bio import SeqIO # read() from Bio import Entrez # efetch(), read(), esearch(), esummary() from Bio.Seq import UnknownSeq @@ -25,6 +27,7 @@ from xml.dom import DOMException, minidom from xml.parsers import expat from httplib import HTTPException, IncompleteRead from sqlalchemy.orm.exc import NoResultFound +import cchardet as chardet from mutalyzer import util from mutalyzer.config import settings @@ -84,10 +87,10 @@ class Retriever(object) : Convert an accession number to a filename. @arg name: The accession number - @type name: string + @type name: unicode @return: A filename - @rtype: string + @rtype: unicode """ return os.path.join(settings.CACHE_DIR, name + "." + self.fileType + ".bz2") #_nametofile @@ -97,18 +100,27 @@ class Retriever(object) : Write raw data to a compressed file. @arg raw_data: The raw_data to be compressed and written - @type raw_data: string + @type raw_data: byte string @arg filename: The intended name of the outfile - @type filename: string + @type filename: unicode @return: outfile ; The full path and name of the file written - @rtype: string + @rtype: unicode """ + result = chardet.detect(raw_data) + if result['confidence'] > 0.5: + encoding = result['encoding'] + else: + encoding = 'utf-8' + + if not util.is_utf8_alias(encoding): + raw_data = raw_data.decode(encoding).encode('utf-8') + # Compress the data to save disk space. comp = bz2.BZ2Compressor() data = comp.compress(raw_data) data += comp.flush() - out_handle = open(self._nametofile(filename), "w") + out_handle = open(self._nametofile(filename), "wb") out_handle.write(data) out_handle.close() @@ -120,10 +132,10 @@ class Retriever(object) : Calculate the md5sum of a piece of text. @arg content: Arbitrary text - @type content: string + @type content: byte string @return: The md5sum of 'content' - @rtype: string + @rtype: unicode """ hashfunc = hashlib.md5() @@ -131,7 +143,7 @@ class Retriever(object) : md5sum = hashfunc.hexdigest() del hashfunc - return md5sum + return unicode(md5sum) #_calcHash def _newUD(self) : @@ -139,11 +151,11 @@ class Retriever(object) : Make a new UD number based on the current time (seconds since 1970). @return: A new UD number - @rtype: string + @rtype: unicode """ UD = util.generate_id() - return "UD_" + str(UD) + return "UD_" + unicode(UD) #_newUD def _updateDBmd5(self, raw_data, name, GI): @@ -159,7 +171,7 @@ class Retriever(object) : @type GI: @return: filename - @rtype: string + @rtype: unicode """ try: reference = Reference.query.filter_by(accession=name).one() @@ -191,10 +203,10 @@ class Retriever(object) : it. @arg rsId: The rsId of the SNP (example: 'rs9919552'). - @type rsId: string + @type rsId: unicode @return: A list of HGVS notations. - @rtype: list(string) + @rtype: list(unicode) """ # A simple input check. id = rs_id[2:] @@ -223,7 +235,7 @@ class Retriever(object) : self._output.addMessage(__file__, 4, 'EENTREZ', 'Error connecting to dbSNP.') self._output.addMessage(__file__, -1, 'INFO', - 'IOError: %s' % str(e)) + 'IOError: %s' % unicode(e)) return [] try: @@ -232,10 +244,10 @@ class Retriever(object) : self._output.addMessage(__file__, 4, 'EENTREZ', 'Error reading from dbSNP.') self._output.addMessage(__file__, -1, 'INFO', - 'IncompleteRead: %s' % str(e)) + 'IncompleteRead: %s' % unicode(e)) return [] - if response_text == '\n': + if response_text.strip() == b'\n': # This is apparently what dbSNP returns for non-existing dbSNP id self._output.addMessage(__file__, 4, 'EENTREZ', 'ID rs%s could not be found in dbSNP.' \ @@ -251,21 +263,21 @@ class Retriever(object) : self._output.addMessage(__file__, 4, 'EENTREZ', 'Unknown dbSNP ' \ 'error. Error parsing result XML.') self._output.addMessage(__file__, -1, 'INFO', - 'ExpatError: %s' % str(e)) + 'ExpatError: %s' % unicode(e)) self._output.addMessage(__file__, -1, 'INFO', - 'Result from dbSNP: %s' % response_text) + 'Result from dbSNP: %s' % unicode(response_text, 'utf-8')) return [] except IndexError: # The expected root element is not present. self._output.addMessage(__file__, 4, 'EENTREZ', 'Unknown dbSNP ' \ 'error. Result XML was not as expected.') self._output.addMessage(__file__, -1, 'INFO', - 'Result from dbSNP: %s' % response_text) + 'Result from dbSNP: %s' % unicode(response_text, 'utf-8')) return [] snps = [] for i in rs.getElementsByTagName('hgvs'): - snps.append(i.lastChild.data.encode('utf8')) + snps.append(i.lastChild.data) return snps #snpConvert @@ -298,9 +310,9 @@ class GenBankRetriever(Retriever): database). @arg raw_data: The data - @type raw_data: string + @type raw_data: byte string @arg filename: The intended name of the file. - @type filename: string + @type filename: unicode @arg extract: Flag that indicates whether to extract the record ID and GI number: - 0 ; Do not extract, use 'filename' @@ -310,29 +322,27 @@ class GenBankRetriever(Retriever): @return: tuple ; Depending on the value of 'extract': - 0 ; ('filename', None) - 1 ; (id, GI) - @rtype: tuple (string, string) + @rtype: tuple (unicode, unicode) """ - if raw_data == "\nNothing has been found\n" : + if raw_data.strip() == b'Nothing has been found': self._output.addMessage(__file__, 4, "ENORECORD", "The record could not be retrieved.") return None #if - fakehandle = StringIO.StringIO() # Unfortunately, BioPython needs a - fakehandle.write(raw_data) # file handle. + fakehandle = io.BytesIO() # Unfortunately, BioPython needs a + fakehandle.write(raw_data) # file handle. fakehandle.seek(0) try : record = SeqIO.read(fakehandle, "genbank") except (ValueError, AttributeError): # An error occured while parsing. self._output.addMessage(__file__, 4, "ENOPARSE", "The file could not be parsed.") - fakehandle.close() return None #except if type(record.seq) == UnknownSeq : - fakehandle.close() self._output.addMessage(__file__, 4, "ENOSEQ", "This record contains no sequence. Chromosomal or contig " \ "records should be uploaded with the GenBank uploader.") @@ -342,12 +352,12 @@ class GenBankRetriever(Retriever): outfile = filename GI = None if extract : - outfile = record.id - GI = record.annotations["gi"] + outfile = unicode(record.id) + GI = unicode(record.annotations["gi"]) if outfile != filename : # Add the reference (incl version) to the reference output # This differs if the original reference lacks a version - self._output.addOutput("reference", record.id) + self._output.addOutput("reference", unicode(record.id)) self._output.addOutput( "BatchFlags", ("A1",( filename, @@ -355,9 +365,8 @@ class GenBankRetriever(Retriever): filename+"." ))) self._output.addMessage(__file__, 2, "WNOVER", "No version number is given, using %s. Please use this " \ - "number to reduce downloading overhead." % record.id) + "number to reduce downloading overhead." % unicode(record.id)) #if - fakehandle.close() self._write(raw_data, outfile) @@ -378,12 +387,12 @@ class GenBankRetriever(Retriever): net_handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage(__file__, -1, 'INFO', - 'Error connecting to Entrez nuccore database: %s' % str(e)) + 'Error connecting to Entrez nuccore database: %s' % unicode(e)) self._output.addMessage(__file__, 4, 'ERETR', 'Could not retrieve %s.' % name) return None - if raw_data == '\n' : # Check if the file is empty or not. + if raw_data.strip() == b'': # Check if the file is empty or not. self._output.addMessage(__file__, 4, 'ERETR', 'Could not retrieve %s.' % name) return None @@ -391,10 +400,10 @@ class GenBankRetriever(Retriever): # This is a hack to detect constructed references, the proper way to # do this would be to check the data_file_division attribute of the # parsed GenBank file (it would be 'CON'). - if '\nCONTIG' in raw_data: + if b'\nCONTIG' in raw_data: try: # Get the length in base pairs - length = int(raw_data[:raw_data.index(' bp', 0, 500)].split()[-1]) + length = int(raw_data[:raw_data.index(b' bp', 0, 500)].split()[-1]) except ValueError, IndexError: self._output.addMessage(__file__, 4, 'ERETR', 'Could not retrieve %s.' % name) @@ -409,7 +418,7 @@ class GenBankRetriever(Retriever): net_handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage(__file__, -1, 'INFO', - 'Error connecting to Entrez nuccore database: %s' % str(e)) + 'Error connecting to Entrez nuccore database: %s' % unicode(e)) self._output.addMessage(__file__, 4, 'ERETR', 'Could not retrieve %s.' % name) return None @@ -438,7 +447,7 @@ class GenBankRetriever(Retriever): as filename. @arg accno: The accession number of the chromosome - @type accno: string + @type accno: unicode @arg start: Start position of the slice @type start: integer @arg stop: End position of the slice. @@ -450,7 +459,7 @@ class GenBankRetriever(Retriever): @type orientation: integer @return: An UD number - @rtype: string + @rtype: unicode """ # Not a valid slice. @@ -483,7 +492,7 @@ class GenBankRetriever(Retriever): handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage(__file__, -1, 'INFO', - 'Error connecting to Entrez nuccore database: %s' % str(e)) + 'Error connecting to Entrez nuccore database: %s' % unicode(e)) self._output.addMessage(__file__, 4, 'ERETR', 'Could not retrieve slice.') return None @@ -512,7 +521,7 @@ class GenBankRetriever(Retriever): #else if self.write(raw_data, reference.accession, 0): - return str(reference.accession) + return reference.accession #retrieveslice def retrievegene(self, gene, organism, upstream, downstream) : @@ -521,9 +530,9 @@ class GenBankRetriever(Retriever): slice if the gene can be found. @arg gene: Name of the gene - @type gene: string + @type gene: unicode @arg organism: The organism in which we search. - @type organism: string + @type organism: unicode @arg upstream: Number of upstream nucleotides for the slice. @type upstream: integer @arg downstream: Number of downstream nucleotides for the slice. @@ -549,7 +558,7 @@ class GenBankRetriever(Retriever): handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage(__file__, -1, 'INFO', - 'Error connecting to Entrez esearch: %s' % str(e)) + 'Error connecting to Entrez esearch: %s' % unicode(e)) self._output.addMessage(__file__, 4, 'ERETR', 'Could not search for gene %s.' % gene) return None @@ -571,29 +580,29 @@ class GenBankRetriever(Retriever): handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage(__file__, -1, 'INFO', - 'Error connecting to Entrez esummary: %s' % str(e)) + 'Error connecting to Entrez esummary: %s' % unicode(e)) self._output.addMessage(__file__, 4, 'ERETR', 'Could not get mapping information for gene %s.' % gene) return None - if summary[0]["NomenclatureSymbol"].lower() == gene.lower() : # Found it. + if unicode(summary[0]["NomenclatureSymbol"]).lower() == gene.lower() : # Found it. if not summary[0]["GenomicInfo"] : self._output.addMessage(__file__, 4, "ENOMAPPING", "No mapping information found for gene %s." % gene) return None #if - ChrAccVer = summary[0]["GenomicInfo"][0]["ChrAccVer"] - ChrLoc = summary[0]["GenomicInfo"][0]["ChrLoc"] - ChrStart = summary[0]["GenomicInfo"][0]["ChrStart"] - ChrStop = summary[0]["GenomicInfo"][0]["ChrStop"] - break; + ChrAccVer = unicode(summary[0]["GenomicInfo"][0]["ChrAccVer"]) + ChrLoc = unicode(summary[0]["GenomicInfo"][0]["ChrLoc"]) + ChrStart = unicode(summary[0]["GenomicInfo"][0]["ChrStart"]) + ChrStop = unicode(summary[0]["GenomicInfo"][0]["ChrStop"]) + break #if # Collect official symbols that has this gene as alias in case we # can not find anything. - if gene in summary[0]["OtherAliases"] and \ + if gene in [unicode(a) for a in summary[0]["OtherAliases"]] and \ summary[0]["NomenclatureSymbol"] : - aliases.append(summary[0]["NomenclatureSymbol"]); + aliases.append(unicode(summary[0]["NomenclatureSymbol"])) #for if not ChrAccVer : # We did not find any genes. @@ -631,11 +640,18 @@ class GenBankRetriever(Retriever): is used. @arg url: Location of a GenBank record - @type url: string + @type url: unicode @return: UD or None - @rtype: string + @rtype: unicode """ + if not (url.startswith('http://') or + url.startswith('https://') or + url.startswith('ftp://')): + self._output.addMessage(__file__, 4, "ERECPARSE", + "Only HTTP(S) or FTP locations are allowed.") + return None + handle = urllib2.urlopen(url) info = handle.info() if info["Content-Type"] == "text/plain" : @@ -651,14 +667,14 @@ class GenBankRetriever(Retriever): except NoResultFound: UD = self._newUD() if not os.path.isfile(self._nametofile(UD)): - UD = self.write(raw_data, UD, 0) and str(UD) + UD = self.write(raw_data, UD, 0) and UD if UD: #Parsing went OK, add to DB reference = Reference(UD, md5sum, download_url=url) session.add(reference) session.commit() else: if not os.path.isfile(self._nametofile(reference.accession)): - UD = self.write(raw_data, reference.accession, 0) and str(reference.accession) + UD = self.write(raw_data, reference.accession, 0) and reference.accession return UD #Returns the UD or None #if @@ -681,11 +697,11 @@ class GenBankRetriever(Retriever): If the downloaded file is recognised by its hash, the old UD number is used. - @arg raw_data: A GenBank record - @type raw_data: string + @arg raw_data: A GenBank record. + @type raw_data: byte string - @return: - @rtype: string????? + @return: Accession number for the uploaded file. + @rtype: unicode """ md5sum = self._calcHash(raw_data) @@ -702,7 +718,7 @@ class GenBankRetriever(Retriever): if os.path.isfile(self._nametofile(reference.accession)): return reference.accession else: - return self.write(raw_data, reference.accession, 0) and str(reference.accession) + return self.write(raw_data, reference.accession, 0) and reference.accession #uploadrecord def loadrecord(self, identifier): @@ -718,7 +734,7 @@ class GenBankRetriever(Retriever): 3. Fetched from the NCBI. :arg identifier: A RefSeq accession number or geninfo identifier (GI). - :type identifier: string + :type identifier: unicode :return: A parsed RefSeq record or `None` if no record could be found for the given identifier. @@ -830,7 +846,7 @@ class LRGRetriever(Retriever): Load and parse a LRG file based on the identifier @arg identifier: The name of the LRG file to read - @type identifier: string + @type identifier: unicode @return: record ; GenRecord.Record of LRG file None ; in case of failure @@ -870,10 +886,10 @@ class LRGRetriever(Retriever): from the pending section. @arg name: The name of the LRG file to fetch - @type name: string + @type name: unicode @return: the full path to the file; None in case of an error - @rtype: string + @rtype: unicode """ prefix = settings.LRG_PREFIX_URL @@ -901,12 +917,12 @@ class LRGRetriever(Retriever): Download an LRG record from an URL. @arg url: Location of the LRG record - @type url: string + @type url: unicode @return: - filename ; The full path to the file - None ; in case of failure - @rtype: string + @rtype: unicode """ lrgID = name or os.path.splitext(os.path.split(url)[1])[0] @@ -914,6 +930,8 @@ class LRGRetriever(Retriever): # return None filename = self._nametofile(lrgID) + # Todo: Properly read the file contents to a unicode string and write + # it utf-8 encoded. handle = urllib2.urlopen(url) info = handle.info() if info["Content-Type"] == "application/xml" and info.has_key("Content-length"): @@ -968,14 +986,14 @@ class LRGRetriever(Retriever): if a parse error occurs None is returned. @arg raw_data: The data - @type raw_data: string + @type raw_data: byte string @arg filename: The intended name of the file - @type filename: string + @type filename: unicode @return: - filename ; The full path and name of the file written - None ; In case of an error - @rtype: string + @rtype: unicode """ # Dirty way to test if a file is valid, # Parse the file to see if it's a real LRG file. diff --git a/mutalyzer/Scheduler.py b/mutalyzer/Scheduler.py index e6f102d3e2b23dcd8b60bc7c17a6c7f03a196ad0..400c81c54c3d0530d139ce7ae7a96f7ae4c7542d 100644 --- a/mutalyzer/Scheduler.py +++ b/mutalyzer/Scheduler.py @@ -15,13 +15,15 @@ Module used to add and manage the Batch Jobs. # - Batch Syntax Checker # - Batch Position Converter +from __future__ import unicode_literals + +import io import os # os.path.exists import smtplib # smtplib.STMP from email.mime.text import MIMEText # MIMEText from sqlalchemy import func from sqlalchemy.orm.exc import NoResultFound -import mutalyzer from mutalyzer.config import settings from mutalyzer.db import queries, session from mutalyzer.db.models import Assembly, BatchJob, BatchQueueItem @@ -88,9 +90,9 @@ class Scheduler() : @todo: Handle Connection errors in a try, except clause @arg mailTo: The batch job submitter - @type mailTo: string + @type mailTo: unicode @arg url: The url containing the results - @type url: string + @type url: unicode """ if settings.TESTING: return @@ -410,7 +412,7 @@ Mutalyzer batch scheduler""" % url) O.addMessage(__file__, 4, "EBATCHU", "Unexpected error occurred, dev-team notified") import traceback - O.addMessage(__file__, 4, "DEBUG", repr(traceback.format_exc())) + O.addMessage(__file__, 4, "DEBUG", unicode(repr(traceback.format_exc()))) #except finally : #check if we need to update the database @@ -448,11 +450,11 @@ Mutalyzer batch scheduler""" % url) 'Affected Proteins', 'Restriction Sites Created', 'Restriction Sites Deleted'] - handle = open(filename, 'a') + handle = io.open(filename, mode='a', encoding='utf-8') handle.write("%s\n" % "\t".join(header)) #if else : - handle = open(filename, 'a') + handle = io.open(filename, mode='a', encoding='utf-8') if flags and 'C' in flags: separator = '\t' @@ -507,11 +509,11 @@ Mutalyzer batch scheduler""" % url) # header above it. The header is read from the config file as # a list. We need a tab delimited string. header = ['Input', 'Status'] - handle = open(filename, 'a') + handle = io.open(filename, mode='a', encoding='utf-8') handle.write("%s\n" % "\t".join(header)) #if else : - handle = open(filename, 'a') + handle = io.open(filename, mode='a', encoding='utf-8') if flags and 'C' in flags: separator = '\t' @@ -535,11 +537,11 @@ Mutalyzer batch scheduler""" % url) - Output written to outputfile. @arg cmd: The Syntax Checker input - @type cmd: string + @type cmd: unicode @arg i: The JobID @type i: integer @arg build: The build to use for the converter - @type build: string + @type build: unicode @arg flags: Flags of the current entry @type flags: """ @@ -562,7 +564,7 @@ Mutalyzer batch scheduler""" % url) assembly = Assembly.by_name_or_alias(batch_job.argument) except NoResultFound: O.addMessage(__file__, 3, 'ENOASSEMBLY', - 'Not a valid assembly: ' + str(batch_job.argument)) + 'Not a valid assembly: ' + batch_job.argument) raise converter = Converter(assembly, O) @@ -619,11 +621,11 @@ Mutalyzer batch scheduler""" % url) 'Errors', 'Chromosomal Variant', 'Coding Variant(s)'] - handle = open(filename, 'a') + handle = io.open(filename, mode='a', encoding='utf-8') handle.write("%s\n" % "\t".join(header)) #if else : - handle = open(filename, 'a') + handle = io.open(filename, mode='a', encoding='utf-8') if flags and 'C' in flags: separator = '\t' @@ -681,11 +683,11 @@ Mutalyzer batch scheduler""" % url) header = ['Input Variant', 'HGVS description(s)', 'Errors and warnings'] - handle = open(filename, 'a') + handle = io.open(filename, mode='a', encoding='utf-8') handle.write("%s\n" % "\t".join(header)) #if else : - handle = open(filename, 'a') + handle = io.open(filename, mode='a', encoding='utf-8') if flags and 'C' in flags: separator = '\t' @@ -704,7 +706,7 @@ Mutalyzer batch scheduler""" % url) Add a job to the Database and start the BatchChecker. @arg email: e-mail address of batch supplier - @type email: string + @type email: unicode @arg queue: A list of jobs @type queue: list @arg columns: The number of columns. diff --git a/mutalyzer/__init__.py b/mutalyzer/__init__.py index e3c80aa36a8691de7128fb8a7c482cf58699bb48..6968d5ff84fa0b23b07b8e49adbd449fd6cc61e5 100644 --- a/mutalyzer/__init__.py +++ b/mutalyzer/__init__.py @@ -3,6 +3,9 @@ HGVS variant nomenclature checker. """ +from __future__ import unicode_literals + + # We follow a versioning scheme compatible with setuptools [1] where the # package version is always that of the upcoming release (and not that of the # previous release), post-fixed with ``.dev``. Only in a release commit, the diff --git a/mutalyzer/announce.py b/mutalyzer/announce.py index d8acbe4de84757bde62b0d326b5a4c0a3fc7ee4d..9adbf79109eeb06e8894b74cfdbad7d929261502 100644 --- a/mutalyzer/announce.py +++ b/mutalyzer/announce.py @@ -7,6 +7,8 @@ fast, it can be done on every website pageview without problems. """ +from __future__ import unicode_literals + from mutalyzer.redisclient import client diff --git a/mutalyzer/config/__init__.py b/mutalyzer/config/__init__.py index def4630bc53ad26234896ab56165afa73bdc3c88..462a490e1bf21d18d7bca310e732ec95bd1f2e62 100644 --- a/mutalyzer/config/__init__.py +++ b/mutalyzer/config/__init__.py @@ -12,6 +12,8 @@ be used. """ +from __future__ import unicode_literals + import collections import os diff --git a/mutalyzer/config/default_settings.py b/mutalyzer/config/default_settings.py index 43009e09e1e0142fed8f99bc67a00076c0ab9327..00dc9b2e8070f55b18bd47d9a384bcc7e9fa98cf 100644 --- a/mutalyzer/config/default_settings.py +++ b/mutalyzer/config/default_settings.py @@ -4,6 +4,9 @@ pointed-to by the `MUTALYZER_SETTINGS` environment variable. """ +from __future__ import unicode_literals + + # Use Mutalyzer in debug mode. DEBUG = False diff --git a/mutalyzer/db/__init__.py b/mutalyzer/db/__init__.py index b2192186773b542c68d1dae3884124f012ccfff4..71e8eaf5cd4eeea706873fcb4b179168e34187d7 100644 --- a/mutalyzer/db/__init__.py +++ b/mutalyzer/db/__init__.py @@ -4,6 +4,8 @@ using SQLAlchemy. """ +from __future__ import unicode_literals + import sqlalchemy from sqlalchemy.engine.url import make_url from sqlalchemy.ext.declarative import declarative_base diff --git a/mutalyzer/db/models.py b/mutalyzer/db/models.py index 4119fa99e178b7dfbdaaa91e3cbb1352836dd3c0..faa0754c519549f71d78fbadcceb4c10586d43bf 100644 --- a/mutalyzer/db/models.py +++ b/mutalyzer/db/models.py @@ -3,6 +3,8 @@ Models backed by SQL using SQLAlchemy. """ +from __future__ import unicode_literals + from datetime import datetime import sqlite3 import uuid @@ -50,7 +52,7 @@ class Positions(TypeDecorator): def process_bind_param(self, value, dialect): if value is not None: - value = ','.join(str(i) for i in value) + value = ','.join(unicode(i) for i in value) return value def process_result_value(self, value, dialect): @@ -98,7 +100,7 @@ class BatchJob(db.Base): self.email = email self.download_url = download_url self.argument = argument - self.result_id = str(uuid.uuid4()) + self.result_id = unicode(uuid.uuid4()) self.added = datetime.now() def __repr__(self): diff --git a/mutalyzer/db/queries.py b/mutalyzer/db/queries.py index afdd2a44152e105976edc94db793c4ce12b764d1..7c54d137fa19e5ff0b8459a3df305ec4241c9d2e 100644 --- a/mutalyzer/db/queries.py +++ b/mutalyzer/db/queries.py @@ -7,6 +7,8 @@ Queries on database models. # the models they work with. +from __future__ import unicode_literals + from datetime import datetime, timedelta from sqlalchemy import and_, or_ diff --git a/mutalyzer/describe.py b/mutalyzer/describe.py index 37fb60c238990b3cd12bcdbd9098c48ecfb6724d..d81254c39aeed1febbb7b5545ae48b82e3dfc7cb 100644 --- a/mutalyzer/describe.py +++ b/mutalyzer/describe.py @@ -7,13 +7,14 @@ leading from one sequence to an other. @requires: Bio.Seq """ +from __future__ import unicode_literals + import collections -from Bio import Seq from Bio.SeqUtils import seq3 from Bio.Data import CodonTable from mutalyzer.util import longest_common_prefix, longest_common_suffix -from mutalyzer.util import palinsnoop, roll +from mutalyzer.util import palinsnoop, roll, reverse_complement from mutalyzer import models @@ -34,9 +35,9 @@ class LCS(object): Initialise the class. @arg s1: A string. - @type s1: str + @type s1: unicode @arg s2: A string. - @type s2: str + @type s2: unicode @arg lcp: The length of the longest common prefix of {s1} and {s2}. @type lcp: int @arg s1_end: End of the substring in {s1}. @@ -55,21 +56,21 @@ class LCS(object): self.__s2_rc = None self.__matrix_rc = None if DNA: - self.__s2_rc = Seq.reverse_complement(s2[self.__lcp:s2_end]) + self.__s2_rc = reverse_complement(s2[self.__lcp:s2_end]) self.__matrix_rc = self.LCSMatrix(self.__s1, self.__s2_rc) #if #__init__ - def __str__(self): + def __unicode__(self): """ Return a graphical representation of the LCS matrix, mainly for debugging. @returns: A graphical representation of the LCS matrix. - @rtype: str + @rtype: unicode """ return self.visMatrix((0, len(self.__s1)), (0, len(self.__s2))) - #__str__ + #__unicode__ def visMatrix(self, r1, r2, rc=False): """ @@ -77,7 +78,7 @@ class LCS(object): debugging. @returns: A graphical representation of the LCS matrix. - @rtype: str + @rtype: unicode """ nr1 = r1[0] - self.__lcp, r1[1] - self.__lcp nr2 = r2[0] - self.__lcp, r2[1] - self.__lcp @@ -91,7 +92,7 @@ class LCS(object): out = self.__delim.join(self.__delim + '-' + s2[nr2[0]:nr2[1]]) + '\n' for i in range(nr1[0], nr1[1] + 1): out += (('-' + self.__s1)[i] + self.__delim + - self.__delim.join(map(lambda x: str(M[i][x]), + self.__delim.join(map(lambda x: unicode(M[i][x]), range(nr2[0], nr2[1] + 1))) + '\n') return out @@ -102,9 +103,9 @@ class LCS(object): Calculate the Longest Common Substring matrix. @arg s1: A string. - @type s1: str + @type s1: unicode @arg s2: A string. - @type s2: str + @type s2: unicode @returns: A matrix with the LCS of {s1}[i], {s2}[j] at position i, j. @rval: list[list[int]] @@ -201,9 +202,9 @@ def __makeOverlaps(peptide): Make a list of overlapping 2-mers of {peptide} in order of appearance. @arg peptide: A peptide sequence. - @type peptide: str + @type peptide: unicode @returns: All 2-mers of {peptide} in order of appearance. - @rtype: list(str) + @rtype: list(unicode) """ return map(lambda x: peptide[x:x+2], range(len(peptide) - 1)) #__makeOverlaps @@ -213,13 +214,13 @@ def __options(pList, peptidePrefix, FS, output): Enumerate all peptides that could result from a frame shift. @arg pList: List of overlapping 2-mers of a peptide. - @type pList: list(str) + @type pList: list(unicode) @arg peptidePrefix: Prefix of a peptide in the alternative reading frame. - @type peptidePrefix: str + @type peptidePrefix: unicode @arg FS: Frame shift table. @type FS: dict @arg output: List of peptides, should be empty initially. - @type output: list(str) + @type output: list(unicode) """ if not pList: output.append(peptidePrefix) @@ -234,7 +235,7 @@ def enumFS(peptide, FS): Enumerate all peptides that could result from a frame shift. @arg peptide: Original peptide sequence. - @type peptide: str + @type peptide: unicode @arg FS: Frame shift table. @type FS: dict """ @@ -250,9 +251,9 @@ def fitFS(peptide, altPeptide, FS): {peptide}. @arg peptide: Original peptide sequence. - @type peptide: str + @type peptide: unicode @arg altPeptide: Observed peptide sequence. - @type altPeptide: str + @type altPeptide: unicode @arg FS: Frame shift table. @type FS: dict """ @@ -302,11 +303,11 @@ class DescribeRawVar(models.RawVar): @arg end_offset: @type end_offset: int @arg type: Variant type. - @type type: str + @type type: unicode @arg deleted: Deleted part of the reference sequence. - @type deleted: str + @type deleted: unicode @arg inserted: Inserted part. - @type inserted: str + @type inserted: unicode @arg shift: Amount of freedom. @type shift: int """ @@ -336,7 +337,7 @@ class DescribeRawVar(models.RawVar): correct description. Also see the comment in the class definition. @returns: The HGVS description of the raw variant stored in this class. - @rtype: str + @rtype: unicode """ if not self.start: return "=" @@ -365,7 +366,7 @@ class DescribeRawVar(models.RawVar): correct description. Also see the comment in the class definition. @returns: The HGVS description of the raw variant stored in this class. - @rtype: str + @rtype: unicode """ if self.type == "unknown": return "?" @@ -491,7 +492,7 @@ def alleleDescription(allele): @type allele: list(DescribeRawVar) @returns: The HGVS description of {allele}. - @rval: str + @rval: unicode """ if len(allele) > 1: return "[%s]" % ';'.join(map(lambda x : x.hgvs, allele)) @@ -530,9 +531,9 @@ def DNA_description(M, s1, s2, s1_start, s1_end, s2_start, s2_end): {s1_start}..{s1_end} on {s1} and {s2_start}..{s2_end} on {s2}. arg s1: Sequence 1. - type s1: str + type s1: unicode arg s2: Sequence 2. - type s2: str + type s2: unicode arg s1_start: Start of the range on {s1}. type s1_start: int arg s1_end: End of the range on {s1}. @@ -682,9 +683,9 @@ def protein_description(M, s1, s2, s1_start, s1_end, s2_start, s2_end): {s1_start}..{s1_end} on {s1} and {s2_start}..{s2_end} on {s2}. arg s1: Sequence 1. - type s1: str + type s1: unicode arg s2: Sequence 2. - type s2: str + type s2: unicode arg s1_start: Start of the range on {s1}. type s1_start: int arg s1_end: End of the range on {s1}. @@ -810,15 +811,15 @@ def describe(original, mutated, DNA=True): Convenience function for DNA_description(). @arg original: - @type original: str + @type original: unicode @arg mutated: - @type mutated: str + @type mutated: unicode @returns: A list of DescribeRawVar objects, representing the allele. @rval: list(DescribeRawVar) """ - s1 = str(original) - s2 = str(mutated) + s1 = original + s2 = mutated lcp = len(longest_common_prefix(s1, s2)) lcs = len(longest_common_suffix(s1[lcp:], s2[lcp:])) s1_end = len(s1) - lcs diff --git a/mutalyzer/describe_c.py b/mutalyzer/describe_c.py deleted file mode 100755 index 1da86f77293e015ba2a0f53a5a3f61a3fcaeca4d..0000000000000000000000000000000000000000 --- a/mutalyzer/describe_c.py +++ /dev/null @@ -1,587 +0,0 @@ -#!/usr/bin/python - -""" -Prototype of a module that can generate a HGVS description of the variant(s) -leading from one sequence to an other. - -@requires: Bio.Seq -""" -import collections -from Bio import Seq -from Bio.SeqUtils import seq3 -from Bio.Data import CodonTable - -from mutalyzer.util import longest_common_prefix, longest_common_suffix -from mutalyzer.util import palinsnoop, roll -from mutalyzer import models - -from extractor import extractor - -def makeFSTables(table_id): - """ - For every pair of amino acids, calculate the set of possible amino acids in - a different reading frame. Do this for both alternative reading frames (+1 - and +2). - - @arg table_id: Coding table ID. - @type table_id: int - @returns: Two dictionaries for the two alternative reading frames. - @rtype: tuple(dict, dict) - """ - # Make the forward translation table. - table = dict(CodonTable.unambiguous_dna_by_id[table_id].forward_table) - for i in CodonTable.unambiguous_dna_by_id[table_id].stop_codons: - table[i] = '*' - - # Make the reverse translation table. - reverse_table = collections.defaultdict(list) - for i in table: - reverse_table[table[i]].append(i) - - # Make the frame shift tables. - FS1 = collections.defaultdict(set) - FS2 = collections.defaultdict(set) - for AA_i in reverse_table: - for AA_j in reverse_table: - for codon_i in reverse_table[AA_i]: - for codon_j in reverse_table[AA_j]: - FS1[AA_i + AA_j].add(table[(codon_i + codon_j)[1:4]]) # +1. - FS2[AA_i + AA_j].add(table[(codon_i + codon_j)[2:5]]) # +2. - #for - return FS1, FS2 -#makeFSTables - -def __makeOverlaps(peptide): - """ - Make a list of overlapping 2-mers of {peptide} in order of appearance. - - @arg peptide: A peptide sequence. - @type peptide: str - @returns: All 2-mers of {peptide} in order of appearance. - @rtype: list(str) - """ - return map(lambda x: peptide[x:x+2], range(len(peptide) - 1)) -#__makeOverlaps - -def __options(pList, peptidePrefix, FS, output): - """ - Enumerate all peptides that could result from a frame shift. - - @arg pList: List of overlapping 2-mers of a peptide. - @type pList: list(str) - @arg peptidePrefix: Prefix of a peptide in the alternative reading frame. - @type peptidePrefix: str - @arg FS: Frame shift table. - @type FS: dict - @arg output: List of peptides, should be empty initially. - @type output: list(str) - """ - if not pList: - output.append(peptidePrefix) - return - #if - for i in FS[pList[0]]: - __options(pList[1:], peptidePrefix + i, FS, output) -#__options - -def enumFS(peptide, FS): - """ - Enumerate all peptides that could result from a frame shift. - - @arg peptide: Original peptide sequence. - @type peptide: str - @arg FS: Frame shift table. - @type FS: dict - """ - output = [] - - __options(__makeOverlaps(peptide), "", FS, output) - return output -#enumFS - -def fitFS(peptide, altPeptide, FS): - """ - Check whether peptide {altPeptide} is a possible frame shift of peptide - {peptide}. - - @arg peptide: Original peptide sequence. - @type peptide: str - @arg altPeptide: Observed peptide sequence. - @type altPeptide: str - @arg FS: Frame shift table. - @type FS: dict - """ - # Todo: This is a temporary fix to prevent crashing on frameshift - # detection (I think bug #124). - return False - - if len(peptide) < len(altPeptide): - return False - - pList = __makeOverlaps(peptide) - - for i in range(len(altPeptide)): - if not altPeptide[i] in FS[pList[i]]: - return False - return True -#fitFS - -def findFS(peptide, altPeptide, FS): - """ - Find the longest part of {altPeptide} that fits in {peptide} in a certain - frame given by {FS}. - - @arg peptide: Original peptide sequence. - @type peptide: str - @arg altPeptide: Observed peptide sequence. - @type altPeptide: str - @arg FS: Frame shift table. - @type FS: dict - - @returns: The length and the offset in {peptide} of the largest frameshift. - @rtype: tuple(int, int) - """ - pList = __makeOverlaps(peptide) - maxFS = 0 - fsStart = 0 - - for i in range(len(pList))[::-1]: - for j in range(min(i + 1, len(altPeptide))): - if not altPeptide[::-1][j] in FS[pList[i - j]]: - break - if j >= maxFS: - maxFS = j - fsStart = i - j + 2 - #if - #for - - return maxFS - 1, fsStart -#findFS - -class RawVar(models.RawVar): - """ - Container for a raw variant. - - To use this class correctly, do not supply more than the minimum amount of - data. The {description()} function may not work properly if too much - information is given. - - Example: if {end} is initialised for a substitution, a range will be - retuned, resulting in a description like: 100_100A>T - """ - - def __init__(self, DNA=True, start=0, start_offset=0, end=0, end_offset=0, - type="none", deleted="", inserted="", shift=0, startAA="", endAA="", - term=0): - """ - Initialise the class with the appropriate values. - - @arg start: Start position. - @type start: int - @arg start_offset: - @type start_offset: int - @arg end: End position. - @type end: int - @arg end_offset: - @type end_offset: int - @arg type: Variant type. - @type type: str - @arg deleted: Deleted part of the reference sequence. - @type deleted: str - @arg inserted: Inserted part. - @type inserted: str - @arg shift: Amount of freedom. - @type shift: int - """ - # TODO: Will this container be used for all variants, or only genomic? - # start_offset and end_offset may be never used. - self.DNA = DNA - self.start = start - self.start_offset = start_offset - self.end = end - self.end_offset = end_offset - self.type = type - self.deleted = deleted - self.inserted = inserted - self.shift = shift - self.startAA = startAA - self.endAA = endAA - self.term = term - self.update() - #self.hgvs = self.description() - #self.hgvsLength = self.descriptionLength() - #__init__ - - def __DNADescription(self): - """ - Give the HGVS description of the raw variant stored in this class. - - Note that this function relies on the absence of values to make the - correct description. Also see the comment in the class definition. - - @returns: The HGVS description of the raw variant stored in this class. - @rtype: str - """ - if not self.start: - return "=" - - descr = "%i" % self.start - - if self.end: - descr += "_%i" % self.end - - if self.type != "subst": - descr += "%s" % self.type - - if self.inserted: - return descr + "%s" % self.inserted - return descr - #if - - return descr + "%s>%s" % (self.deleted, self.inserted) - #__DNADescription - - def __proteinDescription(self): - """ - Give the HGVS description of the raw variant stored in this class. - - Note that this function relies on the absence of values to make the - correct description. Also see the comment in the class definition. - - @returns: The HGVS description of the raw variant stored in this class. - @rtype: str - """ - if self.type == "unknown": - return "?" - if not self.start: - return "=" - - descr = "" - if not self.deleted: - if self.type == "ext": - descr += '*' - else: - descr += "%s" % seq3(self.startAA) - #if - else: - descr += "%s" % seq3(self.deleted) - descr += "%i" % self.start - if self.end: - descr += "_%s%i" % (seq3(self.endAA), self.end) - if self.type not in ["subst", "stop", "ext", "fs"]: # fs is not a type - descr += self.type - if self.inserted: - descr += "%s" % seq3(self.inserted) - - if self.type == "stop": - return descr + '*' - if self.term: - return descr + "fs*%i" % self.term - return descr - #__proteinDescription - - def __DNADescriptionLength(self): - """ - Give the standardised length of the HGVS description of the raw variant - stored in this class. - - Note that this function relies on the absence of values to make the - correct description. Also see the comment in the class definition. - - @returns: The standardised length of the HGVS description of the raw - variant stored in this class. - @rtype: int - """ - if not self.start: # `=' or `?' - return 1 - - descrLen = 1 # Start position. - - if self.end: # '_' and end position. - descrLen += 2 - - if self.type != "subst": - descrLen += len(self.type) - - if self.inserted: - return descrLen + len(self.inserted) - return descrLen - #if - - return 4 # Start position, '>' and end position. - #__DNAdescriptionLength - - def __proteinDescriptionLength(self): - """ - Give the standardised length of the HGVS description of the raw variant - stored in this class. - - Note that this function relies on the absence of values to make the - correct description. Also see the comment in the class definition. - - @returns: The standardised length of the HGVS description of the raw - variant stored in this class. - @rtype: int - """ - if not self.start: # = - return 1 - - descrLen = 1 # Start position. - if not self.deleted and self.type == "ext": - descrLen += 1 # * - else: - descrLen += 3 # One amino acid. - if self.end: - descrLen += 5 # `_' + one amino acid + end position. - if self.type not in ["subst", "stop", "ext", "fs"]: - descrLen += len(self.type) - if self.inserted: - descrLen += 3 * len(self.inserted) - if self.type == "stop": - return descrLen + 1 # * - if self.term: - return descrLen + len(self.type) + 2 # `*' + length until stop. - return descrLen - #__proteinDescriptionLength - - def update(self): - """ - """ - self.hgvs = self.description() - self.hgvsLength = self.descriptionLength() - #update - - def description(self): - """ - """ - if self.DNA: - return self.__DNADescription() - return self.__proteinDescription() - #description - - def descriptionLength(self): - """ - Give the standardised length of the HGVS description of the raw variant - stored in this class. - - @returns: The standardised length of the HGVS description of the raw - variant stored in this class. - @rtype: int - """ - if self.DNA: - return self.__DNADescriptionLength() - return self.__proteinDescriptionLength() - #descriptionLength -#RawVar - -def alleleDescription(allele): - """ - Convert a list of raw variants to an HGVS allele description. - - @arg allele: A list of raw variants representing an allele description. - @type allele: list(RawVar) - - @returns: The HGVS description of {allele}. - @rval: str - """ - if len(allele) > 1: - return "[%s]" % ';'.join(map(lambda x: x.hgvs, allele)) - return allele[0].hgvs -#alleleDescription - -def alleleDescriptionLength(allele): - """ - Calculate the standardised length of an HGVS allele description. - - @arg allele: A list of raw variants representing an allele description. - @type allele: list(RawVar) - - @returns: The standardised length of the HGVS description of {allele}. - @rval: int - """ - # NOTE: Do we need to count the ; and [] ? - return sum(map(lambda x: x.hgvsLength, allele)) -#alleleDescriptionLength - -def printpos(s, start, end, fill=0): - """ - For debugging purposes. - """ - # TODO: See if this can partially replace or be merged with the - # visualisation in the __mutate() function of mutator.py - fs = 10 # Flank size. - - return "%s %s%s %s" % (s[start - fs:start], s[start:end], '-' * fill, - s[end:end + fs]) -#printpos - -def var2RawVar(s1, s2, var, DNA=True): - """ - """ - # Unknown. - if s1 == '?' or s2 == '?': - return [RawVar(DNA=DNA, type="unknown")] - - # Insertion / Duplication. - if var.reference_start == var.reference_end: - ins_length = var.sample_end - var.sample_start - shift5, shift3 = roll(s2, var.sample_start + 1, var.sample_end) - shift = shift5 + shift3 - - var.reference_start += shift3 - var.reference_end += shift3 - var.sample_start += shift3 - var.sample_end += shift3 - - if (var.sample_start - ins_length >= 0 and - s1[var.reference_start - ins_length:var.reference_start] == - s2[var.sample_start:var.sample_end]): - - if ins_length == 1: - return RawVar(DNA=DNA, start=var.reference_start, type="dup", - shift=shift) - return RawVar(DNA=DNA, start=var.reference_start - ins_length + 1, - end=var.reference_end, type="dup", shift=shift) - #if - return RawVar(DNA=DNA, start=var.reference_start, - end=var.reference_start + 1, - inserted=s2[var.sample_start:var.sample_end], type="ins", - shift=shift) - #if - - # Deletion. - if var.sample_start == var.sample_end: - shift5, shift3 = roll(s1, var.reference_start + 1, var.reference_end) - shift = shift5 + shift3 - - var.reference_start += shift3 + 1 - var.reference_end += shift3 - - if var.reference_start == var.reference_end: - return RawVar(DNA=DNA, start=var.reference_start, type="del", - shift=shift) - return RawVar(DNA=DNA, start=var.reference_start, - end=var.reference_end, type="del", shift=shift) - #if - - # Substitution. - if (var.reference_start + 1 == var.reference_end and - var.sample_start + 1 == var.sample_end): - - return RawVar(DNA=DNA, start=var.reference_start + 1, - deleted=s1[var.reference_start], inserted=s2[var.sample_start], - type="subst") - #if - - # Simple InDel. - if var.reference_start + 1 == var.reference_end: - return RawVar(DNA=DNA, start=var.reference_start + 1, - inserted=s2[var.sample_start:var.sample_end], type="delins") - - # Inversion. - if var.type == extractor.VARIANT_REVERSE_COMPLEMENT: - trim = palinsnoop(s1[var.reference_start:var.reference_end]) - - if trim > 0: # Partial palindrome. - var.reference_end -= trim - var.sample_end -= trim - #if - - return RawVar(DNA=DNA, start=var.reference_start + 1, - end=var.reference_end, type="inv") - #if - - # InDel. - return RawVar(DNA=DNA, start=var.reference_start + 1, - end=var.reference_end, inserted=s2[var.sample_start:var.sample_end], - type="delins") -#var2RawVar - -def description(s1, s2, DNA=True): - """ - Give an allele description of the change from {s1} to {s2}. - - arg s1: Sequence 1. - type s1: str - arg s2: Sequence 2. - type s2: str - - @returns: A list of RawVar objects, representing the allele. - @rval: list(RawVar) - """ - description = [] - - if not DNA: - FS1, FS2 = makeFSTables(1) - longestFSf = max(findFS(s1, s2, FS1), findFS(s1, s2, FS2)) - longestFSr = max(findFS(s2, s1, FS1), findFS(s2, s1, FS2)) - - if longestFSf > longestFSr: - print s1[:longestFSf[1]], s1[longestFSf[1]:] - print s2[:len(s2) - longestFSf[0]], s2[len(s2) - longestFSf[0]:] - s1_part = s1[:longestFSf[1]] - s2_part = s2[:len(s2) - longestFSf[0]] - term = longestFSf[0] - #if - else: - print s1[:len(s1) - longestFSr[0]], s1[len(s1) - longestFSr[0]:] - print s2[:longestFSr[1]], s2[longestFSr[1]:] - s1_part = s1[:len(s1) - longestFSr[0]] - s2_part = s2[:longestFSr[1]] - term = len(s2) - longestFSr[1] - #else - - s1_part = s1 - s2_part = s2 - for variant in extractor.extract(str(s1_part), len(s1_part), - str(s2_part), len(s2_part), 1): - description.append(var2RawVar(s1, s2, variant, DNA=DNA)) - - if description: - description[-1].term = term + 2 - description[-1].update() - #if - #if - else: - for variant in extractor.extract(str(s1), len(s1), str(s2), len(s2), - 0): - if variant.type != extractor.VARIANT_IDENTITY: - description.append(var2RawVar(s1, s2, variant, DNA=DNA)) - - # Nothing happened. - if not description: - return [RawVar(DNA=DNA)] - - return description -#description - -if __name__ == "__main__": - a = "ATAGATGATAGATAGATAGAT" - b = "ATAGATGATTGATAGATAGAT" - print alleleDescription(description(a, b, DNA=True)) - - a = "MAVLWRLSAVCGALGGRALLLRTPVVRPAH" - b = "MAVLWRLSAGCGALGGRALLLRTPVVRAH" - print alleleDescription(description(a, b, DNA=False)) - - a = "MDYSLAAALTLHGHWGLGQVVTDYVHGDALQKAAKAGLLALSALTFAGLCYFNYHDVGICKAVAMLWKL" - b = "MDYSLAAALTFMVTGALDKLLLTMFMGMPCRKLPRQGFWHFQL" - #print alleleDescription(description(a, b, DNA=False)) - #print alleleDescription(description(b, a, DNA=False)) - print "1" - extractor.extract(a, len(a), b, len(b), 1) - print "2" - extractor.extract(b, len(b), a, len(a), 1) - print "3" - - - a = "VVSVLLLGLLPAAYLNPCSAMYYSLAAALTLHGHWGLGQV" - b = "VVSVLLLGLLPAAYLNPCSAMDYSLAAALTLHGHWGLGQV" - print alleleDescription(description(a, b, DNA=False)) - print alleleDescription(description(b, a, DNA=False)) - - a = "ACGCTCGATCGCTTATAGCATGGGGGGGGGATCTAGCTCTCTCTATAAGATA" - b = "ACGCTCGATCGCTTATACCCCCCCCATGCGATCTAGCTCTCTCTATAAGATA" - print alleleDescription(description(a, b, DNA=True)) - -#if diff --git a/mutalyzer/entrypoints/__init__.py b/mutalyzer/entrypoints/__init__.py index 36b5ad16a25f2e75f11765e052dd8099697ddb13..7d95d01efa1575d1a4896bbc15e3a4f2e972b4cf 100644 --- a/mutalyzer/entrypoints/__init__.py +++ b/mutalyzer/entrypoints/__init__.py @@ -3,6 +3,12 @@ Entry points to Mutalyzer. """ +from __future__ import unicode_literals + +import locale +import sys + + class _ReverseProxied(object): """ Wrap the application in this middleware and configure the front-end server @@ -36,3 +42,15 @@ class _ReverseProxied(object): if scheme: environ['wsgi.url_scheme'] = scheme return self.app(environ, *args, **kwargs) + + +def _cli_string(argument): + """ + Decode a command line argument byte string to unicode using our best + guess for the encoding (noop on unicode strings). + """ + encoding = sys.stdin.encoding or locale.getpreferredencoding() + + if isinstance(argument, unicode): + return argument + return unicode(argument, encoding=encoding) diff --git a/mutalyzer/entrypoints/admin.py b/mutalyzer/entrypoints/admin.py index 42929e6bb31c974149f11b12a7ba39680eb9c5ae..e7c74178ea0560a133ce39e4d04c8ee1e647d58d 100644 --- a/mutalyzer/entrypoints/admin.py +++ b/mutalyzer/entrypoints/admin.py @@ -3,16 +3,20 @@ Command line interface to Mutalyzer administrative tools. """ +from __future__ import unicode_literals + import argparse +import codecs import json +import locale import os import alembic.command import alembic.config from alembic.migration import MigrationContext -from sqlalchemy.exc import IntegrityError from sqlalchemy.orm.exc import NoResultFound +from . import _cli_string from .. import announce from .. import db from ..db import session @@ -26,10 +30,12 @@ class UserError(Exception): pass -def add_assembly(assembly_file): +def add_assembly(assembly_file, encoding): """ Add genome assembly definition from a JSON file. """ + assembly_file = codecs.getreader(encoding)(assembly_file) + try: definition = json.load(assembly_file) except ValueError: @@ -84,10 +90,13 @@ def list_assemblies(): assembly.taxonomy_id) -def import_mapview(assembly_name_or_alias, mapview_file, group_label): +def import_mapview(assembly_name_or_alias, mapview_file, encoding, + group_label): """ Import transcript mappings from an NCBI mapview file. """ + mapview_file = codecs.getreader(encoding)(mapview_file) + try: assembly = Assembly.by_name_or_alias(assembly_name_or_alias) except NoResultFound: @@ -96,7 +105,7 @@ def import_mapview(assembly_name_or_alias, mapview_file, group_label): try: mapping.import_from_mapview_file(assembly, mapview_file, group_label) except mapping.MapviewSortError as e: - raise UserError(str(e)) + raise UserError(unicode(e)) def import_gene(assembly_name_or_alias, gene): @@ -182,10 +191,13 @@ def main(): """ Command-line interface to Mutalyzer administrative tools. """ + default_encoding = locale.getpreferredencoding() + assembly_parser = argparse.ArgumentParser(add_help=False) assembly_parser.add_argument( - '-a', '--assembly', metavar='ASSEMBLY', dest='assembly_name_or_alias', - default='hg19', help='assembly to import to (default: hg19)') + '-a', '--assembly', metavar='ASSEMBLY', type=_cli_string, + dest='assembly_name_or_alias', default='hg19', + help='assembly to import to (default: hg19)') parser = argparse.ArgumentParser( description='Mutalyzer administrative tools.') @@ -210,9 +222,13 @@ def main(): description=add_assembly.__doc__.split('\n\n')[0]) p.set_defaults(func=add_assembly) p.add_argument( - 'assembly_file', metavar='FILE', type=argparse.FileType('r'), + 'assembly_file', metavar='FILE', type=argparse.FileType('rb'), help='genome assembly definition JSON file (example: ' 'extras/assemblies/GRCh37.json)') + p.add_argument( + '--encoding', metavar='ENCODING', type=_cli_string, + default=default_encoding, + help='input file encoding (default: %s)' % default_encoding) # Subparser 'assemblies import-mapview'. p = s.add_parser( @@ -224,10 +240,14 @@ def main(): '`sort -t $\'\\t\' -k 11,11 -k 2,2` command.') p.set_defaults(func=import_mapview) p.add_argument( - 'mapview_file', metavar='FILE', type=argparse.FileType('r'), + 'mapview_file', metavar='FILE', type=argparse.FileType('rb'), help='file from NCBI mapview (example: seq_gene.md), see note below') p.add_argument( - 'group_label', metavar='GROUP_LABEL', + '--encoding', metavar='ENCODING', type=_cli_string, + default=default_encoding, + help='input file encoding (default: %s)' % default_encoding) + p.add_argument( + 'group_label', metavar='GROUP_LABEL', type=_cli_string, help='use only entries with this group label (example: ' 'GRCh37.p2-Primary Assembly)') @@ -241,7 +261,7 @@ def main(): ' (i.e., NCBI mapview).') p.set_defaults(func=import_gene) p.add_argument( - 'gene', metavar='GENE_SYMBOL', + 'gene', metavar='GENE_SYMBOL', type=_cli_string, help='gene to import all transcript mappings for from the UCSC ' 'database (example: TTN)') @@ -255,7 +275,7 @@ def main(): 'usual source (i.e., NCBI mapview).') p.set_defaults(func=import_reference) p.add_argument( - 'reference', metavar='ACCESSION', + 'reference', metavar='ACCESSION', type=_cli_string, help='genomic reference to import all genes from (example: ' 'NC_012920.1)') @@ -272,10 +292,10 @@ def main(): description=set_announcement.__doc__.split('\n\n')[0]) p.set_defaults(func=set_announcement) p.add_argument( - 'body', metavar='ANNOUNCEMENT', + 'body', metavar='ANNOUNCEMENT', type=_cli_string, help='announcement text to show to the user') p.add_argument( - '--url', metavar='URL', dest='url', + '--url', metavar='URL', dest='url', type=_cli_string, help='URL to more information on the announcement') # Subparser 'announcement unset'. @@ -290,10 +310,10 @@ def main(): description=sync_cache.__doc__.split('\n\n')[0], epilog='Intended use is to run daily from cron.') p.add_argument( - 'wsdl_url', metavar='WSDL_URL', + 'wsdl_url', metavar='WSDL_URL', type=_cli_string, help='location of the remote WSDL description') p.add_argument( - 'url_template', metavar='URL_TEMPLATE', + 'url_template', metavar='URL_TEMPLATE', type=_cli_string, help='URL for remote downloads, in which the filename is to be ' 'substituted for {file}') p.add_argument( @@ -313,7 +333,7 @@ def main(): '--destructive', dest='destructive', action='store_true', help='delete any existing tables and data') p.add_argument( - '-c', '--alembic-config', metavar='ALEMBIC_CONFIG', + '-c', '--alembic-config', metavar='ALEMBIC_CONFIG', type=_cli_string, dest='alembic_config_path', help='path to Alembic configuration file') p.set_defaults(func=setup_database) @@ -323,7 +343,7 @@ def main(): args.func(**{k: v for k, v in vars(args).items() if k not in ('func', 'subcommand')}) except UserError as e: - parser.error(str(e)) + parser.error(unicode(e)) if __name__ == '__main__': diff --git a/mutalyzer/entrypoints/batch_processor.py b/mutalyzer/entrypoints/batch_processor.py index 286c411609642515e8ff6e3308e759b4234b0b92..ae3c2945748db1a3b286690e8df52dedbe603c13 100644 --- a/mutalyzer/entrypoints/batch_processor.py +++ b/mutalyzer/entrypoints/batch_processor.py @@ -6,12 +6,13 @@ Mutalyzer batch processor. """ +from __future__ import unicode_literals + import argparse import signal import sys import time -from .. import config from .. import db from .. import Scheduler diff --git a/mutalyzer/entrypoints/mutalyzer.py b/mutalyzer/entrypoints/mutalyzer.py index d123482fbe92d6ffa0f0277dcfd2847d877ecbab..6717161d1d4795c923f70cfa6846358ace2972c8 100644 --- a/mutalyzer/entrypoints/mutalyzer.py +++ b/mutalyzer/entrypoints/mutalyzer.py @@ -5,8 +5,12 @@ Mutalyzer command-line name checker. """ +from __future__ import unicode_literals + import argparse +import sys +from . import _cli_string from .. import describe from .. import output from .. import variantchecker @@ -114,7 +118,7 @@ def main(): parser = argparse.ArgumentParser( description='Mutalyzer command-line name checker.') parser.add_argument( - 'description', metavar='DESCRIPTION', + 'description', metavar='DESCRIPTION', type=_cli_string, help='variant description to run the name checker on') args = parser.parse_args() diff --git a/mutalyzer/entrypoints/service_json.py b/mutalyzer/entrypoints/service_json.py index 25ff8bbfc1d7d3a01a49a375a6caba90b846aed3..5e5d93d01a2e7d48d3acbf4e0014ac2f4ddde60f 100644 --- a/mutalyzer/entrypoints/service_json.py +++ b/mutalyzer/entrypoints/service_json.py @@ -18,6 +18,8 @@ You can also use the built-in HTTP server by running this file directly. """ +from __future__ import unicode_literals + import argparse import logging import sys @@ -25,7 +27,7 @@ import sys from wsgiref.simple_server import make_server from spyne.server.wsgi import WsgiApplication -from . import _ReverseProxied +from . import _cli_string, _ReverseProxied from ..config import settings from ..services import json @@ -57,9 +59,9 @@ def main(): parser = argparse.ArgumentParser( description='Mutalyzer HTTP/RPC+JSON webservice.') parser.add_argument( - '-H', '--host', metavar='HOSTNAME', dest='host', default='127.0.0.1', - help='hostname to listen on (default: 127.0.0.1; specify 0.0.0.0 to ' - 'listen on all hostnames)') + '-H', '--host', metavar='HOSTNAME', type=_cli_string, dest='host', + default='127.0.0.1', help='hostname to listen on (default: ' + '127.0.0.1; specify 0.0.0.0 to listen on all hostnames)') parser.add_argument( '-p', '--port', metavar='PORT', dest='port', type=int, default=8082, help='port to listen on (default: 8082)') diff --git a/mutalyzer/entrypoints/service_soap.py b/mutalyzer/entrypoints/service_soap.py index 6b630ad6aa1bc885995e099bca91553260854b0d..8179faa358e7e109442cef799b1ffc2f8e4c0128 100644 --- a/mutalyzer/entrypoints/service_soap.py +++ b/mutalyzer/entrypoints/service_soap.py @@ -18,6 +18,8 @@ You can also use the built-in HTTP server by running this file directly. """ +from __future__ import unicode_literals + import argparse import logging import sys @@ -25,7 +27,7 @@ import sys from wsgiref.simple_server import make_server from spyne.server.wsgi import WsgiApplication -from . import _ReverseProxied +from . import _cli_string, _ReverseProxied from ..config import settings from ..services import soap @@ -58,9 +60,9 @@ def main(): parser = argparse.ArgumentParser( description='Mutalyzer SOAP webservice.') parser.add_argument( - '-H', '--host', metavar='HOSTNAME', dest='host', default='127.0.0.1', - help='hostname to listen on (default: 127.0.0.1; specify 0.0.0.0 to ' - 'listen on all hostnames)') + '-H', '--host', metavar='HOSTNAME', type=_cli_string, dest='host', + default='127.0.0.1', help='hostname to listen on (default: ' + '127.0.0.1; specify 0.0.0.0 to listen on all hostnames)') parser.add_argument( '-p', '--port', metavar='PORT', dest='port', type=int, default=8081, help='port to listen on (default: 8081)') diff --git a/mutalyzer/entrypoints/website.py b/mutalyzer/entrypoints/website.py index a62e3bb332322312191d4f8eff800d711608037b..f387b70ff4cb5f4a315ca4de9a87c9e0d0033b5a 100644 --- a/mutalyzer/entrypoints/website.py +++ b/mutalyzer/entrypoints/website.py @@ -39,9 +39,12 @@ also serve the static files. """ +from __future__ import unicode_literals + import argparse +import sys -from . import _ReverseProxied +from . import _cli_string, _ReverseProxied from ..config import settings from .. import website @@ -66,9 +69,9 @@ def main(): parser = argparse.ArgumentParser( description='Mutalyzer website.') parser.add_argument( - '-H', '--host', metavar='HOSTNAME', dest='host', default='127.0.0.1', - help='hostname to listen on (default: 127.0.0.1; specify 0.0.0.0 to ' - 'listen on all hostnames)') + '-H', '--host', metavar='HOSTNAME', type=_cli_string, dest='host', + default='127.0.0.1', help='hostname to listen on (default: ' + '127.0.0.1; specify 0.0.0.0 to listen on all hostnames)') parser.add_argument( '-p', '--port', metavar='PORT', dest='port', type=int, default=8089, help='port to listen on (default: 8080)') diff --git a/mutalyzer/grammar.py b/mutalyzer/grammar.py index 0e65ec574822f0182ee4d1eb0abe6accb548161d..8f231bf57cee26ed032a21dbfeb4cb3f7d83f1ce 100644 --- a/mutalyzer/grammar.py +++ b/mutalyzer/grammar.py @@ -19,6 +19,8 @@ The grammar is described in [3]. """ +from __future__ import unicode_literals + from pyparsing import * @@ -48,7 +50,7 @@ class Grammar(): ########################################################################## # BNF: Name -> ([a-z] | [a-Z] | [0-9])+ - Name = Word(alphanums, min=1) + Name = Word(unicode(alphanums), min=1) # BNF: Nt -> `a' | `c' | `g' | `u' | `A' | `C' | `G' | `T' | `U' #Nt = Word('acgtuACGTU', exact=1) @@ -66,7 +68,7 @@ class Grammar(): NtString = Combine(OneOrMore(Nt)) # BNF: Number -> [0-9]+ - Number = Word(nums) + Number = Word(unicode(nums)) ########################################################################## # Reference sequences @@ -79,7 +81,7 @@ class Grammar(): ProtIso = Suppress('_i') + Number('ProtIso') # BNF: GeneName -> ([a-Z] | [0-9] | `-')+ - GeneName = Word(alphanums + '-', min=1) + GeneName = Word(unicode(alphanums) + '-', min=1) # BNF: GeneSymbol -> `(' Name (TransVar | ProtIso)? `)' GeneSymbol = Suppress('(') + Group(GeneName('GeneSymbol') + \ @@ -94,11 +96,11 @@ class Grammar(): # BNF: AccNo -> ([a-Z] Number `_')+ Version? AccNo = NotAny('LRG_') + \ - Combine(Word(alphas + '_') + Number)('RefSeqAcc') + \ + Combine(Word(unicode(alphas) + '_') + Number)('RefSeqAcc') + \ Optional(Version) # BNF: UD -> `UD_' [a-Z]+ (`_' Number)+ - UD = Combine('UD_' + Word(alphas) + OneOrMore('_' + Number))('RefSeqAcc') + UD = Combine('UD_' + Word(unicode(alphas)) + OneOrMore('_' + Number))('RefSeqAcc') # BNF: LRGTranscriptID -> `t' [0-9]+ LRGTranscriptID = Suppress('t') + Number('LRGTranscriptID') @@ -467,7 +469,7 @@ class Grammar(): the input where the error occurred (and return None). @arg variant: The input string that needs to be parsed. - @type variant: string + @type variant: unicode @return: The parse tree containing the parse results, or None in case of a parsing error. @@ -480,12 +482,12 @@ class Grammar(): return self.Var.parseString(variant, parseAll=True) # Todo: check .dump() except ParseException as err: - print err.line - print " "*(err.column-1) + "^" - print err + #print err.line + #print " "*(err.column-1) + "^" + #print err # Log parse error and the position where it occurred. - self._output.addMessage(__file__, 4, 'EPARSE', str(err)) - pos = int(str(err).split(':')[-1][:-1]) - 1 + self._output.addMessage(__file__, 4, 'EPARSE', unicode(err)) + pos = int(unicode(err).split(':')[-1][:-1]) - 1 self._output.addOutput('parseError', variant) self._output.addOutput('parseError', pos * ' ' + '^') return None diff --git a/mutalyzer/mapping.py b/mutalyzer/mapping.py index 693294d31b5a2a06319c24566d3e98259657882d..11e058997182252b01d75ea1b0586cb555347d18 100644 --- a/mutalyzer/mapping.py +++ b/mutalyzer/mapping.py @@ -10,11 +10,12 @@ update the database with this information. """ +from __future__ import unicode_literals + from collections import defaultdict from itertools import groupby from operator import attrgetter, itemgetter -from Bio.Seq import reverse_complement import MySQLdb from mutalyzer.db import session @@ -24,6 +25,7 @@ from mutalyzer.models import SoapMessage, Mapping, Transcript from mutalyzer.output import Output from mutalyzer import Crossmap from mutalyzer import Retriever +from mutalyzer import util class MapviewSortError(Exception): @@ -40,28 +42,29 @@ def _construct_change(var, reverse=False): @type reverse: bool @return: Description of mutation (without reference and positions). - @rtype: string + @rtype: unicode """ + # Note that the pyparsing parse tree yields `str('')` for nonexisting + # attributes, so we wrap the optional attributes in `unicode()`. if reverse: - # todo: if var.Arg1 is unicode, this crashes try: - arg1 = str(int(var.Arg1)) + arg1 = unicode(int(var.Arg1)) except ValueError: - arg1 = reverse_complement(str(var.Arg1) or '') + arg1 = util.reverse_complement(unicode(var.Arg1)) try: - arg2 = str(int(var.Arg2)) + arg2 = unicode(int(var.Arg2)) except ValueError: - arg2 = reverse_complement(str(var.Arg2) or '') + arg2 = util.reverse_complement(unicode(var.Arg2)) else: - arg1 = var.Arg1 - arg2 = var.Arg2 + arg1 = unicode(var.Arg1) + arg2 = unicode(var.Arg2) def parse_sequence(seq): if not seq.Sequence: raise NotImplementedError('Only explicit sequences are supported ' 'for insertions.') if reverse: - return reverse_complement(str(seq.Sequence)) + return util.reverse_complement(seq.Sequence) return seq.Sequence if var.MutationType == 'subst': @@ -72,7 +75,7 @@ def _construct_change(var, reverse=False): seqs = reversed(var.SeqList) else: seqs = var.SeqList - insertion = '[' + ';'.join(str(parse_sequence(seq)) + insertion = '[' + ';'.join(parse_sequence(seq) for seq in seqs) + ']' else: insertion = parse_sequence(var.Seq) @@ -161,11 +164,11 @@ class Converter(object) : Get data from database. @arg acc: NM_ accession number (without version) - @type acc: string + @type acc: unicode @arg version: version number @type version: integer @kwarg selector: Optional gene symbol selector. - @type selector: str + @type selector: unicode @kwarg selector_version: Optional transcript version selector. @type selector_version: int """ @@ -269,7 +272,7 @@ class Converter(object) : @arg Loc: A location in either I{g.} or I{c.} notation @type Loc: object @arg Type: The reference type - @type Type: string + @type Type: unicode @returns: triple: 0. Main coordinate in I{c.} notation 1. Offset coordinate in I{c.} notation @@ -359,7 +362,7 @@ class Converter(object) : available. @arg accNo: transcript (NM_) accession number (with or without version) - @type accNo: string + @type accNo: unicode @return: transcription start, transcription end and CDS stop @rtype: triple @@ -381,7 +384,7 @@ class Converter(object) : One of the entry points (called by the HTML publisher). @arg accNo: The full NM accession number (including version) - @type accNo: string + @type accNo: unicode @return: T ; ClassSerializer object with the types trans_start, trans_stop and CDS_stop @@ -404,9 +407,9 @@ class Converter(object) : One of the entry points (called by the HTML publisher). @arg accNo: transcript (NM_) accession number (with version?) - @type accNo: string + @type accNo: unicode @arg mutation: the 'mutation' (e.g. c.123C>T) - @type mutation: string + @type mutation: unicode @return: ClassSerializer object @rtype: object @@ -493,10 +496,10 @@ class Converter(object) : Converts a complete HGVS I{c.} notation into a chromosomal notation. @arg variant: The variant in HGVS I{c.} notation - @type variant: string + @type variant: unicode @return: var_in_g ; The variant in HGVS I{g.} notation - @rtype: string + @rtype: unicode """ if self._parseInput(variant): acc = self.parseTree.RefSeqAcc @@ -528,7 +531,7 @@ class Converter(object) : r_change = _construct_change(variant, reverse=True) except NotImplementedError as e: self.__output.addMessage(__file__, 3, 'ENOTIMPLEMENTED', - str(e)) + unicode(e)) return None if self.mapping.orientation == 'forward': @@ -568,14 +571,14 @@ class Converter(object) : @arg positions: Positions in c. notation to convert. @type positions: list @arg reference: Transcript reference. - @type reference: string + @type reference: unicode @kwarg version: Transcript reference version. If omitted, '0' is assumed. - @type version: string + @type version: unicode @return: Chromosome name, orientation (+ or -), and converted positions. - @rtype: tuple(string, string, list) + @rtype: tuple(unicode, unicode, list) This only works for positions on transcript references in c. notation. """ @@ -617,10 +620,10 @@ class Converter(object) : def correctChrVariant(self, variant) : """ @arg variant: - @type variant: string + @type variant: unicode @return: variant ; - @rtype: string + @rtype: unicode """ #Pre split check @@ -651,12 +654,12 @@ class Converter(object) : def chrom2c(self, variant, rt, gene=None): """ @arg variant: a variant description - @type variant: string + @type variant: unicode @arg rt: the return type - @type rt: string + @type rt: unicode @kwarg gene: Optional gene name. If given, return variant descriptions on all transcripts for this gene. - @type gene: string + @type gene: unicode @return: HGVS_notatations ; @rtype: dictionary or list @@ -751,7 +754,7 @@ class Converter(object) : r_change = _construct_change(variant, reverse=True) except NotImplementedError as e: self.__output.addMessage(__file__, 4, - "ENOTIMPLEMENTEDERROR", str(e)) + "ENOTIMPLEMENTEDERROR", unicode(e)) return None startp = self.crossmap.tuple2string((cmap.startmain, cmap.startoffset)) @@ -786,6 +789,8 @@ class Converter(object) : #Converter +# Todo: This seems broken at the moment. +# Todo: Correct handling of string encodings. def import_from_ucsc_by_gene(assembly, gene): """ Import transcript mappings for a gene from the UCSC. diff --git a/mutalyzer/models.py b/mutalyzer/models.py index f7356dda60a68c840c455d28c65eaec14efd0a44..bc9bf5a0780a382af267b3973b17b017c6a8ff77 100644 --- a/mutalyzer/models.py +++ b/mutalyzer/models.py @@ -1,26 +1,16 @@ """ Collection of serilizable objects used by the SOAP web service. They extend -from the spyne ClassModel. +from the Spyne model classes. -Default attributes for the spyne ClassModel: -- nillable = True -- min_occurs = 0 -- max_occurs = 1 - -Additional attributes values for the spyne String model: -- min_len = 0 -- max_len = 'unbounded' -- pattern = None - -@todo: Use Mandatory.* models in the ClassModel extensions? -@todo: See if it improves client code if we use Array(_, nillable=False). @todo: Move all these models to the mutalyzer.services package and refactor all uses of them in other places. The SOAP_NAMESPACE variable can then also be moved there. """ -from spyne.model.primitive import String, Integer, Boolean, DateTime +from __future__ import unicode_literals + +from spyne.model.primitive import Integer, Boolean, DateTime, Unicode from spyne.model.binary import ByteArray from spyne.model.complex import ComplexModel, Array @@ -30,9 +20,9 @@ from mutalyzer import SOAP_NAMESPACE class Mandatory(object): """ This is spyne.model.primitive.Mandatory, but without min_length=1 for - the String model. + the Unicode model. """ - String = String(type_name='mandatory_string', min_occurs=1, nillable=False) + Unicode = Unicode(type_name='mandatory_unicode', min_occurs=1, nillable=False) Integer = Integer(type_name='mandatory_integer', min_occurs=1, nillable=False) Boolean = Boolean(type_name='mandatory_boolean', min_occurs=1, nillable=False) DateTime = DateTime(type_name='mandatory_date_time', min_occurs=1, nillable=False) @@ -46,8 +36,8 @@ class SoapMessage(ComplexModel): """ __namespace__ = SOAP_NAMESPACE - errorcode = Mandatory.String - message = Mandatory.String + errorcode = Mandatory.Unicode + message = Mandatory.Unicode #SoapMessage @@ -63,7 +53,7 @@ class Mapping(ComplexModel): endoffset = Integer start_g = Integer end_g = Integer - mutationType = String + mutationType = Unicode errorcode = Integer messages = Array(SoapMessage) #Mapping @@ -87,8 +77,8 @@ class RawVariant(ComplexModel): """ __namespace__ = SOAP_NAMESPACE - description = Mandatory.String - visualisation = Mandatory.String + description = Mandatory.Unicode + visualisation = Mandatory.Unicode #RawVariant @@ -103,14 +93,14 @@ class RawVar(ComplexModel): start_offset = Mandatory.Integer end = Mandatory.Integer end_offset = Mandatory.Integer - type = Mandatory.String - deleted = Mandatory.String - inserted = Mandatory.String + type = Mandatory.Unicode + deleted = Mandatory.Unicode + inserted = Mandatory.Unicode shift = Mandatory.Integer - startAA = Mandatory.String - endAA = Mandatory.String + startAA = Mandatory.Unicode + endAA = Mandatory.Unicode term = Mandatory.Integer - hgvs = Mandatory.String + hgvs = Mandatory.Unicode hgvsLength = Mandatory.Integer #RawVar @@ -121,7 +111,7 @@ class Allele(ComplexModel): """ __namespace__ = SOAP_NAMESPACE - description = Mandatory.String + description = Mandatory.Unicode allele = Array(RawVar) #Allele @@ -132,10 +122,10 @@ class ExonInfo(ComplexModel): """ __namespace__ = SOAP_NAMESPACE - cStart = Mandatory.String + cStart = Mandatory.Unicode gStart = Mandatory.Integer chromStart = Integer - cStop = Mandatory.String + cStop = Mandatory.Unicode gStop = Mandatory.Integer chromStop = Integer #ExonInfo @@ -147,34 +137,34 @@ class MutalyzerOutput(ComplexModel): """ __namespace__ = SOAP_NAMESPACE - referenceId = Mandatory.String - sourceId = Mandatory.String - sourceAccession = String - sourceVersion = String - sourceGi = String - molecule = Mandatory.String + referenceId = Mandatory.Unicode + sourceId = Mandatory.Unicode + sourceAccession = Unicode + sourceVersion = Unicode + sourceGi = Unicode + molecule = Mandatory.Unicode - original = String - mutated = String + original = Unicode + mutated = Unicode - origMRNA = String - mutatedMRNA= String + origMRNA = Unicode + mutatedMRNA= Unicode - origCDS = String - newCDS= String + origCDS = Unicode + newCDS= Unicode - origProtein = String - newProtein = String - altProtein = String + origProtein = Unicode + newProtein = Unicode + altProtein = Unicode errors = Integer warnings = Integer - summary = String + summary = Unicode - chromDescription = String - genomicDescription = String - transcriptDescriptions = Array(String) - proteinDescriptions = Array(String) + chromDescription = Unicode + genomicDescription = Unicode + transcriptDescriptions = Array(Unicode) + proteinDescriptions = Array(Unicode) exons = Array(ExonInfo) @@ -190,8 +180,8 @@ class TranscriptNameInfo(ComplexModel): """ __namespace__ = SOAP_NAMESPACE - transcriptName = Mandatory.String - productName = Mandatory.String + transcriptName = Mandatory.Unicode + productName = Mandatory.Unicode #TranscriptNameInfo @@ -201,9 +191,9 @@ class ProteinTranscript(ComplexModel): """ __namespace__ = SOAP_NAMESPACE - name = Mandatory.String - id = Mandatory.String - product = Mandatory.String + name = Mandatory.Unicode + id = Mandatory.Unicode + product = Mandatory.Unicode #ProteinTranscript @@ -219,27 +209,27 @@ class TranscriptInfo(ComplexModel): """ __namespace__ = SOAP_NAMESPACE - name = Mandatory.String - id = Mandatory.String - product = Mandatory.String + name = Mandatory.Unicode + id = Mandatory.Unicode + product = Mandatory.Unicode - cTransStart = Mandatory.String + cTransStart = Mandatory.Unicode gTransStart = Mandatory.Integer chromTransStart = Integer - cTransEnd = Mandatory.String + cTransEnd = Mandatory.Unicode gTransEnd = Mandatory.Integer chromTransEnd = Integer sortableTransEnd = Mandatory.Integer - cCDSStart = Mandatory.String + cCDSStart = Mandatory.Unicode gCDSStart = Mandatory.Integer chromCDSStart = Integer - cCDSStop = Mandatory.String + cCDSStop = Mandatory.Unicode gCDSStop = Mandatory.Integer chromCDSStop = Integer - locusTag = Mandatory.String - linkMethod = Mandatory.String + locusTag = Mandatory.Unicode + linkMethod = Mandatory.Unicode exons = Array(ExonInfo) @@ -253,10 +243,10 @@ class TranscriptMappingInfo(ComplexModel): """ __namespace__ = SOAP_NAMESPACE - name = Mandatory.String + name = Mandatory.Unicode version = Mandatory.Integer - gene = Mandatory.String - orientation = Mandatory.String + gene = Mandatory.Unicode + orientation = Mandatory.Unicode start = Mandatory.Integer stop = Mandatory.Integer @@ -283,15 +273,15 @@ class InfoOutput(ComplexModel): """ __namespace__ = SOAP_NAMESPACE - version = String - versionParts = Array(String) - releaseDate = String - nomenclatureVersion = String - nomenclatureVersionParts = Array(String) - serverName = String - contactEmail = String - announcement = String - announcementUrl = String + version = Unicode + versionParts = Array(Unicode) + releaseDate = Unicode + nomenclatureVersion = Unicode + nomenclatureVersionParts = Array(Unicode) + serverName = Unicode + contactEmail = Unicode + announcement = Unicode + announcementUrl = Unicode #InfoOutput @@ -301,14 +291,14 @@ class CacheEntry(ComplexModel): """ __namespace__ = SOAP_NAMESPACE - name = Mandatory.String - gi = String - hash = Mandatory.String - chromosomeName = String + name = Mandatory.Unicode + gi = Unicode + hash = Mandatory.Unicode + chromosomeName = Unicode chromosomeStart = Integer chromosomeStop = Integer chromosomeOrientation = Integer - url = String + url = Unicode created = Mandatory.DateTime - cached = String + cached = Unicode #CacheEntry diff --git a/mutalyzer/mutator.py b/mutalyzer/mutator.py index 8047d932d4bab1ca4fa66b2020e2d69428d97853..4a4b0a2d157460e9ebc5cebbde89f0111090492f 100644 --- a/mutalyzer/mutator.py +++ b/mutalyzer/mutator.py @@ -12,12 +12,11 @@ The original as well as the mutated string are stored here. """ +from __future__ import unicode_literals + from collections import defaultdict from Bio import Restriction -from Bio.Seq import Seq -from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA -from Bio.Seq import reverse_complement from mutalyzer import util @@ -46,7 +45,7 @@ class Mutator(): Initialise the instance with the original sequence. @arg orig: The original sequence before mutation. - @type orig: str + @type orig: Bio.Seq.Seq @arg output: The output object. @type output: mutalyzer.Output.Output """ @@ -57,6 +56,8 @@ class Mutator(): self._output = output self.orig = orig + # Note that we don't need to create a copy here, since mutation + # operations are not in place (`self._mutate`). self.mutated = orig #__init__ @@ -72,7 +73,7 @@ class Mutator(): @rtype: dict """ analysis = Restriction.Analysis(self._restriction_batch, sequence) - return dict((str(k), len(v)) for k, v in analysis.with_sites().items()) + return dict((unicode(k), len(v)) for k, v in analysis.with_sites().items()) #_restriction_count def _counts_diff(self, counts1, counts2): @@ -109,10 +110,10 @@ class Mutator(): @arg pos2: Second interbase position of the deleted sequence. @type pos2: int @arg ins: Inserted sequence. - @type ins: str + @type ins: unicode @return: Visualisation. - @rtype: str + @rtype: unicode """ loflank = self.orig[max(pos1 - VIS_FLANK_LENGTH, 0):pos1] roflank = self.orig[pos2:pos2 + VIS_FLANK_LENGTH] @@ -338,7 +339,7 @@ class Mutator(): @arg pos2: Second interbase position of the deleted sequence. @type pos2: int @arg ins: Inserted sequence. - @type ins: str + @type ins: unicode """ correct = 1 if pos1 == pos2 else 0 self.mutated = (self.mutated[:self.shift(pos1 + 1) - 1] + @@ -375,7 +376,7 @@ class Mutator(): @arg pos: Interbase position where the insertion should take place. @type pos: int @arg ins: Inserted sequence. - @type ins: str + @type ins: unicode """ visualisation = ['insertion between %i and %i' % (pos, pos + 1)] visualisation.extend(self._visualise(pos, pos, ins)) @@ -394,7 +395,7 @@ class Mutator(): @arg pos2: Last nucleotide of the deleted sequence. @type pos2: int @arg ins: Inserted sequence. - @type ins: str + @type ins: unicode """ visualisation = ['delins from %i to %i' % (pos1, pos2)] visualisation.extend(self._visualise(pos1 - 1, pos2, ins)) @@ -410,7 +411,7 @@ class Mutator(): @arg pos: Position of the substitution. @type pos: int @arg nuc: Substituted nucleotide. - @type nuc: str + @type nuc: unicode """ visualisation = ['substitution at %i' % pos] visualisation.extend(self._visualise(pos - 1, pos, nuc)) @@ -428,14 +429,13 @@ class Mutator(): @arg pos2: Last nucleotide of the inverted sequence. @type pos2: int """ + sequence = util.reverse_complement(unicode(self.orig[pos1 - 1:pos2])) + visualisation = ['inversion between %i and %i' % (pos1, pos2)] - visualisation.extend( - self._visualise(pos1 - 1, pos2, - reverse_complement(self.orig[pos1 - 1:pos2]))) + visualisation.extend(self._visualise(pos1 - 1, pos2, sequence)) self._output.addOutput('visualisation', visualisation) - self._mutate(pos1 - 1, pos2, - reverse_complement(self.orig[pos1 - 1:pos2])) + self._mutate(pos1 - 1, pos2, sequence) #inversion def duplication(self, pos1, pos2): @@ -447,11 +447,12 @@ class Mutator(): @arg pos2: Last nucleotide of the duplicated sequence. @type pos2: int """ + sequence = unicode(self.orig[pos1 - 1:pos2]) + visualisation = ['duplication from %i to %i' % (pos1, pos2)] - visualisation.extend( - self._visualise(pos2, pos2, self.orig[pos1 - 1:pos2])) + visualisation.extend(self._visualise(pos2, pos2, sequence)) self._output.addOutput('visualisation', visualisation) - self._mutate(pos1 - 1, pos1 - 1, self.orig[pos1 - 1:pos2]) + self._mutate(pos1 - 1, pos1 - 1, sequence) #duplication #Mutator diff --git a/mutalyzer/output.py b/mutalyzer/output.py index 3ca1c8a71d8a998463262074fd0fae17f8a5c84c..fbec8418274798c367ba9153a438096284f974ae 100644 --- a/mutalyzer/output.py +++ b/mutalyzer/output.py @@ -23,6 +23,9 @@ Public classes: """ +from __future__ import unicode_literals + +import io import time from mutalyzer import util @@ -71,12 +74,13 @@ class Output() : - _warnings ; Initialised to 0. @arg instance: The filename of the module that created this object - @type instance: string + @type instance: unicode """ self._outputData = {} self._messages = [] self._instance = util.nice_filename(instance) - self._loghandle = open(settings.LOG_FILE, "a+") + self._loghandle = io.open(settings.LOG_FILE, mode='a+', + encoding='utf-8') self._errors = 0 self._warnings = 0 #__init__ @@ -147,7 +151,7 @@ class Output() : - _messages ; The messages list. @arg errorcode: The error code to filter on - @type errorcode: string + @type errorcode: unicode @return: A filtered list @rtype: list @@ -194,7 +198,7 @@ class Output() : - _outputData ; The output dictionary. @arg name: Name of a node in the output dictionary - @type name: string + @type name: unicode @arg data: The data to be stored at this node @type data: object """ @@ -258,7 +262,7 @@ class Output() : - Number of errors - Number of warnings - Summary - @rtype: integer, integer, string + @rtype: integer, integer, unicode """ e_s = 's' w_s = 's' @@ -297,13 +301,13 @@ class Message() : - description ; A description of the message. @arg origin: Name of the module creating this object - @type origin: string + @type origin: unicode @arg level: Importance of the message @type level: integer @arg code: The error code of the message - @type code: string + @type code: unicode @arg description: A description of the message - @type description: string + @type description: unicode """ self.origin = origin self.level = level @@ -316,17 +320,17 @@ class Message() : (self.origin, self.level, self.code, self.description) #__repr__ - def __str__(self): + def __unicode__(self): return '%s (%s): %s' % \ (self.named_level(), self.origin, self.description) - #__str__ + #__unicode__ def named_level(self): """ Get message log level as readable string. @return: A readable description of the log level. - @rtype: string + @rtype: unicode """ if self.level == 0: return "Debug" diff --git a/mutalyzer/parsers/__init__.py b/mutalyzer/parsers/__init__.py index 3e1bd90dd08aa288d05a8c342e2bbae9218a730c..6b3f43347bc55d1518e6aaeb0279b5fa3bac9871 100644 --- a/mutalyzer/parsers/__init__.py +++ b/mutalyzer/parsers/__init__.py @@ -1,3 +1,6 @@ """ Parsers for GenRecord objects. """ + + +from __future__ import unicode_literals diff --git a/mutalyzer/parsers/genbank.py b/mutalyzer/parsers/genbank.py index 867fa78f7b2d838d9076fc460eb5fd02282aee58..247545989e105702211e0c796a88b256edd40f3d 100644 --- a/mutalyzer/parsers/genbank.py +++ b/mutalyzer/parsers/genbank.py @@ -4,6 +4,9 @@ mutalyzer GenRecord. Record populated with data from a GenBank file. """ +from __future__ import unicode_literals + +import codecs import re import bz2 from itertools import izip_longest @@ -41,7 +44,7 @@ class tempGene(): - cdsList ; CDS list (including internal splice sites). @arg name: Gene name - @type name: string + @type name: unicode """ self.name = name @@ -75,8 +78,8 @@ class GBparser(): ret = [] - if not str(location.start).isdigit() or \ - not str(location.end).isdigit() : + if not unicode(location.start).isdigit() or \ + not unicode(location.end).isdigit() : return None #if @@ -99,8 +102,8 @@ class GBparser(): ret = [] - if not str(locationList.location.start).isdigit() or \ - not str(locationList.location.end).isdigit() : + if not unicode(locationList.location.start).isdigit() or \ + not unicode(locationList.location.end).isdigit() : return None #if @@ -128,10 +131,10 @@ class GBparser(): @arg transcriptAcc: Accession number of the transcript for which we want to find the protein - @type transcriptAcc: string + @type transcriptAcc: unicode @return: Accession number of a protein or None if nothing can be found - @rtype: string + @rtype: unicode """ link = queries.get_transcript_protein_link(transcriptAcc) if link is not None: @@ -146,7 +149,7 @@ class GBparser(): finally: handle.close() - transcriptGI = result["IdList"][0] + transcriptGI = unicode(result["IdList"][0]) handle = Entrez.elink(dbfrom = "nucleotide", db = "protein", id = transcriptGI) @@ -162,11 +165,11 @@ class GBparser(): queries.update_transcript_protein_link(transcriptAcc) return None - proteinGI = result[0]["LinkSetDb"][0]["Link"][0]["Id"] + proteinGI = unicode(result[0]["LinkSetDb"][0]["Link"][0]["Id"]) handle = Entrez.efetch(db='protein', id=proteinGI, rettype='acc', retmode='text') - proteinAcc = handle.read().split('.')[0] + proteinAcc = unicode(handle.read()).split('.')[0] handle.close() queries.update_transcript_protein_link(transcriptAcc, proteinAcc) @@ -179,7 +182,7 @@ class GBparser(): sentence from another. The index of the last word is counted backwards. @arg sentences: A list of sentences. - @type sentences: list of strings + @type sentences: list of unicode strings @return: The indices of the words where sentences start to differ, both are -1 when no mismatches are found. @@ -217,7 +220,7 @@ class GBparser(): [-1:1] yields the empty list. """ # Create lists of words - lists = map(str.split, sentences) + lists = [s.split() for s in sentences] try: forward, reverse = [next(i for i, v in @@ -239,7 +242,7 @@ class GBparser(): @arg locus: The locus object on which the transfer should be performed @type locus: locus object @arg key: The name of the variable that should be transferred - @type key: string + @type key: unicode """ if locus.qualifiers.has_key(key) : @@ -315,7 +318,7 @@ class GBparser(): @arg locusList: A list of loci @type locusList: list @arg tagName: Name of the tag to be checked - @type tagName: string + @type tagName: unicode """ tags = [] @@ -476,13 +479,14 @@ class GBparser(): Create a GenRecord.Record from a GenBank file @arg filename: The full path to the compressed GenBank file - @type filename: string + @type filename: unicode @return: A GenRecord.Record instance @rtype: object (record) """ # first create an intermediate genbank record with BioPython file_handle = bz2.BZ2File(filename, "r") + file_handle = codecs.getreader('utf-8')(file_handle) biorecord = SeqIO.read(file_handle, "genbank") file_handle.close() diff --git a/mutalyzer/parsers/lrg.py b/mutalyzer/parsers/lrg.py index d3624360291b5035fc3f5e6a323de4a59a08bfdc..b22b7ce69f3e7bb1b58e0e72783432fe4d11c4a3 100644 --- a/mutalyzer/parsers/lrg.py +++ b/mutalyzer/parsers/lrg.py @@ -21,6 +21,8 @@ added in python2.5. Its main strengths are speed and readability [pythonesque]. """ +from __future__ import unicode_literals + import xml.dom.minidom from Bio.Seq import Seq from Bio.Alphabet import IUPAC @@ -54,14 +56,14 @@ def _get_content(data, refname): @arg data: a minidom object @type data: object @arg refname: the name of a member of the minidom object - @type refname: string + @type refname: unicode - @return: The UTF-8 content of the textnode or an emtpy string + @return: The content of the textnode or an emtpy string @rtype: string """ temp = data.getElementsByTagName(refname) if temp: - return temp[0].lastChild.data.encode("utf8") + return temp[0].lastChild.data else: return "" #_get_content @@ -75,14 +77,14 @@ def _attr2dict(attr): @type attr: object @return: A dictionary with pairing of node-attribute names and values. - Integer string values are converted to integers. String values are converted - to UTF-8 + Integer string values are converted to integers. @rtype: dictionary """ ret = {} for key, value in attr.items(): - value = value.isdigit() and int(value) or value.encode("utf-8") - ret[key.encode("utf-8")] = value + if value.isdigit(): + value = int(value) + ret[key] = value return ret #_attr2dict @@ -110,7 +112,7 @@ def create_record(data): Create a GenRecord.Record of a LRG <xml> formatted string. @arg data: Content of LRG file - @type data: string + @type data: byte string @return: GenRecord.Record instance @rtype: object @@ -166,7 +168,7 @@ def create_record(data): for tData in fixed.getElementsByTagName("transcript"): # iterate over the transcripts in the fixed section. # get the transcript from the updatable section and combine results - transcriptName = tData.getAttribute("name").encode("utf8")[1:] + transcriptName = tData.getAttribute("name")[1:] transcription = [t for t in gene.transcriptList if t.name == transcriptName][0] #TODO?: swap with gene.findLocus diff --git a/mutalyzer/redisclient.py b/mutalyzer/redisclient.py index ec9e6050548a85d04dced7489fbd8de195a5c6fc..58acd7cacdda8818dcf902150f361ce8d7342313 100644 --- a/mutalyzer/redisclient.py +++ b/mutalyzer/redisclient.py @@ -18,6 +18,8 @@ simple and just use one global connection pool as created by `StrictRedis`. """ +from __future__ import unicode_literals + import redis from mutalyzer.config import settings @@ -37,7 +39,9 @@ class LazyClient(util.LazyObject): import mockredis self._wrapped = mockredis.MockRedis(strict=True) else: - self._wrapped = redis.StrictRedis.from_url(settings.REDIS_URI) + self._wrapped = redis.StrictRedis.from_url(settings.REDIS_URI, + decode_responses=True, + charset='utf-8') #: Global :class:`LazyClient` instance. Use this for all communication with diff --git a/mutalyzer/services/__init__.py b/mutalyzer/services/__init__.py index 05b3d031865b91b2a3ebd2ead081592a52a119e2..81887d7c05baaf74a3ef836f34fdb9dbe9c25336 100644 --- a/mutalyzer/services/__init__.py +++ b/mutalyzer/services/__init__.py @@ -1,3 +1,6 @@ """ Services (RPC) for Mutalyzer. """ + + +from __future__ import unicode_literals diff --git a/mutalyzer/services/json.py b/mutalyzer/services/json.py index c35b79293c1a790209185a9efa37772155acf07e..89c6a26e11cca3e0f2c64f2c96621cbaeffb236b 100644 --- a/mutalyzer/services/json.py +++ b/mutalyzer/services/json.py @@ -3,6 +3,8 @@ Mutalyzer web service HTTP/RPC with JSON response payloads. """ +from __future__ import unicode_literals + from spyne.application import Application from spyne.protocol.http import HttpRpc from spyne.protocol.json import JsonDocument diff --git a/mutalyzer/services/rpc.py b/mutalyzer/services/rpc.py index d681b94c27c3e2398db5884bad99ad3532c2f346..c65053587d4e3b7639df78bf6152100cbf39114a 100644 --- a/mutalyzer/services/rpc.py +++ b/mutalyzer/services/rpc.py @@ -9,23 +9,24 @@ Mutalyzer RPC services. """ +from __future__ import unicode_literals + from spyne.decorator import srpc from spyne.service import ServiceBase -from spyne.model.primitive import String, Integer, Boolean, DateTime +from spyne.model.primitive import Integer, Boolean, DateTime, Unicode from spyne.model.complex import Array from spyne.model.fault import Fault +import io import os import socket -from cStringIO import StringIO -import tempfile -from operator import itemgetter, attrgetter +from operator import attrgetter from sqlalchemy.orm.exc import NoResultFound import mutalyzer from mutalyzer.config import settings from mutalyzer.db import session -from mutalyzer.db.models import (Assembly, Chromosome, BatchJob, - BatchQueueItem, TranscriptMapping) +from mutalyzer.db.models import (Assembly, BatchJob, BatchQueueItem, + TranscriptMapping) from mutalyzer.output import Output from mutalyzer.grammar import Grammar from mutalyzer.sync import CacheSync @@ -51,7 +52,7 @@ class MutalyzerService(ServiceBase): super(MutalyzerService, self).__init__(environ) #__init__ - @srpc(Mandatory.ByteArray, String, String, _returns=String) + @srpc(Mandatory.ByteArray, Unicode, Unicode, _returns=Unicode) def submitBatchJob(data, process='NameChecker', argument=''): """ Submit a batch job. @@ -90,6 +91,12 @@ class MutalyzerService(ServiceBase): 'The process argument must be one of %s.' % ', '.join(batch_types)) + # The Python type for `data` should be a sequence of `str` objects, + # but it seems we sometimes just get one `str` object. Perhaps only in + # the unit tests, but let's fix that anyway. + if isinstance(data, str): + data = [data] + # Note that the max file size check below might be bogus, since Spyne # first checks the total request size, which by default has a maximum # of 2 megabytes. @@ -103,7 +110,9 @@ class MutalyzerService(ServiceBase): 'Only files up to %d megabytes are accepted.' % (settings.MAX_FILE_SIZE // 1048576)) - batch_file = StringIO(''.join(data)) + batch_file = io.BytesIO() + for d in data: + batch_file.write(d) job, columns = file_instance.parseBatchFile(batch_file) batch_file.close() @@ -115,7 +124,7 @@ class MutalyzerService(ServiceBase): batch_types[process], argument) return result_id - @srpc(Mandatory.String, _returns=Integer) + @srpc(Mandatory.Unicode, _returns=Integer) def monitorBatchJob(job_id): """ Get the number of entries left for a batch job. @@ -129,7 +138,7 @@ class MutalyzerService(ServiceBase): """ return BatchQueueItem.query.join(BatchJob).filter_by(result_id=job_id).count() - @srpc(Mandatory.String, _returns=ByteArray) + @srpc(Mandatory.Unicode, _returns=ByteArray) def getBatchJob(job_id): """ Get the result of a batch job. @@ -144,7 +153,7 @@ class MutalyzerService(ServiceBase): @arg job_id: Batch job identifier. - @return: Batch job result file. + @return: Batch job result file (UTF-8, base64 encoded). """ left = BatchQueueItem.query.join(BatchJob).filter_by(result_id=job_id).count() @@ -152,11 +161,11 @@ class MutalyzerService(ServiceBase): raise Fault('EBATCHNOTREADY', 'Batch job result is not yet ready.') filename = 'batch-job-%s.txt' % job_id - handle = open(os.path.join(settings.CACHE_DIR, filename)) + handle = open(os.path.join(settings.CACHE_DIR, filename), 'rb') return handle - @srpc(Mandatory.String, Mandatory.String, Mandatory.Integer, Boolean, - _returns=Array(Mandatory.String)) + @srpc(Mandatory.Unicode, Mandatory.Unicode, Mandatory.Integer, Boolean, + _returns=Array(Mandatory.Unicode)) def getTranscripts(build, chrom, pos, versions=False) : """ Get all the transcripts that overlap with a chromosomal position. @@ -215,7 +224,7 @@ class MutalyzerService(ServiceBase): return [m.accession for m in mappings] #getTranscripts - @srpc(Mandatory.String, Mandatory.String, _returns=Array(Mandatory.String)) + @srpc(Mandatory.Unicode, Mandatory.Unicode, _returns=Array(Mandatory.Unicode)) def getTranscriptsByGeneName(build, name): """ Todo: documentation. @@ -243,8 +252,8 @@ class MutalyzerService(ServiceBase): return ['%s.%s' % (m.accession, m.version) for m in mappings] #getTranscriptsByGene - @srpc(Mandatory.String, Mandatory.String, Mandatory.Integer, - Mandatory.Integer, Mandatory.Integer, _returns=Array(Mandatory.String)) + @srpc(Mandatory.Unicode, Mandatory.Unicode, Mandatory.Integer, + Mandatory.Integer, Mandatory.Integer, _returns=Array(Mandatory.Unicode)) def getTranscriptsRange(build, chrom, pos1, pos2, method) : """ Get all the transcripts that overlap with a range on a chromosome. @@ -302,7 +311,7 @@ class MutalyzerService(ServiceBase): return [m.accession for m in mappings] #getTranscriptsRange - @srpc(Mandatory.String, Mandatory.String, Mandatory.Integer, + @srpc(Mandatory.Unicode, Mandatory.Unicode, Mandatory.Integer, Mandatory.Integer, Mandatory.Integer, _returns=Array(TranscriptMappingInfo)) def getTranscriptsMapping(build, chrom, pos1, pos2, method): @@ -387,7 +396,7 @@ class MutalyzerService(ServiceBase): return transcripts #getTranscriptsMapping - @srpc(Mandatory.String, Mandatory.String, _returns=Mandatory.String) + @srpc(Mandatory.Unicode, Mandatory.Unicode, _returns=Mandatory.Unicode) def getGeneName(build, accno) : """ Find the gene name associated with a transcript. @@ -424,8 +433,8 @@ class MutalyzerService(ServiceBase): return mapping.gene #getGeneName - @srpc(Mandatory.String, Mandatory.String, Mandatory.String, - Mandatory.String, _returns=Mapping) + @srpc(Mandatory.Unicode, Mandatory.Unicode, Mandatory.Unicode, + Mandatory.Unicode, _returns=Mapping) def mappingInfo(LOVD_ver, build, accNo, variant) : """ Search for an NM number in the MySQL database, if the version @@ -492,7 +501,7 @@ class MutalyzerService(ServiceBase): return result #mappingInfo - @srpc(Mandatory.String, Mandatory.String, Mandatory.String, + @srpc(Mandatory.Unicode, Mandatory.Unicode, Mandatory.Unicode, _returns=Transcript) def transcriptInfo(LOVD_ver, build, accNo) : """ @@ -536,7 +545,7 @@ class MutalyzerService(ServiceBase): return T #transcriptInfo - @srpc(Mandatory.String, Mandatory.String, _returns=Mandatory.String) + @srpc(Mandatory.Unicode, Mandatory.Unicode, _returns=Mandatory.Unicode) def chromAccession(build, name) : """ Get the accession number of a chromosome, given a name. @@ -574,7 +583,7 @@ class MutalyzerService(ServiceBase): return chromosome.accession #chromAccession - @srpc(Mandatory.String, Mandatory.String, _returns=Mandatory.String) + @srpc(Mandatory.Unicode, Mandatory.Unicode, _returns=Mandatory.Unicode) def chromosomeName(build, accNo) : """ Get the name of a chromosome, given a chromosome accession number. @@ -612,7 +621,7 @@ class MutalyzerService(ServiceBase): return chromosome.name #chromosomeName - @srpc(Mandatory.String, Mandatory.String, _returns=Mandatory.String) + @srpc(Mandatory.Unicode, Mandatory.Unicode, _returns=Mandatory.Unicode) def getchromName(build, acc) : """ Get the chromosome name, given a transcript identifier (NM number). @@ -649,8 +658,8 @@ class MutalyzerService(ServiceBase): return mapping.chromosome.name #chromosomeName - @srpc(Mandatory.String, Mandatory.String, String, - _returns=Array(Mandatory.String)) + @srpc(Mandatory.Unicode, Mandatory.Unicode, Unicode, + _returns=Array(Mandatory.Unicode)) def numberConversion(build, variant, gene=None): """ Converts I{c.} to I{g.} notation or vice versa @@ -696,7 +705,7 @@ class MutalyzerService(ServiceBase): return result #numberConversion - @srpc(Mandatory.String, _returns=CheckSyntaxOutput) + @srpc(Mandatory.Unicode, _returns=CheckSyntaxOutput) def checkSyntax(variant): """ Checks the syntax of a variant. @@ -739,7 +748,7 @@ class MutalyzerService(ServiceBase): return result #checkSyntax - @srpc(Mandatory.String, _returns=MutalyzerOutput) + @srpc(Mandatory.Unicode, _returns=MutalyzerOutput) def runMutalyzer(variant) : """ Run the Mutalyzer name checker. @@ -804,23 +813,18 @@ class MutalyzerService(ServiceBase): result.sourceGi = O.getIndexedOutput('source_gi', 0) result.molecule = O.getIndexedOutput('molecule', 0) - # We force the results to strings here, because some results - # may be of type Bio.Seq.Seq which spyne doesn't like. - # - # todo: We might have to also do this elsewhere. + result.original = O.getIndexedOutput("original", 0) + result.mutated = O.getIndexedOutput("mutated", 0) - result.original = str(O.getIndexedOutput("original", 0)) - result.mutated = str(O.getIndexedOutput("mutated", 0)) + result.origMRNA = O.getIndexedOutput("origMRNA", 0) + result.mutatedMRNA = O.getIndexedOutput("mutatedMRNA", 0) - result.origMRNA = str(O.getIndexedOutput("origMRNA", 0)) - result.mutatedMRNA = str(O.getIndexedOutput("mutatedMRNA", 0)) + result.origCDS = O.getIndexedOutput("origCDS", 0) + result.newCDS = O.getIndexedOutput("newCDS", 0) - result.origCDS = str(O.getIndexedOutput("origCDS", 0)) - result.newCDS = str(O.getIndexedOutput("newCDS", 0)) - - result.origProtein = str(O.getIndexedOutput("oldprotein", 0)) - result.newProtein = str(O.getIndexedOutput("newprotein", 0)) - result.altProtein = str(O.getIndexedOutput("altProtein", 0)) + result.origProtein = O.getIndexedOutput("oldprotein", 0) + result.newProtein = O.getIndexedOutput("newprotein", 0) + result.altProtein = O.getIndexedOutput("altProtein", 0) result.chromDescription = \ O.getIndexedOutput("genomicChromDescription", 0) @@ -860,7 +864,7 @@ class MutalyzerService(ServiceBase): return result #runMutalyzer - @srpc(Mandatory.String, Mandatory.String, _returns=TranscriptNameInfo) + @srpc(Mandatory.Unicode, Mandatory.Unicode, _returns=TranscriptNameInfo) def getGeneAndTranscript(genomicReference, transcriptReference) : """ Todo: documentation. @@ -892,7 +896,7 @@ class MutalyzerService(ServiceBase): return ret #getGeneAndTranscript - @srpc(Mandatory.String, String, _returns=Array(TranscriptInfo)) + @srpc(Mandatory.Unicode, Unicode, _returns=Array(TranscriptInfo)) def getTranscriptsAndInfo(genomicReference, geneName=None): """ Given a genomic reference, return all its transcripts with their @@ -995,7 +999,7 @@ class MutalyzerService(ServiceBase): transcript.CM.info() cds_start = 1 - t.cTransEnd = str(t.exons[-1].cStop) + t.cTransEnd = unicode(t.exons[-1].cStop) t.gTransEnd = t.exons[-1].gStop t.chromTransEnd = GenRecordInstance.record.toChromPos( t.gTransEnd) @@ -1009,15 +1013,15 @@ class MutalyzerService(ServiceBase): t.name = '%s_v%s' % (gene.name, transcript.name) t.id = transcript.transcriptID t.product = transcript.transcriptProduct - t.cTransStart = str(trans_start) + t.cTransStart = unicode(trans_start) t.gTransStart = transcript.CM.x2g(trans_start, 0) t.chromTransStart = GenRecordInstance.record.toChromPos( t.gTransStart) - t.cCDSStart = str(cds_start) + t.cCDSStart = unicode(cds_start) t.gCDSStart = transcript.CM.x2g(cds_start, 0) t.chromCDSStart = GenRecordInstance.record.toChromPos( t.gCDSStart) - t.cCDSStop = str(cds_stop) + t.cCDSStop = unicode(cds_stop) t.gCDSStop = transcript.CM.x2g(cds_stop, 0) t.chromCDSStop = GenRecordInstance.record.toChromPos(t.gCDSStop) t.locusTag = transcript.locusTag @@ -1040,12 +1044,12 @@ class MutalyzerService(ServiceBase): return transcripts #getTranscriptsAndInfo - @srpc(Mandatory.ByteArray, _returns=Mandatory.String) + @srpc(Mandatory.ByteArray, _returns=Mandatory.Unicode) def uploadGenBankLocalFile(data): """ Upload a genbank file. - @arg data: Genbank file (base64 encoded). + @arg data: Genbank file (UTF-8, base64 encoded). @return: UD accession number for the uploaded genbank file. """ output = Output(__file__) @@ -1054,6 +1058,12 @@ class MutalyzerService(ServiceBase): output.addMessage(__file__, -1, 'INFO', 'Received request uploadGenBankLocalFile()') + # The Python type for `data` should be a sequence of `str` objects, + # but it seems we sometimes just get one `str` object. Perhaps only in + # the unit tests, but let's fix that anyway. + if isinstance(data, str): + data = [data] + # Note that the max file size check below might be bogus, since Spyne # first checks the total request size, which by default has a maximum # of 2 megabytes. @@ -1067,7 +1077,7 @@ class MutalyzerService(ServiceBase): 'Only files up to %d megabytes are accepted.' % (settings.MAX_FILE_SIZE // 1048576)) - ud = retriever.uploadrecord(''.join(data)) + ud = retriever.uploadrecord(b''.join(data)) output.addMessage(__file__, -1, 'INFO', 'Finished processing uploadGenBankLocalFile()') @@ -1075,13 +1085,13 @@ class MutalyzerService(ServiceBase): # Todo: use SOAP Fault object here (see Trac issue #41). if not ud: error = 'The request could not be completed\n' \ - + '\n'.join(map(lambda m: str(m), output.getMessages())) + + '\n'.join(map(lambda m: unicode(m), output.getMessages())) raise Exception(error) return ud #upLoadGenBankLocalFile - @srpc(Mandatory.String, _returns=Mandatory.String) + @srpc(Mandatory.Unicode, _returns=Mandatory.Unicode) def uploadGenBankRemoteFile(url) : """ Not implemented yet. @@ -1089,8 +1099,8 @@ class MutalyzerService(ServiceBase): raise Fault('ENOTIMPLEMENTED', 'Not implemented yet') #upLoadGenBankRemoteFile - @srpc(Mandatory.String, Mandatory.String, Mandatory.Integer, - Mandatory.Integer, _returns=Mandatory.String) + @srpc(Mandatory.Unicode, Mandatory.Unicode, Mandatory.Integer, + Mandatory.Integer, _returns=Mandatory.Unicode) def sliceChromosomeByGene(geneSymbol, organism, upStream, downStream) : """ @@ -1112,14 +1122,14 @@ class MutalyzerService(ServiceBase): # Todo: use SOAP Fault object here (see Trac issue #41). if not UD: error = 'The request could not be completed\n' \ - + '\n'.join(map(lambda m: str(m), O.getMessages())) + + '\n'.join(map(lambda m: unicode(m), O.getMessages())) raise Exception(error) return UD #sliceChromosomeByGene - @srpc(Mandatory.String, Mandatory.Integer, Mandatory.Integer, - Mandatory.Integer, _returns=Mandatory.String) + @srpc(Mandatory.Unicode, Mandatory.Integer, Mandatory.Integer, + Mandatory.Integer, _returns=Mandatory.Unicode) def sliceChromosome(chromAccNo, start, end, orientation) : """ Todo: documentation, error handling, argument checking, tests. @@ -1190,7 +1200,7 @@ class MutalyzerService(ServiceBase): return result #info - @srpc(_returns=Mandatory.String) + @srpc(_returns=Mandatory.Unicode) def ping(): """ Simple function to test the interface. @@ -1201,7 +1211,7 @@ class MutalyzerService(ServiceBase): return 'pong' #ping - @srpc(Mandatory.String, Mandatory.String, _returns=Allele) + @srpc(Mandatory.Unicode, Mandatory.Unicode, _returns=Allele) def descriptionExtract(reference, observed): """ Extract the HGVS variant description from a reference sequence and an @@ -1253,7 +1263,7 @@ class MutalyzerService(ServiceBase): return map(cache_entry_to_soap, cache) #getCache - @srpc(Mandatory.String, _returns=Array(Mandatory.String)) + @srpc(Mandatory.Unicode, _returns=Array(Mandatory.Unicode)) def getdbSNPDescriptions(rs_id): """ Lookup HGVS descriptions for a dbSNP rs identifier. @@ -1281,7 +1291,7 @@ class MutalyzerService(ServiceBase): messages = output.getMessages() if messages: error = 'The request could not be completed\n' + \ - '\n'.join(map(lambda m: str(m), output.getMessages())) + '\n'.join(map(lambda m: unicode(m), output.getMessages())) raise Exception(error) return descriptions diff --git a/mutalyzer/services/soap.py b/mutalyzer/services/soap.py index a7d7b001868705b65807f27edba653c1050e6fda..d8f28407bd29afbedc37be52020122a96c2c8490 100644 --- a/mutalyzer/services/soap.py +++ b/mutalyzer/services/soap.py @@ -3,6 +3,8 @@ Mutalyzer SOAP/1.1 web service. """ +from __future__ import unicode_literals + from spyne.application import Application from spyne.protocol.soap import Soap11 diff --git a/mutalyzer/stats.py b/mutalyzer/stats.py index bb1dec573161b469af85f52f8862d5883b45f4a7..e7228cdfb4e8dbb34a1a59ebcc07654f42679a8c 100644 --- a/mutalyzer/stats.py +++ b/mutalyzer/stats.py @@ -17,6 +17,8 @@ module much more. """ +from __future__ import unicode_literals + import time from mutalyzer.redisclient import client @@ -36,7 +38,8 @@ def increment_counter(counter): pipe.incr('counter:%s:total' % counter) for label, bucket, expire in INTERVALS: - key = 'counter:%s:%s:%s' % (counter, label, time.strftime(bucket)) + key = 'counter:%s:%s:%s' % (counter, label, + unicode(time.strftime(bucket))) pipe.incr(key) # It's safe to just keep on expiring the counter, even if it already diff --git a/mutalyzer/sync.py b/mutalyzer/sync.py index e5465e1e35e6f5a1cbc1556b8f5f817520947a2a..a1a1b7f90a3e687ef17aa833f3613b7895054d9c 100644 --- a/mutalyzer/sync.py +++ b/mutalyzer/sync.py @@ -3,6 +3,8 @@ Synchronizing the reference file cache with other Mutalyzer instances. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() from datetime import datetime, timedelta @@ -86,7 +88,7 @@ class CacheSync(object): or later. :arg remote_wsdl: The url of the remote SOAP WSDL description. - :type remote_wsdl: str + :type remote_wsdl: unicode :arg created_since: Only entries with this creation date or later are returned. :type created_since: datatime.datetime @@ -111,11 +113,11 @@ class CacheSync(object): 1: 'forward', 2: 'reverse'} - entry_dict = {'name': str(entry.name), - 'hash': str(entry.hash), + entry_dict = {'name': entry.name, + 'hash': entry.hash, 'created': entry.created} for attribute in ('gi', 'chromosomeName', 'url', 'cached'): - entry_dict[attribute] = str(entry[attribute]) \ + entry_dict[attribute] = entry[attribute] \ if attribute in entry else None for attribute in ('chromosomeStart', 'chromosomeStop'): entry_dict[attribute] = int(entry[attribute]) \ @@ -131,9 +133,9 @@ class CacheSync(object): Download a remote file located at `url` and store it as `name`. :arg name: Name to store the file under. - :type name: str + :type name: unicode :arg url: Url to the remote file. - :type url: str + :type url: unicode """ if not re.match('^[\da-zA-Z\._-]+$', name): return @@ -160,10 +162,10 @@ class CacheSync(object): (14, 3) :arg remote_wsdl: The url of the remote SOAP WSDL description. - :type remote_wsdl: str + :type remote_wsdl: unicode :arg url_template: Formatting string containing a ``{file}`` occurence, see example usage above. - :string url_template: str + :string url_template: unicode :arg days: Only remote entries added this number of days ago or later are considered. :type days: int diff --git a/mutalyzer/util.py b/mutalyzer/util.py index 936f0812b6abb077cb17dcb252a146cb3a5285f5..6b7987b31c8f9a7bed62507572f0c417589d6c4a 100644 --- a/mutalyzer/util.py +++ b/mutalyzer/util.py @@ -19,20 +19,88 @@ General utility functions. """ +from __future__ import unicode_literals + from functools import wraps import inspect from itertools import izip_longest import math import operator -import os import sys import time -from Bio.Alphabet import IUPAC -import Bio.Seq from Bio.SeqUtils import seq3 +# Taken from BioPython. +AMBIGUOUS_DNA_COMPLEMENT = { + 'A': 'T', + 'C': 'G', + 'G': 'C', + 'T': 'A', + 'M': 'K', + 'R': 'Y', + 'W': 'W', + 'S': 'S', + 'Y': 'R', + 'K': 'M', + 'V': 'B', + 'H': 'D', + 'D': 'H', + 'B': 'V', + 'X': 'X', + 'N': 'N'} +AMBIGUOUS_RNA_COMPLEMENT = { + 'A': 'U', + 'C': 'G', + 'G': 'C', + 'U': 'A', + 'M': 'K', + 'R': 'Y', + 'W': 'W', + 'S': 'S', + 'Y': 'R', + 'K': 'M', + 'V': 'B', + 'H': 'D', + 'D': 'H', + 'B': 'V', + 'X': 'X', + 'N': 'N'} + + +def _make_translation_table(complement_mapping): + before = complement_mapping.keys() + before += [b.lower() for b in before] + after = complement_mapping.values() + after += [b.lower() for b in after] + return {ord(k): v for k, v in zip(before, after)} + + +_dna_complement_table = _make_translation_table(AMBIGUOUS_DNA_COMPLEMENT) +_rna_complement_table = _make_translation_table(AMBIGUOUS_RNA_COMPLEMENT) + + +def reverse_complement(sequence): + """ + Reverse complement of a sequence represented as unicode string. + """ + if 'U' in sequence or 'u' in sequence: + table = _rna_complement_table + else: + table = _dna_complement_table + + return ''.join(reversed(sequence.translate(table))) + + +def is_utf8_alias(encoding): + """ + Returns `True` if the given encoding is recognized as UTF-8. + """ + aliases = ('utf_8', 'u8', 'utf', 'utf8') + return encoding.lower().replace('-', '_') in aliases + + def grouper(iterable, n=2, fillvalue=None): """ Make an iterator that takes {n} elements at a time from {iterable}, using @@ -115,17 +183,17 @@ def splice(s, splice_sites): 'bcdghijklmnoptuvw' @arg s: A DNA sequence. - @type s: string + @type s: any sequence type @arg splice_sites: A list of even length of integers. @type splice_sites: list @return: The concatenation of slices from the sequence that is present in the GenBank record. - @rtype: string + @rtype: type(s) @todo: Assert length of splice_sites is even. """ - transcript = '' + transcript = s[:0] for acceptor, donor in grouper(splice_sites): transcript += s[acceptor - 1:donor] @@ -146,7 +214,7 @@ def __nsplice(string, splice_sites, CDS, orientation) : @todo: documentation """ - transcript = "" + transcript = string[:0] if orientation == 1 : for i in range(0, len(splice_sites), 2) : if CDS[0] >= splice_sites[i] and CDS[0] <= splice_sites[i + 1] : @@ -212,14 +280,15 @@ def format_range(first, last): @type last: integer @return: {first}_{last} in case of a real range, {first} otherwise. - @rtype: string + @rtype: unicode """ if first == last: - return str(first) + return unicode(first) return '%i_%i' % (first, last) #format_range + def roll_(s, start, end) : """ Different (and easier) way of finding the variability of a substring. @@ -239,6 +308,7 @@ def roll_(s, start, end) : return j, i #roll + def roll(s, first, last): """ Determine the variability of a variant by looking at cyclic @@ -254,7 +324,7 @@ def roll(s, first, last): (1, 3) @arg s: A reference sequence. - @type s: string + @type s: any sequence type @arg first: First position of the pattern in the reference sequence. @type first: int @arg last: Last position of the pattern in the reference sequence. @@ -302,13 +372,13 @@ def palinsnoop(s): 0 @arg s: A nucleotide sequence. - @type s: string + @type s: unicode @return: The number of elements that are palindromic or -1 if the string is a 'palindrome'. - @rtype: string + @rtype: int """ - s_revcomp = Bio.Seq.reverse_complement(s) + s_revcomp = reverse_complement(s) for i in range(int(math.ceil(len(s) / 2.0))): if s[i] != s_revcomp[i]: @@ -330,12 +400,12 @@ def longest_common_prefix(s1, s2): 'abcdefg' @arg s1: The first string. - @type s1: string + @type s1: unicode @arg s2: The second string. - @type s2: string + @type s2: unicode @return: The longest common prefix of s1 and s2. - @rtype: string + @rtype: unicode @todo: This is mostly used just for the length of the returned string, and we could also return that directly. @@ -359,9 +429,9 @@ def longest_common_suffix(s1, s2): 'efg' @arg s1: The first string. - @type s1: string + @type s1: unicode @arg s2: The second string. - @type s2: string + @type s2: unicode @return: The longest common suffix of s1 and s2. @rtype: string @@ -380,15 +450,15 @@ def trim_common(s1, s2): ('xyzef', 'abc', 3, 1) @arg s1: A string. - @type s1: string + @type s1: unicode @arg s2: Another string. - @type s2: string + @type s2: unicode @return: A tuple of: - - string: Trimmed version of s1. - - string: Trimmed version of s2. - - int: Length of longest common prefix. - - int: Length of longest common suffix. + - unicode: Trimmed version of s1. + - unicode: Trimmed version of s2. + - int: Length of longest common prefix. + - int: Length of longest common suffix. @todo: More intelligently handle longest_common_prefix(). """ @@ -407,14 +477,14 @@ def is_dna(s): >>> is_dna('TACUGT') False - @arg s: Any string or Bio.Seq.Seq instance. - @type s: string + @arg s: Any string. + @type s: unicode @return: True if the string is a DNA string, False otherwise. @rtype: boolean """ - for i in str(s): - if not i in IUPAC.unambiguous_dna.letters: + for i in s: + if i not in 'ATCG': return False return True @@ -435,16 +505,16 @@ def in_frame_description(s1, s2) : ('p.(Pro4_Gln6delinsGlnGlnMet)', 3, 6, 6) @arg s1: The original protein. - @type s1: string + @type s1: unicode @arg s2: The mutated protein. - @type s2: string + @type s2: unicode @return: A tuple of: - - string ; Protein description of the change. - - int ; First position of the change. - - int ; Last position of the change in the first protein. - - int ; Last position of the change in the second protein. - @rtype: tuple(string, int, int, int) + - unicode ; Protein description of the change. + - int ; First position of the change. + - int ; Last position of the change in the first protein. + - int ; Last position of the change in the second protein. + @rtype: tuple(unicode, int, int, int) @todo: More intelligently handle longest_common_prefix(). @todo: Refactor this code (too many return statements). @@ -528,16 +598,16 @@ def out_of_frame_description(s1, s2): ('p.(Pro4Glnfs*5)', 3, 7, 7) @arg s1: The original protein. - @type s1: string + @type s1: unicode @arg s2: The mutated protein. - @type s2: string + @type s2: unicode @return: A tuple of: - - string ; Protein description of the change. - - int ; First position of the change. - - int ; Last position of the first protein. - - int ; Last position of the second protein. - @rtype: tuple(string, int, int, int) + - unicode ; Protein description of the change. + - int ; First position of the change. + - int ; Last position of the first protein. + - int ; Last position of the second protein. + @rtype: tuple(unicode, int, int, int) @todo: More intelligently handle longest_common_prefix(). """ @@ -573,23 +643,23 @@ def protein_description(cds_stop, s1, s2) : @arg cds_stop: Position of the stop codon in c. notation (CDS length). @type cds_stop: int @arg s1: The original protein. - @type s1: string + @type s1: unicode @arg s2: The mutated protein. - @type s2: string + @type s2: unicode @return: A tuple of: - - string ; Protein description of the change. - - int ; First position of the change. - - int ; Last position of the change in the first protein. - - int ; Last position of the change in the second protein. - @rtype: tuple(string, int, int, int) + - unicode ; Protein description of the change. + - int ; First position of the change. + - int ; Last position of the change in the first protein. + - int ; Last position of the change in the second protein. + @rtype: tuple(unicode, int, int, int) """ if cds_stop % 3: - description = out_of_frame_description(str(s1), str(s2)) + description = out_of_frame_description(s1, s2) else: - description = in_frame_description(str(s1), str(s2)) + description = in_frame_description(s1, s2) - if not s2 or str(s1[0]) != str(s2[0]): + if not s2 or s1[0] != s2[0]: # Mutation in start codon. return 'p.?', description[1], description[2], description[3] @@ -603,7 +673,7 @@ def visualise_sequence(sequence, max_length=25, flank_size=6): string is clipped; otherwise the string is just returned. @arg sequence: DNA sequence. - @type sequence: str + @type sequence: unicode @arg max_length: Maximum length of visualised sequence. @type max_length: int @arg flank_size: Length of the flanks in clipped visualised sequence. @@ -629,19 +699,19 @@ def _insert_tag(s, pos1, pos2, tag1, tag2): anything either. @arg s: A sequence. - @type s: + @type s: unicode @arg pos1: Position of tag1. @type pos1: int @arg pos2: Position of tag2. @type pos2: int @arg tag1: Content of tag1. - @type tag1: string + @type tag1: unicode @arg tag2: Content of tag2. - @type tag2: string + @type tag2: unicode @return: The original sequence, or a sequence with eiter tag1, tag2 or both tags inserted. - @rtype: string + @rtype: unicode @todo: Cleanup (note: only used in print_protein_html). """ @@ -670,7 +740,7 @@ def print_protein_html(s, first, last, O, where, text=False): and is suitable for viewing in a monospaced font. @arg s: A protein sequence. - @type s: string + @type s: unicode @arg first: First position to highlight. @type first: int @arg last: Last position to highlight. @@ -678,7 +748,7 @@ def print_protein_html(s, first, last, O, where, text=False): @arg O: The Output object. @type O: Modules.Output.Output @arg where: Location in the {O} object to store the representation. - @type where: string + @type where: unicode @todo: Cleanup. """ @@ -701,7 +771,7 @@ def print_protein_html(s, first, last, O, where, text=False): o = 1 # Add the first position. - output = '%s ' % str(o).rjust(m) + output = '%s ' % unicode(o).rjust(m) for i in range(0, len(s), block): # Add the blocks. @@ -714,13 +784,13 @@ def print_protein_html(s, first, last, O, where, text=False): # Add the position (while escaping any potential highlighting). if text: if first < o < last: - output = '%s%s%s ' % (tag2, str(o).rjust(m), tag1) + output = '%s%s%s ' % (tag2, unicode(o).rjust(m), tag1) else: - output = '%s ' % str(o).rjust(m) + output = '%s ' % unicode(o).rjust(m) else: output = \ '<tt style="color:000000;font-weight:normal">%s</tt> ' % \ - str(o).rjust(m) + unicode(o).rjust(m) # Add last line. O.addOutput(where, output) @@ -748,10 +818,10 @@ def nice_filename(filename): Strip the path and the extention from a filename. @arg filename: A complete path plus extention. - @type filename: string + @type filename: unicode @return: The bare filename without a path and extention. - @rtype: string + @rtype: unicode """ return filename.split('/')[-1].split('.')[0] #nice_filename @@ -788,16 +858,16 @@ def format_usage(usage=None, keywords={}): @kwarg usage: The string to format. If omitted, the calling module's docstring is used. - @type usage: string + @type usage: unicode @kwarg keywords: A dictionary of (keyword, value) pairs used to format the usage string. If it does not contain the key 'command', it is added with the value of sys.argv[0]. - @type keywords: dictionary(string, string) + @type keywords: dictionary(unicode, unicode) @return: Formatted usage string. This is {usage} with any entries from {keywords} replaced and cut-off at the first occurence of two consecutive empty lines. - @rtype: string + @rtype: unicode """ if not usage: caller = inspect.stack()[1] diff --git a/mutalyzer/variantchecker.py b/mutalyzer/variantchecker.py index 65dd70564a727e88eb38ddac39baf38e5befb286..3f0ee4220d8d38451ab7cf4067f287c31ae29383 100644 --- a/mutalyzer/variantchecker.py +++ b/mutalyzer/variantchecker.py @@ -9,17 +9,22 @@ Notes about naming positions: * translation -> begin/end * any range of bases -> first/last * interbase position (if two numbers are used) -> before/after + +Notes about string representations: +* All variant descriptions and their parts are unicode strings +* All reference sequences (and their mutated version) are Bio.Seq.Seq objects """ -from operator import itemgetter, attrgetter +from __future__ import unicode_literals + +from operator import attrgetter -import Bio -import Bio.Seq -from Bio.Seq import Seq +from Bio.Data import CodonTable from Bio.Alphabet import IUPAC from Bio.Alphabet import DNAAlphabet from Bio.Alphabet import ProteinAlphabet +from Bio.Alphabet import _verify_alphabet from mutalyzer import util from mutalyzer.db.models import Assembly @@ -126,14 +131,14 @@ def _check_argument(argument, reference, first, last, output): Do several checks for the optional argument of a variant. Raise a _RawVariantError exception if the checks fail. + @arg argument: The optional argument. + @type argument: unicode @arg reference: The reference sequence. - @type reference: string + @type reference: Bio.Seq.Seq @arg first: Start position of the variant. @type first: int @arg last: End position of the variant. @type last: int - @arg argument: The optional argument. - @type argument: string @arg output: The Output object. @type output: mutalyzer.Output.Output @@ -164,8 +169,8 @@ def _check_argument(argument, reference, first, last, output): 'Invalid letters in argument.') raise _NotDNAError() # And the DNA must match the reference sequence. - reference_slice = str(reference[first - 1:last]) - if reference_slice != str(argument): + reference_slice = unicode(reference[first - 1:last]) + if reference_slice != argument: # Todo: Be more informative. output.addMessage(__file__, 3, 'EREF', '%s not found at position %s, found %s ' \ @@ -286,9 +291,9 @@ def apply_substitution(position, original, substitute, mutator, record, O): @arg position: Genomic location of the substitution. @type position: int @arg original: Nucleotide in the reference sequence. - @type original: string + @type original: unicode @arg substitute: Nucleotide in the mutated sequence. - @type substitute: string + @type substitute: unicode @arg mutator: A Mutator instance. @type mutator: mutalyzer.mutator.Mutator @arg record: A GenRecord object. @@ -310,7 +315,7 @@ def apply_substitution(position, original, substitute, mutator, record, O): mutator.substitution(position, substitute) - record.name(position, position, 'subst', mutator.orig[position - 1], + record.name(position, position, 'subst', unicode(mutator.orig[position - 1]), substitute, None) #apply_substitution @@ -326,7 +331,7 @@ def apply_deletion_duplication(first, last, type, mutator, record, O, @arg last: Genomic end position of the del/dup. @type last: int @arg type: The variant type (del or dup). - @type type: string + @type type: unicode @arg mutator: A Mutator instance. @type mutator: mutalyzer.mutator.Mutator @arg record: A GenRecord object. @@ -376,9 +381,9 @@ def apply_deletion_duplication(first, last, type, mutator, record, O, 'Sequence "%s" at position %s was given, however, ' \ 'the HGVS notation prescribes that on the forward strand ' \ 'it should be "%s" at position %s.' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), + util.visualise_sequence(unicode(mutator.orig[first - 1:last])), util.format_range(first, last), - util.visualise_sequence(str(mutator.orig[new_first - 1:new_stop])), + util.visualise_sequence(unicode(mutator.orig[new_first - 1:new_stop])), util.format_range(new_first, new_stop))) if forward_roll != original_forward_roll and not reverse_strand: @@ -388,9 +393,9 @@ def apply_deletion_duplication(first, last, type, mutator, record, O, O.addMessage(__file__, 1, 'IROLLBACK', 'Sequence "%s" at position %s was not corrected to "%s" at ' \ 'position %s, since they reside in different exons.' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), + util.visualise_sequence(unicode(mutator.orig[first - 1:last])), util.format_range(first, last), - util.visualise_sequence(str(mutator.orig[incorrect_first - 1:incorrect_stop])), + util.visualise_sequence(unicode(mutator.orig[incorrect_first - 1:incorrect_stop])), util.format_range(incorrect_first, incorrect_stop))) if reverse_roll and reverse_strand: @@ -400,9 +405,9 @@ def apply_deletion_duplication(first, last, type, mutator, record, O, 'Sequence "%s" at position %s was given, however, ' \ 'the HGVS notation prescribes that on the reverse strand ' \ 'it should be "%s" at position %s.' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), + util.visualise_sequence(unicode(mutator.orig[first - 1:last])), util.format_range(first, last), - util.visualise_sequence(str(mutator.orig[new_first - 1:new_stop])), + util.visualise_sequence(unicode(mutator.orig[new_first - 1:new_stop])), util.format_range(new_first, new_stop))) # We don't go through the trouble of visualising the *corrected* variant @@ -434,7 +439,7 @@ def apply_inversion(first, last, mutator, record, O): @arg O: The Output object. @type O: Modules.Output.Output """ - snoop = util.palinsnoop(mutator.orig[first - 1:last]) + snoop = util.palinsnoop(unicode(mutator.orig[first - 1:last])) if snoop: # We have a reverse-complement-palindromic prefix. @@ -444,7 +449,7 @@ def apply_inversion(first, last, mutator, record, O): O.addMessage(__file__, 2, 'WNOCHANGE', 'Sequence "%s" at position %i_%i is a palindrome ' \ '(its own reverse complement).' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), + util.visualise_sequence(unicode(mutator.orig[first - 1:last])), first, last)) return else: @@ -453,10 +458,10 @@ def apply_inversion(first, last, mutator, record, O): 'palindrome (the first %i nucleotide(s) are the reverse ' \ 'complement of the last one(s)), the HGVS notation ' \ 'prescribes that it should be "%s" at position %i_%i.' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), + util.visualise_sequence(unicode(mutator.orig[first - 1:last])), first, last, snoop, util.visualise_sequence( - str(mutator.orig[first + snoop - 1: last - snoop])), + unicode(mutator.orig[first + snoop - 1: last - snoop])), first + snoop, last - snoop)) first += snoop last -= snoop @@ -466,8 +471,8 @@ def apply_inversion(first, last, mutator, record, O): if first == last: O.addMessage(__file__, 2, 'WWRONGTYPE', 'Inversion at position ' \ '%i is actually a substitution.' % first) - record.name(first, first, 'subst', mutator.orig[first - 1], - Bio.Seq.reverse_complement(mutator.orig[first - 1]), None) + record.name(first, first, 'subst', unicode(mutator.orig[first - 1]), + util.reverse_complement(unicode(mutator.orig[first - 1])), None) else : record.name(first, last, 'inv', '', '', None) #apply_inversion @@ -483,7 +488,7 @@ def apply_insertion(before, after, s, mutator, record, O): @arg after: Genomic position after the insertion. @type after: int @arg s: Nucleotides to be inserted. - @type s: string + @type s: nucleotide @arg mutator: A Mutator instance. @type mutator: mutalyzer.mutator.Mutator @arg record: A GenRecord object. @@ -547,7 +552,7 @@ def apply_insertion(before, after, s, mutator, record, O): 'however, the HGVS notation prescribes that it should be a ' \ 'duplication of %s at position %i_%i.' % ( s, before, before + 1, - mutator.mutated[new_before + forward_roll:new_stop + forward_roll], + unicode(mutator.mutated[new_before + forward_roll:new_stop + forward_roll]), before + forward_roll, before + forward_roll + insertion_length - 1)) after += forward_roll - 1 @@ -566,7 +571,7 @@ def apply_insertion(before, after, s, mutator, record, O): 'that on the forward strand it should be an insertion of %s ' \ 'at position %i_%i.' % ( s, before, before + 1, - mutator.mutated[new_before + forward_roll:new_stop + forward_roll], + unicode(mutator.mutated[new_before + forward_roll:new_stop + forward_roll]), new_before + forward_roll, new_before + forward_roll + 1)) if forward_roll != original_forward_roll and not reverse_strand: @@ -576,7 +581,7 @@ def apply_insertion(before, after, s, mutator, record, O): 'insertion of %s at position %i_%i, since they reside in ' \ 'different exons.' % ( s, before, before + 1, - mutator.mutated[new_before + original_forward_roll:new_stop + original_forward_roll], + unicode(mutator.mutated[new_before + original_forward_roll:new_stop + original_forward_roll]), new_before + original_forward_roll, new_before + original_forward_roll + 1)) if reverse_roll and reverse_strand: @@ -585,13 +590,13 @@ def apply_insertion(before, after, s, mutator, record, O): 'that on the reverse strand it should be an insertion of %s ' \ 'at position %i_%i.' % ( s, before, before + 1, - mutator.mutated[new_before - reverse_roll:new_stop - reverse_roll], + unicode(mutator.mutated[new_before - reverse_roll:new_stop - reverse_roll]), new_before - reverse_roll, (new_before - reverse_roll) + 1)) record.name(before, before + 1, 'ins', - mutator.mutated[new_before + forward_roll:new_stop + forward_roll], + unicode(mutator.mutated[new_before + forward_roll:new_stop + forward_roll]), '', (reverse_roll, forward_roll), - mutator.mutated[new_before - reverse_roll:new_stop - reverse_roll]) + unicode(mutator.mutated[new_before - reverse_roll:new_stop - reverse_roll])) #apply_insertion @@ -605,7 +610,7 @@ def apply_delins(first, last, insert, mutator, record, output): @arg last: Genomic end position of the delins. @type last: int @arg insert: Sequence to insert. - @type insert: string + @type insert: unicode @arg mutator: A Mutator instance. @type mutator: mutalyzer.mutator.Mutator @arg record: A GenRecord object. @@ -613,14 +618,13 @@ def apply_delins(first, last, insert, mutator, record, output): @arg output: The Output object. @type output: Modules.Output.Output """ - delete = mutator.orig[first - 1:last] + delete = unicode(mutator.orig[first - 1:last]) - if str(delete) == str(insert): + if delete == insert: output.addMessage(__file__, 2, 'WNOCHANGE', 'Sequence "%s" at position %i_%i is identical to ' \ 'the variant.' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), - first, last)) + util.visualise_sequence(delete), first, last)) return delete_trimmed, insert_trimmed, lcp, lcs = util.trim_common(delete, insert) @@ -646,7 +650,7 @@ def apply_delins(first, last, insert, mutator, record, output): mutator, record, output) return - if str(Bio.Seq.reverse_complement(delete_trimmed)) == insert_trimmed: + if util.reverse_complement(delete_trimmed) == insert_trimmed: output.addMessage(__file__, 2, 'WWRONGTYPE', 'The given DelIns ' \ 'is actually an inversion.') apply_inversion(first + lcp, last - lcs, mutator, @@ -658,7 +662,7 @@ def apply_delins(first, last, insert, mutator, record, output): 'Sequence "%s" at position %i_%i has the same prefix or ' \ 'suffix as the inserted sequence "%s". The HGVS notation ' \ 'prescribes that it should be "%s" at position %i_%i.' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), + util.visualise_sequence(unicode(mutator.orig[first - 1:last])), first, last, insert, insert_trimmed, first + lcp, last - lcs)) mutator.delins(first + lcp, last - lcs, insert_trimmed) @@ -952,17 +956,19 @@ def process_raw_variant(mutator, variant, record, transcript, output): """ variant, original_description = variant.RawVar, variant[-1] - # {argument} may be a number, or a subsequence of the reference. - # {sequence} is the variant subsequence. - argument = variant.Arg1 - sequence = variant.Arg2 + # `argument` may be a number, or a subsequence of the reference. + # `sequence` is the variant subsequence. + # Note that pyparsing will return `str('')` if the attribute does not + # exist, so we explicitely convert the result to unicode. + argument = unicode(variant.Arg1) + sequence = unicode(variant.Arg2) # If we are on the reverse strand, subsequences must be in reverse # complement. if transcript and transcript.CM.orientation == -1: - sequence = Bio.Seq.reverse_complement(sequence) + sequence = util.reverse_complement(sequence) if util.is_dna(argument): - argument = Bio.Seq.reverse_complement(argument) + argument = util.reverse_complement(argument) # Get genomic first and last positions for this variant. Below we handle # the different ways of describing these positions. @@ -1189,7 +1195,7 @@ def process_raw_variant(mutator, variant, record, transcript, output): def parse_sequence(seq): if seq.Sequence: if transcript and transcript.CM.orientation == -1: - return Bio.Seq.reverse_complement(str(seq.Sequence)) + return util.reverse_complement(seq.Sequence) return seq.Sequence if seq.StartLoc and seq.EndLoc: @@ -1228,9 +1234,9 @@ def process_raw_variant(mutator, variant, record, transcript, output): 'Position %s is out of range.' % range_last) raise _RawVariantError() - insertion = mutator.orig[range_first - 1:range_last] + insertion = unicode(mutator.orig[range_first - 1:range_last]) if seq.Inv: - insertion = Bio.Seq.reverse_complement(str(insertion)) + insertion = util.reverse_complement(insertion) return insertion @@ -1245,7 +1251,7 @@ def process_raw_variant(mutator, variant, record, transcript, output): seqs = reversed(variant.SeqList) else: seqs = variant.SeqList - insertion = ''.join(str(parse_sequence(seq)) + insertion = ''.join(parse_sequence(seq) for seq in seqs) else: insertion = parse_sequence(variant.Seq) @@ -1316,32 +1322,33 @@ def _add_transcript_info(mutator, transcript, output): if transcript.transcribe: output.addOutput('myTranscriptDescription', transcript.description or '=') output.addOutput('origMRNA', - str(util.splice(mutator.orig, transcript.mRNA.positionList))) + unicode(util.splice(mutator.orig, transcript.mRNA.positionList))) output.addOutput('mutatedMRNA', - str(util.splice(mutator.mutated, + unicode(util.splice(mutator.mutated, mutator.shift_sites(transcript.mRNA.positionList)))) # Add protein prediction to output. if transcript.translate: - cds_original = Seq(str(util.splice(mutator.orig, transcript.CDS.positionList)), - IUPAC.unambiguous_dna) - cds_variant = Seq(str(util.__nsplice(mutator.mutated, - mutator.shift_sites(transcript.mRNA.positionList), - mutator.shift_sites(transcript.CDS.location), - transcript.CM.orientation)), - IUPAC.unambiguous_dna) + cds_original = util.splice(mutator.orig, transcript.CDS.positionList) + cds_original.alphabet = IUPAC.unambiguous_dna - #output.addOutput('origCDS', cds_original) - - if transcript.CM.orientation == -1: - cds_original = Bio.Seq.reverse_complement(cds_original) - cds_variant = Bio.Seq.reverse_complement(cds_variant) - - if not util.is_dna(cds_original): + if not _verify_alphabet(cds_original): output.addMessage(__file__, 4, 'ENODNA', 'Invalid letters in reference sequence.') return + cds_variant = util.__nsplice(mutator.mutated, + mutator.shift_sites(transcript.mRNA.positionList), + mutator.shift_sites(transcript.CDS.location), + transcript.CM.orientation) + cds_variant.alphabet = IUPAC.unambiguous_dna + + #output.addOutput('origCDS', cds_original) + + if transcript.CM.orientation == -1: + cds_original = cds_original.reverse_complement() + cds_variant = cds_variant.reverse_complement() + if '*' in cds_original.translate(table=transcript.txTable)[:-1]: output.addMessage(__file__, 3, 'ESTOP', 'In frame stop codon found.') @@ -1354,36 +1361,35 @@ def _add_transcript_info(mutator, transcript, output): # Note: addOutput('origCDS', ...) was first before the possible # reverse complement operation above. - output.addOutput('origCDS', cds_original) - output.addOutput("newCDS", cds_variant[:(len(str(protein_variant)) + 1) * 3]) + output.addOutput('origCDS', unicode(cds_original)) + output.addOutput("newCDS", unicode(cds_variant[:(len(protein_variant) + 1) * 3])) - output.addOutput('oldprotein', protein_original + '*') + output.addOutput('oldprotein', unicode(protein_original) + '*') # Todo: Don't generate the fancy HTML protein views here, do this in # website.py. # I think it would also be nice to include the mutated list of splice # sites. - if not protein_variant or protein_variant[0] != 'M': + if not protein_variant or unicode(protein_variant[0]) != 'M': # Todo: Protein differences are not color-coded, # use something like below in protein_description(). - util.print_protein_html(protein_original + '*', 0, 0, output, - 'oldProteinFancy') - util.print_protein_html(protein_original + '*', 0, 0, output, - 'oldProteinFancyText', text=True) - if str(cds_variant[0:3]) in \ - Bio.Data.CodonTable.unambiguous_dna_by_id \ - [transcript.txTable].start_codons: + util.print_protein_html(unicode(protein_original) + '*', 0, 0, + output, 'oldProteinFancy') + util.print_protein_html(unicode(protein_original) + '*', 0, 0, + output, 'oldProteinFancyText', text=True) + if unicode(cds_variant[0:3]) in \ + CodonTable.unambiguous_dna_by_id[transcript.txTable].start_codons: output.addOutput('newprotein', '?') util.print_protein_html('?', 0, 0, output, 'newProteinFancy') util.print_protein_html('?', 0, 0, output, 'newProteinFancyText', text=True) - output.addOutput('altStart', str(cds_variant[0:3])) - if str(protein_original[1:]) != str(protein_variant[1:]): + output.addOutput('altStart', unicode(cds_variant[0:3])) + if unicode(protein_original[1:]) != unicode(protein_variant[1:]): output.addOutput('altProtein', - 'M' + protein_variant[1:] + '*') - util.print_protein_html('M' + protein_variant[1:] + '*', 0, + 'M' + unicode(protein_variant[1:]) + '*') + util.print_protein_html('M' + unicode(protein_variant[1:]) + '*', 0, 0, output, 'altProteinFancy') - util.print_protein_html('M' + protein_variant[1:] + '*', 0, + util.print_protein_html('M' + unicode(protein_variant[1:]) + '*', 0, 0, output, 'altProteinFancyText', text=True) else : output.addOutput('newprotein', '?') @@ -1395,21 +1401,22 @@ def _add_transcript_info(mutator, transcript, output): cds_length = util.cds_length( mutator.shift_sites(transcript.CDS.positionList)) descr, first, last_original, last_variant = \ - util.protein_description(cds_length, protein_original, - protein_variant) + util.protein_description(cds_length, + unicode(protein_original), + unicode(protein_variant)) # This is never used. output.addOutput('myProteinDescription', descr) - util.print_protein_html(protein_original + '*', first, + util.print_protein_html(unicode(protein_original) + '*', first, last_original, output, 'oldProteinFancy') - util.print_protein_html(protein_original + '*', first, + util.print_protein_html(unicode(protein_original) + '*', first, last_original, output, 'oldProteinFancyText', text=True) - if str(protein_original) != str(protein_variant): - output.addOutput('newprotein', protein_variant + '*') - util.print_protein_html(protein_variant + '*', first, + if unicode(protein_original) != unicode(protein_variant): + output.addOutput('newprotein', unicode(protein_variant) + '*') + util.print_protein_html(unicode(protein_variant) + '*', first, last_variant, output, 'newProteinFancy') - util.print_protein_html(protein_variant + '*', first, + util.print_protein_html(unicode(protein_variant) + '*', first, last_variant, output, 'newProteinFancyText', text=True) #_add_transcript_info @@ -1473,6 +1480,7 @@ def process_variant(mutator, description, record, output): if description.LrgAcc: # LRG case, pick the top gene. gene = record.record.geneList[0] + if transcript_id: transcript = gene.findLocus(transcript_id) if not transcript: @@ -1481,7 +1489,7 @@ def process_variant(mutator, description, record, output): # NG_012772.1). output.addMessage(__file__, 4, "ENOTRANSCRIPT", "Multiple transcripts found for gene %s. Please " \ - "choose from: %s" %(gene.name, + "choose from: %s" % (gene.name, ", ".join(gene.listLoci()))) else: # No transcript id given. @@ -1563,10 +1571,10 @@ def process_variant(mutator, description, record, output): 'Protein level descriptions can only be done on a protein or transcript reference.') raise _VariantError() else: - cds = Seq(str(util.splice(mutator.orig, transcript.CDS.positionList)), - IUPAC.unambiguous_dna) + cds = util.splice(mutator.orig, transcript.CDS.positionList) + cds.alphabet = IUPAC.unambiguous_dna if transcript.CM.orientation == -1: - cds = Bio.Seq.reverse_complement(cds) + cds = cds.reverse_complement() protein = cds.translate(table=transcript.txTable, cds=True, to_stop=True) mutator.orig = protein mutator.mutated = protein @@ -1644,12 +1652,12 @@ def check_variant(description, output): if parsed_description.LrgAcc: record_id = parsed_description.LrgAcc - elif parsed_description.Version: - record_id = parsed_description.RefSeqAcc + '.' + parsed_description.Version + elif parsed_description.RefSeqAcc: + if parsed_description.Version: + record_id = parsed_description.RefSeqAcc + '.' + parsed_description.Version + else: + record_id = parsed_description.RefSeqAcc else: - record_id = parsed_description.RefSeqAcc - - if not record_id: output.addMessage(__file__, 4, 'ENOREF', 'No reference sequence given.') return @@ -1657,7 +1665,7 @@ def check_variant(description, output): if parsed_description.LrgAcc: filetype = 'LRG' - transcript_id = parsed_description.LRGTranscriptID + transcript_id = parsed_description.LRGTranscriptID or '' retriever = Retriever.LRGRetriever(output) else: filetype = 'GB' @@ -1732,8 +1740,8 @@ def check_variant(description, output): except _VariantError: return - output.addOutput('original', str(mutator.orig)) - output.addOutput('mutated', str(mutator.mutated)) + output.addOutput('original', unicode(mutator.orig)) + output.addOutput('mutated', unicode(mutator.mutated)) # Chromosomal region (only for GenBank human transcript references). # This is still quite ugly code, and should be cleaned up once we have @@ -1775,17 +1783,18 @@ def check_variant(description, output): transcript.proteinDescription = 'p.?' continue - cds_original = Seq(str(util.splice(mutator.orig, transcript.CDS.positionList)), - IUPAC.unambiguous_dna) - cds_variant = Seq(str(util.__nsplice(mutator.mutated, - mutator.shift_sites(transcript.mRNA.positionList), - mutator.shift_sites(transcript.CDS.location), - transcript.CM.orientation)), - IUPAC.unambiguous_dna) + cds_original = util.splice(mutator.orig, transcript.CDS.positionList) + cds_original.alphabet = IUPAC.unambiguous_dna + + cds_variant = util.__nsplice(mutator.mutated, + mutator.shift_sites(transcript.mRNA.positionList), + mutator.shift_sites(transcript.CDS.location), + transcript.CM.orientation) + cds_variant.alphabet = IUPAC.unambiguous_dna if transcript.CM.orientation == -1: - cds_original = Bio.Seq.reverse_complement(cds_original) - cds_variant = Bio.Seq.reverse_complement(cds_variant) + cds_original = cds_original.reverse_complement() + cds_variant = cds_variant.reverse_complement() #if '*' in cds_original.translate()[:-1]: # output.addMessage(__file__, 3, "ESTOP", @@ -1801,7 +1810,7 @@ def check_variant(description, output): # FIXME this is a bit of a rancid fix. protein_original = cds_original.translate( table=transcript.txTable, cds=True, to_stop=True) - except Bio.Data.CodonTable.TranslationError: + except CodonTable.TranslationError: if transcript.current: output.addMessage( __file__, 2, "WTRANS", @@ -1822,7 +1831,7 @@ def check_variant(description, output): cds_length = util.cds_length( mutator.shift_sites(transcript.CDS.positionList)) transcript.proteinDescription = util.protein_description( - cds_length, protein_original, protein_variant)[0] + cds_length, unicode(protein_original), unicode(protein_variant))[0] except IndexError: # Todo: Probably CDS start was hit by removal of exon.. transcript.proteinDescription = 'p.?' diff --git a/mutalyzer/website/__init__.py b/mutalyzer/website/__init__.py index 730c33e86f6ee5be9edd5afcb13166d4c58d907d..2ce0450bf8765e9197c37545aeef9b3281315c43 100644 --- a/mutalyzer/website/__init__.py +++ b/mutalyzer/website/__init__.py @@ -3,6 +3,8 @@ Mutalyzer website interface using the Flask framework. """ +from __future__ import unicode_literals + import logging import os import pkg_resources diff --git a/mutalyzer/website/templates/base.html b/mutalyzer/website/templates/base.html index 2f45caf9f0a7a4be3f98721736c785861b620dad..270e3bdfd75d1bd69e692de91f876300b26f4066 100644 --- a/mutalyzer/website/templates/base.html +++ b/mutalyzer/website/templates/base.html @@ -22,7 +22,7 @@ src="{{ url_for('static', filename='js/generator.js') }}"> </script> <meta http-equiv="Content-Type" - content="text/html; charset=iso-8859-1"> + content="text/html; charset=utf-8"> <title>Mutalyzer {{ mutalyzer_version }} — {{ page_title }}</title> </head> <body diff --git a/mutalyzer/website/views.py b/mutalyzer/website/views.py index 475330b7ce084273bc61627e5a24221d2b0ccff0..84b5cf857b424bb6e42b6c2f41d97877459df588 100644 --- a/mutalyzer/website/views.py +++ b/mutalyzer/website/views.py @@ -3,16 +3,17 @@ Mutalyzer website views. """ +from __future__ import unicode_literals + import bz2 import os import pkg_resources import re -from cStringIO import StringIO import urllib from flask import Blueprint -from flask import (abort, current_app, jsonify, make_response, redirect, - render_template, request, send_from_directory, url_for) +from flask import (abort, jsonify, make_response, redirect, render_template, + request, send_from_directory, url_for) import jinja2 from lxml import etree from spyne.server.http import HttpBase @@ -22,9 +23,8 @@ import mutalyzer from mutalyzer import (announce, describe, File, Retriever, Scheduler, stats, util, variantchecker) from mutalyzer.config import settings -from mutalyzer.db import session from mutalyzer.db.models import BATCH_JOB_TYPES -from mutalyzer.db.models import Assembly, BatchJob, BatchQueueItem +from mutalyzer.db.models import Assembly, BatchJob from mutalyzer.grammar import Grammar from mutalyzer.mapping import Converter from mutalyzer.output import Output @@ -135,16 +135,16 @@ def soap_api(): """ soap_server = HttpBase(soap.application) soap_server.doc.wsdl11.build_interface_document(settings.SOAP_WSDL_URL) - wsdl_handle = StringIO(soap_server.doc.wsdl11.get_interface_document()) + wsdl_string = soap_server.doc.wsdl11.get_interface_document() - xsl_handle = open(os.path.join( - pkg_resources.resource_filename('mutalyzer', 'website/templates'), - 'wsdl-viewer.xsl'), 'r') - wsdl_doc = etree.parse(wsdl_handle) - xsl_doc = etree.parse(xsl_handle) + xsl_file = os.path.join( + pkg_resources.resource_filename('mutalyzer', 'website/templates'), + 'wsdl-viewer.xsl') + wsdl_doc = etree.fromstring(wsdl_string) + xsl_doc = etree.parse(xsl_file) transform = etree.XSLT(xsl_doc) - return make_response(str(transform(wsdl_doc))) + return make_response(unicode(transform(wsdl_doc))) @website.route('/downloads/<string:filename>') @@ -159,7 +159,7 @@ def downloads(filename): except jinja2.exceptions.TemplateNotFound: abort(404) - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' response.headers['Content-Disposition'] = ('attachment; filename="%s"' % filename) return response @@ -233,10 +233,7 @@ def name_checker(): % (description, request.remote_addr)) stats.increment_counter('name-checker/website') - # Todo: The following is probably a problem elsewhere too. We stringify - # the variant, because a unicode string crashes BioPython's - # `reverse_complement`. - variantchecker.check_variant(str(description), output) + variantchecker.check_variant(description, output) errors, warnings, summary = output.Summary() parse_error = output.getOutput('parseError') @@ -272,18 +269,20 @@ def name_checker(): # Experimental description extractor. if (output.getIndexedOutput('original', 0) and output.getIndexedOutput('mutated', 0)): + extracted = extractedProt = '(skipped)' + allele = describe.describe(output.getIndexedOutput('original', 0), output.getIndexedOutput('mutated', 0)) - prot_allele = describe.describe( - output.getIndexedOutput('oldprotein', 0), - output.getIndexedOutput('newprotein', 0, default=''), - DNA=False) - - extracted = extractedProt = '(skipped)' if allele: extracted = describe.alleleDescription(allele) - if prot_allele: - extractedProt = describe.alleleDescription(prot_allele) + + if output.getIndexedOutput('oldprotein', 0): + prot_allele = describe.describe( + output.getIndexedOutput('oldprotein', 0), + output.getIndexedOutput('newprotein', 0, default=''), + DNA=False) + if prot_allele: + extractedProt = describe.alleleDescription(prot_allele) else: extracted = extractedProt = '' @@ -350,11 +349,10 @@ def bed(): if not description: abort(404) - return render_template('name-checker.html') output = Output(__file__) - variantchecker.check_variant(str(description), output) + variantchecker.check_variant(description, output) raw_variants = output.getIndexedOutput('rawVariantsChromosomal', 0) if not raw_variants: @@ -376,14 +374,14 @@ def bed(): for descr, positions in raw_variants[2]: bed += '\t'.join([raw_variants[0], - str(min(positions) - 1), - str(max(positions)), + unicode(min(positions) - 1), + unicode(max(positions)), descr, '0', raw_variants[1]]) + '\n' response = make_response(bed) - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' return response @@ -579,7 +577,7 @@ def reference_loader_submit(): output = Output(__file__) output.addMessage(__file__, -1, 'INFO', 'Received request upload(%s) with arguments %s from %s' - % (method, str(request.form), request.remote_addr)) + % (method, unicode(request.form), request.remote_addr)) assemblies = Assembly.query \ .order_by(Assembly.taxonomy_common_name.asc(), @@ -668,11 +666,11 @@ def reference_loader_submit(): if not ud: errors.append('The request could not be completed') - errors.extend(str(m) for m in output.getMessages()) + errors.extend(unicode(m) for m in output.getMessages()) output.addMessage(__file__, -1, 'INFO', 'Finished request upload(%s) with arguments %s from %s' - % (method, str(request.form), request.remote_addr)) + % (method, unicode(request.form), request.remote_addr)) return render_template('reference-loader.html', assemblies=assemblies, @@ -737,7 +735,7 @@ def reference(filename): response = make_response(bz2.BZ2File(file_path, 'r').read()) - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' response.headers['Content-Disposition'] = ('attachment; filename="%s"' % filename) return response @@ -773,7 +771,9 @@ def batch_jobs_submit(): """ job_type = request.form.get('job_type') email = request.form.get('email') - file = request.files.get('file') + + # Note that this is always a seekable binary file object. + batch_file = request.files.get('file') assemblies = Assembly.query \ .order_by(Assembly.taxonomy_common_name.asc(), @@ -809,7 +809,7 @@ def batch_jobs_submit(): scheduler = Scheduler.Scheduler() file_instance = File.File(output) - job, columns = file_instance.parseBatchFile(file) + job, columns = file_instance.parseBatchFile(batch_file) if job is None: errors.append('Could not parse input file, please check your ' @@ -894,7 +894,7 @@ def batch_job_result(result_id): return send_from_directory(settings.CACHE_DIR, 'batch-job-%s.txt' % result_id, - mimetype='text/plain', + mimetype='text/plain; charset=utf-8', as_attachment=True) @@ -933,10 +933,7 @@ def lovd_get_gs(): % (mutation_name, variant_record, forward, request.remote_addr)) - # Todo: The following is probably a problem elsewhere too. - # We stringify the variant, because a unicode string crashes - # Bio.Seq.reverse_complement in mapping.py:607. - variantchecker.check_variant(str(mutation_name), output) + variantchecker.check_variant(mutation_name, output) output.addMessage(__file__, -1, 'INFO', 'Finished request getGS(%s, %s, %s)' @@ -955,11 +952,11 @@ def lovd_get_gs(): standalone=1)) else: response = make_response(l[0]) - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' return response response = make_response('Transcript not found') - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' return response @@ -1041,7 +1038,7 @@ def lovd_variant_info(): assembly = Assembly.by_name_or_alias(build) except NoResultFound: response = make_response('invalid build') - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' return response converter = Converter(assembly, output) @@ -1079,7 +1076,7 @@ def lovd_variant_info(): response = re.sub('^Error \(.*\):', 'Error:', result) response = make_response(result) - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' return response diff --git a/requirements.txt b/requirements.txt index ab361e7d283147c643757e10f9d2ee1e212d0c3e..63d953eace27346d46a9a1a03088b42509c7b87e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ pyparsing==2.0.1 pytz==2013.9 requests==2.2.1 simplejson==3.3.3 --e git+https://github.com/LUMC/spyne.git@spyne-2.11.0-mutalyzer#egg=spyne +spyne==2.11.0 suds==0.4 wsgiref==0.1.2 xlrd==0.9.2 @@ -21,3 +21,5 @@ mock==1.0.1 alembic==0.6.3 Sphinx==1.2.1 sphinx-rtd-theme==0.1.5 +cchardet==0.3.5 +Werkzeug==0.9.6 diff --git a/tests/data/batch_input.ods b/tests/data/batch_input.ods new file mode 100644 index 0000000000000000000000000000000000000000..ea08744237a58f80386e041f23583e6555b459ed Binary files /dev/null and b/tests/data/batch_input.ods differ diff --git a/tests/data/batch_input.sxc b/tests/data/batch_input.sxc new file mode 100644 index 0000000000000000000000000000000000000000..942282e2acc2e68f5ac7e496c0f48db6f2d1870b Binary files /dev/null and b/tests/data/batch_input.sxc differ diff --git a/tests/data/batch_input.xls b/tests/data/batch_input.xls new file mode 100644 index 0000000000000000000000000000000000000000..e795855d7ae0856f3b91da7b2732245274073f75 Binary files /dev/null and b/tests/data/batch_input.xls differ diff --git a/tests/data/batch_input.xlsx b/tests/data/batch_input.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..b2a5a87674b7eb49eed8c7ed53e227762e8bf17f Binary files /dev/null and b/tests/data/batch_input.xlsx differ diff --git a/tests/data/image.zip b/tests/data/image.zip new file mode 100644 index 0000000000000000000000000000000000000000..df09158894dfb403f0edb2e5dc24a2749bee6c0d Binary files /dev/null and b/tests/data/image.zip differ diff --git a/tests/fixtures.py b/tests/fixtures.py index 595d72a663e3ec06a6df748f3d21e6aa4a8019ee..71b1ae1bfc7bba9bc17a56f8c1431f56b2eddde7 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -7,6 +7,8 @@ as :func:`hg19` must be called after the :func:`database` fixture). """ +from __future__ import unicode_literals + import os import shutil diff --git a/tests/old/lrgtest.py b/tests/old/lrgtest.py index afeefc3324596c39bf3723f40d119a4f0df90d0c..d2dae2bca774fad39d9cdeb7bc8e888db7083075 100644 --- a/tests/old/lrgtest.py +++ b/tests/old/lrgtest.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals import sys, os, unittest, types #make it possible to import the Modules diff --git a/tests/old/maptest.py b/tests/old/maptest.py index 7f3105a46eac5c4af99025728b736fa08c49b4e1..40dc1d15dfee1df5b4cfcc46c2d6948dd6796423 100644 --- a/tests/old/maptest.py +++ b/tests/old/maptest.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals import sys, os, unittest, types #make it possible to import the Modules diff --git a/tests/old/recordtest.py b/tests/old/recordtest.py index d55bd58c0df440a4774ceaf59a44444f388740dd..a9cc9354557e3ba32f299bd23718cddab3ff7b46 100644 --- a/tests/old/recordtest.py +++ b/tests/old/recordtest.py @@ -2,6 +2,7 @@ recordtest.py contains TestRecord - a BaseClass for testing GenRecord.Record instances """ +from __future__ import unicode_literals import unittest, types from Modules import GenRecord #test class-types @@ -56,7 +57,7 @@ class TestRecord(unittest.TestCase): self.assertTrue(isinstance(plist, (types.NoneType, GenRecord.PList))) - #self.assertTrue(any(map(isinstance, + #self.assertTrue(any(map(isinstance, def _test_if_loc(self, loc): @@ -76,7 +77,5 @@ class TestRecord(unittest.TestCase): if __name__ == "__main__": - # This file should be imported + # This file should be imported pass - - diff --git a/tests/test_crossmap.py b/tests/test_crossmap.py index ff9d6d75928918b19d01b769b5a099d864408b11..990f93fe877dfcc6d8945187a4d559238f7f9a45 100644 --- a/tests/test_crossmap.py +++ b/tests/test_crossmap.py @@ -3,6 +3,8 @@ Tests for the Crossmap module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() from mutalyzer.Crossmap import Crossmap diff --git a/tests/test_describe.py b/tests/test_describe.py index 8315213eb49cc5c688d1d4816841dcc5c7dcb02b..e81c7ce45bf6dbb5776326d75e3f7f410179db6d 100644 --- a/tests/test_describe.py +++ b/tests/test_describe.py @@ -3,6 +3,8 @@ Tests for the mutalyzer.describe module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() import os diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 1ebaa399e372155291f33ce1c6de21b22682c5ad..dad9a9c64c959cd91433b0324f0c7eb346f3e58c 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -3,6 +3,8 @@ Tests for the mutalyzer.grammar module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() import os diff --git a/tests/test_mapping.py b/tests/test_mapping.py index 5ebdc60e667cc3ec46cd46cda7c71e29561061ee..620f9d757f388579381edbf0eb3c64d032db51a3 100644 --- a/tests/test_mapping.py +++ b/tests/test_mapping.py @@ -3,6 +3,8 @@ Tests for the mapping module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() from sqlalchemy import or_ diff --git a/tests/test_mutator.py b/tests/test_mutator.py index 36c5b8d152ebfa553e859b9ef11dae3e3a40bd43..05e2c685fb33f29978839b17236c933c4b232016 100644 --- a/tests/test_mutator.py +++ b/tests/test_mutator.py @@ -3,6 +3,8 @@ Tests for the mutalyzer.mutator module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() import re import os @@ -666,7 +668,7 @@ class TestMutator(MutalyzerTest): """ m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) - assert str(m.mutated) == str(Seq('ACGATCG')) + assert unicode(m.mutated) == unicode(Seq('ACGATCG')) def test_largedel(self): """ @@ -674,7 +676,7 @@ class TestMutator(MutalyzerTest): """ m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 7) - assert str(m.mutated) == str(Seq('AG')) + assert unicode(m.mutated) == unicode(Seq('AG')) def test_ins(self): """ @@ -682,7 +684,7 @@ class TestMutator(MutalyzerTest): """ m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'A') - assert str(m.mutated) == str(Seq('ATACGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATACGATCG')) def test_largeins(self): """ @@ -690,7 +692,7 @@ class TestMutator(MutalyzerTest): """ m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'ATCG') - assert str(m.mutated) == str(Seq('ATATCGCGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATATCGCGATCG')) def test_sub(self): """ @@ -698,7 +700,7 @@ class TestMutator(MutalyzerTest): """ m = self._mutator(Seq('ATCGATCG')) m.substitution(3, 'G') - assert str(m.mutated) == str(Seq('ATGGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATGGATCG')) def test_adjecent_del_sub_1(self): """ @@ -709,7 +711,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.substitution(3, 'G') - assert str(m.mutated) == str(Seq('AGGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGGATCG')) def test_adjecent_del_sub_2(self): """ @@ -718,7 +720,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(3, 3) m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGGATCG')) def test_near_adjecent_del_sub_1(self): """ @@ -727,7 +729,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.substitution(4, 'T') - assert str(m.mutated) == str(Seq('ACTATCG')) + assert unicode(m.mutated) == unicode(Seq('ACTATCG')) def test_near_adjecent_del_sub_2(self): """ @@ -736,7 +738,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(4, 4) m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGCATCG')) + assert unicode(m.mutated) == unicode(Seq('AGCATCG')) def test_adjecent_largedel_sub_1(self): """ @@ -746,7 +748,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 6) m.substitution(7, 'T') - assert str(m.mutated) == str(Seq('ATG')) + assert unicode(m.mutated) == unicode(Seq('ATG')) def test_adjecent_largedel_sub_2(self): """ @@ -756,7 +758,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(3, 7) m.substitution(2, 'C') - assert str(m.mutated) == str(Seq('ACG')) + assert unicode(m.mutated) == unicode(Seq('ACG')) def test_near_adjecent_largedel_sub_1(self): """ @@ -765,7 +767,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 5) m.substitution(7, 'T') - assert str(m.mutated) == str(Seq('ATTG')) + assert unicode(m.mutated) == unicode(Seq('ATTG')) def test_near_adjecent_largedel_sub_2(self): """ @@ -774,7 +776,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(4, 7) m.substitution(2, 'C') - assert str(m.mutated) == str(Seq('ACCG')) + assert unicode(m.mutated) == unicode(Seq('ACCG')) def test_adjectent_del_ins_1(self): """ @@ -783,7 +785,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('AGCGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGCGATCG')) def test_adjectent_del_ins_2(self): """ @@ -792,7 +794,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(3, 3) m.insertion(2, 'A') - assert str(m.mutated) == str(Seq('ATAGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATAGATCG')) def test_near_adjectent_del_ins(self): """ @@ -801,7 +803,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.insertion(3, 'T') - assert str(m.mutated) == str(Seq('ACTGATCG')) + assert unicode(m.mutated) == unicode(Seq('ACTGATCG')) def test_adjecent_ins_sub_1(self): """ @@ -811,7 +813,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'A') m.substitution(3, 'G') - assert str(m.mutated) == str(Seq('ATAGGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATAGGATCG')) def test_adjecent_ins_sub_2(self): """ @@ -821,7 +823,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'A') m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGACGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGACGATCG')) def test_near_adjecent_ins_sub(self): """ @@ -831,7 +833,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'A') m.substitution(4, 'T') - assert str(m.mutated) == str(Seq('ATACTATCG')) + assert unicode(m.mutated) == unicode(Seq('ATACTATCG')) def test_adjecent_largeins_sub_1(self): """ @@ -841,7 +843,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'ATCG') m.substitution(3, 'G') - assert str(m.mutated) == str(Seq('ATATCGGGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATATCGGGATCG')) def test_adjecent_largeins_sub_2(self): """ @@ -851,7 +853,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'ATCG') m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGATCGCGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGATCGCGATCG')) def test_near_adjecent_largeins_sub(self): """ @@ -861,7 +863,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'ATCG') m.substitution(4, 'T') - assert str(m.mutated) == str(Seq('ATATCGCTATCG')) + assert unicode(m.mutated) == unicode(Seq('ATATCGCTATCG')) def test_adjecent_del_del_1(self): """ @@ -870,7 +872,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.deletion(3, 3) - assert str(m.mutated) == str(Seq('AGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGATCG')) def test_adjecent_del_del_2(self): """ @@ -879,7 +881,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(3, 3) m.deletion(2, 2) - assert str(m.mutated) == str(Seq('AGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGATCG')) def test_adjecent_delins_snp_1(self): """ @@ -888,7 +890,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 2, 'A') m.substitution(3, 'G') - assert str(m.mutated) == str(Seq('AAGGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGGATCG')) def test_adjecent_delins_snp_2(self): """ @@ -897,7 +899,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 3, 'A') m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGAGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGAGATCG')) def test_adjecent_largedelins_eq_snp_1(self): """ @@ -907,7 +909,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAA') m.substitution(7, 'G') - assert str(m.mutated) == str(Seq('AAAAAAGG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAGG')) def test_adjecent_largedelins_min_snp_1(self): """ @@ -917,7 +919,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAA') m.substitution(7, 'G') - assert str(m.mutated) == str(Seq('AAAAGG')) + assert unicode(m.mutated) == unicode(Seq('AAAAGG')) def test_adjecent_largedelins_plus_snp_1(self): """ @@ -927,7 +929,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAAAA') m.substitution(7, 'G') - assert str(m.mutated) == str(Seq('AAAAAAAAGG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAAAGG')) def test_adjecent_largedelins_eq_snp_2(self): """ @@ -937,7 +939,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAA') m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('AGAAAAAG')) def test_adjecent_largedelins_min_snp_2(self): """ @@ -947,7 +949,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAA') m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGAAAG')) + assert unicode(m.mutated) == unicode(Seq('AGAAAG')) def test_adjecent_largedelins_plus_snp_2(self): """ @@ -957,7 +959,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAAAA') m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGAAAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('AGAAAAAAAG')) def test_adjecent_delins_del_1(self): """ @@ -966,7 +968,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 2, 'A') m.deletion(3, 3) - assert str(m.mutated) == str(Seq('AAGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGATCG')) def test_adjecent_delins_del_2(self): """ @@ -975,7 +977,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 3, 'A') m.deletion(2, 2) - assert str(m.mutated) == str(Seq('AAGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGATCG')) def test_adjecent_largedelins_eq_del_1(self): """ @@ -985,7 +987,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAA') m.deletion(7, 7) - assert str(m.mutated) == str(Seq('AAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAG')) def test_adjecent_largedelins_min_del_1(self): """ @@ -995,7 +997,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAA') m.deletion(7, 7) - assert str(m.mutated) == str(Seq('AAAAG')) + assert unicode(m.mutated) == unicode(Seq('AAAAG')) def test_adjecent_largedelins_plus_del_1(self): """ @@ -1005,7 +1007,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAAAA') m.deletion(7, 7) - assert str(m.mutated) == str(Seq('AAAAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAAAG')) def test_adjecent_largedelins_eq_del_2(self): """ @@ -1015,7 +1017,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAA') m.deletion(2, 2) - assert str(m.mutated) == str(Seq('AAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAG')) def test_adjecent_largedelins_min_del_2(self): """ @@ -1025,7 +1027,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAA') m.deletion(2, 2) - assert str(m.mutated) == str(Seq('AAAAG')) + assert unicode(m.mutated) == unicode(Seq('AAAAG')) def test_adjecent_largedelins_plus_del_2(self): """ @@ -1035,7 +1037,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAAAA') m.deletion(2, 2) - assert str(m.mutated) == str(Seq('AAAAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAAAG')) def test_adjectent_delins_ins_1(self): """ @@ -1044,7 +1046,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 2, 'A') m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('AAGCGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGCGATCG')) def test_adjectent_delins_ins_2(self): """ @@ -1053,7 +1055,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 3, 'A') m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('ATGAGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATGAGATCG')) def test_adjectent_largedelins_eq_ins_1(self): """ @@ -1062,7 +1064,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAA') m.insertion(6, 'G') - assert str(m.mutated) == str(Seq('AAAAAAGCG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAGCG')) def test_adjectent_largedelins_min_ins_1(self): """ @@ -1071,7 +1073,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAA') m.insertion(6, 'G') - assert str(m.mutated) == str(Seq('AAAAGCG')) + assert unicode(m.mutated) == unicode(Seq('AAAAGCG')) def test_adjectent_largedelins_plus_ins_1(self): """ @@ -1080,7 +1082,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAAAA') m.insertion(6, 'G') - assert str(m.mutated) == str(Seq('AAAAAAAAGCG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAAAGCG')) def test_adjectent_largedelins_eq_ins_2(self): """ @@ -1089,7 +1091,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAA') m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('ATGAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('ATGAAAAAG')) def test_adjectent_largedelins_min_ins_2(self): """ @@ -1098,7 +1100,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAA') m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('ATGAAAG')) + assert unicode(m.mutated) == unicode(Seq('ATGAAAG')) def test_adjectent_largedelins_plus_ins_2(self): """ @@ -1107,7 +1109,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAAAA') m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('ATGAAAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('ATGAAAAAAAG')) def test_adjectent_delins_del_delins(self): """ @@ -1116,7 +1118,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 3, 'A') m.delins(4, 4, 'T') - assert str(m.mutated) == str(Seq('AATATCG')) + assert unicode(m.mutated) == unicode(Seq('AATATCG')) def test_adjectent_largedelins_plus_delins_1(self): """ @@ -1125,7 +1127,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAAAA') m.delins(7, 7, 'T') - assert str(m.mutated) == str(Seq('AAAAAAAATG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAAATG')) def test_adjectent_largedelins_plus_delins_2(self): """ @@ -1134,7 +1136,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAAAA') m.delins(2, 2, 'C') - assert str(m.mutated) == str(Seq('ACAAAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('ACAAAAAAAG')) def test_adjectent_largedelins_min_delins_1(self): """ @@ -1143,7 +1145,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAA') m.delins(7, 7, 'T') - assert str(m.mutated) == str(Seq('AAAATG')) + assert unicode(m.mutated) == unicode(Seq('AAAATG')) def test_adjectent_largedelins_min_delins_2(self): """ @@ -1152,7 +1154,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAA') m.delins(2, 2, 'C') - assert str(m.mutated) == str(Seq('ACAAAG')) + assert unicode(m.mutated) == unicode(Seq('ACAAAG')) def test_adjectent_del_dup_1(self): """ @@ -1161,7 +1163,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.duplication(3, 3) - assert str(m.mutated) == str(Seq('ACCGATCG')) + assert unicode(m.mutated) == unicode(Seq('ACCGATCG')) def test_adjectent_del_dup_2(self): """ @@ -1170,7 +1172,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(3, 3) m.duplication(2, 2) - assert str(m.mutated) == str(Seq('ATTGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATTGATCG')) def test_adjectent_ins_dup_1(self): """ @@ -1179,7 +1181,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'G') m.duplication(3, 3) - assert str(m.mutated) == str(Seq('ATGCCGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATGCCGATCG')) def test_adjectent_ins_dup_2(self): """ @@ -1188,7 +1190,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'G') m.duplication(2, 2) - assert str(m.mutated) == str(Seq('ATTGCGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATTGCGATCG')) def test_adjectent_ins_ins_1(self): """ @@ -1197,7 +1199,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'G') m.insertion(3, 'A') - assert str(m.mutated) == str(Seq('ATGCAGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATGCAGATCG')) def test_adjectent_ins_ins_2(self): """ @@ -1206,7 +1208,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(3, 'A') m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('ATGCAGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATGCAGATCG')) def test_ins_ins(self): """ @@ -1215,7 +1217,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'G') m.insertion(2, 'A') - assert str(m.mutated) in (str(Seq('ATGACGATCG')), str(Seq('ATAGCGATCG'))) + assert unicode(m.mutated) in (unicode(Seq('ATGACGATCG')), unicode(Seq('ATAGCGATCG'))) def test_adjecent_inv_inv_1(self): """ @@ -1224,7 +1226,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.inversion(2, 2) m.inversion(3, 3) - assert str(m.mutated) == str(Seq('AAGGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGGATCG')) def test_adjecent_inv_inv_2(self): """ @@ -1233,7 +1235,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.inversion(3, 3) m.inversion(2, 2) - assert str(m.mutated) == str(Seq('AAGGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGGATCG')) def test_adjecent_dup_dup_1(self): """ @@ -1242,7 +1244,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.duplication(2, 2) m.duplication(3, 3) - assert str(m.mutated) == str(Seq('ATTCCGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATTCCGATCG')) def test_adjecent_dup_dup_2(self): """ @@ -1251,7 +1253,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.duplication(3, 3) m.duplication(2, 2) - assert str(m.mutated) == str(Seq('ATTCCGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATTCCGATCG')) def test_adjecent_del_inv_1(self): """ @@ -1260,7 +1262,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.inversion(3, 3) - assert str(m.mutated) == str(Seq('AGGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGGATCG')) def test_adjecent_del_inv_2(self): """ @@ -1269,7 +1271,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(3, 3) m.inversion(2, 2) - assert str(m.mutated) == str(Seq('AAGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGATCG')) def test_adjecent_ins_inv_1(self): """ @@ -1278,7 +1280,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'G') m.inversion(3, 3) - assert str(m.mutated) == str(Seq('ATGGGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATGGGATCG')) def test_adjecent_ins_inv_2(self): """ @@ -1287,4 +1289,4 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'G') m.inversion(2, 2) - assert str(m.mutated) == str(Seq('AAGCGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGCGATCG')) diff --git a/tests/test_parsers_genbank.py b/tests/test_parsers_genbank.py index 7640c496af9aef1c871fa313b69e2ef836d1aace..f04b883971617ee9885ba3478bd667c674189650 100644 --- a/tests/test_parsers_genbank.py +++ b/tests/test_parsers_genbank.py @@ -3,6 +3,8 @@ Tests for the mutalyzer.parsers.genbank module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() from mutalyzer.parsers import genbank diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index fc5e4abe498469e83f4986b40fb68967fd165d86..791f867ddad19b9a71ac333726e5f13ade37d782 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -3,16 +3,18 @@ Tests for the Scheduler module. """ +from __future__ import unicode_literals + import bz2 import os -import StringIO +import io #import logging; logging.basicConfig() from Bio import Entrez from mock import patch from mutalyzer.config import settings -from mutalyzer.db.models import BatchJob, BatchQueueItem +from mutalyzer.db.models import BatchJob from mutalyzer import File from mutalyzer import output from mutalyzer import Scheduler @@ -28,12 +30,10 @@ class TestScheduler(MutalyzerTest): """ fixtures = (database, ) - @staticmethod - def _batch_job(variants, expected, job_type, argument=None): + def _batch_job(self, batch_file, expected, job_type, argument=None): file_instance = File.File(output.Output('test')) scheduler = Scheduler.Scheduler() - batch_file = StringIO.StringIO('\n'.join(variants) + '\n') job, columns = file_instance.parseBatchFile(batch_file) result_id = scheduler.addJob('test@test.test', job, columns, job_type, argument=argument) @@ -41,7 +41,7 @@ class TestScheduler(MutalyzerTest): batch_job = BatchJob.query.filter_by(result_id=result_id).one() left = batch_job.batch_queue_items.count() - assert left == len(variants) + assert left == len(expected) scheduler.process() @@ -49,11 +49,16 @@ class TestScheduler(MutalyzerTest): assert left == 0 filename = 'batch-job-%s.txt' % result_id - result = open(os.path.join(settings.CACHE_DIR, filename)) + result = io.open(os.path.join(settings.CACHE_DIR, filename), + encoding='utf-8') next(result) # Header. assert expected == [line.strip().split('\t') for line in result] + def _batch_job_plain_text(self, variants, expected, job_type, argument=None): + batch_file = io.BytesIO(('\n'.join(variants) + '\n').encode('utf-8')) + self._batch_job(batch_file, expected, job_type, argument=argument) + def test_syntax_checker(self): """ Simple syntax checker batch job. @@ -64,7 +69,7 @@ class TestScheduler(MutalyzerTest): 'OK'], ['AL449423.14(CDKN2A_v002):c.5_400del', 'OK']] - self._batch_job(variants, expected, 'syntax-checker') + self._batch_job_plain_text(variants, expected, 'syntax-checker') @fix(cache('AB026906.1', 'NM_000059.3')) def test_name_checker(self): @@ -110,7 +115,7 @@ class TestScheduler(MutalyzerTest): 'NM_000059.3(BRCA2_i001):p.(Asp224Tyr)', '', 'BspHI,CviAII,FatI,Hpy188III,NlaIII']] - self._batch_job(variants, expected, 'name-checker') + self._batch_job_plain_text(variants, expected, 'name-checker') def test_name_checker_altered(self): """ @@ -187,7 +192,7 @@ class TestScheduler(MutalyzerTest): return bz2.BZ2File(path) with patch.object(Entrez, 'efetch', mock_efetch): - self._batch_job(variants, expected, 'name-checker') + self._batch_job_plain_text(variants, expected, 'name-checker') @fix(cache('NM_000059.3')) def test_name_checker_skipped(self): @@ -228,7 +233,7 @@ class TestScheduler(MutalyzerTest): raise IOError() with patch.object(Entrez, 'efetch', mock_efetch): - self._batch_job(variants, expected, 'name-checker') + self._batch_job_plain_text(variants, expected, 'name-checker') @fix(hg19, hg19_transcript_mappings) def test_position_converter(self): @@ -242,4 +247,89 @@ class TestScheduler(MutalyzerTest): 'NM_003002.2:c.274G>T', 'NM_012459.2:c.-2203C>A', 'NR_028383.1:n.-2173C>A']] - self._batch_job(variants, expected, 'position-converter', 'hg19') + self._batch_job_plain_text(variants, expected, 'position-converter', 'hg19') + + def test_ods_file(self): + """ + OpenDocument Spreadsheet input for batch job. + """ + path = os.path.join(os.path.dirname(os.path.realpath(__file__)), + 'data', + 'batch_input.ods') + batch_file = open(path, 'rb') + expected = [['AB026906.1:c.274G>T', + 'OK'], + ['AL449423.14(CDKN2A_v002):c.5_400del', + 'OK']] + + self._batch_job(batch_file, expected, 'syntax-checker') + + def test_sxc_file(self): + """ + OpenOffice.org 1.x Calc spreadsheet input for batch job. + """ + path = os.path.join(os.path.dirname(os.path.realpath(__file__)), + 'data', + 'batch_input.sxc') + batch_file = open(path, 'rb') + expected = [['AB026906.1:c.274G>T', + 'OK'], + ['AL449423.14(CDKN2A_v002):c.5_400del', + 'OK']] + + self._batch_job(batch_file, expected, 'syntax-checker') + + def test_xls_file(self): + """ + Microsoft Excel 97/2000/XP/2003 input for batch job. + """ + path = os.path.join(os.path.dirname(os.path.realpath(__file__)), + 'data', + 'batch_input.xls') + batch_file = open(path, 'rb') + expected = [['AB026906.1:c.274G>T', + 'OK'], + ['AL449423.14(CDKN2A_v002):c.5_400del', + 'OK']] + + self._batch_job(batch_file, expected, 'syntax-checker') + + def test_xlsx_file(self): + """ + Office Open XML Spreadsheet input for batch job. + """ + path = os.path.join(os.path.dirname(os.path.realpath(__file__)), + 'data', + 'batch_input.xlsx') + batch_file = open(path, 'rb') + expected = [['AB026906.1:c.274G>T', + 'OK'], + ['AL449423.14(CDKN2A_v002):c.5_400del', + 'OK']] + + self._batch_job(batch_file, expected, 'syntax-checker') + + def test_invalid_zip_file(self): + """ + Random zip file input for batch job (invalid). + """ + path = os.path.join(os.path.dirname(os.path.realpath(__file__)), + 'data', + 'image.zip') + batch_file = open(path, 'rb') + + file_instance = File.File(output.Output('test')) + job, columns = file_instance.parseBatchFile(batch_file) + assert job is None + + def test_unicode_input(self): + """ + Simple input with some non-ASCII unicode characters. + """ + variants = ['\u2026AB026906.1:c.274G>T', + '\u2026AL449423.14(CDKN2A_v002):c.5_400del'] + expected = [['\u2026AB026906.1:c.274G>T', + '(grammar): Expected W:(0123...) (at char 0), (line:1, col:1)'], + ['\u2026AL449423.14(CDKN2A_v002):c.5_400del', + '(grammar): Expected W:(0123...) (at char 0), (line:1, col:1)']] + self._batch_job_plain_text(variants, expected, 'syntax-checker') diff --git a/tests/test_services_json.py b/tests/test_services_json.py index ce029ba764fab2c7cadd84ea730671abca41cca4..81833505e36ecee7436bde0f956e579ecd82c00e 100644 --- a/tests/test_services_json.py +++ b/tests/test_services_json.py @@ -3,10 +3,13 @@ Tests for the JSON interface to Mutalyzer. """ +from __future__ import unicode_literals + import simplejson as json from spyne.server.null import NullServer import mutalyzer from mutalyzer import announce +from mutalyzer import Scheduler from mutalyzer.services.json import application from fixtures import database, hg19, hg19_transcript_mappings @@ -77,7 +80,7 @@ class TestServicesJson(MutalyzerTest): Running the info method should give us some version information. """ r = self._call('info') - assert type(r['versionParts']) == list + assert isinstance(r['versionParts'], list) assert r['version'] == mutalyzer.__version__ def test_info_announcement(self): @@ -86,14 +89,56 @@ class TestServicesJson(MutalyzerTest): """ announce.set_announcement('Test announcement') r = self._call('info') - assert type(r['announcement']) == str + assert isinstance(r['announcement'], unicode) assert r['announcement'] == 'Test announcement' announce.set_announcement('New announcement') r = self._call('info') - assert type(r['announcement']) == str + assert isinstance(r['announcement'], unicode) assert r['announcement'] == 'New announcement' announce.unset_announcement() r = self._call('info') assert not r.get('announcement') + + def test_checksyntax_unicode(self): + """ + Run checkSyntax with an invalid variant description containing + non-ASCII unicode characters. + """ + r = self._call('checkSyntax', 'La Pe\xf1a') + assert r['valid'] == False + assert len(r['messages']) == 1 + assert r['messages'][0]['errorcode'] == 'EPARSE' + assert r['messages'][0]['message'] == 'Expected W:(0123...) (at char 2), (line:1, col:3)' + + @fix(database) + def test_batchjob_unicode(self): + """ + Submit a batch job with non-ASCII unicode characters in the input + file. + """ + variants = ['\u2026AB026906.1:c.274G>T', + '\u2026AL449423.14(CDKN2A_v002):c.5_400del'] + expected = [['\u2026AB026906.1:c.274G>T', + '(grammar): Expected W:(0123...) (at char 0), (line:1, col:1)'], + ['\u2026AL449423.14(CDKN2A_v002):c.5_400del', + '(grammar): Expected W:(0123...) (at char 0), (line:1, col:1)']] + + data = '\n'.join(variants) + '\n' #.encode('base64') + + result = self._call('submitBatchJob', data.encode('utf-8'), 'SyntaxChecker') + job_id = unicode(result) + + result = self._call('monitorBatchJob', job_id) + assert int(result) == len(variants) + + scheduler = Scheduler.Scheduler() + scheduler.process() + + result = self._call('monitorBatchJob', job_id) + assert int(result) == 0 + + result = self._call('getBatchJob', job_id) + result = result.decode('base64').decode('utf-8').strip().split('\n')[1:] + assert expected == [line.split('\t') for line in result] diff --git a/tests/test_services_soap.py b/tests/test_services_soap.py index cc1ce8c00320164293fb03ac66f662b6e454941c..0a85844d07c0f5a95bcf9e00b1dbc183a591f6bf 100644 --- a/tests/test_services_soap.py +++ b/tests/test_services_soap.py @@ -3,6 +3,8 @@ Tests for the SOAP interface to Mutalyzer. """ +from __future__ import unicode_literals + import bz2 import datetime import logging @@ -539,8 +541,8 @@ class TestServicesSoap(MutalyzerTest): 'AL449423.14(CDKN2A_v002):c.5_400del'] data = '\n'.join(variants) + '\n' #.encode('base64') - result = self._call('submitBatchJob', data, 'NameChecker') - job_id = str(result) + result = self._call('submitBatchJob', data.encode('utf-8'), 'NameChecker') + job_id = unicode(result) result = self._call('monitorBatchJob', job_id) assert int(result) == len(variants) @@ -564,8 +566,8 @@ class TestServicesSoap(MutalyzerTest): 'AL449423.14(CDKN2A_v002):c.5_400del'] data = '\n'.join(variants) + '\n' - result = self._call('submitBatchJob', data, 'SyntaxChecker') - job_id = str(result) + result = self._call('submitBatchJob', data.encode('utf-8'), 'SyntaxChecker') + job_id = unicode(result) result = self._call('monitorBatchJob', job_id) assert int(result) == len(variants) @@ -586,8 +588,8 @@ class TestServicesSoap(MutalyzerTest): 'AL449423.14(CDKN2A_v002):c.5_400del'] data = '\r'.join(variants) + '\r' - result = self._call('submitBatchJob', data, 'SyntaxChecker') - job_id = str(result) + result = self._call('submitBatchJob', data.encode('utf-8'), 'SyntaxChecker') + job_id = unicode(result) result = self._call('monitorBatchJob', job_id) assert int(result) == len(variants) @@ -608,8 +610,8 @@ class TestServicesSoap(MutalyzerTest): 'AL449423.14(CDKN2A_v002):c.5_400del'] data = '\r\n'.join(variants) + '\r\n' - result = self._call('submitBatchJob', data, 'SyntaxChecker') - job_id = str(result) + result = self._call('submitBatchJob', data.encode('utf-8'), 'SyntaxChecker') + job_id = unicode(result) result = self._call('monitorBatchJob', job_id) assert int(result) == len(variants) @@ -640,7 +642,7 @@ facilisi.""" data += data try: - self._call('submitBatchJob', data.encode('base64'), 'NameChecker') + self._call('submitBatchJob', data.encode('utf-8'), 'NameChecker') assert False except Fault as e: # - senv:Client.RequestTooLong: Raised by Spyne, depending on @@ -661,9 +663,51 @@ facilisi.""" data = f.read() result = self._call('uploadGenBankLocalFile', data) - ud = str(result) + ud = unicode(result) r = self._call('runMutalyzer', ud + '(SDHD):g.7872G>T') assert r.errors == 0 assert r.genomicDescription == ud + ':g.7872G>T' assert ud + '(SDHD_v001):c.274G>T' in r.transcriptDescriptions.string + + def test_checksyntax_unicode(self): + """ + Run checkSyntax with an invalid variant description containing + non-ASCII unicode characters. + """ + r = self._call('checkSyntax', 'La Pe\xf1a') + assert r.valid == False + assert len(r.messages.SoapMessage) == 1 + assert r.messages.SoapMessage[0]['errorcode'] == 'EPARSE' + assert r.messages.SoapMessage[0]['message'] == 'Expected W:(0123...) (at char 2), (line:1, col:3)' + + @fix(database) + def test_batchjob_unicode(self): + """ + Submit a batch job with non-ASCII unicode characters in the input + file. + """ + variants = ['\u2026AB026906.1:c.274G>T', + '\u2026AL449423.14(CDKN2A_v002):c.5_400del'] + expected = [['\u2026AB026906.1:c.274G>T', + '(grammar): Expected W:(0123...) (at char 0), (line:1, col:1)'], + ['\u2026AL449423.14(CDKN2A_v002):c.5_400del', + '(grammar): Expected W:(0123...) (at char 0), (line:1, col:1)']] + + data = '\n'.join(variants) + '\n' #.encode('base64') + + result = self._call('submitBatchJob', data.encode('utf-8'), 'SyntaxChecker') + job_id = unicode(result) + + result = self._call('monitorBatchJob', job_id) + assert int(result) == len(variants) + + scheduler = Scheduler.Scheduler() + scheduler.process() + + result = self._call('monitorBatchJob', job_id) + assert int(result) == 0 + + result = self._call('getBatchJob', job_id) + result = result.decode('base64').decode('utf-8').strip().split('\n')[1:] + assert expected == [line.split('\t') for line in result] diff --git a/tests/test_variantchecker.py b/tests/test_variantchecker.py index 1b30786b27730bdc91ac5b39785c0f6fa9625d28..8c19421a9f0b8c891908b316d162a007b3d2733b 100644 --- a/tests/test_variantchecker.py +++ b/tests/test_variantchecker.py @@ -3,6 +3,8 @@ Tests for the variantchecker module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() from mutalyzer.output import Output diff --git a/tests/test_website.py b/tests/test_website.py index e579433a18f321a2fb2784530b8381111bc9b3e6..fd0f02e7725b2cd1dc53b6231a9ac01d70a4caca 100644 --- a/tests/test_website.py +++ b/tests/test_website.py @@ -5,23 +5,19 @@ Tests for the WSGI interface to Mutalyzer. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() import bz2 -import cgi -import logging from mock import patch import os -import re -from StringIO import StringIO -import time -import urllib -import urllib2 +from io import BytesIO from Bio import Entrez import lxml.html -import mutalyzer from mutalyzer import announce, Scheduler +from mutalyzer.db import models from mutalyzer.website import create_app from fixtures import cache, database, hg19, hg19_transcript_mappings @@ -264,7 +260,7 @@ class TestWebsite(MutalyzerTest): """ data = {'job_type': job_type, 'email': 'test@test.test', - 'file': (StringIO(file), 'test.txt')} + 'file': (BytesIO(file.encode('utf-8')), 'test.txt')} if assembly_name_or_alias is not None: data['assembly_name_or_alias'] = assembly_name_or_alias @@ -510,7 +506,7 @@ class TestWebsite(MutalyzerTest): Download a C# example client for the web service. """ r = self.app.get('/downloads/client-mono.cs') - assert r.headers['Content-Type'] == 'text/plain' + assert 'text/plain' in r.headers['Content-Type'] assert 'public static void Main(String [] args) {' in r.data def test_download_php(self): @@ -634,7 +630,7 @@ class TestWebsite(MutalyzerTest): 'build': 'hg19', 'acc': 'NM_203473.1'}) assert 'text/plain' in r.headers['Content-Type'] - assert r.content_type == 'text/plain' + assert 'text/plain' in r.content_type expected = '\n'.join(['-158', '1709', '1371']) assert r.data == expected @@ -678,7 +674,7 @@ class TestWebsite(MutalyzerTest): """ r = self.app.post('/reference-loader', data={'method': 'upload', - 'file': (StringIO('this is not a genbank file'), 'AB026906.1.gb')}) + 'file': (BytesIO('this is not a genbank file'.encode('utf-8')), 'AB026906.1.gb')}) assert 'Your reference sequence was loaded successfully.' not in r.data assert 'The file could not be parsed.' in r.data @@ -737,3 +733,89 @@ class TestWebsite(MutalyzerTest): assert 'text/plain' in r.headers['Content-Type'] assert '\t'.join(['chrX', '154157690', '154157691', '4374A>T', '0', '-']) in r.data assert '\t'.join(['chrX', '154157683', '154157685', '4380_4381del', '0', '-']) in r.data + + def test_checksyntax_unicode(self): + """ + Run check syntax form with an invalid variant description containing + non-ASCII unicode characters. + """ + r = self.app.get('/syntax-checker', + query_string={'description': 'La Pe\xf1a'}) + body = r.get_data(as_text=True) + assert 'Fatal' in body + assert 'Details of the parse error' in body + assert 'Expected W:(0123...) (at char 2), (line:1, col:3)' in body + + @fix(database) + def test_batch_unicode(self): + """ + Submit a batch form with non-ASCII unicode characters in the input + file. + """ + file = '\n'.join(['\u2026AB026906.1:c.274G>T', + '\u2026AL449423.14(CDKN2A_v002):c.5_400del']) + expected = [['\u2026AB026906.1:c.274G>T', + '(grammar): Expected W:(0123...) (at char 0), (line:1, col:1)'], + ['\u2026AL449423.14(CDKN2A_v002):c.5_400del', + '(grammar): Expected W:(0123...) (at char 0), (line:1, col:1)']] + + data = {'job_type': 'syntax-checker', + 'email': 'test@test.test', + 'file': (BytesIO(file.encode('utf-8')), 'test.txt')} + + r = self.app.post('/batch-jobs', + data=data) + progress_url = '/' + r.location.split('/')[-1] + + assert models.BatchJob.query.first().email == 'test@test.test' + + scheduler = Scheduler.Scheduler() + scheduler.process() + + r = self.app.get(progress_url) + + dom = lxml.html.fromstring(r.data) + result_url = dom.cssselect('#ifnot_items_left a')[0].attrib['href'] + + r = self.app.get(result_url) + assert 'text/plain' in r.headers['Content-Type'] + + result = r.get_data(as_text=True).strip().split('\n')[1:] + assert expected == [line.split('\t') for line in result] + + @fix(database) + def test_batch_unicode_email(self): + """ + Submit a batch form with non-ASCII unicode characters in the email + address. + """ + file = '\n'.join(['AB026906.1:c.274G>T', + 'AL449423.14(CDKN2A_v002):c.5_400del']) + expected = [['AB026906.1:c.274G>T', + 'OK'], + ['AL449423.14(CDKN2A_v002):c.5_400del', + 'OK']] + + data = {'job_type': 'syntax-checker', + 'email': 'pe\xf1a@test.test', + 'file': (BytesIO(file.encode('utf-8')), 'test.txt')} + + r = self.app.post('/batch-jobs', + data=data) + progress_url = '/' + r.location.split('/')[-1] + + assert models.BatchJob.query.first().email == 'pe\xf1a@test.test' + + scheduler = Scheduler.Scheduler() + scheduler.process() + + r = self.app.get(progress_url) + + dom = lxml.html.fromstring(r.data) + result_url = dom.cssselect('#ifnot_items_left a')[0].attrib['href'] + + r = self.app.get(result_url) + assert 'text/plain' in r.headers['Content-Type'] + + result = r.get_data(as_text=True).strip().split('\n')[1:] + assert expected == [line.split('\t') for line in result] diff --git a/tests/utils.py b/tests/utils.py index befa5d72859279140211ad412fa2920fce8961d6..f9cfce8bb44a2ce0e7bd09d9951e92d6b8ea1c34 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -3,6 +3,8 @@ Utilities for unit tests. """ +from __future__ import unicode_literals + from functools import wraps import os import shutil