From 2a4dc3c18e1d19a9aa6bb70b04283022707748cb Mon Sep 17 00:00:00 2001 From: Martijn Vermaat <martijn@vermaat.name> Date: Sat, 11 Oct 2014 23:45:18 +0200 Subject: [PATCH] Use unicode strings Don't fix what ain't broken. Unfortunately, string handling in Mutalyzer really is broken. So we fix it. Internally, all strings should be represented by unicode strings as much as possible. The main exception are large reference sequence strings. These can often better be BioPython sequence objects, since that is how we usually get them in the first place. These changes will hopefully make Mutalyzer more reliable in working with incoming data. As a bonus, they're a first (small) step towards Python 3 compatibility [1]. Our strategy is as follows: 1. We use `from __future__ import unicode_literals` at the top of every file. 2. All incoming strings are decoded to unicode (if necessary) as soon as possible. 3. Outgoing strings are encoded to UTF8 (if necessary) as late as possible. 4. BioPython sequence objects can be based on byte strings as well as unicode strings. 5. In the database, everything is UTF8. 6. We worry about uploaded and downloaded reference files and batch jobs in a later commit. Point 1 will ensure that all string literals in our source code will be unicode strings [2]. As for point 4, sometimes this may even change under our eyes (e.g., calling `.reverse_complement()` will change it to a byte string). We don't care as long as they're BioPython objects, only when we get the sequence out we must have it as unicode string. Their contents are always in the ASCII range anyway. Although `Bio.Seq.reverse_complement` works fine on Python byte strings (and we used to rely on that), it crashes on a Python unicode string. So we take care to only use it on BioPython sequence objects and wrote our own reverse complement function for unicode strings (`mutalyzer.util.reverse_complement`). As for point 5, SQLAlchemy already does a very good job at presenting decoding from and encoding to UTF8 for us. The Spyne documentation has the following to say about their `String` and `Unicode` types [3]: > There are two string types in Spyne: `spyne.model.primitive.Unicode` and > `spyne.model.primitive.String` whose native types are `unicode` and `str` > respectively. > > Unlike the Python `str`, the Spyne `String` is not for arbitrary byte > streams. You should not use it unless you are absolutely, positively sure > that you need to deal with text data with an unknown encoding. In all other > cases, you should just use the `Unicode` type. They actually look the same > from outside, this distinction is made just to properly deal with the quirks > surrounding Python-2's `unicode` type. > > Remember that you have the `ByteArray` and `File` types at your disposal > when you need to deal with arbitrary byte streams. > > The `String` type will be just an alias for `Unicode` once Spyne gets ported > to Python 3. It might even be deprecated and removed in the future, so make > sure you are using either `Unicode` or `ByteArray` in your interface > definitions. So let's not ignore that and never use `String` anymore in our webservice interface. For the command line interface it's a bit more complicated, since there seems to be no reliable way to get the encoding of command line arguments. We use `sys.stdin.encoding` as a best guess. For us to interpret a sequence of bytes as text, it's key to be aware of their encoding. Once decoded, a text string can be safely used without having to worry about bytes. Without unicode we're nothing, and nothing will help us. Maybe we're lying, then you better not stay. But we could be safer, just for one day. Oh-oh-oh-ohh, oh-oh-oh-ohh, just for one day. [1] https://docs.python.org/2.7/howto/pyporting.html [2] http://python-future.org/unicode_literals.html [3] http://spyne.io/docs/2.10/manual/03_types.html#strings --- extras/log-tools/find-crashes.py | 2 + extras/monitor/mutalyzer-monitor.py | 2 + extras/soap-tools/batchjob.py | 2 + extras/soap-tools/checkSyntax.py | 2 + extras/soap-tools/chromAccession.py | 2 + extras/soap-tools/descriptionExtract.py | 2 + extras/soap-tools/getCache.py | 2 + extras/soap-tools/getGeneAndTranscript.py | 2 + extras/soap-tools/getGeneName.py | 2 + extras/soap-tools/getTranscripts.py | 2 + extras/soap-tools/getTranscriptsAndInfo.py | 2 + extras/soap-tools/getTranscriptsByGeneName.py | 2 + extras/soap-tools/getTranscriptsMapping.py | 2 + extras/soap-tools/getdbSNPDescriptions.py | 2 + extras/soap-tools/info.py | 2 + extras/soap-tools/mappingInfo.py | 2 + extras/soap-tools/numberConversion.py | 2 + extras/soap-tools/runMutalyzer.py | 2 + extras/soap-tools/sliceChromosomeByGene.py | 2 + extras/soap-tools/sp.py | 2 + extras/soap-tools/transcriptInfo.py | 2 + migrations/script.py.mako | 2 + ...fix_grcm38_chromosome_accession_number_.py | 2 + .../versions/ea660b66f26_initial_schema.py | 2 + mutalyzer/Crossmap.py | 30 +- mutalyzer/File.py | 27 +- mutalyzer/GenRecord.py | 36 +- mutalyzer/Retriever.py | 98 +-- mutalyzer/Scheduler.py | 17 +- mutalyzer/__init__.py | 3 + mutalyzer/announce.py | 2 + mutalyzer/config/__init__.py | 2 + mutalyzer/config/default_settings.py | 3 + mutalyzer/db/__init__.py | 2 + mutalyzer/db/models.py | 6 +- mutalyzer/db/queries.py | 2 + mutalyzer/describe.py | 69 +- mutalyzer/describe_c.py | 587 ------------------ mutalyzer/entrypoints/__init__.py | 15 + mutalyzer/entrypoints/admin.py | 30 +- mutalyzer/entrypoints/batch_processor.py | 3 +- mutalyzer/entrypoints/mutalyzer.py | 6 +- mutalyzer/entrypoints/service_json.py | 10 +- mutalyzer/entrypoints/service_soap.py | 10 +- mutalyzer/entrypoints/website.py | 11 +- mutalyzer/grammar.py | 24 +- mutalyzer/mapping.py | 66 +- mutalyzer/models.py | 2 + mutalyzer/mutator.py | 39 +- mutalyzer/output.py | 26 +- mutalyzer/parsers/__init__.py | 3 + mutalyzer/parsers/genbank.py | 34 +- mutalyzer/parsers/lrg.py | 18 +- mutalyzer/redisclient.py | 6 +- mutalyzer/services/__init__.py | 3 + mutalyzer/services/json.py | 2 + mutalyzer/services/rpc.py | 60 +- mutalyzer/services/soap.py | 2 + mutalyzer/stats.py | 5 +- mutalyzer/sync.py | 18 +- mutalyzer/util.py | 198 ++++-- mutalyzer/variantchecker.py | 233 +++---- mutalyzer/website/__init__.py | 2 + mutalyzer/website/templates/base.html | 2 +- mutalyzer/website/views.py | 61 +- tests/fixtures.py | 2 + tests/old/lrgtest.py | 1 + tests/old/maptest.py | 1 + tests/old/recordtest.py | 7 +- tests/test_crossmap.py | 2 + tests/test_describe.py | 2 + tests/test_grammar.py | 2 + tests/test_mapping.py | 2 + tests/test_mutator.py | 138 ++-- tests/test_parsers_genbank.py | 2 + tests/test_scheduler.py | 6 +- tests/test_services_json.py | 8 +- tests/test_services_soap.py | 22 +- tests/test_variantchecker.py | 2 + tests/test_website.py | 12 +- tests/utils.py | 2 + 81 files changed, 844 insertions(+), 1188 deletions(-) delete mode 100755 mutalyzer/describe_c.py diff --git a/extras/log-tools/find-crashes.py b/extras/log-tools/find-crashes.py index 0e6d791e..cf6ba986 100755 --- a/extras/log-tools/find-crashes.py +++ b/extras/log-tools/find-crashes.py @@ -9,6 +9,8 @@ crashed. """ +from __future__ import unicode_literals + import os from mutalyzer import config diff --git a/extras/monitor/mutalyzer-monitor.py b/extras/monitor/mutalyzer-monitor.py index b5ea49fd..43e49abc 100755 --- a/extras/monitor/mutalyzer-monitor.py +++ b/extras/monitor/mutalyzer-monitor.py @@ -15,6 +15,8 @@ Currently implemented checks: """ +from __future__ import unicode_literals + import argparse import logging import sys diff --git a/extras/soap-tools/batchjob.py b/extras/soap-tools/batchjob.py index 7558b98d..de11bc2a 100755 --- a/extras/soap-tools/batchjob.py +++ b/extras/soap-tools/batchjob.py @@ -17,6 +17,8 @@ to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/checkSyntax.py b/extras/soap-tools/checkSyntax.py index 78c63e5c..a2bf32d7 100755 --- a/extras/soap-tools/checkSyntax.py +++ b/extras/soap-tools/checkSyntax.py @@ -12,6 +12,8 @@ and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/chromAccession.py b/extras/soap-tools/chromAccession.py index 4fb6e04f..457277d8 100755 --- a/extras/soap-tools/chromAccession.py +++ b/extras/soap-tools/chromAccession.py @@ -13,6 +13,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/descriptionExtract.py b/extras/soap-tools/descriptionExtract.py index 7ca3b2ec..3889ca41 100755 --- a/extras/soap-tools/descriptionExtract.py +++ b/extras/soap-tools/descriptionExtract.py @@ -14,6 +14,8 @@ service and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getCache.py b/extras/soap-tools/getCache.py index 2f9c7df2..07a86818 100755 --- a/extras/soap-tools/getCache.py +++ b/extras/soap-tools/getCache.py @@ -12,6 +12,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getGeneAndTranscript.py b/extras/soap-tools/getGeneAndTranscript.py index 8946d59e..e4ba939b 100755 --- a/extras/soap-tools/getGeneAndTranscript.py +++ b/extras/soap-tools/getGeneAndTranscript.py @@ -13,6 +13,8 @@ web service and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getGeneName.py b/extras/soap-tools/getGeneName.py index e3b7dd01..ad4ce8c4 100755 --- a/extras/soap-tools/getGeneName.py +++ b/extras/soap-tools/getGeneName.py @@ -13,6 +13,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getTranscripts.py b/extras/soap-tools/getTranscripts.py index 51052fca..82af3219 100755 --- a/extras/soap-tools/getTranscripts.py +++ b/extras/soap-tools/getTranscripts.py @@ -13,6 +13,8 @@ web service and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getTranscriptsAndInfo.py b/extras/soap-tools/getTranscriptsAndInfo.py index 86dc3ff4..12b94d86 100755 --- a/extras/soap-tools/getTranscriptsAndInfo.py +++ b/extras/soap-tools/getTranscriptsAndInfo.py @@ -14,6 +14,8 @@ and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getTranscriptsByGeneName.py b/extras/soap-tools/getTranscriptsByGeneName.py index d7789a0a..f31ff6ba 100755 --- a/extras/soap-tools/getTranscriptsByGeneName.py +++ b/extras/soap-tools/getTranscriptsByGeneName.py @@ -12,6 +12,8 @@ web service and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getTranscriptsMapping.py b/extras/soap-tools/getTranscriptsMapping.py index 79683369..891dfa75 100755 --- a/extras/soap-tools/getTranscriptsMapping.py +++ b/extras/soap-tools/getTranscriptsMapping.py @@ -16,6 +16,8 @@ and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getdbSNPDescriptions.py b/extras/soap-tools/getdbSNPDescriptions.py index f5745533..5be99c73 100755 --- a/extras/soap-tools/getdbSNPDescriptions.py +++ b/extras/soap-tools/getdbSNPDescriptions.py @@ -12,6 +12,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/info.py b/extras/soap-tools/info.py index eb3cd058..1a4ea6e4 100755 --- a/extras/soap-tools/info.py +++ b/extras/soap-tools/info.py @@ -10,6 +10,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/mappingInfo.py b/extras/soap-tools/mappingInfo.py index 49fb4ac4..7a473b1c 100755 --- a/extras/soap-tools/mappingInfo.py +++ b/extras/soap-tools/mappingInfo.py @@ -14,6 +14,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/numberConversion.py b/extras/soap-tools/numberConversion.py index 977bbc71..bd5262f4 100755 --- a/extras/soap-tools/numberConversion.py +++ b/extras/soap-tools/numberConversion.py @@ -13,6 +13,8 @@ web service and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/runMutalyzer.py b/extras/soap-tools/runMutalyzer.py index 0a2d1e75..475cc6c1 100755 --- a/extras/soap-tools/runMutalyzer.py +++ b/extras/soap-tools/runMutalyzer.py @@ -13,6 +13,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/sliceChromosomeByGene.py b/extras/soap-tools/sliceChromosomeByGene.py index 8e24c54d..c4e0e418 100755 --- a/extras/soap-tools/sliceChromosomeByGene.py +++ b/extras/soap-tools/sliceChromosomeByGene.py @@ -13,6 +13,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/sp.py b/extras/soap-tools/sp.py index d395d199..a2fd0be4 100755 --- a/extras/soap-tools/sp.py +++ b/extras/soap-tools/sp.py @@ -11,6 +11,8 @@ # This code is in the public domain; it can be used for whatever purpose # with absolutely no restrictions. +from __future__ import unicode_literals + import sys from SOAPpy import WSDL diff --git a/extras/soap-tools/transcriptInfo.py b/extras/soap-tools/transcriptInfo.py index d25d361a..bd9c14e8 100755 --- a/extras/soap-tools/transcriptInfo.py +++ b/extras/soap-tools/transcriptInfo.py @@ -12,6 +12,8 @@ and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/migrations/script.py.mako b/migrations/script.py.mako index 95702017..56af6fd8 100644 --- a/migrations/script.py.mako +++ b/migrations/script.py.mako @@ -6,6 +6,8 @@ Create Date: ${create_date} """ +from __future__ import unicode_literals + # revision identifiers, used by Alembic. revision = ${repr(up_revision)} down_revision = ${repr(down_revision)} diff --git a/migrations/versions/402ff01b0d5d_fix_grcm38_chromosome_accession_number_.py b/migrations/versions/402ff01b0d5d_fix_grcm38_chromosome_accession_number_.py index ca664e56..10ed1f8b 100644 --- a/migrations/versions/402ff01b0d5d_fix_grcm38_chromosome_accession_number_.py +++ b/migrations/versions/402ff01b0d5d_fix_grcm38_chromosome_accession_number_.py @@ -6,6 +6,8 @@ Create Date: 2014-10-08 15:10:21.522551 """ +from __future__ import unicode_literals + # revision identifiers, used by Alembic. revision = '402ff01b0d5d' down_revision = 'ea660b66f26' diff --git a/migrations/versions/ea660b66f26_initial_schema.py b/migrations/versions/ea660b66f26_initial_schema.py index d0d474ed..eec6ce6a 100644 --- a/migrations/versions/ea660b66f26_initial_schema.py +++ b/migrations/versions/ea660b66f26_initial_schema.py @@ -6,6 +6,8 @@ Create Date: 2014-02-04 18:38:28.416032 """ +from __future__ import unicode_literals + # revision identifiers, used by Alembic. revision = 'ea660b66f26' down_revision = None diff --git a/mutalyzer/Crossmap.py b/mutalyzer/Crossmap.py index 0fb166dc..0de7ce3a 100644 --- a/mutalyzer/Crossmap.py +++ b/mutalyzer/Crossmap.py @@ -10,6 +10,8 @@ and stop and the orientation of a transcript. #Public classes: # - Crossmap ; Convert from g. to c. or n. notation or vice versa. +from __future__ import unicode_literals + class Crossmap() : """ Convert from I{g.} to I{c.} or I{n.} notation or vice versa. @@ -406,13 +408,13 @@ class Crossmap() : @type a: integer @return: The converted notation (may be unaltered) - @rtype: string + @rtype: unicode """ if a > self.__STOP : - return '*' + str(a - self.__STOP) + return '*' + unicode(a - self.__STOP) - return str(a) + return unicode(a) #int2main def main2int(self, s) : @@ -423,7 +425,7 @@ class Crossmap() : - __STOP ; CDS stop in I{c.} notation. @arg s: A string in '*' notation - @type s: string + @type s: unicode @return: The converted notation (may be unaltered) @rtype: integer @@ -447,20 +449,20 @@ class Crossmap() : @type fuzzy: bool @return: The offset in HGVS notation - @rtype: string + @rtype: unicode """ if t[1] > 0 : # The exon boundary is downstream. if fuzzy: return '+?' if t[0] >= self.__trans_end : # It is downstream of the last exon. - return "+d" + str(t[1]) - return '+' + str(t[1]) + return "+d" + unicode(t[1]) + return '+' + unicode(t[1]) #if if t[1] < 0 : # The exon boundary is uptream. if fuzzy: return '-?' if t[0] <= self.__trans_start : # It is upstream of the first exon. - return "-u" + str(-t[1]) - return str(t[1]) + return "-u" + unicode(-t[1]) + return unicode(t[1]) #if return '' # No offset was given. #int2offset @@ -472,7 +474,7 @@ class Crossmap() : sensible. @arg s: An offset in HGVS notation - @type s: string + @type s: unicode @return: The offset as an integer @rtype: integer @@ -505,12 +507,12 @@ class Crossmap() : @type fuzzy: bool @return: The position in HGVS notation - @rtype: string + @rtype: unicode """ if t[0] >= self.__trans_end or t[0] <= self.__trans_start: - return str(self.int2main(self.__minus(t[0], -t[1]))) - return str(self.int2main(t[0])) + str(self.int2offset(t, fuzzy)) + return unicode(self.int2main(self.__minus(t[0], -t[1]))) + return unicode(self.int2main(t[0])) + unicode(self.int2offset(t, fuzzy)) #tuple2string def g2c(self, a, fuzzy=False) : @@ -525,7 +527,7 @@ class Crossmap() : @type fuzzy: bool @return: The position in HGVS notation - @rtype: string + @rtype: unicode """ return self.tuple2string(self.g2x(a), fuzzy) #g2c diff --git a/mutalyzer/File.py b/mutalyzer/File.py index b95f0301..3de998da 100644 --- a/mutalyzer/File.py +++ b/mutalyzer/File.py @@ -16,6 +16,8 @@ Module for parsing CSV files and spreadsheets. # - File ; Parse CSV files and spreadsheets. +from __future__ import unicode_literals + import magic # open(), MAGIC_MIME, MAGIC_NONE import csv # Sniffer(), reader(), Error import xlrd # open_workbook() @@ -23,10 +25,7 @@ import zipfile # ZipFile() import xml.dom.minidom # parseString() import os # remove() import tempfile -import types # UnicodeType -from cStringIO import StringIO -from mutalyzer import util from mutalyzer.config import settings @@ -173,10 +172,10 @@ class File() : for i in range(sheet.nrows) : row = [] for j in sheet.row_values(i) : - if type(j) == types.UnicodeType : # Convert the data to strings. - row.append(j.encode("utf8")) - else : - row.append(str(j)) + if isinstance(j, unicode): + row.append(j) + else: + row.append(j.decode('utf-8')) #for ret.append(row) #for @@ -209,7 +208,7 @@ class File() : for j in i.getElementsByTagName("table:table-cell") : c = j.getElementsByTagName("text:p") if c : - row.append(c[0].lastChild.data.encode("utf8")) + row.append(c[0].lastChild.data) #if #for ret.append(row) @@ -346,19 +345,19 @@ class File() : @arg handle: A handle to a stream @type handle: stream - @return: The mime type of a file - @rtype: string + @return: The mime type of a file and a textual description. + @rtype: unicode, unicode """ handle.seek(0) buf = handle.read(BUFFER_SIZE) MagicInstance = magic.open(magic.MAGIC_MIME) MagicInstance.load() - mimeType = MagicInstance.buffer(buf).split(';')[0] + mimeType = MagicInstance.buffer(buf).decode('utf-8').split(';')[0] MagicInstance.close() MagicInstance = magic.open(magic.MAGIC_NONE) MagicInstance.load() - description = MagicInstance.buffer(buf) + description = MagicInstance.buffer(buf).decode('utf-8') del MagicInstance handle.seek(0) @@ -419,9 +418,9 @@ def makeList(l, maxlen=10): @arg maxlen: maximum length of the string you want to return @type maxlen: integer @return: a list converted to a string with comma's and spaces - @rtype: string + @rtype: unicode """ - ret = ", ".join(str(i) for i in l[:maxlen]) + ret = ", ".join(i for i in l[:maxlen]) if len(l)>maxlen: return ret+", ..." else: diff --git a/mutalyzer/GenRecord.py b/mutalyzer/GenRecord.py index b30ed800..5a729f73 100644 --- a/mutalyzer/GenRecord.py +++ b/mutalyzer/GenRecord.py @@ -15,7 +15,7 @@ search for them each time. # - GenRecord ; Convert a GenBank record to a nested dictionary. -import Bio +from __future__ import unicode_literals from mutalyzer import util from mutalyzer import Crossmap @@ -85,7 +85,7 @@ class Locus(object) : - CM ; A Crossmap object. @arg name: identifier of the locus - @type name: string + @type name: unicode """ self.name = name @@ -131,7 +131,7 @@ class Locus(object) : Expands the DNA description with a new raw variant. @arg rawVariant: description of a single mutation - @type rawVariant: string + @type rawVariant: unicode """ if self.description: # Don't change anything if we already have an unknown value. @@ -170,7 +170,7 @@ class Gene(object) : - __locusTag ; @arg name: gene name - @type name: string + @type name: unicode """ self.name = name @@ -199,14 +199,14 @@ class Gene(object) : Find a transcript, given its name. @arg name: transcript variant number - @type name: string + @type name: unicode @return: transcript @rtype: object """ for i in self.transcriptList : - if i.name == name or i.name == str("%03i" % int(name)): + if i.name == name or i.name == "%03i" % int(name): return i return None #findLocus @@ -230,7 +230,7 @@ class Gene(object) : Look in the list of transcripts for a given protein accession number. @arg protAcc: protein accession number - @type protAcc: string + @type protAcc: unicode @return: transcript @rtype: object @@ -300,7 +300,7 @@ class Record(object) : Returns a Gene object, given its name. @arg name: Gene name - @type name: string + @type name: unicode @return: Gene object @rtype: object @@ -332,7 +332,7 @@ class Record(object) : Expands the DNA description with a new raw variant. @arg rawVariant: description of a single mutation - @type rawVariant: string + @type rawVariant: unicode """ if self.description : @@ -469,18 +469,18 @@ class GenRecord() : @arg gene: Gene @type gene: object @arg string: DNA sequence - @type string: string + @type string: unicode @kwarg string_reverse: DNA sequence to use (if not None) for the reverse complement. @return: reverse-complement (if applicable), otherwise return the original. - @rtype: string + @rtype: unicode """ if gene.orientation == -1: if string_reverse: string = string_reverse - return Bio.Seq.reverse_complement(string) + return util.reverse_complement(string) return string #__maybeInvert @@ -639,15 +639,15 @@ class GenRecord() : @arg stop_g: stop position @type stop_g: integer @arg varType: variant type - @type varType: string + @type varType: unicode @arg arg1: argument 1 of a raw variant - @type arg1: string + @type arg1: unicode @arg arg2: argument 2 of a raw variant - @type arg2: string + @type arg2: unicode @arg roll: ??? @type roll: tuple (integer, integer) @kwarg arg1_reverse: argument 1 to be used on reverse strand - @type arg1_reverse: string + @type arg1_reverse: unicode @kwarg start_fuzzy: Indicates if start position of variant is fuzzy. @type start_fuzzy: bool @kwarg stop_fuzzy: Indicates if stop position of variant is fuzzy. @@ -666,8 +666,8 @@ class GenRecord() : else: chromStart = self.record.toChromPos(stop_g) chromStop = self.record.toChromPos(start_g) - chromArg1 = Bio.Seq.reverse_complement(arg1) - chromArg2 = Bio.Seq.reverse_complement(arg2) + chromArg1 = util.reverse_complement(arg1) + chromArg2 = util.reverse_complement(arg2) # Todo: Should we use arg1_reverse here? if roll : diff --git a/mutalyzer/Retriever.py b/mutalyzer/Retriever.py index 5fa91eeb..deb645ae 100644 --- a/mutalyzer/Retriever.py +++ b/mutalyzer/Retriever.py @@ -10,6 +10,9 @@ Public classes: """ +from __future__ import unicode_literals + +import codecs import os # path.isfile(), link() path.isdir(), path.mkdir(), # walk(), path.getsize(), path.join(), stat(), remove() import time @@ -84,10 +87,10 @@ class Retriever(object) : Convert an accession number to a filename. @arg name: The accession number - @type name: string + @type name: unicode @return: A filename - @rtype: string + @rtype: unicode """ return os.path.join(settings.CACHE_DIR, name + "." + self.fileType + ".bz2") #_nametofile @@ -99,11 +102,13 @@ class Retriever(object) : @arg raw_data: The raw_data to be compressed and written @type raw_data: string @arg filename: The intended name of the outfile - @type filename: string + @type filename: unicode @return: outfile ; The full path and name of the file written - @rtype: string + @rtype: unicode """ + # Todo: Should we write a utf-8 encoded genbank file? Not even sure + # what type `raw_data` is... # Compress the data to save disk space. comp = bz2.BZ2Compressor() data = comp.compress(raw_data) @@ -115,15 +120,16 @@ class Retriever(object) : return out_handle.name # return the full path to the file #_write + # Todo: check callers; argument should be a byte string def _calcHash(self, content) : """ Calculate the md5sum of a piece of text. @arg content: Arbitrary text - @type content: string + @type content: byte string @return: The md5sum of 'content' - @rtype: string + @rtype: unicode """ hashfunc = hashlib.md5() @@ -131,7 +137,7 @@ class Retriever(object) : md5sum = hashfunc.hexdigest() del hashfunc - return md5sum + return unicode(md5sum) #_calcHash def _newUD(self) : @@ -139,11 +145,11 @@ class Retriever(object) : Make a new UD number based on the current time (seconds since 1970). @return: A new UD number - @rtype: string + @rtype: unicode """ UD = util.generate_id() - return "UD_" + str(UD) + return "UD_" + unicode(UD) #_newUD def _updateDBmd5(self, raw_data, name, GI): @@ -159,7 +165,7 @@ class Retriever(object) : @type GI: @return: filename - @rtype: string + @rtype: unicode """ try: reference = Reference.query.filter_by(accession=name).one() @@ -191,10 +197,10 @@ class Retriever(object) : it. @arg rsId: The rsId of the SNP (example: 'rs9919552'). - @type rsId: string + @type rsId: unicode @return: A list of HGVS notations. - @rtype: list(string) + @rtype: list(unicode) """ # A simple input check. id = rs_id[2:] @@ -223,7 +229,7 @@ class Retriever(object) : self._output.addMessage(__file__, 4, 'EENTREZ', 'Error connecting to dbSNP.') self._output.addMessage(__file__, -1, 'INFO', - 'IOError: %s' % str(e)) + 'IOError: %s' % unicode(e)) return [] try: @@ -232,7 +238,7 @@ class Retriever(object) : self._output.addMessage(__file__, 4, 'EENTREZ', 'Error reading from dbSNP.') self._output.addMessage(__file__, -1, 'INFO', - 'IncompleteRead: %s' % str(e)) + 'IncompleteRead: %s' % unicode(e)) return [] if response_text == '\n': @@ -251,7 +257,7 @@ class Retriever(object) : self._output.addMessage(__file__, 4, 'EENTREZ', 'Unknown dbSNP ' \ 'error. Error parsing result XML.') self._output.addMessage(__file__, -1, 'INFO', - 'ExpatError: %s' % str(e)) + 'ExpatError: %s' % unicode(e)) self._output.addMessage(__file__, -1, 'INFO', 'Result from dbSNP: %s' % response_text) return [] @@ -265,7 +271,7 @@ class Retriever(object) : snps = [] for i in rs.getElementsByTagName('hgvs'): - snps.append(i.lastChild.data.encode('utf8')) + snps.append(i.lastChild.data) return snps #snpConvert @@ -286,6 +292,7 @@ class GenBankRetriever(Retriever): # Child specific init #__init__ + # todo: raw_data must always be a byte string def write(self, raw_data, filename, extract) : """ Write raw data to a file. The data is parsed before writing, if a @@ -300,7 +307,7 @@ class GenBankRetriever(Retriever): @arg raw_data: The data @type raw_data: string @arg filename: The intended name of the file. - @type filename: string + @type filename: unicode @arg extract: Flag that indicates whether to extract the record ID and GI number: - 0 ; Do not extract, use 'filename' @@ -310,7 +317,7 @@ class GenBankRetriever(Retriever): @return: tuple ; Depending on the value of 'extract': - 0 ; ('filename', None) - 1 ; (id, GI) - @rtype: tuple (string, string) + @rtype: tuple (unicode, unicode) """ if raw_data == "\nNothing has been found\n" : @@ -378,7 +385,7 @@ class GenBankRetriever(Retriever): net_handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage(__file__, -1, 'INFO', - 'Error connecting to Entrez nuccore database: %s' % str(e)) + 'Error connecting to Entrez nuccore database: %s' % unicode(e)) self._output.addMessage(__file__, 4, 'ERETR', 'Could not retrieve %s.' % name) return None @@ -409,7 +416,7 @@ class GenBankRetriever(Retriever): net_handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage(__file__, -1, 'INFO', - 'Error connecting to Entrez nuccore database: %s' % str(e)) + 'Error connecting to Entrez nuccore database: %s' % unicode(e)) self._output.addMessage(__file__, 4, 'ERETR', 'Could not retrieve %s.' % name) return None @@ -438,7 +445,7 @@ class GenBankRetriever(Retriever): as filename. @arg accno: The accession number of the chromosome - @type accno: string + @type accno: unicode @arg start: Start position of the slice @type start: integer @arg stop: End position of the slice. @@ -450,7 +457,7 @@ class GenBankRetriever(Retriever): @type orientation: integer @return: An UD number - @rtype: string + @rtype: unicode """ # Not a valid slice. @@ -483,7 +490,7 @@ class GenBankRetriever(Retriever): handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage(__file__, -1, 'INFO', - 'Error connecting to Entrez nuccore database: %s' % str(e)) + 'Error connecting to Entrez nuccore database: %s' % unicode(e)) self._output.addMessage(__file__, 4, 'ERETR', 'Could not retrieve slice.') return None @@ -512,7 +519,7 @@ class GenBankRetriever(Retriever): #else if self.write(raw_data, reference.accession, 0): - return str(reference.accession) + return reference.accession #retrieveslice def retrievegene(self, gene, organism, upstream, downstream) : @@ -521,9 +528,9 @@ class GenBankRetriever(Retriever): slice if the gene can be found. @arg gene: Name of the gene - @type gene: string + @type gene: unicode @arg organism: The organism in which we search. - @type organism: string + @type organism: unicode @arg upstream: Number of upstream nucleotides for the slice. @type upstream: integer @arg downstream: Number of downstream nucleotides for the slice. @@ -549,7 +556,7 @@ class GenBankRetriever(Retriever): handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage(__file__, -1, 'INFO', - 'Error connecting to Entrez esearch: %s' % str(e)) + 'Error connecting to Entrez esearch: %s' % unicode(e)) self._output.addMessage(__file__, 4, 'ERETR', 'Could not search for gene %s.' % gene) return None @@ -571,7 +578,7 @@ class GenBankRetriever(Retriever): handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage(__file__, -1, 'INFO', - 'Error connecting to Entrez esummary: %s' % str(e)) + 'Error connecting to Entrez esummary: %s' % unicode(e)) self._output.addMessage(__file__, 4, 'ERETR', 'Could not get mapping information for gene %s.' % gene) return None @@ -631,10 +638,10 @@ class GenBankRetriever(Retriever): is used. @arg url: Location of a GenBank record - @type url: string + @type url: unicode @return: UD or None - @rtype: string + @rtype: unicode """ handle = urllib2.urlopen(url) info = handle.info() @@ -651,14 +658,14 @@ class GenBankRetriever(Retriever): except NoResultFound: UD = self._newUD() if not os.path.isfile(self._nametofile(UD)): - UD = self.write(raw_data, UD, 0) and str(UD) + UD = self.write(raw_data, UD, 0) and UD if UD: #Parsing went OK, add to DB reference = Reference(UD, md5sum, download_url=url) session.add(reference) session.commit() else: if not os.path.isfile(self._nametofile(reference.accession)): - UD = self.write(raw_data, reference.accession, 0) and str(reference.accession) + UD = self.write(raw_data, reference.accession, 0) and reference.accession return UD #Returns the UD or None #if @@ -682,10 +689,10 @@ class GenBankRetriever(Retriever): is used. @arg raw_data: A GenBank record - @type raw_data: string + @type raw_data: byte string - @return: - @rtype: string????? + @return: Accession number for the uploaded file. + @rtype: unicode """ md5sum = self._calcHash(raw_data) @@ -702,7 +709,7 @@ class GenBankRetriever(Retriever): if os.path.isfile(self._nametofile(reference.accession)): return reference.accession else: - return self.write(raw_data, reference.accession, 0) and str(reference.accession) + return self.write(raw_data, reference.accession, 0) and reference.accession #uploadrecord def loadrecord(self, identifier): @@ -718,7 +725,7 @@ class GenBankRetriever(Retriever): 3. Fetched from the NCBI. :arg identifier: A RefSeq accession number or geninfo identifier (GI). - :type identifier: string + :type identifier: unicode :return: A parsed RefSeq record or `None` if no record could be found for the given identifier. @@ -830,7 +837,7 @@ class LRGRetriever(Retriever): Load and parse a LRG file based on the identifier @arg identifier: The name of the LRG file to read - @type identifier: string + @type identifier: unicode @return: record ; GenRecord.Record of LRG file None ; in case of failure @@ -850,6 +857,7 @@ class LRGRetriever(Retriever): # Now we have the file, so we can parse it. file_handle = bz2.BZ2File(filename, "r") + file_handle = codecs.getreader('utf-8')(file_handle) #create GenRecord.Record from LRG file record = lrg.create_record(file_handle.read()) @@ -870,10 +878,10 @@ class LRGRetriever(Retriever): from the pending section. @arg name: The name of the LRG file to fetch - @type name: string + @type name: unicode @return: the full path to the file; None in case of an error - @rtype: string + @rtype: unicode """ prefix = settings.LRG_PREFIX_URL @@ -901,12 +909,12 @@ class LRGRetriever(Retriever): Download an LRG record from an URL. @arg url: Location of the LRG record - @type url: string + @type url: unicode @return: - filename ; The full path to the file - None ; in case of failure - @rtype: string + @rtype: unicode """ lrgID = name or os.path.splitext(os.path.split(url)[1])[0] @@ -914,6 +922,8 @@ class LRGRetriever(Retriever): # return None filename = self._nametofile(lrgID) + # Todo: Properly read the file contents to a unicode string and write + # it utf-8 encoded. handle = urllib2.urlopen(url) info = handle.info() if info["Content-Type"] == "application/xml" and info.has_key("Content-length"): @@ -970,12 +980,12 @@ class LRGRetriever(Retriever): @arg raw_data: The data @type raw_data: string @arg filename: The intended name of the file - @type filename: string + @type filename: unicode @return: - filename ; The full path and name of the file written - None ; In case of an error - @rtype: string + @rtype: unicode """ # Dirty way to test if a file is valid, # Parse the file to see if it's a real LRG file. diff --git a/mutalyzer/Scheduler.py b/mutalyzer/Scheduler.py index e6f102d3..ee7223a9 100644 --- a/mutalyzer/Scheduler.py +++ b/mutalyzer/Scheduler.py @@ -15,13 +15,14 @@ Module used to add and manage the Batch Jobs. # - Batch Syntax Checker # - Batch Position Converter +from __future__ import unicode_literals + import os # os.path.exists import smtplib # smtplib.STMP from email.mime.text import MIMEText # MIMEText from sqlalchemy import func from sqlalchemy.orm.exc import NoResultFound -import mutalyzer from mutalyzer.config import settings from mutalyzer.db import queries, session from mutalyzer.db.models import Assembly, BatchJob, BatchQueueItem @@ -88,9 +89,9 @@ class Scheduler() : @todo: Handle Connection errors in a try, except clause @arg mailTo: The batch job submitter - @type mailTo: string + @type mailTo: unicode @arg url: The url containing the results - @type url: string + @type url: unicode """ if settings.TESTING: return @@ -410,7 +411,7 @@ Mutalyzer batch scheduler""" % url) O.addMessage(__file__, 4, "EBATCHU", "Unexpected error occurred, dev-team notified") import traceback - O.addMessage(__file__, 4, "DEBUG", repr(traceback.format_exc())) + O.addMessage(__file__, 4, "DEBUG", unicode(repr(traceback.format_exc()))) #except finally : #check if we need to update the database @@ -535,11 +536,11 @@ Mutalyzer batch scheduler""" % url) - Output written to outputfile. @arg cmd: The Syntax Checker input - @type cmd: string + @type cmd: unicode @arg i: The JobID @type i: integer @arg build: The build to use for the converter - @type build: string + @type build: unicode @arg flags: Flags of the current entry @type flags: """ @@ -562,7 +563,7 @@ Mutalyzer batch scheduler""" % url) assembly = Assembly.by_name_or_alias(batch_job.argument) except NoResultFound: O.addMessage(__file__, 3, 'ENOASSEMBLY', - 'Not a valid assembly: ' + str(batch_job.argument)) + 'Not a valid assembly: ' + batch_job.argument) raise converter = Converter(assembly, O) @@ -704,7 +705,7 @@ Mutalyzer batch scheduler""" % url) Add a job to the Database and start the BatchChecker. @arg email: e-mail address of batch supplier - @type email: string + @type email: unicode @arg queue: A list of jobs @type queue: list @arg columns: The number of columns. diff --git a/mutalyzer/__init__.py b/mutalyzer/__init__.py index e3c80aa3..6968d5ff 100644 --- a/mutalyzer/__init__.py +++ b/mutalyzer/__init__.py @@ -3,6 +3,9 @@ HGVS variant nomenclature checker. """ +from __future__ import unicode_literals + + # We follow a versioning scheme compatible with setuptools [1] where the # package version is always that of the upcoming release (and not that of the # previous release), post-fixed with ``.dev``. Only in a release commit, the diff --git a/mutalyzer/announce.py b/mutalyzer/announce.py index d8acbe4d..9adbf791 100644 --- a/mutalyzer/announce.py +++ b/mutalyzer/announce.py @@ -7,6 +7,8 @@ fast, it can be done on every website pageview without problems. """ +from __future__ import unicode_literals + from mutalyzer.redisclient import client diff --git a/mutalyzer/config/__init__.py b/mutalyzer/config/__init__.py index def4630b..462a490e 100644 --- a/mutalyzer/config/__init__.py +++ b/mutalyzer/config/__init__.py @@ -12,6 +12,8 @@ be used. """ +from __future__ import unicode_literals + import collections import os diff --git a/mutalyzer/config/default_settings.py b/mutalyzer/config/default_settings.py index 43009e09..00dc9b2e 100644 --- a/mutalyzer/config/default_settings.py +++ b/mutalyzer/config/default_settings.py @@ -4,6 +4,9 @@ pointed-to by the `MUTALYZER_SETTINGS` environment variable. """ +from __future__ import unicode_literals + + # Use Mutalyzer in debug mode. DEBUG = False diff --git a/mutalyzer/db/__init__.py b/mutalyzer/db/__init__.py index b2192186..71e8eaf5 100644 --- a/mutalyzer/db/__init__.py +++ b/mutalyzer/db/__init__.py @@ -4,6 +4,8 @@ using SQLAlchemy. """ +from __future__ import unicode_literals + import sqlalchemy from sqlalchemy.engine.url import make_url from sqlalchemy.ext.declarative import declarative_base diff --git a/mutalyzer/db/models.py b/mutalyzer/db/models.py index 4119fa99..faa0754c 100644 --- a/mutalyzer/db/models.py +++ b/mutalyzer/db/models.py @@ -3,6 +3,8 @@ Models backed by SQL using SQLAlchemy. """ +from __future__ import unicode_literals + from datetime import datetime import sqlite3 import uuid @@ -50,7 +52,7 @@ class Positions(TypeDecorator): def process_bind_param(self, value, dialect): if value is not None: - value = ','.join(str(i) for i in value) + value = ','.join(unicode(i) for i in value) return value def process_result_value(self, value, dialect): @@ -98,7 +100,7 @@ class BatchJob(db.Base): self.email = email self.download_url = download_url self.argument = argument - self.result_id = str(uuid.uuid4()) + self.result_id = unicode(uuid.uuid4()) self.added = datetime.now() def __repr__(self): diff --git a/mutalyzer/db/queries.py b/mutalyzer/db/queries.py index afdd2a44..7c54d137 100644 --- a/mutalyzer/db/queries.py +++ b/mutalyzer/db/queries.py @@ -7,6 +7,8 @@ Queries on database models. # the models they work with. +from __future__ import unicode_literals + from datetime import datetime, timedelta from sqlalchemy import and_, or_ diff --git a/mutalyzer/describe.py b/mutalyzer/describe.py index 37fb60c2..d81254c3 100644 --- a/mutalyzer/describe.py +++ b/mutalyzer/describe.py @@ -7,13 +7,14 @@ leading from one sequence to an other. @requires: Bio.Seq """ +from __future__ import unicode_literals + import collections -from Bio import Seq from Bio.SeqUtils import seq3 from Bio.Data import CodonTable from mutalyzer.util import longest_common_prefix, longest_common_suffix -from mutalyzer.util import palinsnoop, roll +from mutalyzer.util import palinsnoop, roll, reverse_complement from mutalyzer import models @@ -34,9 +35,9 @@ class LCS(object): Initialise the class. @arg s1: A string. - @type s1: str + @type s1: unicode @arg s2: A string. - @type s2: str + @type s2: unicode @arg lcp: The length of the longest common prefix of {s1} and {s2}. @type lcp: int @arg s1_end: End of the substring in {s1}. @@ -55,21 +56,21 @@ class LCS(object): self.__s2_rc = None self.__matrix_rc = None if DNA: - self.__s2_rc = Seq.reverse_complement(s2[self.__lcp:s2_end]) + self.__s2_rc = reverse_complement(s2[self.__lcp:s2_end]) self.__matrix_rc = self.LCSMatrix(self.__s1, self.__s2_rc) #if #__init__ - def __str__(self): + def __unicode__(self): """ Return a graphical representation of the LCS matrix, mainly for debugging. @returns: A graphical representation of the LCS matrix. - @rtype: str + @rtype: unicode """ return self.visMatrix((0, len(self.__s1)), (0, len(self.__s2))) - #__str__ + #__unicode__ def visMatrix(self, r1, r2, rc=False): """ @@ -77,7 +78,7 @@ class LCS(object): debugging. @returns: A graphical representation of the LCS matrix. - @rtype: str + @rtype: unicode """ nr1 = r1[0] - self.__lcp, r1[1] - self.__lcp nr2 = r2[0] - self.__lcp, r2[1] - self.__lcp @@ -91,7 +92,7 @@ class LCS(object): out = self.__delim.join(self.__delim + '-' + s2[nr2[0]:nr2[1]]) + '\n' for i in range(nr1[0], nr1[1] + 1): out += (('-' + self.__s1)[i] + self.__delim + - self.__delim.join(map(lambda x: str(M[i][x]), + self.__delim.join(map(lambda x: unicode(M[i][x]), range(nr2[0], nr2[1] + 1))) + '\n') return out @@ -102,9 +103,9 @@ class LCS(object): Calculate the Longest Common Substring matrix. @arg s1: A string. - @type s1: str + @type s1: unicode @arg s2: A string. - @type s2: str + @type s2: unicode @returns: A matrix with the LCS of {s1}[i], {s2}[j] at position i, j. @rval: list[list[int]] @@ -201,9 +202,9 @@ def __makeOverlaps(peptide): Make a list of overlapping 2-mers of {peptide} in order of appearance. @arg peptide: A peptide sequence. - @type peptide: str + @type peptide: unicode @returns: All 2-mers of {peptide} in order of appearance. - @rtype: list(str) + @rtype: list(unicode) """ return map(lambda x: peptide[x:x+2], range(len(peptide) - 1)) #__makeOverlaps @@ -213,13 +214,13 @@ def __options(pList, peptidePrefix, FS, output): Enumerate all peptides that could result from a frame shift. @arg pList: List of overlapping 2-mers of a peptide. - @type pList: list(str) + @type pList: list(unicode) @arg peptidePrefix: Prefix of a peptide in the alternative reading frame. - @type peptidePrefix: str + @type peptidePrefix: unicode @arg FS: Frame shift table. @type FS: dict @arg output: List of peptides, should be empty initially. - @type output: list(str) + @type output: list(unicode) """ if not pList: output.append(peptidePrefix) @@ -234,7 +235,7 @@ def enumFS(peptide, FS): Enumerate all peptides that could result from a frame shift. @arg peptide: Original peptide sequence. - @type peptide: str + @type peptide: unicode @arg FS: Frame shift table. @type FS: dict """ @@ -250,9 +251,9 @@ def fitFS(peptide, altPeptide, FS): {peptide}. @arg peptide: Original peptide sequence. - @type peptide: str + @type peptide: unicode @arg altPeptide: Observed peptide sequence. - @type altPeptide: str + @type altPeptide: unicode @arg FS: Frame shift table. @type FS: dict """ @@ -302,11 +303,11 @@ class DescribeRawVar(models.RawVar): @arg end_offset: @type end_offset: int @arg type: Variant type. - @type type: str + @type type: unicode @arg deleted: Deleted part of the reference sequence. - @type deleted: str + @type deleted: unicode @arg inserted: Inserted part. - @type inserted: str + @type inserted: unicode @arg shift: Amount of freedom. @type shift: int """ @@ -336,7 +337,7 @@ class DescribeRawVar(models.RawVar): correct description. Also see the comment in the class definition. @returns: The HGVS description of the raw variant stored in this class. - @rtype: str + @rtype: unicode """ if not self.start: return "=" @@ -365,7 +366,7 @@ class DescribeRawVar(models.RawVar): correct description. Also see the comment in the class definition. @returns: The HGVS description of the raw variant stored in this class. - @rtype: str + @rtype: unicode """ if self.type == "unknown": return "?" @@ -491,7 +492,7 @@ def alleleDescription(allele): @type allele: list(DescribeRawVar) @returns: The HGVS description of {allele}. - @rval: str + @rval: unicode """ if len(allele) > 1: return "[%s]" % ';'.join(map(lambda x : x.hgvs, allele)) @@ -530,9 +531,9 @@ def DNA_description(M, s1, s2, s1_start, s1_end, s2_start, s2_end): {s1_start}..{s1_end} on {s1} and {s2_start}..{s2_end} on {s2}. arg s1: Sequence 1. - type s1: str + type s1: unicode arg s2: Sequence 2. - type s2: str + type s2: unicode arg s1_start: Start of the range on {s1}. type s1_start: int arg s1_end: End of the range on {s1}. @@ -682,9 +683,9 @@ def protein_description(M, s1, s2, s1_start, s1_end, s2_start, s2_end): {s1_start}..{s1_end} on {s1} and {s2_start}..{s2_end} on {s2}. arg s1: Sequence 1. - type s1: str + type s1: unicode arg s2: Sequence 2. - type s2: str + type s2: unicode arg s1_start: Start of the range on {s1}. type s1_start: int arg s1_end: End of the range on {s1}. @@ -810,15 +811,15 @@ def describe(original, mutated, DNA=True): Convenience function for DNA_description(). @arg original: - @type original: str + @type original: unicode @arg mutated: - @type mutated: str + @type mutated: unicode @returns: A list of DescribeRawVar objects, representing the allele. @rval: list(DescribeRawVar) """ - s1 = str(original) - s2 = str(mutated) + s1 = original + s2 = mutated lcp = len(longest_common_prefix(s1, s2)) lcs = len(longest_common_suffix(s1[lcp:], s2[lcp:])) s1_end = len(s1) - lcs diff --git a/mutalyzer/describe_c.py b/mutalyzer/describe_c.py deleted file mode 100755 index 1da86f77..00000000 --- a/mutalyzer/describe_c.py +++ /dev/null @@ -1,587 +0,0 @@ -#!/usr/bin/python - -""" -Prototype of a module that can generate a HGVS description of the variant(s) -leading from one sequence to an other. - -@requires: Bio.Seq -""" -import collections -from Bio import Seq -from Bio.SeqUtils import seq3 -from Bio.Data import CodonTable - -from mutalyzer.util import longest_common_prefix, longest_common_suffix -from mutalyzer.util import palinsnoop, roll -from mutalyzer import models - -from extractor import extractor - -def makeFSTables(table_id): - """ - For every pair of amino acids, calculate the set of possible amino acids in - a different reading frame. Do this for both alternative reading frames (+1 - and +2). - - @arg table_id: Coding table ID. - @type table_id: int - @returns: Two dictionaries for the two alternative reading frames. - @rtype: tuple(dict, dict) - """ - # Make the forward translation table. - table = dict(CodonTable.unambiguous_dna_by_id[table_id].forward_table) - for i in CodonTable.unambiguous_dna_by_id[table_id].stop_codons: - table[i] = '*' - - # Make the reverse translation table. - reverse_table = collections.defaultdict(list) - for i in table: - reverse_table[table[i]].append(i) - - # Make the frame shift tables. - FS1 = collections.defaultdict(set) - FS2 = collections.defaultdict(set) - for AA_i in reverse_table: - for AA_j in reverse_table: - for codon_i in reverse_table[AA_i]: - for codon_j in reverse_table[AA_j]: - FS1[AA_i + AA_j].add(table[(codon_i + codon_j)[1:4]]) # +1. - FS2[AA_i + AA_j].add(table[(codon_i + codon_j)[2:5]]) # +2. - #for - return FS1, FS2 -#makeFSTables - -def __makeOverlaps(peptide): - """ - Make a list of overlapping 2-mers of {peptide} in order of appearance. - - @arg peptide: A peptide sequence. - @type peptide: str - @returns: All 2-mers of {peptide} in order of appearance. - @rtype: list(str) - """ - return map(lambda x: peptide[x:x+2], range(len(peptide) - 1)) -#__makeOverlaps - -def __options(pList, peptidePrefix, FS, output): - """ - Enumerate all peptides that could result from a frame shift. - - @arg pList: List of overlapping 2-mers of a peptide. - @type pList: list(str) - @arg peptidePrefix: Prefix of a peptide in the alternative reading frame. - @type peptidePrefix: str - @arg FS: Frame shift table. - @type FS: dict - @arg output: List of peptides, should be empty initially. - @type output: list(str) - """ - if not pList: - output.append(peptidePrefix) - return - #if - for i in FS[pList[0]]: - __options(pList[1:], peptidePrefix + i, FS, output) -#__options - -def enumFS(peptide, FS): - """ - Enumerate all peptides that could result from a frame shift. - - @arg peptide: Original peptide sequence. - @type peptide: str - @arg FS: Frame shift table. - @type FS: dict - """ - output = [] - - __options(__makeOverlaps(peptide), "", FS, output) - return output -#enumFS - -def fitFS(peptide, altPeptide, FS): - """ - Check whether peptide {altPeptide} is a possible frame shift of peptide - {peptide}. - - @arg peptide: Original peptide sequence. - @type peptide: str - @arg altPeptide: Observed peptide sequence. - @type altPeptide: str - @arg FS: Frame shift table. - @type FS: dict - """ - # Todo: This is a temporary fix to prevent crashing on frameshift - # detection (I think bug #124). - return False - - if len(peptide) < len(altPeptide): - return False - - pList = __makeOverlaps(peptide) - - for i in range(len(altPeptide)): - if not altPeptide[i] in FS[pList[i]]: - return False - return True -#fitFS - -def findFS(peptide, altPeptide, FS): - """ - Find the longest part of {altPeptide} that fits in {peptide} in a certain - frame given by {FS}. - - @arg peptide: Original peptide sequence. - @type peptide: str - @arg altPeptide: Observed peptide sequence. - @type altPeptide: str - @arg FS: Frame shift table. - @type FS: dict - - @returns: The length and the offset in {peptide} of the largest frameshift. - @rtype: tuple(int, int) - """ - pList = __makeOverlaps(peptide) - maxFS = 0 - fsStart = 0 - - for i in range(len(pList))[::-1]: - for j in range(min(i + 1, len(altPeptide))): - if not altPeptide[::-1][j] in FS[pList[i - j]]: - break - if j >= maxFS: - maxFS = j - fsStart = i - j + 2 - #if - #for - - return maxFS - 1, fsStart -#findFS - -class RawVar(models.RawVar): - """ - Container for a raw variant. - - To use this class correctly, do not supply more than the minimum amount of - data. The {description()} function may not work properly if too much - information is given. - - Example: if {end} is initialised for a substitution, a range will be - retuned, resulting in a description like: 100_100A>T - """ - - def __init__(self, DNA=True, start=0, start_offset=0, end=0, end_offset=0, - type="none", deleted="", inserted="", shift=0, startAA="", endAA="", - term=0): - """ - Initialise the class with the appropriate values. - - @arg start: Start position. - @type start: int - @arg start_offset: - @type start_offset: int - @arg end: End position. - @type end: int - @arg end_offset: - @type end_offset: int - @arg type: Variant type. - @type type: str - @arg deleted: Deleted part of the reference sequence. - @type deleted: str - @arg inserted: Inserted part. - @type inserted: str - @arg shift: Amount of freedom. - @type shift: int - """ - # TODO: Will this container be used for all variants, or only genomic? - # start_offset and end_offset may be never used. - self.DNA = DNA - self.start = start - self.start_offset = start_offset - self.end = end - self.end_offset = end_offset - self.type = type - self.deleted = deleted - self.inserted = inserted - self.shift = shift - self.startAA = startAA - self.endAA = endAA - self.term = term - self.update() - #self.hgvs = self.description() - #self.hgvsLength = self.descriptionLength() - #__init__ - - def __DNADescription(self): - """ - Give the HGVS description of the raw variant stored in this class. - - Note that this function relies on the absence of values to make the - correct description. Also see the comment in the class definition. - - @returns: The HGVS description of the raw variant stored in this class. - @rtype: str - """ - if not self.start: - return "=" - - descr = "%i" % self.start - - if self.end: - descr += "_%i" % self.end - - if self.type != "subst": - descr += "%s" % self.type - - if self.inserted: - return descr + "%s" % self.inserted - return descr - #if - - return descr + "%s>%s" % (self.deleted, self.inserted) - #__DNADescription - - def __proteinDescription(self): - """ - Give the HGVS description of the raw variant stored in this class. - - Note that this function relies on the absence of values to make the - correct description. Also see the comment in the class definition. - - @returns: The HGVS description of the raw variant stored in this class. - @rtype: str - """ - if self.type == "unknown": - return "?" - if not self.start: - return "=" - - descr = "" - if not self.deleted: - if self.type == "ext": - descr += '*' - else: - descr += "%s" % seq3(self.startAA) - #if - else: - descr += "%s" % seq3(self.deleted) - descr += "%i" % self.start - if self.end: - descr += "_%s%i" % (seq3(self.endAA), self.end) - if self.type not in ["subst", "stop", "ext", "fs"]: # fs is not a type - descr += self.type - if self.inserted: - descr += "%s" % seq3(self.inserted) - - if self.type == "stop": - return descr + '*' - if self.term: - return descr + "fs*%i" % self.term - return descr - #__proteinDescription - - def __DNADescriptionLength(self): - """ - Give the standardised length of the HGVS description of the raw variant - stored in this class. - - Note that this function relies on the absence of values to make the - correct description. Also see the comment in the class definition. - - @returns: The standardised length of the HGVS description of the raw - variant stored in this class. - @rtype: int - """ - if not self.start: # `=' or `?' - return 1 - - descrLen = 1 # Start position. - - if self.end: # '_' and end position. - descrLen += 2 - - if self.type != "subst": - descrLen += len(self.type) - - if self.inserted: - return descrLen + len(self.inserted) - return descrLen - #if - - return 4 # Start position, '>' and end position. - #__DNAdescriptionLength - - def __proteinDescriptionLength(self): - """ - Give the standardised length of the HGVS description of the raw variant - stored in this class. - - Note that this function relies on the absence of values to make the - correct description. Also see the comment in the class definition. - - @returns: The standardised length of the HGVS description of the raw - variant stored in this class. - @rtype: int - """ - if not self.start: # = - return 1 - - descrLen = 1 # Start position. - if not self.deleted and self.type == "ext": - descrLen += 1 # * - else: - descrLen += 3 # One amino acid. - if self.end: - descrLen += 5 # `_' + one amino acid + end position. - if self.type not in ["subst", "stop", "ext", "fs"]: - descrLen += len(self.type) - if self.inserted: - descrLen += 3 * len(self.inserted) - if self.type == "stop": - return descrLen + 1 # * - if self.term: - return descrLen + len(self.type) + 2 # `*' + length until stop. - return descrLen - #__proteinDescriptionLength - - def update(self): - """ - """ - self.hgvs = self.description() - self.hgvsLength = self.descriptionLength() - #update - - def description(self): - """ - """ - if self.DNA: - return self.__DNADescription() - return self.__proteinDescription() - #description - - def descriptionLength(self): - """ - Give the standardised length of the HGVS description of the raw variant - stored in this class. - - @returns: The standardised length of the HGVS description of the raw - variant stored in this class. - @rtype: int - """ - if self.DNA: - return self.__DNADescriptionLength() - return self.__proteinDescriptionLength() - #descriptionLength -#RawVar - -def alleleDescription(allele): - """ - Convert a list of raw variants to an HGVS allele description. - - @arg allele: A list of raw variants representing an allele description. - @type allele: list(RawVar) - - @returns: The HGVS description of {allele}. - @rval: str - """ - if len(allele) > 1: - return "[%s]" % ';'.join(map(lambda x: x.hgvs, allele)) - return allele[0].hgvs -#alleleDescription - -def alleleDescriptionLength(allele): - """ - Calculate the standardised length of an HGVS allele description. - - @arg allele: A list of raw variants representing an allele description. - @type allele: list(RawVar) - - @returns: The standardised length of the HGVS description of {allele}. - @rval: int - """ - # NOTE: Do we need to count the ; and [] ? - return sum(map(lambda x: x.hgvsLength, allele)) -#alleleDescriptionLength - -def printpos(s, start, end, fill=0): - """ - For debugging purposes. - """ - # TODO: See if this can partially replace or be merged with the - # visualisation in the __mutate() function of mutator.py - fs = 10 # Flank size. - - return "%s %s%s %s" % (s[start - fs:start], s[start:end], '-' * fill, - s[end:end + fs]) -#printpos - -def var2RawVar(s1, s2, var, DNA=True): - """ - """ - # Unknown. - if s1 == '?' or s2 == '?': - return [RawVar(DNA=DNA, type="unknown")] - - # Insertion / Duplication. - if var.reference_start == var.reference_end: - ins_length = var.sample_end - var.sample_start - shift5, shift3 = roll(s2, var.sample_start + 1, var.sample_end) - shift = shift5 + shift3 - - var.reference_start += shift3 - var.reference_end += shift3 - var.sample_start += shift3 - var.sample_end += shift3 - - if (var.sample_start - ins_length >= 0 and - s1[var.reference_start - ins_length:var.reference_start] == - s2[var.sample_start:var.sample_end]): - - if ins_length == 1: - return RawVar(DNA=DNA, start=var.reference_start, type="dup", - shift=shift) - return RawVar(DNA=DNA, start=var.reference_start - ins_length + 1, - end=var.reference_end, type="dup", shift=shift) - #if - return RawVar(DNA=DNA, start=var.reference_start, - end=var.reference_start + 1, - inserted=s2[var.sample_start:var.sample_end], type="ins", - shift=shift) - #if - - # Deletion. - if var.sample_start == var.sample_end: - shift5, shift3 = roll(s1, var.reference_start + 1, var.reference_end) - shift = shift5 + shift3 - - var.reference_start += shift3 + 1 - var.reference_end += shift3 - - if var.reference_start == var.reference_end: - return RawVar(DNA=DNA, start=var.reference_start, type="del", - shift=shift) - return RawVar(DNA=DNA, start=var.reference_start, - end=var.reference_end, type="del", shift=shift) - #if - - # Substitution. - if (var.reference_start + 1 == var.reference_end and - var.sample_start + 1 == var.sample_end): - - return RawVar(DNA=DNA, start=var.reference_start + 1, - deleted=s1[var.reference_start], inserted=s2[var.sample_start], - type="subst") - #if - - # Simple InDel. - if var.reference_start + 1 == var.reference_end: - return RawVar(DNA=DNA, start=var.reference_start + 1, - inserted=s2[var.sample_start:var.sample_end], type="delins") - - # Inversion. - if var.type == extractor.VARIANT_REVERSE_COMPLEMENT: - trim = palinsnoop(s1[var.reference_start:var.reference_end]) - - if trim > 0: # Partial palindrome. - var.reference_end -= trim - var.sample_end -= trim - #if - - return RawVar(DNA=DNA, start=var.reference_start + 1, - end=var.reference_end, type="inv") - #if - - # InDel. - return RawVar(DNA=DNA, start=var.reference_start + 1, - end=var.reference_end, inserted=s2[var.sample_start:var.sample_end], - type="delins") -#var2RawVar - -def description(s1, s2, DNA=True): - """ - Give an allele description of the change from {s1} to {s2}. - - arg s1: Sequence 1. - type s1: str - arg s2: Sequence 2. - type s2: str - - @returns: A list of RawVar objects, representing the allele. - @rval: list(RawVar) - """ - description = [] - - if not DNA: - FS1, FS2 = makeFSTables(1) - longestFSf = max(findFS(s1, s2, FS1), findFS(s1, s2, FS2)) - longestFSr = max(findFS(s2, s1, FS1), findFS(s2, s1, FS2)) - - if longestFSf > longestFSr: - print s1[:longestFSf[1]], s1[longestFSf[1]:] - print s2[:len(s2) - longestFSf[0]], s2[len(s2) - longestFSf[0]:] - s1_part = s1[:longestFSf[1]] - s2_part = s2[:len(s2) - longestFSf[0]] - term = longestFSf[0] - #if - else: - print s1[:len(s1) - longestFSr[0]], s1[len(s1) - longestFSr[0]:] - print s2[:longestFSr[1]], s2[longestFSr[1]:] - s1_part = s1[:len(s1) - longestFSr[0]] - s2_part = s2[:longestFSr[1]] - term = len(s2) - longestFSr[1] - #else - - s1_part = s1 - s2_part = s2 - for variant in extractor.extract(str(s1_part), len(s1_part), - str(s2_part), len(s2_part), 1): - description.append(var2RawVar(s1, s2, variant, DNA=DNA)) - - if description: - description[-1].term = term + 2 - description[-1].update() - #if - #if - else: - for variant in extractor.extract(str(s1), len(s1), str(s2), len(s2), - 0): - if variant.type != extractor.VARIANT_IDENTITY: - description.append(var2RawVar(s1, s2, variant, DNA=DNA)) - - # Nothing happened. - if not description: - return [RawVar(DNA=DNA)] - - return description -#description - -if __name__ == "__main__": - a = "ATAGATGATAGATAGATAGAT" - b = "ATAGATGATTGATAGATAGAT" - print alleleDescription(description(a, b, DNA=True)) - - a = "MAVLWRLSAVCGALGGRALLLRTPVVRPAH" - b = "MAVLWRLSAGCGALGGRALLLRTPVVRAH" - print alleleDescription(description(a, b, DNA=False)) - - a = "MDYSLAAALTLHGHWGLGQVVTDYVHGDALQKAAKAGLLALSALTFAGLCYFNYHDVGICKAVAMLWKL" - b = "MDYSLAAALTFMVTGALDKLLLTMFMGMPCRKLPRQGFWHFQL" - #print alleleDescription(description(a, b, DNA=False)) - #print alleleDescription(description(b, a, DNA=False)) - print "1" - extractor.extract(a, len(a), b, len(b), 1) - print "2" - extractor.extract(b, len(b), a, len(a), 1) - print "3" - - - a = "VVSVLLLGLLPAAYLNPCSAMYYSLAAALTLHGHWGLGQV" - b = "VVSVLLLGLLPAAYLNPCSAMDYSLAAALTLHGHWGLGQV" - print alleleDescription(description(a, b, DNA=False)) - print alleleDescription(description(b, a, DNA=False)) - - a = "ACGCTCGATCGCTTATAGCATGGGGGGGGGATCTAGCTCTCTCTATAAGATA" - b = "ACGCTCGATCGCTTATACCCCCCCCATGCGATCTAGCTCTCTCTATAAGATA" - print alleleDescription(description(a, b, DNA=True)) - -#if diff --git a/mutalyzer/entrypoints/__init__.py b/mutalyzer/entrypoints/__init__.py index 36b5ad16..5c6d2cf6 100644 --- a/mutalyzer/entrypoints/__init__.py +++ b/mutalyzer/entrypoints/__init__.py @@ -3,6 +3,11 @@ Entry points to Mutalyzer. """ +from __future__ import unicode_literals + +import sys + + class _ReverseProxied(object): """ Wrap the application in this middleware and configure the front-end server @@ -36,3 +41,13 @@ class _ReverseProxied(object): if scheme: environ['wsgi.url_scheme'] = scheme return self.app(environ, *args, **kwargs) + + +def _cli_string(argument): + """ + Decode a command line argument byte string to unicode using our best + guess for the encoding (noop on unicode strings). + """ + if isinstance(argument, unicode): + return argument + return unicode(argument, encoding=sys.stdin.encoding) diff --git a/mutalyzer/entrypoints/admin.py b/mutalyzer/entrypoints/admin.py index 42929e6b..9b06920d 100644 --- a/mutalyzer/entrypoints/admin.py +++ b/mutalyzer/entrypoints/admin.py @@ -3,16 +3,19 @@ Command line interface to Mutalyzer administrative tools. """ +from __future__ import unicode_literals + import argparse import json import os +import sys import alembic.command import alembic.config from alembic.migration import MigrationContext -from sqlalchemy.exc import IntegrityError from sqlalchemy.orm.exc import NoResultFound +from . import _cli_string from .. import announce from .. import db from ..db import session @@ -96,7 +99,7 @@ def import_mapview(assembly_name_or_alias, mapview_file, group_label): try: mapping.import_from_mapview_file(assembly, mapview_file, group_label) except mapping.MapviewSortError as e: - raise UserError(str(e)) + raise UserError(unicode(e)) def import_gene(assembly_name_or_alias, gene): @@ -184,8 +187,9 @@ def main(): """ assembly_parser = argparse.ArgumentParser(add_help=False) assembly_parser.add_argument( - '-a', '--assembly', metavar='ASSEMBLY', dest='assembly_name_or_alias', - default='hg19', help='assembly to import to (default: hg19)') + '-a', '--assembly', metavar='ASSEMBLY', type=_cli_string, + dest='assembly_name_or_alias', default='hg19', + help='assembly to import to (default: hg19)') parser = argparse.ArgumentParser( description='Mutalyzer administrative tools.') @@ -227,7 +231,7 @@ def main(): 'mapview_file', metavar='FILE', type=argparse.FileType('r'), help='file from NCBI mapview (example: seq_gene.md), see note below') p.add_argument( - 'group_label', metavar='GROUP_LABEL', + 'group_label', metavar='GROUP_LABEL', type=_cli_string, help='use only entries with this group label (example: ' 'GRCh37.p2-Primary Assembly)') @@ -241,7 +245,7 @@ def main(): ' (i.e., NCBI mapview).') p.set_defaults(func=import_gene) p.add_argument( - 'gene', metavar='GENE_SYMBOL', + 'gene', metavar='GENE_SYMBOL', type=_cli_string, help='gene to import all transcript mappings for from the UCSC ' 'database (example: TTN)') @@ -255,7 +259,7 @@ def main(): 'usual source (i.e., NCBI mapview).') p.set_defaults(func=import_reference) p.add_argument( - 'reference', metavar='ACCESSION', + 'reference', metavar='ACCESSION', type=_cli_string, help='genomic reference to import all genes from (example: ' 'NC_012920.1)') @@ -272,10 +276,10 @@ def main(): description=set_announcement.__doc__.split('\n\n')[0]) p.set_defaults(func=set_announcement) p.add_argument( - 'body', metavar='ANNOUNCEMENT', + 'body', metavar='ANNOUNCEMENT', type=_cli_string, help='announcement text to show to the user') p.add_argument( - '--url', metavar='URL', dest='url', + '--url', metavar='URL', dest='url', type=_cli_string, help='URL to more information on the announcement') # Subparser 'announcement unset'. @@ -290,10 +294,10 @@ def main(): description=sync_cache.__doc__.split('\n\n')[0], epilog='Intended use is to run daily from cron.') p.add_argument( - 'wsdl_url', metavar='WSDL_URL', + 'wsdl_url', metavar='WSDL_URL', type=_cli_string, help='location of the remote WSDL description') p.add_argument( - 'url_template', metavar='URL_TEMPLATE', + 'url_template', metavar='URL_TEMPLATE', type=_cli_string, help='URL for remote downloads, in which the filename is to be ' 'substituted for {file}') p.add_argument( @@ -313,7 +317,7 @@ def main(): '--destructive', dest='destructive', action='store_true', help='delete any existing tables and data') p.add_argument( - '-c', '--alembic-config', metavar='ALEMBIC_CONFIG', + '-c', '--alembic-config', metavar='ALEMBIC_CONFIG', type=_cli_string, dest='alembic_config_path', help='path to Alembic configuration file') p.set_defaults(func=setup_database) @@ -323,7 +327,7 @@ def main(): args.func(**{k: v for k, v in vars(args).items() if k not in ('func', 'subcommand')}) except UserError as e: - parser.error(str(e)) + parser.error(unicode(e)) if __name__ == '__main__': diff --git a/mutalyzer/entrypoints/batch_processor.py b/mutalyzer/entrypoints/batch_processor.py index 286c4116..ae3c2945 100644 --- a/mutalyzer/entrypoints/batch_processor.py +++ b/mutalyzer/entrypoints/batch_processor.py @@ -6,12 +6,13 @@ Mutalyzer batch processor. """ +from __future__ import unicode_literals + import argparse import signal import sys import time -from .. import config from .. import db from .. import Scheduler diff --git a/mutalyzer/entrypoints/mutalyzer.py b/mutalyzer/entrypoints/mutalyzer.py index d123482f..6717161d 100644 --- a/mutalyzer/entrypoints/mutalyzer.py +++ b/mutalyzer/entrypoints/mutalyzer.py @@ -5,8 +5,12 @@ Mutalyzer command-line name checker. """ +from __future__ import unicode_literals + import argparse +import sys +from . import _cli_string from .. import describe from .. import output from .. import variantchecker @@ -114,7 +118,7 @@ def main(): parser = argparse.ArgumentParser( description='Mutalyzer command-line name checker.') parser.add_argument( - 'description', metavar='DESCRIPTION', + 'description', metavar='DESCRIPTION', type=_cli_string, help='variant description to run the name checker on') args = parser.parse_args() diff --git a/mutalyzer/entrypoints/service_json.py b/mutalyzer/entrypoints/service_json.py index 25ff8bbf..5e5d93d0 100644 --- a/mutalyzer/entrypoints/service_json.py +++ b/mutalyzer/entrypoints/service_json.py @@ -18,6 +18,8 @@ You can also use the built-in HTTP server by running this file directly. """ +from __future__ import unicode_literals + import argparse import logging import sys @@ -25,7 +27,7 @@ import sys from wsgiref.simple_server import make_server from spyne.server.wsgi import WsgiApplication -from . import _ReverseProxied +from . import _cli_string, _ReverseProxied from ..config import settings from ..services import json @@ -57,9 +59,9 @@ def main(): parser = argparse.ArgumentParser( description='Mutalyzer HTTP/RPC+JSON webservice.') parser.add_argument( - '-H', '--host', metavar='HOSTNAME', dest='host', default='127.0.0.1', - help='hostname to listen on (default: 127.0.0.1; specify 0.0.0.0 to ' - 'listen on all hostnames)') + '-H', '--host', metavar='HOSTNAME', type=_cli_string, dest='host', + default='127.0.0.1', help='hostname to listen on (default: ' + '127.0.0.1; specify 0.0.0.0 to listen on all hostnames)') parser.add_argument( '-p', '--port', metavar='PORT', dest='port', type=int, default=8082, help='port to listen on (default: 8082)') diff --git a/mutalyzer/entrypoints/service_soap.py b/mutalyzer/entrypoints/service_soap.py index 6b630ad6..8179faa3 100644 --- a/mutalyzer/entrypoints/service_soap.py +++ b/mutalyzer/entrypoints/service_soap.py @@ -18,6 +18,8 @@ You can also use the built-in HTTP server by running this file directly. """ +from __future__ import unicode_literals + import argparse import logging import sys @@ -25,7 +27,7 @@ import sys from wsgiref.simple_server import make_server from spyne.server.wsgi import WsgiApplication -from . import _ReverseProxied +from . import _cli_string, _ReverseProxied from ..config import settings from ..services import soap @@ -58,9 +60,9 @@ def main(): parser = argparse.ArgumentParser( description='Mutalyzer SOAP webservice.') parser.add_argument( - '-H', '--host', metavar='HOSTNAME', dest='host', default='127.0.0.1', - help='hostname to listen on (default: 127.0.0.1; specify 0.0.0.0 to ' - 'listen on all hostnames)') + '-H', '--host', metavar='HOSTNAME', type=_cli_string, dest='host', + default='127.0.0.1', help='hostname to listen on (default: ' + '127.0.0.1; specify 0.0.0.0 to listen on all hostnames)') parser.add_argument( '-p', '--port', metavar='PORT', dest='port', type=int, default=8081, help='port to listen on (default: 8081)') diff --git a/mutalyzer/entrypoints/website.py b/mutalyzer/entrypoints/website.py index a62e3bb3..f387b70f 100644 --- a/mutalyzer/entrypoints/website.py +++ b/mutalyzer/entrypoints/website.py @@ -39,9 +39,12 @@ also serve the static files. """ +from __future__ import unicode_literals + import argparse +import sys -from . import _ReverseProxied +from . import _cli_string, _ReverseProxied from ..config import settings from .. import website @@ -66,9 +69,9 @@ def main(): parser = argparse.ArgumentParser( description='Mutalyzer website.') parser.add_argument( - '-H', '--host', metavar='HOSTNAME', dest='host', default='127.0.0.1', - help='hostname to listen on (default: 127.0.0.1; specify 0.0.0.0 to ' - 'listen on all hostnames)') + '-H', '--host', metavar='HOSTNAME', type=_cli_string, dest='host', + default='127.0.0.1', help='hostname to listen on (default: ' + '127.0.0.1; specify 0.0.0.0 to listen on all hostnames)') parser.add_argument( '-p', '--port', metavar='PORT', dest='port', type=int, default=8089, help='port to listen on (default: 8080)') diff --git a/mutalyzer/grammar.py b/mutalyzer/grammar.py index 0e65ec57..8f231bf5 100644 --- a/mutalyzer/grammar.py +++ b/mutalyzer/grammar.py @@ -19,6 +19,8 @@ The grammar is described in [3]. """ +from __future__ import unicode_literals + from pyparsing import * @@ -48,7 +50,7 @@ class Grammar(): ########################################################################## # BNF: Name -> ([a-z] | [a-Z] | [0-9])+ - Name = Word(alphanums, min=1) + Name = Word(unicode(alphanums), min=1) # BNF: Nt -> `a' | `c' | `g' | `u' | `A' | `C' | `G' | `T' | `U' #Nt = Word('acgtuACGTU', exact=1) @@ -66,7 +68,7 @@ class Grammar(): NtString = Combine(OneOrMore(Nt)) # BNF: Number -> [0-9]+ - Number = Word(nums) + Number = Word(unicode(nums)) ########################################################################## # Reference sequences @@ -79,7 +81,7 @@ class Grammar(): ProtIso = Suppress('_i') + Number('ProtIso') # BNF: GeneName -> ([a-Z] | [0-9] | `-')+ - GeneName = Word(alphanums + '-', min=1) + GeneName = Word(unicode(alphanums) + '-', min=1) # BNF: GeneSymbol -> `(' Name (TransVar | ProtIso)? `)' GeneSymbol = Suppress('(') + Group(GeneName('GeneSymbol') + \ @@ -94,11 +96,11 @@ class Grammar(): # BNF: AccNo -> ([a-Z] Number `_')+ Version? AccNo = NotAny('LRG_') + \ - Combine(Word(alphas + '_') + Number)('RefSeqAcc') + \ + Combine(Word(unicode(alphas) + '_') + Number)('RefSeqAcc') + \ Optional(Version) # BNF: UD -> `UD_' [a-Z]+ (`_' Number)+ - UD = Combine('UD_' + Word(alphas) + OneOrMore('_' + Number))('RefSeqAcc') + UD = Combine('UD_' + Word(unicode(alphas)) + OneOrMore('_' + Number))('RefSeqAcc') # BNF: LRGTranscriptID -> `t' [0-9]+ LRGTranscriptID = Suppress('t') + Number('LRGTranscriptID') @@ -467,7 +469,7 @@ class Grammar(): the input where the error occurred (and return None). @arg variant: The input string that needs to be parsed. - @type variant: string + @type variant: unicode @return: The parse tree containing the parse results, or None in case of a parsing error. @@ -480,12 +482,12 @@ class Grammar(): return self.Var.parseString(variant, parseAll=True) # Todo: check .dump() except ParseException as err: - print err.line - print " "*(err.column-1) + "^" - print err + #print err.line + #print " "*(err.column-1) + "^" + #print err # Log parse error and the position where it occurred. - self._output.addMessage(__file__, 4, 'EPARSE', str(err)) - pos = int(str(err).split(':')[-1][:-1]) - 1 + self._output.addMessage(__file__, 4, 'EPARSE', unicode(err)) + pos = int(unicode(err).split(':')[-1][:-1]) - 1 self._output.addOutput('parseError', variant) self._output.addOutput('parseError', pos * ' ' + '^') return None diff --git a/mutalyzer/mapping.py b/mutalyzer/mapping.py index 693294d3..e5bd96db 100644 --- a/mutalyzer/mapping.py +++ b/mutalyzer/mapping.py @@ -10,11 +10,12 @@ update the database with this information. """ +from __future__ import unicode_literals + from collections import defaultdict from itertools import groupby from operator import attrgetter, itemgetter -from Bio.Seq import reverse_complement import MySQLdb from mutalyzer.db import session @@ -24,6 +25,7 @@ from mutalyzer.models import SoapMessage, Mapping, Transcript from mutalyzer.output import Output from mutalyzer import Crossmap from mutalyzer import Retriever +from mutalyzer import util class MapviewSortError(Exception): @@ -40,28 +42,29 @@ def _construct_change(var, reverse=False): @type reverse: bool @return: Description of mutation (without reference and positions). - @rtype: string + @rtype: unicode """ + # Note that the pyparsing parse tree yields `str('')` for nonexisting + # attributes, so we wrap the optional attributes in `unicode()`. if reverse: - # todo: if var.Arg1 is unicode, this crashes try: - arg1 = str(int(var.Arg1)) + arg1 = unicode(int(var.Arg1)) except ValueError: - arg1 = reverse_complement(str(var.Arg1) or '') + arg1 = util.reverse_complement(unicode(var.Arg1)) try: - arg2 = str(int(var.Arg2)) + arg2 = unicode(int(var.Arg2)) except ValueError: - arg2 = reverse_complement(str(var.Arg2) or '') + arg2 = util.reverse_complement(unicode(var.Arg2)) else: - arg1 = var.Arg1 - arg2 = var.Arg2 + arg1 = unicode(var.Arg1) + arg2 = unicode(var.Arg2) def parse_sequence(seq): if not seq.Sequence: raise NotImplementedError('Only explicit sequences are supported ' 'for insertions.') if reverse: - return reverse_complement(str(seq.Sequence)) + return util.reverse_complement(seq.Sequence) return seq.Sequence if var.MutationType == 'subst': @@ -72,7 +75,7 @@ def _construct_change(var, reverse=False): seqs = reversed(var.SeqList) else: seqs = var.SeqList - insertion = '[' + ';'.join(str(parse_sequence(seq)) + insertion = '[' + ';'.join(parse_sequence(seq) for seq in seqs) + ']' else: insertion = parse_sequence(var.Seq) @@ -161,11 +164,11 @@ class Converter(object) : Get data from database. @arg acc: NM_ accession number (without version) - @type acc: string + @type acc: unicode @arg version: version number @type version: integer @kwarg selector: Optional gene symbol selector. - @type selector: str + @type selector: unicode @kwarg selector_version: Optional transcript version selector. @type selector_version: int """ @@ -269,7 +272,7 @@ class Converter(object) : @arg Loc: A location in either I{g.} or I{c.} notation @type Loc: object @arg Type: The reference type - @type Type: string + @type Type: unicode @returns: triple: 0. Main coordinate in I{c.} notation 1. Offset coordinate in I{c.} notation @@ -359,7 +362,7 @@ class Converter(object) : available. @arg accNo: transcript (NM_) accession number (with or without version) - @type accNo: string + @type accNo: unicode @return: transcription start, transcription end and CDS stop @rtype: triple @@ -381,7 +384,7 @@ class Converter(object) : One of the entry points (called by the HTML publisher). @arg accNo: The full NM accession number (including version) - @type accNo: string + @type accNo: unicode @return: T ; ClassSerializer object with the types trans_start, trans_stop and CDS_stop @@ -404,9 +407,9 @@ class Converter(object) : One of the entry points (called by the HTML publisher). @arg accNo: transcript (NM_) accession number (with version?) - @type accNo: string + @type accNo: unicode @arg mutation: the 'mutation' (e.g. c.123C>T) - @type mutation: string + @type mutation: unicode @return: ClassSerializer object @rtype: object @@ -493,10 +496,10 @@ class Converter(object) : Converts a complete HGVS I{c.} notation into a chromosomal notation. @arg variant: The variant in HGVS I{c.} notation - @type variant: string + @type variant: unicode @return: var_in_g ; The variant in HGVS I{g.} notation - @rtype: string + @rtype: unicode """ if self._parseInput(variant): acc = self.parseTree.RefSeqAcc @@ -528,7 +531,7 @@ class Converter(object) : r_change = _construct_change(variant, reverse=True) except NotImplementedError as e: self.__output.addMessage(__file__, 3, 'ENOTIMPLEMENTED', - str(e)) + unicode(e)) return None if self.mapping.orientation == 'forward': @@ -568,14 +571,14 @@ class Converter(object) : @arg positions: Positions in c. notation to convert. @type positions: list @arg reference: Transcript reference. - @type reference: string + @type reference: unicode @kwarg version: Transcript reference version. If omitted, '0' is assumed. - @type version: string + @type version: unicode @return: Chromosome name, orientation (+ or -), and converted positions. - @rtype: tuple(string, string, list) + @rtype: tuple(unicode, unicode, list) This only works for positions on transcript references in c. notation. """ @@ -617,10 +620,10 @@ class Converter(object) : def correctChrVariant(self, variant) : """ @arg variant: - @type variant: string + @type variant: unicode @return: variant ; - @rtype: string + @rtype: unicode """ #Pre split check @@ -651,12 +654,12 @@ class Converter(object) : def chrom2c(self, variant, rt, gene=None): """ @arg variant: a variant description - @type variant: string + @type variant: unicode @arg rt: the return type - @type rt: string + @type rt: unicode @kwarg gene: Optional gene name. If given, return variant descriptions on all transcripts for this gene. - @type gene: string + @type gene: unicode @return: HGVS_notatations ; @rtype: dictionary or list @@ -751,7 +754,7 @@ class Converter(object) : r_change = _construct_change(variant, reverse=True) except NotImplementedError as e: self.__output.addMessage(__file__, 4, - "ENOTIMPLEMENTEDERROR", str(e)) + "ENOTIMPLEMENTEDERROR", unicode(e)) return None startp = self.crossmap.tuple2string((cmap.startmain, cmap.startoffset)) @@ -786,6 +789,8 @@ class Converter(object) : #Converter +# Todo: This seems broken at the moment. +# Todo: Correct handling of string encodings. def import_from_ucsc_by_gene(assembly, gene): """ Import transcript mappings for a gene from the UCSC. @@ -878,6 +883,7 @@ def import_from_reference(assembly, reference): session.commit() +# Todo: File must be opened with the correct encoding. def import_from_mapview_file(assembly, mapview_file, group_label): """ Import transcript mappings from an NCBI mapview file. diff --git a/mutalyzer/models.py b/mutalyzer/models.py index 24a340fe..bc9bf5a0 100644 --- a/mutalyzer/models.py +++ b/mutalyzer/models.py @@ -8,6 +8,8 @@ from the Spyne model classes. """ +from __future__ import unicode_literals + from spyne.model.primitive import Integer, Boolean, DateTime, Unicode from spyne.model.binary import ByteArray from spyne.model.complex import ComplexModel, Array diff --git a/mutalyzer/mutator.py b/mutalyzer/mutator.py index 8047d932..4a4b0a2d 100644 --- a/mutalyzer/mutator.py +++ b/mutalyzer/mutator.py @@ -12,12 +12,11 @@ The original as well as the mutated string are stored here. """ +from __future__ import unicode_literals + from collections import defaultdict from Bio import Restriction -from Bio.Seq import Seq -from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA -from Bio.Seq import reverse_complement from mutalyzer import util @@ -46,7 +45,7 @@ class Mutator(): Initialise the instance with the original sequence. @arg orig: The original sequence before mutation. - @type orig: str + @type orig: Bio.Seq.Seq @arg output: The output object. @type output: mutalyzer.Output.Output """ @@ -57,6 +56,8 @@ class Mutator(): self._output = output self.orig = orig + # Note that we don't need to create a copy here, since mutation + # operations are not in place (`self._mutate`). self.mutated = orig #__init__ @@ -72,7 +73,7 @@ class Mutator(): @rtype: dict """ analysis = Restriction.Analysis(self._restriction_batch, sequence) - return dict((str(k), len(v)) for k, v in analysis.with_sites().items()) + return dict((unicode(k), len(v)) for k, v in analysis.with_sites().items()) #_restriction_count def _counts_diff(self, counts1, counts2): @@ -109,10 +110,10 @@ class Mutator(): @arg pos2: Second interbase position of the deleted sequence. @type pos2: int @arg ins: Inserted sequence. - @type ins: str + @type ins: unicode @return: Visualisation. - @rtype: str + @rtype: unicode """ loflank = self.orig[max(pos1 - VIS_FLANK_LENGTH, 0):pos1] roflank = self.orig[pos2:pos2 + VIS_FLANK_LENGTH] @@ -338,7 +339,7 @@ class Mutator(): @arg pos2: Second interbase position of the deleted sequence. @type pos2: int @arg ins: Inserted sequence. - @type ins: str + @type ins: unicode """ correct = 1 if pos1 == pos2 else 0 self.mutated = (self.mutated[:self.shift(pos1 + 1) - 1] + @@ -375,7 +376,7 @@ class Mutator(): @arg pos: Interbase position where the insertion should take place. @type pos: int @arg ins: Inserted sequence. - @type ins: str + @type ins: unicode """ visualisation = ['insertion between %i and %i' % (pos, pos + 1)] visualisation.extend(self._visualise(pos, pos, ins)) @@ -394,7 +395,7 @@ class Mutator(): @arg pos2: Last nucleotide of the deleted sequence. @type pos2: int @arg ins: Inserted sequence. - @type ins: str + @type ins: unicode """ visualisation = ['delins from %i to %i' % (pos1, pos2)] visualisation.extend(self._visualise(pos1 - 1, pos2, ins)) @@ -410,7 +411,7 @@ class Mutator(): @arg pos: Position of the substitution. @type pos: int @arg nuc: Substituted nucleotide. - @type nuc: str + @type nuc: unicode """ visualisation = ['substitution at %i' % pos] visualisation.extend(self._visualise(pos - 1, pos, nuc)) @@ -428,14 +429,13 @@ class Mutator(): @arg pos2: Last nucleotide of the inverted sequence. @type pos2: int """ + sequence = util.reverse_complement(unicode(self.orig[pos1 - 1:pos2])) + visualisation = ['inversion between %i and %i' % (pos1, pos2)] - visualisation.extend( - self._visualise(pos1 - 1, pos2, - reverse_complement(self.orig[pos1 - 1:pos2]))) + visualisation.extend(self._visualise(pos1 - 1, pos2, sequence)) self._output.addOutput('visualisation', visualisation) - self._mutate(pos1 - 1, pos2, - reverse_complement(self.orig[pos1 - 1:pos2])) + self._mutate(pos1 - 1, pos2, sequence) #inversion def duplication(self, pos1, pos2): @@ -447,11 +447,12 @@ class Mutator(): @arg pos2: Last nucleotide of the duplicated sequence. @type pos2: int """ + sequence = unicode(self.orig[pos1 - 1:pos2]) + visualisation = ['duplication from %i to %i' % (pos1, pos2)] - visualisation.extend( - self._visualise(pos2, pos2, self.orig[pos1 - 1:pos2])) + visualisation.extend(self._visualise(pos2, pos2, sequence)) self._output.addOutput('visualisation', visualisation) - self._mutate(pos1 - 1, pos1 - 1, self.orig[pos1 - 1:pos2]) + self._mutate(pos1 - 1, pos1 - 1, sequence) #duplication #Mutator diff --git a/mutalyzer/output.py b/mutalyzer/output.py index 3ca1c8a7..fbec8418 100644 --- a/mutalyzer/output.py +++ b/mutalyzer/output.py @@ -23,6 +23,9 @@ Public classes: """ +from __future__ import unicode_literals + +import io import time from mutalyzer import util @@ -71,12 +74,13 @@ class Output() : - _warnings ; Initialised to 0. @arg instance: The filename of the module that created this object - @type instance: string + @type instance: unicode """ self._outputData = {} self._messages = [] self._instance = util.nice_filename(instance) - self._loghandle = open(settings.LOG_FILE, "a+") + self._loghandle = io.open(settings.LOG_FILE, mode='a+', + encoding='utf-8') self._errors = 0 self._warnings = 0 #__init__ @@ -147,7 +151,7 @@ class Output() : - _messages ; The messages list. @arg errorcode: The error code to filter on - @type errorcode: string + @type errorcode: unicode @return: A filtered list @rtype: list @@ -194,7 +198,7 @@ class Output() : - _outputData ; The output dictionary. @arg name: Name of a node in the output dictionary - @type name: string + @type name: unicode @arg data: The data to be stored at this node @type data: object """ @@ -258,7 +262,7 @@ class Output() : - Number of errors - Number of warnings - Summary - @rtype: integer, integer, string + @rtype: integer, integer, unicode """ e_s = 's' w_s = 's' @@ -297,13 +301,13 @@ class Message() : - description ; A description of the message. @arg origin: Name of the module creating this object - @type origin: string + @type origin: unicode @arg level: Importance of the message @type level: integer @arg code: The error code of the message - @type code: string + @type code: unicode @arg description: A description of the message - @type description: string + @type description: unicode """ self.origin = origin self.level = level @@ -316,17 +320,17 @@ class Message() : (self.origin, self.level, self.code, self.description) #__repr__ - def __str__(self): + def __unicode__(self): return '%s (%s): %s' % \ (self.named_level(), self.origin, self.description) - #__str__ + #__unicode__ def named_level(self): """ Get message log level as readable string. @return: A readable description of the log level. - @rtype: string + @rtype: unicode """ if self.level == 0: return "Debug" diff --git a/mutalyzer/parsers/__init__.py b/mutalyzer/parsers/__init__.py index 3e1bd90d..6b3f4334 100644 --- a/mutalyzer/parsers/__init__.py +++ b/mutalyzer/parsers/__init__.py @@ -1,3 +1,6 @@ """ Parsers for GenRecord objects. """ + + +from __future__ import unicode_literals diff --git a/mutalyzer/parsers/genbank.py b/mutalyzer/parsers/genbank.py index 867fa78f..24754598 100644 --- a/mutalyzer/parsers/genbank.py +++ b/mutalyzer/parsers/genbank.py @@ -4,6 +4,9 @@ mutalyzer GenRecord. Record populated with data from a GenBank file. """ +from __future__ import unicode_literals + +import codecs import re import bz2 from itertools import izip_longest @@ -41,7 +44,7 @@ class tempGene(): - cdsList ; CDS list (including internal splice sites). @arg name: Gene name - @type name: string + @type name: unicode """ self.name = name @@ -75,8 +78,8 @@ class GBparser(): ret = [] - if not str(location.start).isdigit() or \ - not str(location.end).isdigit() : + if not unicode(location.start).isdigit() or \ + not unicode(location.end).isdigit() : return None #if @@ -99,8 +102,8 @@ class GBparser(): ret = [] - if not str(locationList.location.start).isdigit() or \ - not str(locationList.location.end).isdigit() : + if not unicode(locationList.location.start).isdigit() or \ + not unicode(locationList.location.end).isdigit() : return None #if @@ -128,10 +131,10 @@ class GBparser(): @arg transcriptAcc: Accession number of the transcript for which we want to find the protein - @type transcriptAcc: string + @type transcriptAcc: unicode @return: Accession number of a protein or None if nothing can be found - @rtype: string + @rtype: unicode """ link = queries.get_transcript_protein_link(transcriptAcc) if link is not None: @@ -146,7 +149,7 @@ class GBparser(): finally: handle.close() - transcriptGI = result["IdList"][0] + transcriptGI = unicode(result["IdList"][0]) handle = Entrez.elink(dbfrom = "nucleotide", db = "protein", id = transcriptGI) @@ -162,11 +165,11 @@ class GBparser(): queries.update_transcript_protein_link(transcriptAcc) return None - proteinGI = result[0]["LinkSetDb"][0]["Link"][0]["Id"] + proteinGI = unicode(result[0]["LinkSetDb"][0]["Link"][0]["Id"]) handle = Entrez.efetch(db='protein', id=proteinGI, rettype='acc', retmode='text') - proteinAcc = handle.read().split('.')[0] + proteinAcc = unicode(handle.read()).split('.')[0] handle.close() queries.update_transcript_protein_link(transcriptAcc, proteinAcc) @@ -179,7 +182,7 @@ class GBparser(): sentence from another. The index of the last word is counted backwards. @arg sentences: A list of sentences. - @type sentences: list of strings + @type sentences: list of unicode strings @return: The indices of the words where sentences start to differ, both are -1 when no mismatches are found. @@ -217,7 +220,7 @@ class GBparser(): [-1:1] yields the empty list. """ # Create lists of words - lists = map(str.split, sentences) + lists = [s.split() for s in sentences] try: forward, reverse = [next(i for i, v in @@ -239,7 +242,7 @@ class GBparser(): @arg locus: The locus object on which the transfer should be performed @type locus: locus object @arg key: The name of the variable that should be transferred - @type key: string + @type key: unicode """ if locus.qualifiers.has_key(key) : @@ -315,7 +318,7 @@ class GBparser(): @arg locusList: A list of loci @type locusList: list @arg tagName: Name of the tag to be checked - @type tagName: string + @type tagName: unicode """ tags = [] @@ -476,13 +479,14 @@ class GBparser(): Create a GenRecord.Record from a GenBank file @arg filename: The full path to the compressed GenBank file - @type filename: string + @type filename: unicode @return: A GenRecord.Record instance @rtype: object (record) """ # first create an intermediate genbank record with BioPython file_handle = bz2.BZ2File(filename, "r") + file_handle = codecs.getreader('utf-8')(file_handle) biorecord = SeqIO.read(file_handle, "genbank") file_handle.close() diff --git a/mutalyzer/parsers/lrg.py b/mutalyzer/parsers/lrg.py index d3624360..0336d106 100644 --- a/mutalyzer/parsers/lrg.py +++ b/mutalyzer/parsers/lrg.py @@ -21,6 +21,8 @@ added in python2.5. Its main strengths are speed and readability [pythonesque]. """ +from __future__ import unicode_literals + import xml.dom.minidom from Bio.Seq import Seq from Bio.Alphabet import IUPAC @@ -54,14 +56,14 @@ def _get_content(data, refname): @arg data: a minidom object @type data: object @arg refname: the name of a member of the minidom object - @type refname: string + @type refname: unicode - @return: The UTF-8 content of the textnode or an emtpy string + @return: The content of the textnode or an emtpy string @rtype: string """ temp = data.getElementsByTagName(refname) if temp: - return temp[0].lastChild.data.encode("utf8") + return temp[0].lastChild.data else: return "" #_get_content @@ -75,14 +77,14 @@ def _attr2dict(attr): @type attr: object @return: A dictionary with pairing of node-attribute names and values. - Integer string values are converted to integers. String values are converted - to UTF-8 + Integer string values are converted to integers. @rtype: dictionary """ ret = {} for key, value in attr.items(): - value = value.isdigit() and int(value) or value.encode("utf-8") - ret[key.encode("utf-8")] = value + if value.isdigit(): + value = int(value) + ret[key] = value return ret #_attr2dict @@ -166,7 +168,7 @@ def create_record(data): for tData in fixed.getElementsByTagName("transcript"): # iterate over the transcripts in the fixed section. # get the transcript from the updatable section and combine results - transcriptName = tData.getAttribute("name").encode("utf8")[1:] + transcriptName = tData.getAttribute("name")[1:] transcription = [t for t in gene.transcriptList if t.name == transcriptName][0] #TODO?: swap with gene.findLocus diff --git a/mutalyzer/redisclient.py b/mutalyzer/redisclient.py index ec9e6050..58acd7ca 100644 --- a/mutalyzer/redisclient.py +++ b/mutalyzer/redisclient.py @@ -18,6 +18,8 @@ simple and just use one global connection pool as created by `StrictRedis`. """ +from __future__ import unicode_literals + import redis from mutalyzer.config import settings @@ -37,7 +39,9 @@ class LazyClient(util.LazyObject): import mockredis self._wrapped = mockredis.MockRedis(strict=True) else: - self._wrapped = redis.StrictRedis.from_url(settings.REDIS_URI) + self._wrapped = redis.StrictRedis.from_url(settings.REDIS_URI, + decode_responses=True, + charset='utf-8') #: Global :class:`LazyClient` instance. Use this for all communication with diff --git a/mutalyzer/services/__init__.py b/mutalyzer/services/__init__.py index 05b3d031..81887d7c 100644 --- a/mutalyzer/services/__init__.py +++ b/mutalyzer/services/__init__.py @@ -1,3 +1,6 @@ """ Services (RPC) for Mutalyzer. """ + + +from __future__ import unicode_literals diff --git a/mutalyzer/services/json.py b/mutalyzer/services/json.py index c35b7929..89c6a26e 100644 --- a/mutalyzer/services/json.py +++ b/mutalyzer/services/json.py @@ -3,6 +3,8 @@ Mutalyzer web service HTTP/RPC with JSON response payloads. """ +from __future__ import unicode_literals + from spyne.application import Application from spyne.protocol.http import HttpRpc from spyne.protocol.json import JsonDocument diff --git a/mutalyzer/services/rpc.py b/mutalyzer/services/rpc.py index ba29c008..004fa869 100644 --- a/mutalyzer/services/rpc.py +++ b/mutalyzer/services/rpc.py @@ -9,6 +9,8 @@ Mutalyzer RPC services. """ +from __future__ import unicode_literals + from spyne.decorator import srpc from spyne.service import ServiceBase from spyne.model.primitive import Integer, Boolean, DateTime, Unicode @@ -16,16 +18,15 @@ from spyne.model.complex import Array from spyne.model.fault import Fault import os import socket -from cStringIO import StringIO -import tempfile -from operator import itemgetter, attrgetter +from io import BytesIO +from operator import attrgetter from sqlalchemy.orm.exc import NoResultFound import mutalyzer from mutalyzer.config import settings from mutalyzer.db import session -from mutalyzer.db.models import (Assembly, Chromosome, BatchJob, - BatchQueueItem, TranscriptMapping) +from mutalyzer.db.models import (Assembly, BatchJob, BatchQueueItem, + TranscriptMapping) from mutalyzer.output import Output from mutalyzer.grammar import Grammar from mutalyzer.sync import CacheSync @@ -103,7 +104,9 @@ class MutalyzerService(ServiceBase): 'Only files up to %d megabytes are accepted.' % (settings.MAX_FILE_SIZE // 1048576)) - batch_file = StringIO(''.join(data)) + batch_file = BytesIO() + for d in data: + batch_file.write(d) job, columns = file_instance.parseBatchFile(batch_file) batch_file.close() @@ -144,7 +147,7 @@ class MutalyzerService(ServiceBase): @arg job_id: Batch job identifier. - @return: Batch job result file. + @return: Batch job result file (UTF-8, base64 encoded). """ left = BatchQueueItem.query.join(BatchJob).filter_by(result_id=job_id).count() @@ -152,7 +155,7 @@ class MutalyzerService(ServiceBase): raise Fault('EBATCHNOTREADY', 'Batch job result is not yet ready.') filename = 'batch-job-%s.txt' % job_id - handle = open(os.path.join(settings.CACHE_DIR, filename)) + handle = open(os.path.join(settings.CACHE_DIR, filename), 'rb') return handle @srpc(Mandatory.Unicode, Mandatory.Unicode, Mandatory.Integer, Boolean, @@ -804,23 +807,18 @@ class MutalyzerService(ServiceBase): result.sourceGi = O.getIndexedOutput('source_gi', 0) result.molecule = O.getIndexedOutput('molecule', 0) - # We force the results to strings here, because some results - # may be of type Bio.Seq.Seq which spyne doesn't like. - # - # todo: We might have to also do this elsewhere. - - result.original = str(O.getIndexedOutput("original", 0)) - result.mutated = str(O.getIndexedOutput("mutated", 0)) + result.original = O.getIndexedOutput("original", 0) + result.mutated = O.getIndexedOutput("mutated", 0) - result.origMRNA = str(O.getIndexedOutput("origMRNA", 0)) - result.mutatedMRNA = str(O.getIndexedOutput("mutatedMRNA", 0)) + result.origMRNA = O.getIndexedOutput("origMRNA", 0) + result.mutatedMRNA = O.getIndexedOutput("mutatedMRNA", 0) - result.origCDS = str(O.getIndexedOutput("origCDS", 0)) - result.newCDS = str(O.getIndexedOutput("newCDS", 0)) + result.origCDS = O.getIndexedOutput("origCDS", 0) + result.newCDS = O.getIndexedOutput("newCDS", 0) - result.origProtein = str(O.getIndexedOutput("oldprotein", 0)) - result.newProtein = str(O.getIndexedOutput("newprotein", 0)) - result.altProtein = str(O.getIndexedOutput("altProtein", 0)) + result.origProtein = O.getIndexedOutput("oldprotein", 0) + result.newProtein = O.getIndexedOutput("newprotein", 0) + result.altProtein = O.getIndexedOutput("altProtein", 0) result.chromDescription = \ O.getIndexedOutput("genomicChromDescription", 0) @@ -995,7 +993,7 @@ class MutalyzerService(ServiceBase): transcript.CM.info() cds_start = 1 - t.cTransEnd = str(t.exons[-1].cStop) + t.cTransEnd = unicode(t.exons[-1].cStop) t.gTransEnd = t.exons[-1].gStop t.chromTransEnd = GenRecordInstance.record.toChromPos( t.gTransEnd) @@ -1009,15 +1007,15 @@ class MutalyzerService(ServiceBase): t.name = '%s_v%s' % (gene.name, transcript.name) t.id = transcript.transcriptID t.product = transcript.transcriptProduct - t.cTransStart = str(trans_start) + t.cTransStart = unicode(trans_start) t.gTransStart = transcript.CM.x2g(trans_start, 0) t.chromTransStart = GenRecordInstance.record.toChromPos( t.gTransStart) - t.cCDSStart = str(cds_start) + t.cCDSStart = unicode(cds_start) t.gCDSStart = transcript.CM.x2g(cds_start, 0) t.chromCDSStart = GenRecordInstance.record.toChromPos( t.gCDSStart) - t.cCDSStop = str(cds_stop) + t.cCDSStop = unicode(cds_stop) t.gCDSStop = transcript.CM.x2g(cds_stop, 0) t.chromCDSStop = GenRecordInstance.record.toChromPos(t.gCDSStop) t.locusTag = transcript.locusTag @@ -1045,7 +1043,7 @@ class MutalyzerService(ServiceBase): """ Upload a genbank file. - @arg data: Genbank file (base64 encoded). + @arg data: Genbank file (UTF-8, base64 encoded). @return: UD accession number for the uploaded genbank file. """ output = Output(__file__) @@ -1067,7 +1065,7 @@ class MutalyzerService(ServiceBase): 'Only files up to %d megabytes are accepted.' % (settings.MAX_FILE_SIZE // 1048576)) - ud = retriever.uploadrecord(''.join(data)) + ud = retriever.uploadrecord(b''.join(data)) output.addMessage(__file__, -1, 'INFO', 'Finished processing uploadGenBankLocalFile()') @@ -1075,7 +1073,7 @@ class MutalyzerService(ServiceBase): # Todo: use SOAP Fault object here (see Trac issue #41). if not ud: error = 'The request could not be completed\n' \ - + '\n'.join(map(lambda m: str(m), output.getMessages())) + + '\n'.join(map(lambda m: unicode(m), output.getMessages())) raise Exception(error) return ud @@ -1112,7 +1110,7 @@ class MutalyzerService(ServiceBase): # Todo: use SOAP Fault object here (see Trac issue #41). if not UD: error = 'The request could not be completed\n' \ - + '\n'.join(map(lambda m: str(m), O.getMessages())) + + '\n'.join(map(lambda m: unicode(m), O.getMessages())) raise Exception(error) return UD @@ -1281,7 +1279,7 @@ class MutalyzerService(ServiceBase): messages = output.getMessages() if messages: error = 'The request could not be completed\n' + \ - '\n'.join(map(lambda m: str(m), output.getMessages())) + '\n'.join(map(lambda m: unicode(m), output.getMessages())) raise Exception(error) return descriptions diff --git a/mutalyzer/services/soap.py b/mutalyzer/services/soap.py index a7d7b001..d8f28407 100644 --- a/mutalyzer/services/soap.py +++ b/mutalyzer/services/soap.py @@ -3,6 +3,8 @@ Mutalyzer SOAP/1.1 web service. """ +from __future__ import unicode_literals + from spyne.application import Application from spyne.protocol.soap import Soap11 diff --git a/mutalyzer/stats.py b/mutalyzer/stats.py index bb1dec57..e7228cdf 100644 --- a/mutalyzer/stats.py +++ b/mutalyzer/stats.py @@ -17,6 +17,8 @@ module much more. """ +from __future__ import unicode_literals + import time from mutalyzer.redisclient import client @@ -36,7 +38,8 @@ def increment_counter(counter): pipe.incr('counter:%s:total' % counter) for label, bucket, expire in INTERVALS: - key = 'counter:%s:%s:%s' % (counter, label, time.strftime(bucket)) + key = 'counter:%s:%s:%s' % (counter, label, + unicode(time.strftime(bucket))) pipe.incr(key) # It's safe to just keep on expiring the counter, even if it already diff --git a/mutalyzer/sync.py b/mutalyzer/sync.py index e5465e1e..a1a1b7f9 100644 --- a/mutalyzer/sync.py +++ b/mutalyzer/sync.py @@ -3,6 +3,8 @@ Synchronizing the reference file cache with other Mutalyzer instances. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() from datetime import datetime, timedelta @@ -86,7 +88,7 @@ class CacheSync(object): or later. :arg remote_wsdl: The url of the remote SOAP WSDL description. - :type remote_wsdl: str + :type remote_wsdl: unicode :arg created_since: Only entries with this creation date or later are returned. :type created_since: datatime.datetime @@ -111,11 +113,11 @@ class CacheSync(object): 1: 'forward', 2: 'reverse'} - entry_dict = {'name': str(entry.name), - 'hash': str(entry.hash), + entry_dict = {'name': entry.name, + 'hash': entry.hash, 'created': entry.created} for attribute in ('gi', 'chromosomeName', 'url', 'cached'): - entry_dict[attribute] = str(entry[attribute]) \ + entry_dict[attribute] = entry[attribute] \ if attribute in entry else None for attribute in ('chromosomeStart', 'chromosomeStop'): entry_dict[attribute] = int(entry[attribute]) \ @@ -131,9 +133,9 @@ class CacheSync(object): Download a remote file located at `url` and store it as `name`. :arg name: Name to store the file under. - :type name: str + :type name: unicode :arg url: Url to the remote file. - :type url: str + :type url: unicode """ if not re.match('^[\da-zA-Z\._-]+$', name): return @@ -160,10 +162,10 @@ class CacheSync(object): (14, 3) :arg remote_wsdl: The url of the remote SOAP WSDL description. - :type remote_wsdl: str + :type remote_wsdl: unicode :arg url_template: Formatting string containing a ``{file}`` occurence, see example usage above. - :string url_template: str + :string url_template: unicode :arg days: Only remote entries added this number of days ago or later are considered. :type days: int diff --git a/mutalyzer/util.py b/mutalyzer/util.py index 936f0812..4017b57a 100644 --- a/mutalyzer/util.py +++ b/mutalyzer/util.py @@ -19,20 +19,80 @@ General utility functions. """ +from __future__ import unicode_literals + from functools import wraps import inspect from itertools import izip_longest import math import operator -import os import sys import time -from Bio.Alphabet import IUPAC -import Bio.Seq from Bio.SeqUtils import seq3 +# Taken from BioPython. +AMBIGUOUS_DNA_COMPLEMENT = { + 'A': 'T', + 'C': 'G', + 'G': 'C', + 'T': 'A', + 'M': 'K', + 'R': 'Y', + 'W': 'W', + 'S': 'S', + 'Y': 'R', + 'K': 'M', + 'V': 'B', + 'H': 'D', + 'D': 'H', + 'B': 'V', + 'X': 'X', + 'N': 'N'} +AMBIGUOUS_RNA_COMPLEMENT = { + 'A': 'U', + 'C': 'G', + 'G': 'C', + 'U': 'A', + 'M': 'K', + 'R': 'Y', + 'W': 'W', + 'S': 'S', + 'Y': 'R', + 'K': 'M', + 'V': 'B', + 'H': 'D', + 'D': 'H', + 'B': 'V', + 'X': 'X', + 'N': 'N'} + + +def _make_translation_table(complement_mapping): + before = complement_mapping.keys() + before += [b.lower() for b in before] + after = complement_mapping.values() + after += [b.lower() for b in after] + return {ord(k): v for k, v in zip(before, after)} + + +_dna_complement_table = _make_translation_table(AMBIGUOUS_DNA_COMPLEMENT) +_rna_complement_table = _make_translation_table(AMBIGUOUS_RNA_COMPLEMENT) + + +def reverse_complement(sequence): + """ + Reverse complement of a sequence represented as unicode string. + """ + if 'U' in sequence or 'u' in sequence: + table = _rna_complement_table + else: + table = _dna_complement_table + + return ''.join(reversed(sequence.translate(table))) + + def grouper(iterable, n=2, fillvalue=None): """ Make an iterator that takes {n} elements at a time from {iterable}, using @@ -115,17 +175,17 @@ def splice(s, splice_sites): 'bcdghijklmnoptuvw' @arg s: A DNA sequence. - @type s: string + @type s: any sequence type @arg splice_sites: A list of even length of integers. @type splice_sites: list @return: The concatenation of slices from the sequence that is present in the GenBank record. - @rtype: string + @rtype: type(s) @todo: Assert length of splice_sites is even. """ - transcript = '' + transcript = s[:0] for acceptor, donor in grouper(splice_sites): transcript += s[acceptor - 1:donor] @@ -146,7 +206,7 @@ def __nsplice(string, splice_sites, CDS, orientation) : @todo: documentation """ - transcript = "" + transcript = string[:0] if orientation == 1 : for i in range(0, len(splice_sites), 2) : if CDS[0] >= splice_sites[i] and CDS[0] <= splice_sites[i + 1] : @@ -212,14 +272,15 @@ def format_range(first, last): @type last: integer @return: {first}_{last} in case of a real range, {first} otherwise. - @rtype: string + @rtype: unicode """ if first == last: - return str(first) + return unicode(first) return '%i_%i' % (first, last) #format_range + def roll_(s, start, end) : """ Different (and easier) way of finding the variability of a substring. @@ -239,6 +300,7 @@ def roll_(s, start, end) : return j, i #roll + def roll(s, first, last): """ Determine the variability of a variant by looking at cyclic @@ -254,7 +316,7 @@ def roll(s, first, last): (1, 3) @arg s: A reference sequence. - @type s: string + @type s: any sequence type @arg first: First position of the pattern in the reference sequence. @type first: int @arg last: Last position of the pattern in the reference sequence. @@ -302,13 +364,13 @@ def palinsnoop(s): 0 @arg s: A nucleotide sequence. - @type s: string + @type s: unicode @return: The number of elements that are palindromic or -1 if the string is a 'palindrome'. - @rtype: string + @rtype: int """ - s_revcomp = Bio.Seq.reverse_complement(s) + s_revcomp = reverse_complement(s) for i in range(int(math.ceil(len(s) / 2.0))): if s[i] != s_revcomp[i]: @@ -330,12 +392,12 @@ def longest_common_prefix(s1, s2): 'abcdefg' @arg s1: The first string. - @type s1: string + @type s1: unicode @arg s2: The second string. - @type s2: string + @type s2: unicode @return: The longest common prefix of s1 and s2. - @rtype: string + @rtype: unicode @todo: This is mostly used just for the length of the returned string, and we could also return that directly. @@ -359,9 +421,9 @@ def longest_common_suffix(s1, s2): 'efg' @arg s1: The first string. - @type s1: string + @type s1: unicode @arg s2: The second string. - @type s2: string + @type s2: unicode @return: The longest common suffix of s1 and s2. @rtype: string @@ -380,15 +442,15 @@ def trim_common(s1, s2): ('xyzef', 'abc', 3, 1) @arg s1: A string. - @type s1: string + @type s1: unicode @arg s2: Another string. - @type s2: string + @type s2: unicode @return: A tuple of: - - string: Trimmed version of s1. - - string: Trimmed version of s2. - - int: Length of longest common prefix. - - int: Length of longest common suffix. + - unicode: Trimmed version of s1. + - unicode: Trimmed version of s2. + - int: Length of longest common prefix. + - int: Length of longest common suffix. @todo: More intelligently handle longest_common_prefix(). """ @@ -407,14 +469,14 @@ def is_dna(s): >>> is_dna('TACUGT') False - @arg s: Any string or Bio.Seq.Seq instance. - @type s: string + @arg s: Any string. + @type s: unicode @return: True if the string is a DNA string, False otherwise. @rtype: boolean """ - for i in str(s): - if not i in IUPAC.unambiguous_dna.letters: + for i in s: + if i not in 'ATCG': return False return True @@ -435,16 +497,16 @@ def in_frame_description(s1, s2) : ('p.(Pro4_Gln6delinsGlnGlnMet)', 3, 6, 6) @arg s1: The original protein. - @type s1: string + @type s1: unicode @arg s2: The mutated protein. - @type s2: string + @type s2: unicode @return: A tuple of: - - string ; Protein description of the change. - - int ; First position of the change. - - int ; Last position of the change in the first protein. - - int ; Last position of the change in the second protein. - @rtype: tuple(string, int, int, int) + - unicode ; Protein description of the change. + - int ; First position of the change. + - int ; Last position of the change in the first protein. + - int ; Last position of the change in the second protein. + @rtype: tuple(unicode, int, int, int) @todo: More intelligently handle longest_common_prefix(). @todo: Refactor this code (too many return statements). @@ -528,16 +590,16 @@ def out_of_frame_description(s1, s2): ('p.(Pro4Glnfs*5)', 3, 7, 7) @arg s1: The original protein. - @type s1: string + @type s1: unicode @arg s2: The mutated protein. - @type s2: string + @type s2: unicode @return: A tuple of: - - string ; Protein description of the change. - - int ; First position of the change. - - int ; Last position of the first protein. - - int ; Last position of the second protein. - @rtype: tuple(string, int, int, int) + - unicode ; Protein description of the change. + - int ; First position of the change. + - int ; Last position of the first protein. + - int ; Last position of the second protein. + @rtype: tuple(unicode, int, int, int) @todo: More intelligently handle longest_common_prefix(). """ @@ -573,23 +635,23 @@ def protein_description(cds_stop, s1, s2) : @arg cds_stop: Position of the stop codon in c. notation (CDS length). @type cds_stop: int @arg s1: The original protein. - @type s1: string + @type s1: unicode @arg s2: The mutated protein. - @type s2: string + @type s2: unicode @return: A tuple of: - - string ; Protein description of the change. - - int ; First position of the change. - - int ; Last position of the change in the first protein. - - int ; Last position of the change in the second protein. - @rtype: tuple(string, int, int, int) + - unicode ; Protein description of the change. + - int ; First position of the change. + - int ; Last position of the change in the first protein. + - int ; Last position of the change in the second protein. + @rtype: tuple(unicode, int, int, int) """ if cds_stop % 3: - description = out_of_frame_description(str(s1), str(s2)) + description = out_of_frame_description(s1, s2) else: - description = in_frame_description(str(s1), str(s2)) + description = in_frame_description(s1, s2) - if not s2 or str(s1[0]) != str(s2[0]): + if not s2 or s1[0] != s2[0]: # Mutation in start codon. return 'p.?', description[1], description[2], description[3] @@ -603,7 +665,7 @@ def visualise_sequence(sequence, max_length=25, flank_size=6): string is clipped; otherwise the string is just returned. @arg sequence: DNA sequence. - @type sequence: str + @type sequence: unicode @arg max_length: Maximum length of visualised sequence. @type max_length: int @arg flank_size: Length of the flanks in clipped visualised sequence. @@ -629,19 +691,19 @@ def _insert_tag(s, pos1, pos2, tag1, tag2): anything either. @arg s: A sequence. - @type s: + @type s: unicode @arg pos1: Position of tag1. @type pos1: int @arg pos2: Position of tag2. @type pos2: int @arg tag1: Content of tag1. - @type tag1: string + @type tag1: unicode @arg tag2: Content of tag2. - @type tag2: string + @type tag2: unicode @return: The original sequence, or a sequence with eiter tag1, tag2 or both tags inserted. - @rtype: string + @rtype: unicode @todo: Cleanup (note: only used in print_protein_html). """ @@ -670,7 +732,7 @@ def print_protein_html(s, first, last, O, where, text=False): and is suitable for viewing in a monospaced font. @arg s: A protein sequence. - @type s: string + @type s: unicode @arg first: First position to highlight. @type first: int @arg last: Last position to highlight. @@ -678,7 +740,7 @@ def print_protein_html(s, first, last, O, where, text=False): @arg O: The Output object. @type O: Modules.Output.Output @arg where: Location in the {O} object to store the representation. - @type where: string + @type where: unicode @todo: Cleanup. """ @@ -701,7 +763,7 @@ def print_protein_html(s, first, last, O, where, text=False): o = 1 # Add the first position. - output = '%s ' % str(o).rjust(m) + output = '%s ' % unicode(o).rjust(m) for i in range(0, len(s), block): # Add the blocks. @@ -714,13 +776,13 @@ def print_protein_html(s, first, last, O, where, text=False): # Add the position (while escaping any potential highlighting). if text: if first < o < last: - output = '%s%s%s ' % (tag2, str(o).rjust(m), tag1) + output = '%s%s%s ' % (tag2, unicode(o).rjust(m), tag1) else: - output = '%s ' % str(o).rjust(m) + output = '%s ' % unicode(o).rjust(m) else: output = \ '<tt style="color:000000;font-weight:normal">%s</tt> ' % \ - str(o).rjust(m) + unicode(o).rjust(m) # Add last line. O.addOutput(where, output) @@ -748,10 +810,10 @@ def nice_filename(filename): Strip the path and the extention from a filename. @arg filename: A complete path plus extention. - @type filename: string + @type filename: unicode @return: The bare filename without a path and extention. - @rtype: string + @rtype: unicode """ return filename.split('/')[-1].split('.')[0] #nice_filename @@ -788,16 +850,16 @@ def format_usage(usage=None, keywords={}): @kwarg usage: The string to format. If omitted, the calling module's docstring is used. - @type usage: string + @type usage: unicode @kwarg keywords: A dictionary of (keyword, value) pairs used to format the usage string. If it does not contain the key 'command', it is added with the value of sys.argv[0]. - @type keywords: dictionary(string, string) + @type keywords: dictionary(unicode, unicode) @return: Formatted usage string. This is {usage} with any entries from {keywords} replaced and cut-off at the first occurence of two consecutive empty lines. - @rtype: string + @rtype: unicode """ if not usage: caller = inspect.stack()[1] diff --git a/mutalyzer/variantchecker.py b/mutalyzer/variantchecker.py index 65dd7056..3f0ee422 100644 --- a/mutalyzer/variantchecker.py +++ b/mutalyzer/variantchecker.py @@ -9,17 +9,22 @@ Notes about naming positions: * translation -> begin/end * any range of bases -> first/last * interbase position (if two numbers are used) -> before/after + +Notes about string representations: +* All variant descriptions and their parts are unicode strings +* All reference sequences (and their mutated version) are Bio.Seq.Seq objects """ -from operator import itemgetter, attrgetter +from __future__ import unicode_literals + +from operator import attrgetter -import Bio -import Bio.Seq -from Bio.Seq import Seq +from Bio.Data import CodonTable from Bio.Alphabet import IUPAC from Bio.Alphabet import DNAAlphabet from Bio.Alphabet import ProteinAlphabet +from Bio.Alphabet import _verify_alphabet from mutalyzer import util from mutalyzer.db.models import Assembly @@ -126,14 +131,14 @@ def _check_argument(argument, reference, first, last, output): Do several checks for the optional argument of a variant. Raise a _RawVariantError exception if the checks fail. + @arg argument: The optional argument. + @type argument: unicode @arg reference: The reference sequence. - @type reference: string + @type reference: Bio.Seq.Seq @arg first: Start position of the variant. @type first: int @arg last: End position of the variant. @type last: int - @arg argument: The optional argument. - @type argument: string @arg output: The Output object. @type output: mutalyzer.Output.Output @@ -164,8 +169,8 @@ def _check_argument(argument, reference, first, last, output): 'Invalid letters in argument.') raise _NotDNAError() # And the DNA must match the reference sequence. - reference_slice = str(reference[first - 1:last]) - if reference_slice != str(argument): + reference_slice = unicode(reference[first - 1:last]) + if reference_slice != argument: # Todo: Be more informative. output.addMessage(__file__, 3, 'EREF', '%s not found at position %s, found %s ' \ @@ -286,9 +291,9 @@ def apply_substitution(position, original, substitute, mutator, record, O): @arg position: Genomic location of the substitution. @type position: int @arg original: Nucleotide in the reference sequence. - @type original: string + @type original: unicode @arg substitute: Nucleotide in the mutated sequence. - @type substitute: string + @type substitute: unicode @arg mutator: A Mutator instance. @type mutator: mutalyzer.mutator.Mutator @arg record: A GenRecord object. @@ -310,7 +315,7 @@ def apply_substitution(position, original, substitute, mutator, record, O): mutator.substitution(position, substitute) - record.name(position, position, 'subst', mutator.orig[position - 1], + record.name(position, position, 'subst', unicode(mutator.orig[position - 1]), substitute, None) #apply_substitution @@ -326,7 +331,7 @@ def apply_deletion_duplication(first, last, type, mutator, record, O, @arg last: Genomic end position of the del/dup. @type last: int @arg type: The variant type (del or dup). - @type type: string + @type type: unicode @arg mutator: A Mutator instance. @type mutator: mutalyzer.mutator.Mutator @arg record: A GenRecord object. @@ -376,9 +381,9 @@ def apply_deletion_duplication(first, last, type, mutator, record, O, 'Sequence "%s" at position %s was given, however, ' \ 'the HGVS notation prescribes that on the forward strand ' \ 'it should be "%s" at position %s.' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), + util.visualise_sequence(unicode(mutator.orig[first - 1:last])), util.format_range(first, last), - util.visualise_sequence(str(mutator.orig[new_first - 1:new_stop])), + util.visualise_sequence(unicode(mutator.orig[new_first - 1:new_stop])), util.format_range(new_first, new_stop))) if forward_roll != original_forward_roll and not reverse_strand: @@ -388,9 +393,9 @@ def apply_deletion_duplication(first, last, type, mutator, record, O, O.addMessage(__file__, 1, 'IROLLBACK', 'Sequence "%s" at position %s was not corrected to "%s" at ' \ 'position %s, since they reside in different exons.' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), + util.visualise_sequence(unicode(mutator.orig[first - 1:last])), util.format_range(first, last), - util.visualise_sequence(str(mutator.orig[incorrect_first - 1:incorrect_stop])), + util.visualise_sequence(unicode(mutator.orig[incorrect_first - 1:incorrect_stop])), util.format_range(incorrect_first, incorrect_stop))) if reverse_roll and reverse_strand: @@ -400,9 +405,9 @@ def apply_deletion_duplication(first, last, type, mutator, record, O, 'Sequence "%s" at position %s was given, however, ' \ 'the HGVS notation prescribes that on the reverse strand ' \ 'it should be "%s" at position %s.' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), + util.visualise_sequence(unicode(mutator.orig[first - 1:last])), util.format_range(first, last), - util.visualise_sequence(str(mutator.orig[new_first - 1:new_stop])), + util.visualise_sequence(unicode(mutator.orig[new_first - 1:new_stop])), util.format_range(new_first, new_stop))) # We don't go through the trouble of visualising the *corrected* variant @@ -434,7 +439,7 @@ def apply_inversion(first, last, mutator, record, O): @arg O: The Output object. @type O: Modules.Output.Output """ - snoop = util.palinsnoop(mutator.orig[first - 1:last]) + snoop = util.palinsnoop(unicode(mutator.orig[first - 1:last])) if snoop: # We have a reverse-complement-palindromic prefix. @@ -444,7 +449,7 @@ def apply_inversion(first, last, mutator, record, O): O.addMessage(__file__, 2, 'WNOCHANGE', 'Sequence "%s" at position %i_%i is a palindrome ' \ '(its own reverse complement).' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), + util.visualise_sequence(unicode(mutator.orig[first - 1:last])), first, last)) return else: @@ -453,10 +458,10 @@ def apply_inversion(first, last, mutator, record, O): 'palindrome (the first %i nucleotide(s) are the reverse ' \ 'complement of the last one(s)), the HGVS notation ' \ 'prescribes that it should be "%s" at position %i_%i.' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), + util.visualise_sequence(unicode(mutator.orig[first - 1:last])), first, last, snoop, util.visualise_sequence( - str(mutator.orig[first + snoop - 1: last - snoop])), + unicode(mutator.orig[first + snoop - 1: last - snoop])), first + snoop, last - snoop)) first += snoop last -= snoop @@ -466,8 +471,8 @@ def apply_inversion(first, last, mutator, record, O): if first == last: O.addMessage(__file__, 2, 'WWRONGTYPE', 'Inversion at position ' \ '%i is actually a substitution.' % first) - record.name(first, first, 'subst', mutator.orig[first - 1], - Bio.Seq.reverse_complement(mutator.orig[first - 1]), None) + record.name(first, first, 'subst', unicode(mutator.orig[first - 1]), + util.reverse_complement(unicode(mutator.orig[first - 1])), None) else : record.name(first, last, 'inv', '', '', None) #apply_inversion @@ -483,7 +488,7 @@ def apply_insertion(before, after, s, mutator, record, O): @arg after: Genomic position after the insertion. @type after: int @arg s: Nucleotides to be inserted. - @type s: string + @type s: nucleotide @arg mutator: A Mutator instance. @type mutator: mutalyzer.mutator.Mutator @arg record: A GenRecord object. @@ -547,7 +552,7 @@ def apply_insertion(before, after, s, mutator, record, O): 'however, the HGVS notation prescribes that it should be a ' \ 'duplication of %s at position %i_%i.' % ( s, before, before + 1, - mutator.mutated[new_before + forward_roll:new_stop + forward_roll], + unicode(mutator.mutated[new_before + forward_roll:new_stop + forward_roll]), before + forward_roll, before + forward_roll + insertion_length - 1)) after += forward_roll - 1 @@ -566,7 +571,7 @@ def apply_insertion(before, after, s, mutator, record, O): 'that on the forward strand it should be an insertion of %s ' \ 'at position %i_%i.' % ( s, before, before + 1, - mutator.mutated[new_before + forward_roll:new_stop + forward_roll], + unicode(mutator.mutated[new_before + forward_roll:new_stop + forward_roll]), new_before + forward_roll, new_before + forward_roll + 1)) if forward_roll != original_forward_roll and not reverse_strand: @@ -576,7 +581,7 @@ def apply_insertion(before, after, s, mutator, record, O): 'insertion of %s at position %i_%i, since they reside in ' \ 'different exons.' % ( s, before, before + 1, - mutator.mutated[new_before + original_forward_roll:new_stop + original_forward_roll], + unicode(mutator.mutated[new_before + original_forward_roll:new_stop + original_forward_roll]), new_before + original_forward_roll, new_before + original_forward_roll + 1)) if reverse_roll and reverse_strand: @@ -585,13 +590,13 @@ def apply_insertion(before, after, s, mutator, record, O): 'that on the reverse strand it should be an insertion of %s ' \ 'at position %i_%i.' % ( s, before, before + 1, - mutator.mutated[new_before - reverse_roll:new_stop - reverse_roll], + unicode(mutator.mutated[new_before - reverse_roll:new_stop - reverse_roll]), new_before - reverse_roll, (new_before - reverse_roll) + 1)) record.name(before, before + 1, 'ins', - mutator.mutated[new_before + forward_roll:new_stop + forward_roll], + unicode(mutator.mutated[new_before + forward_roll:new_stop + forward_roll]), '', (reverse_roll, forward_roll), - mutator.mutated[new_before - reverse_roll:new_stop - reverse_roll]) + unicode(mutator.mutated[new_before - reverse_roll:new_stop - reverse_roll])) #apply_insertion @@ -605,7 +610,7 @@ def apply_delins(first, last, insert, mutator, record, output): @arg last: Genomic end position of the delins. @type last: int @arg insert: Sequence to insert. - @type insert: string + @type insert: unicode @arg mutator: A Mutator instance. @type mutator: mutalyzer.mutator.Mutator @arg record: A GenRecord object. @@ -613,14 +618,13 @@ def apply_delins(first, last, insert, mutator, record, output): @arg output: The Output object. @type output: Modules.Output.Output """ - delete = mutator.orig[first - 1:last] + delete = unicode(mutator.orig[first - 1:last]) - if str(delete) == str(insert): + if delete == insert: output.addMessage(__file__, 2, 'WNOCHANGE', 'Sequence "%s" at position %i_%i is identical to ' \ 'the variant.' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), - first, last)) + util.visualise_sequence(delete), first, last)) return delete_trimmed, insert_trimmed, lcp, lcs = util.trim_common(delete, insert) @@ -646,7 +650,7 @@ def apply_delins(first, last, insert, mutator, record, output): mutator, record, output) return - if str(Bio.Seq.reverse_complement(delete_trimmed)) == insert_trimmed: + if util.reverse_complement(delete_trimmed) == insert_trimmed: output.addMessage(__file__, 2, 'WWRONGTYPE', 'The given DelIns ' \ 'is actually an inversion.') apply_inversion(first + lcp, last - lcs, mutator, @@ -658,7 +662,7 @@ def apply_delins(first, last, insert, mutator, record, output): 'Sequence "%s" at position %i_%i has the same prefix or ' \ 'suffix as the inserted sequence "%s". The HGVS notation ' \ 'prescribes that it should be "%s" at position %i_%i.' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), + util.visualise_sequence(unicode(mutator.orig[first - 1:last])), first, last, insert, insert_trimmed, first + lcp, last - lcs)) mutator.delins(first + lcp, last - lcs, insert_trimmed) @@ -952,17 +956,19 @@ def process_raw_variant(mutator, variant, record, transcript, output): """ variant, original_description = variant.RawVar, variant[-1] - # {argument} may be a number, or a subsequence of the reference. - # {sequence} is the variant subsequence. - argument = variant.Arg1 - sequence = variant.Arg2 + # `argument` may be a number, or a subsequence of the reference. + # `sequence` is the variant subsequence. + # Note that pyparsing will return `str('')` if the attribute does not + # exist, so we explicitely convert the result to unicode. + argument = unicode(variant.Arg1) + sequence = unicode(variant.Arg2) # If we are on the reverse strand, subsequences must be in reverse # complement. if transcript and transcript.CM.orientation == -1: - sequence = Bio.Seq.reverse_complement(sequence) + sequence = util.reverse_complement(sequence) if util.is_dna(argument): - argument = Bio.Seq.reverse_complement(argument) + argument = util.reverse_complement(argument) # Get genomic first and last positions for this variant. Below we handle # the different ways of describing these positions. @@ -1189,7 +1195,7 @@ def process_raw_variant(mutator, variant, record, transcript, output): def parse_sequence(seq): if seq.Sequence: if transcript and transcript.CM.orientation == -1: - return Bio.Seq.reverse_complement(str(seq.Sequence)) + return util.reverse_complement(seq.Sequence) return seq.Sequence if seq.StartLoc and seq.EndLoc: @@ -1228,9 +1234,9 @@ def process_raw_variant(mutator, variant, record, transcript, output): 'Position %s is out of range.' % range_last) raise _RawVariantError() - insertion = mutator.orig[range_first - 1:range_last] + insertion = unicode(mutator.orig[range_first - 1:range_last]) if seq.Inv: - insertion = Bio.Seq.reverse_complement(str(insertion)) + insertion = util.reverse_complement(insertion) return insertion @@ -1245,7 +1251,7 @@ def process_raw_variant(mutator, variant, record, transcript, output): seqs = reversed(variant.SeqList) else: seqs = variant.SeqList - insertion = ''.join(str(parse_sequence(seq)) + insertion = ''.join(parse_sequence(seq) for seq in seqs) else: insertion = parse_sequence(variant.Seq) @@ -1316,32 +1322,33 @@ def _add_transcript_info(mutator, transcript, output): if transcript.transcribe: output.addOutput('myTranscriptDescription', transcript.description or '=') output.addOutput('origMRNA', - str(util.splice(mutator.orig, transcript.mRNA.positionList))) + unicode(util.splice(mutator.orig, transcript.mRNA.positionList))) output.addOutput('mutatedMRNA', - str(util.splice(mutator.mutated, + unicode(util.splice(mutator.mutated, mutator.shift_sites(transcript.mRNA.positionList)))) # Add protein prediction to output. if transcript.translate: - cds_original = Seq(str(util.splice(mutator.orig, transcript.CDS.positionList)), - IUPAC.unambiguous_dna) - cds_variant = Seq(str(util.__nsplice(mutator.mutated, - mutator.shift_sites(transcript.mRNA.positionList), - mutator.shift_sites(transcript.CDS.location), - transcript.CM.orientation)), - IUPAC.unambiguous_dna) + cds_original = util.splice(mutator.orig, transcript.CDS.positionList) + cds_original.alphabet = IUPAC.unambiguous_dna - #output.addOutput('origCDS', cds_original) - - if transcript.CM.orientation == -1: - cds_original = Bio.Seq.reverse_complement(cds_original) - cds_variant = Bio.Seq.reverse_complement(cds_variant) - - if not util.is_dna(cds_original): + if not _verify_alphabet(cds_original): output.addMessage(__file__, 4, 'ENODNA', 'Invalid letters in reference sequence.') return + cds_variant = util.__nsplice(mutator.mutated, + mutator.shift_sites(transcript.mRNA.positionList), + mutator.shift_sites(transcript.CDS.location), + transcript.CM.orientation) + cds_variant.alphabet = IUPAC.unambiguous_dna + + #output.addOutput('origCDS', cds_original) + + if transcript.CM.orientation == -1: + cds_original = cds_original.reverse_complement() + cds_variant = cds_variant.reverse_complement() + if '*' in cds_original.translate(table=transcript.txTable)[:-1]: output.addMessage(__file__, 3, 'ESTOP', 'In frame stop codon found.') @@ -1354,36 +1361,35 @@ def _add_transcript_info(mutator, transcript, output): # Note: addOutput('origCDS', ...) was first before the possible # reverse complement operation above. - output.addOutput('origCDS', cds_original) - output.addOutput("newCDS", cds_variant[:(len(str(protein_variant)) + 1) * 3]) + output.addOutput('origCDS', unicode(cds_original)) + output.addOutput("newCDS", unicode(cds_variant[:(len(protein_variant) + 1) * 3])) - output.addOutput('oldprotein', protein_original + '*') + output.addOutput('oldprotein', unicode(protein_original) + '*') # Todo: Don't generate the fancy HTML protein views here, do this in # website.py. # I think it would also be nice to include the mutated list of splice # sites. - if not protein_variant or protein_variant[0] != 'M': + if not protein_variant or unicode(protein_variant[0]) != 'M': # Todo: Protein differences are not color-coded, # use something like below in protein_description(). - util.print_protein_html(protein_original + '*', 0, 0, output, - 'oldProteinFancy') - util.print_protein_html(protein_original + '*', 0, 0, output, - 'oldProteinFancyText', text=True) - if str(cds_variant[0:3]) in \ - Bio.Data.CodonTable.unambiguous_dna_by_id \ - [transcript.txTable].start_codons: + util.print_protein_html(unicode(protein_original) + '*', 0, 0, + output, 'oldProteinFancy') + util.print_protein_html(unicode(protein_original) + '*', 0, 0, + output, 'oldProteinFancyText', text=True) + if unicode(cds_variant[0:3]) in \ + CodonTable.unambiguous_dna_by_id[transcript.txTable].start_codons: output.addOutput('newprotein', '?') util.print_protein_html('?', 0, 0, output, 'newProteinFancy') util.print_protein_html('?', 0, 0, output, 'newProteinFancyText', text=True) - output.addOutput('altStart', str(cds_variant[0:3])) - if str(protein_original[1:]) != str(protein_variant[1:]): + output.addOutput('altStart', unicode(cds_variant[0:3])) + if unicode(protein_original[1:]) != unicode(protein_variant[1:]): output.addOutput('altProtein', - 'M' + protein_variant[1:] + '*') - util.print_protein_html('M' + protein_variant[1:] + '*', 0, + 'M' + unicode(protein_variant[1:]) + '*') + util.print_protein_html('M' + unicode(protein_variant[1:]) + '*', 0, 0, output, 'altProteinFancy') - util.print_protein_html('M' + protein_variant[1:] + '*', 0, + util.print_protein_html('M' + unicode(protein_variant[1:]) + '*', 0, 0, output, 'altProteinFancyText', text=True) else : output.addOutput('newprotein', '?') @@ -1395,21 +1401,22 @@ def _add_transcript_info(mutator, transcript, output): cds_length = util.cds_length( mutator.shift_sites(transcript.CDS.positionList)) descr, first, last_original, last_variant = \ - util.protein_description(cds_length, protein_original, - protein_variant) + util.protein_description(cds_length, + unicode(protein_original), + unicode(protein_variant)) # This is never used. output.addOutput('myProteinDescription', descr) - util.print_protein_html(protein_original + '*', first, + util.print_protein_html(unicode(protein_original) + '*', first, last_original, output, 'oldProteinFancy') - util.print_protein_html(protein_original + '*', first, + util.print_protein_html(unicode(protein_original) + '*', first, last_original, output, 'oldProteinFancyText', text=True) - if str(protein_original) != str(protein_variant): - output.addOutput('newprotein', protein_variant + '*') - util.print_protein_html(protein_variant + '*', first, + if unicode(protein_original) != unicode(protein_variant): + output.addOutput('newprotein', unicode(protein_variant) + '*') + util.print_protein_html(unicode(protein_variant) + '*', first, last_variant, output, 'newProteinFancy') - util.print_protein_html(protein_variant + '*', first, + util.print_protein_html(unicode(protein_variant) + '*', first, last_variant, output, 'newProteinFancyText', text=True) #_add_transcript_info @@ -1473,6 +1480,7 @@ def process_variant(mutator, description, record, output): if description.LrgAcc: # LRG case, pick the top gene. gene = record.record.geneList[0] + if transcript_id: transcript = gene.findLocus(transcript_id) if not transcript: @@ -1481,7 +1489,7 @@ def process_variant(mutator, description, record, output): # NG_012772.1). output.addMessage(__file__, 4, "ENOTRANSCRIPT", "Multiple transcripts found for gene %s. Please " \ - "choose from: %s" %(gene.name, + "choose from: %s" % (gene.name, ", ".join(gene.listLoci()))) else: # No transcript id given. @@ -1563,10 +1571,10 @@ def process_variant(mutator, description, record, output): 'Protein level descriptions can only be done on a protein or transcript reference.') raise _VariantError() else: - cds = Seq(str(util.splice(mutator.orig, transcript.CDS.positionList)), - IUPAC.unambiguous_dna) + cds = util.splice(mutator.orig, transcript.CDS.positionList) + cds.alphabet = IUPAC.unambiguous_dna if transcript.CM.orientation == -1: - cds = Bio.Seq.reverse_complement(cds) + cds = cds.reverse_complement() protein = cds.translate(table=transcript.txTable, cds=True, to_stop=True) mutator.orig = protein mutator.mutated = protein @@ -1644,12 +1652,12 @@ def check_variant(description, output): if parsed_description.LrgAcc: record_id = parsed_description.LrgAcc - elif parsed_description.Version: - record_id = parsed_description.RefSeqAcc + '.' + parsed_description.Version + elif parsed_description.RefSeqAcc: + if parsed_description.Version: + record_id = parsed_description.RefSeqAcc + '.' + parsed_description.Version + else: + record_id = parsed_description.RefSeqAcc else: - record_id = parsed_description.RefSeqAcc - - if not record_id: output.addMessage(__file__, 4, 'ENOREF', 'No reference sequence given.') return @@ -1657,7 +1665,7 @@ def check_variant(description, output): if parsed_description.LrgAcc: filetype = 'LRG' - transcript_id = parsed_description.LRGTranscriptID + transcript_id = parsed_description.LRGTranscriptID or '' retriever = Retriever.LRGRetriever(output) else: filetype = 'GB' @@ -1732,8 +1740,8 @@ def check_variant(description, output): except _VariantError: return - output.addOutput('original', str(mutator.orig)) - output.addOutput('mutated', str(mutator.mutated)) + output.addOutput('original', unicode(mutator.orig)) + output.addOutput('mutated', unicode(mutator.mutated)) # Chromosomal region (only for GenBank human transcript references). # This is still quite ugly code, and should be cleaned up once we have @@ -1775,17 +1783,18 @@ def check_variant(description, output): transcript.proteinDescription = 'p.?' continue - cds_original = Seq(str(util.splice(mutator.orig, transcript.CDS.positionList)), - IUPAC.unambiguous_dna) - cds_variant = Seq(str(util.__nsplice(mutator.mutated, - mutator.shift_sites(transcript.mRNA.positionList), - mutator.shift_sites(transcript.CDS.location), - transcript.CM.orientation)), - IUPAC.unambiguous_dna) + cds_original = util.splice(mutator.orig, transcript.CDS.positionList) + cds_original.alphabet = IUPAC.unambiguous_dna + + cds_variant = util.__nsplice(mutator.mutated, + mutator.shift_sites(transcript.mRNA.positionList), + mutator.shift_sites(transcript.CDS.location), + transcript.CM.orientation) + cds_variant.alphabet = IUPAC.unambiguous_dna if transcript.CM.orientation == -1: - cds_original = Bio.Seq.reverse_complement(cds_original) - cds_variant = Bio.Seq.reverse_complement(cds_variant) + cds_original = cds_original.reverse_complement() + cds_variant = cds_variant.reverse_complement() #if '*' in cds_original.translate()[:-1]: # output.addMessage(__file__, 3, "ESTOP", @@ -1801,7 +1810,7 @@ def check_variant(description, output): # FIXME this is a bit of a rancid fix. protein_original = cds_original.translate( table=transcript.txTable, cds=True, to_stop=True) - except Bio.Data.CodonTable.TranslationError: + except CodonTable.TranslationError: if transcript.current: output.addMessage( __file__, 2, "WTRANS", @@ -1822,7 +1831,7 @@ def check_variant(description, output): cds_length = util.cds_length( mutator.shift_sites(transcript.CDS.positionList)) transcript.proteinDescription = util.protein_description( - cds_length, protein_original, protein_variant)[0] + cds_length, unicode(protein_original), unicode(protein_variant))[0] except IndexError: # Todo: Probably CDS start was hit by removal of exon.. transcript.proteinDescription = 'p.?' diff --git a/mutalyzer/website/__init__.py b/mutalyzer/website/__init__.py index 730c33e8..2ce0450b 100644 --- a/mutalyzer/website/__init__.py +++ b/mutalyzer/website/__init__.py @@ -3,6 +3,8 @@ Mutalyzer website interface using the Flask framework. """ +from __future__ import unicode_literals + import logging import os import pkg_resources diff --git a/mutalyzer/website/templates/base.html b/mutalyzer/website/templates/base.html index 2f45caf9..270e3bdf 100644 --- a/mutalyzer/website/templates/base.html +++ b/mutalyzer/website/templates/base.html @@ -22,7 +22,7 @@ src="{{ url_for('static', filename='js/generator.js') }}"> </script> <meta http-equiv="Content-Type" - content="text/html; charset=iso-8859-1"> + content="text/html; charset=utf-8"> <title>Mutalyzer {{ mutalyzer_version }} — {{ page_title }}</title> </head> <body diff --git a/mutalyzer/website/views.py b/mutalyzer/website/views.py index 475330b7..03644dcc 100644 --- a/mutalyzer/website/views.py +++ b/mutalyzer/website/views.py @@ -3,6 +3,8 @@ Mutalyzer website views. """ +from __future__ import unicode_literals + import bz2 import os import pkg_resources @@ -144,7 +146,7 @@ def soap_api(): xsl_doc = etree.parse(xsl_handle) transform = etree.XSLT(xsl_doc) - return make_response(str(transform(wsdl_doc))) + return make_response(unicode(transform(wsdl_doc))) @website.route('/downloads/<string:filename>') @@ -159,7 +161,7 @@ def downloads(filename): except jinja2.exceptions.TemplateNotFound: abort(404) - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' response.headers['Content-Disposition'] = ('attachment; filename="%s"' % filename) return response @@ -233,10 +235,7 @@ def name_checker(): % (description, request.remote_addr)) stats.increment_counter('name-checker/website') - # Todo: The following is probably a problem elsewhere too. We stringify - # the variant, because a unicode string crashes BioPython's - # `reverse_complement`. - variantchecker.check_variant(str(description), output) + variantchecker.check_variant(description, output) errors, warnings, summary = output.Summary() parse_error = output.getOutput('parseError') @@ -272,18 +271,20 @@ def name_checker(): # Experimental description extractor. if (output.getIndexedOutput('original', 0) and output.getIndexedOutput('mutated', 0)): + extracted = extractedProt = '(skipped)' + allele = describe.describe(output.getIndexedOutput('original', 0), output.getIndexedOutput('mutated', 0)) - prot_allele = describe.describe( - output.getIndexedOutput('oldprotein', 0), - output.getIndexedOutput('newprotein', 0, default=''), - DNA=False) - - extracted = extractedProt = '(skipped)' if allele: extracted = describe.alleleDescription(allele) - if prot_allele: - extractedProt = describe.alleleDescription(prot_allele) + + if output.getIndexedOutput('oldprotein', 0): + prot_allele = describe.describe( + output.getIndexedOutput('oldprotein', 0), + output.getIndexedOutput('newprotein', 0, default=''), + DNA=False) + if prot_allele: + extractedProt = describe.alleleDescription(prot_allele) else: extracted = extractedProt = '' @@ -350,11 +351,10 @@ def bed(): if not description: abort(404) - return render_template('name-checker.html') output = Output(__file__) - variantchecker.check_variant(str(description), output) + variantchecker.check_variant(description, output) raw_variants = output.getIndexedOutput('rawVariantsChromosomal', 0) if not raw_variants: @@ -376,14 +376,14 @@ def bed(): for descr, positions in raw_variants[2]: bed += '\t'.join([raw_variants[0], - str(min(positions) - 1), - str(max(positions)), + unicode(min(positions) - 1), + unicode(max(positions)), descr, '0', raw_variants[1]]) + '\n' response = make_response(bed) - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' return response @@ -579,7 +579,7 @@ def reference_loader_submit(): output = Output(__file__) output.addMessage(__file__, -1, 'INFO', 'Received request upload(%s) with arguments %s from %s' - % (method, str(request.form), request.remote_addr)) + % (method, unicode(request.form), request.remote_addr)) assemblies = Assembly.query \ .order_by(Assembly.taxonomy_common_name.asc(), @@ -668,11 +668,11 @@ def reference_loader_submit(): if not ud: errors.append('The request could not be completed') - errors.extend(str(m) for m in output.getMessages()) + errors.extend(unicode(m) for m in output.getMessages()) output.addMessage(__file__, -1, 'INFO', 'Finished request upload(%s) with arguments %s from %s' - % (method, str(request.form), request.remote_addr)) + % (method, unicode(request.form), request.remote_addr)) return render_template('reference-loader.html', assemblies=assemblies, @@ -737,7 +737,7 @@ def reference(filename): response = make_response(bz2.BZ2File(file_path, 'r').read()) - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' response.headers['Content-Disposition'] = ('attachment; filename="%s"' % filename) return response @@ -894,7 +894,7 @@ def batch_job_result(result_id): return send_from_directory(settings.CACHE_DIR, 'batch-job-%s.txt' % result_id, - mimetype='text/plain', + mimetype='text/plain; charset=utf-8', as_attachment=True) @@ -933,10 +933,7 @@ def lovd_get_gs(): % (mutation_name, variant_record, forward, request.remote_addr)) - # Todo: The following is probably a problem elsewhere too. - # We stringify the variant, because a unicode string crashes - # Bio.Seq.reverse_complement in mapping.py:607. - variantchecker.check_variant(str(mutation_name), output) + variantchecker.check_variant(mutation_name, output) output.addMessage(__file__, -1, 'INFO', 'Finished request getGS(%s, %s, %s)' @@ -955,11 +952,11 @@ def lovd_get_gs(): standalone=1)) else: response = make_response(l[0]) - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' return response response = make_response('Transcript not found') - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' return response @@ -1041,7 +1038,7 @@ def lovd_variant_info(): assembly = Assembly.by_name_or_alias(build) except NoResultFound: response = make_response('invalid build') - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' return response converter = Converter(assembly, output) @@ -1079,7 +1076,7 @@ def lovd_variant_info(): response = re.sub('^Error \(.*\):', 'Error:', result) response = make_response(result) - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' return response diff --git a/tests/fixtures.py b/tests/fixtures.py index 595d72a6..71b1ae1b 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -7,6 +7,8 @@ as :func:`hg19` must be called after the :func:`database` fixture). """ +from __future__ import unicode_literals + import os import shutil diff --git a/tests/old/lrgtest.py b/tests/old/lrgtest.py index afeefc33..d2dae2bc 100644 --- a/tests/old/lrgtest.py +++ b/tests/old/lrgtest.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals import sys, os, unittest, types #make it possible to import the Modules diff --git a/tests/old/maptest.py b/tests/old/maptest.py index 7f3105a4..40dc1d15 100644 --- a/tests/old/maptest.py +++ b/tests/old/maptest.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals import sys, os, unittest, types #make it possible to import the Modules diff --git a/tests/old/recordtest.py b/tests/old/recordtest.py index d55bd58c..a9cc9354 100644 --- a/tests/old/recordtest.py +++ b/tests/old/recordtest.py @@ -2,6 +2,7 @@ recordtest.py contains TestRecord - a BaseClass for testing GenRecord.Record instances """ +from __future__ import unicode_literals import unittest, types from Modules import GenRecord #test class-types @@ -56,7 +57,7 @@ class TestRecord(unittest.TestCase): self.assertTrue(isinstance(plist, (types.NoneType, GenRecord.PList))) - #self.assertTrue(any(map(isinstance, + #self.assertTrue(any(map(isinstance, def _test_if_loc(self, loc): @@ -76,7 +77,5 @@ class TestRecord(unittest.TestCase): if __name__ == "__main__": - # This file should be imported + # This file should be imported pass - - diff --git a/tests/test_crossmap.py b/tests/test_crossmap.py index ff9d6d75..990f93fe 100644 --- a/tests/test_crossmap.py +++ b/tests/test_crossmap.py @@ -3,6 +3,8 @@ Tests for the Crossmap module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() from mutalyzer.Crossmap import Crossmap diff --git a/tests/test_describe.py b/tests/test_describe.py index 8315213e..e81c7ce4 100644 --- a/tests/test_describe.py +++ b/tests/test_describe.py @@ -3,6 +3,8 @@ Tests for the mutalyzer.describe module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() import os diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 1ebaa399..dad9a9c6 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -3,6 +3,8 @@ Tests for the mutalyzer.grammar module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() import os diff --git a/tests/test_mapping.py b/tests/test_mapping.py index 5ebdc60e..620f9d75 100644 --- a/tests/test_mapping.py +++ b/tests/test_mapping.py @@ -3,6 +3,8 @@ Tests for the mapping module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() from sqlalchemy import or_ diff --git a/tests/test_mutator.py b/tests/test_mutator.py index 36c5b8d1..05e2c685 100644 --- a/tests/test_mutator.py +++ b/tests/test_mutator.py @@ -3,6 +3,8 @@ Tests for the mutalyzer.mutator module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() import re import os @@ -666,7 +668,7 @@ class TestMutator(MutalyzerTest): """ m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) - assert str(m.mutated) == str(Seq('ACGATCG')) + assert unicode(m.mutated) == unicode(Seq('ACGATCG')) def test_largedel(self): """ @@ -674,7 +676,7 @@ class TestMutator(MutalyzerTest): """ m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 7) - assert str(m.mutated) == str(Seq('AG')) + assert unicode(m.mutated) == unicode(Seq('AG')) def test_ins(self): """ @@ -682,7 +684,7 @@ class TestMutator(MutalyzerTest): """ m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'A') - assert str(m.mutated) == str(Seq('ATACGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATACGATCG')) def test_largeins(self): """ @@ -690,7 +692,7 @@ class TestMutator(MutalyzerTest): """ m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'ATCG') - assert str(m.mutated) == str(Seq('ATATCGCGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATATCGCGATCG')) def test_sub(self): """ @@ -698,7 +700,7 @@ class TestMutator(MutalyzerTest): """ m = self._mutator(Seq('ATCGATCG')) m.substitution(3, 'G') - assert str(m.mutated) == str(Seq('ATGGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATGGATCG')) def test_adjecent_del_sub_1(self): """ @@ -709,7 +711,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.substitution(3, 'G') - assert str(m.mutated) == str(Seq('AGGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGGATCG')) def test_adjecent_del_sub_2(self): """ @@ -718,7 +720,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(3, 3) m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGGATCG')) def test_near_adjecent_del_sub_1(self): """ @@ -727,7 +729,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.substitution(4, 'T') - assert str(m.mutated) == str(Seq('ACTATCG')) + assert unicode(m.mutated) == unicode(Seq('ACTATCG')) def test_near_adjecent_del_sub_2(self): """ @@ -736,7 +738,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(4, 4) m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGCATCG')) + assert unicode(m.mutated) == unicode(Seq('AGCATCG')) def test_adjecent_largedel_sub_1(self): """ @@ -746,7 +748,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 6) m.substitution(7, 'T') - assert str(m.mutated) == str(Seq('ATG')) + assert unicode(m.mutated) == unicode(Seq('ATG')) def test_adjecent_largedel_sub_2(self): """ @@ -756,7 +758,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(3, 7) m.substitution(2, 'C') - assert str(m.mutated) == str(Seq('ACG')) + assert unicode(m.mutated) == unicode(Seq('ACG')) def test_near_adjecent_largedel_sub_1(self): """ @@ -765,7 +767,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 5) m.substitution(7, 'T') - assert str(m.mutated) == str(Seq('ATTG')) + assert unicode(m.mutated) == unicode(Seq('ATTG')) def test_near_adjecent_largedel_sub_2(self): """ @@ -774,7 +776,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(4, 7) m.substitution(2, 'C') - assert str(m.mutated) == str(Seq('ACCG')) + assert unicode(m.mutated) == unicode(Seq('ACCG')) def test_adjectent_del_ins_1(self): """ @@ -783,7 +785,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('AGCGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGCGATCG')) def test_adjectent_del_ins_2(self): """ @@ -792,7 +794,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(3, 3) m.insertion(2, 'A') - assert str(m.mutated) == str(Seq('ATAGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATAGATCG')) def test_near_adjectent_del_ins(self): """ @@ -801,7 +803,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.insertion(3, 'T') - assert str(m.mutated) == str(Seq('ACTGATCG')) + assert unicode(m.mutated) == unicode(Seq('ACTGATCG')) def test_adjecent_ins_sub_1(self): """ @@ -811,7 +813,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'A') m.substitution(3, 'G') - assert str(m.mutated) == str(Seq('ATAGGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATAGGATCG')) def test_adjecent_ins_sub_2(self): """ @@ -821,7 +823,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'A') m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGACGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGACGATCG')) def test_near_adjecent_ins_sub(self): """ @@ -831,7 +833,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'A') m.substitution(4, 'T') - assert str(m.mutated) == str(Seq('ATACTATCG')) + assert unicode(m.mutated) == unicode(Seq('ATACTATCG')) def test_adjecent_largeins_sub_1(self): """ @@ -841,7 +843,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'ATCG') m.substitution(3, 'G') - assert str(m.mutated) == str(Seq('ATATCGGGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATATCGGGATCG')) def test_adjecent_largeins_sub_2(self): """ @@ -851,7 +853,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'ATCG') m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGATCGCGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGATCGCGATCG')) def test_near_adjecent_largeins_sub(self): """ @@ -861,7 +863,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'ATCG') m.substitution(4, 'T') - assert str(m.mutated) == str(Seq('ATATCGCTATCG')) + assert unicode(m.mutated) == unicode(Seq('ATATCGCTATCG')) def test_adjecent_del_del_1(self): """ @@ -870,7 +872,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.deletion(3, 3) - assert str(m.mutated) == str(Seq('AGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGATCG')) def test_adjecent_del_del_2(self): """ @@ -879,7 +881,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(3, 3) m.deletion(2, 2) - assert str(m.mutated) == str(Seq('AGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGATCG')) def test_adjecent_delins_snp_1(self): """ @@ -888,7 +890,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 2, 'A') m.substitution(3, 'G') - assert str(m.mutated) == str(Seq('AAGGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGGATCG')) def test_adjecent_delins_snp_2(self): """ @@ -897,7 +899,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 3, 'A') m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGAGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGAGATCG')) def test_adjecent_largedelins_eq_snp_1(self): """ @@ -907,7 +909,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAA') m.substitution(7, 'G') - assert str(m.mutated) == str(Seq('AAAAAAGG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAGG')) def test_adjecent_largedelins_min_snp_1(self): """ @@ -917,7 +919,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAA') m.substitution(7, 'G') - assert str(m.mutated) == str(Seq('AAAAGG')) + assert unicode(m.mutated) == unicode(Seq('AAAAGG')) def test_adjecent_largedelins_plus_snp_1(self): """ @@ -927,7 +929,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAAAA') m.substitution(7, 'G') - assert str(m.mutated) == str(Seq('AAAAAAAAGG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAAAGG')) def test_adjecent_largedelins_eq_snp_2(self): """ @@ -937,7 +939,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAA') m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('AGAAAAAG')) def test_adjecent_largedelins_min_snp_2(self): """ @@ -947,7 +949,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAA') m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGAAAG')) + assert unicode(m.mutated) == unicode(Seq('AGAAAG')) def test_adjecent_largedelins_plus_snp_2(self): """ @@ -957,7 +959,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAAAA') m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGAAAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('AGAAAAAAAG')) def test_adjecent_delins_del_1(self): """ @@ -966,7 +968,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 2, 'A') m.deletion(3, 3) - assert str(m.mutated) == str(Seq('AAGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGATCG')) def test_adjecent_delins_del_2(self): """ @@ -975,7 +977,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 3, 'A') m.deletion(2, 2) - assert str(m.mutated) == str(Seq('AAGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGATCG')) def test_adjecent_largedelins_eq_del_1(self): """ @@ -985,7 +987,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAA') m.deletion(7, 7) - assert str(m.mutated) == str(Seq('AAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAG')) def test_adjecent_largedelins_min_del_1(self): """ @@ -995,7 +997,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAA') m.deletion(7, 7) - assert str(m.mutated) == str(Seq('AAAAG')) + assert unicode(m.mutated) == unicode(Seq('AAAAG')) def test_adjecent_largedelins_plus_del_1(self): """ @@ -1005,7 +1007,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAAAA') m.deletion(7, 7) - assert str(m.mutated) == str(Seq('AAAAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAAAG')) def test_adjecent_largedelins_eq_del_2(self): """ @@ -1015,7 +1017,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAA') m.deletion(2, 2) - assert str(m.mutated) == str(Seq('AAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAG')) def test_adjecent_largedelins_min_del_2(self): """ @@ -1025,7 +1027,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAA') m.deletion(2, 2) - assert str(m.mutated) == str(Seq('AAAAG')) + assert unicode(m.mutated) == unicode(Seq('AAAAG')) def test_adjecent_largedelins_plus_del_2(self): """ @@ -1035,7 +1037,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAAAA') m.deletion(2, 2) - assert str(m.mutated) == str(Seq('AAAAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAAAG')) def test_adjectent_delins_ins_1(self): """ @@ -1044,7 +1046,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 2, 'A') m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('AAGCGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGCGATCG')) def test_adjectent_delins_ins_2(self): """ @@ -1053,7 +1055,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 3, 'A') m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('ATGAGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATGAGATCG')) def test_adjectent_largedelins_eq_ins_1(self): """ @@ -1062,7 +1064,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAA') m.insertion(6, 'G') - assert str(m.mutated) == str(Seq('AAAAAAGCG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAGCG')) def test_adjectent_largedelins_min_ins_1(self): """ @@ -1071,7 +1073,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAA') m.insertion(6, 'G') - assert str(m.mutated) == str(Seq('AAAAGCG')) + assert unicode(m.mutated) == unicode(Seq('AAAAGCG')) def test_adjectent_largedelins_plus_ins_1(self): """ @@ -1080,7 +1082,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAAAA') m.insertion(6, 'G') - assert str(m.mutated) == str(Seq('AAAAAAAAGCG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAAAGCG')) def test_adjectent_largedelins_eq_ins_2(self): """ @@ -1089,7 +1091,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAA') m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('ATGAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('ATGAAAAAG')) def test_adjectent_largedelins_min_ins_2(self): """ @@ -1098,7 +1100,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAA') m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('ATGAAAG')) + assert unicode(m.mutated) == unicode(Seq('ATGAAAG')) def test_adjectent_largedelins_plus_ins_2(self): """ @@ -1107,7 +1109,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAAAA') m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('ATGAAAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('ATGAAAAAAAG')) def test_adjectent_delins_del_delins(self): """ @@ -1116,7 +1118,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 3, 'A') m.delins(4, 4, 'T') - assert str(m.mutated) == str(Seq('AATATCG')) + assert unicode(m.mutated) == unicode(Seq('AATATCG')) def test_adjectent_largedelins_plus_delins_1(self): """ @@ -1125,7 +1127,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAAAA') m.delins(7, 7, 'T') - assert str(m.mutated) == str(Seq('AAAAAAAATG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAAATG')) def test_adjectent_largedelins_plus_delins_2(self): """ @@ -1134,7 +1136,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAAAA') m.delins(2, 2, 'C') - assert str(m.mutated) == str(Seq('ACAAAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('ACAAAAAAAG')) def test_adjectent_largedelins_min_delins_1(self): """ @@ -1143,7 +1145,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAA') m.delins(7, 7, 'T') - assert str(m.mutated) == str(Seq('AAAATG')) + assert unicode(m.mutated) == unicode(Seq('AAAATG')) def test_adjectent_largedelins_min_delins_2(self): """ @@ -1152,7 +1154,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAA') m.delins(2, 2, 'C') - assert str(m.mutated) == str(Seq('ACAAAG')) + assert unicode(m.mutated) == unicode(Seq('ACAAAG')) def test_adjectent_del_dup_1(self): """ @@ -1161,7 +1163,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.duplication(3, 3) - assert str(m.mutated) == str(Seq('ACCGATCG')) + assert unicode(m.mutated) == unicode(Seq('ACCGATCG')) def test_adjectent_del_dup_2(self): """ @@ -1170,7 +1172,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(3, 3) m.duplication(2, 2) - assert str(m.mutated) == str(Seq('ATTGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATTGATCG')) def test_adjectent_ins_dup_1(self): """ @@ -1179,7 +1181,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'G') m.duplication(3, 3) - assert str(m.mutated) == str(Seq('ATGCCGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATGCCGATCG')) def test_adjectent_ins_dup_2(self): """ @@ -1188,7 +1190,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'G') m.duplication(2, 2) - assert str(m.mutated) == str(Seq('ATTGCGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATTGCGATCG')) def test_adjectent_ins_ins_1(self): """ @@ -1197,7 +1199,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'G') m.insertion(3, 'A') - assert str(m.mutated) == str(Seq('ATGCAGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATGCAGATCG')) def test_adjectent_ins_ins_2(self): """ @@ -1206,7 +1208,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(3, 'A') m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('ATGCAGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATGCAGATCG')) def test_ins_ins(self): """ @@ -1215,7 +1217,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'G') m.insertion(2, 'A') - assert str(m.mutated) in (str(Seq('ATGACGATCG')), str(Seq('ATAGCGATCG'))) + assert unicode(m.mutated) in (unicode(Seq('ATGACGATCG')), unicode(Seq('ATAGCGATCG'))) def test_adjecent_inv_inv_1(self): """ @@ -1224,7 +1226,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.inversion(2, 2) m.inversion(3, 3) - assert str(m.mutated) == str(Seq('AAGGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGGATCG')) def test_adjecent_inv_inv_2(self): """ @@ -1233,7 +1235,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.inversion(3, 3) m.inversion(2, 2) - assert str(m.mutated) == str(Seq('AAGGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGGATCG')) def test_adjecent_dup_dup_1(self): """ @@ -1242,7 +1244,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.duplication(2, 2) m.duplication(3, 3) - assert str(m.mutated) == str(Seq('ATTCCGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATTCCGATCG')) def test_adjecent_dup_dup_2(self): """ @@ -1251,7 +1253,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.duplication(3, 3) m.duplication(2, 2) - assert str(m.mutated) == str(Seq('ATTCCGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATTCCGATCG')) def test_adjecent_del_inv_1(self): """ @@ -1260,7 +1262,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.inversion(3, 3) - assert str(m.mutated) == str(Seq('AGGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGGATCG')) def test_adjecent_del_inv_2(self): """ @@ -1269,7 +1271,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(3, 3) m.inversion(2, 2) - assert str(m.mutated) == str(Seq('AAGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGATCG')) def test_adjecent_ins_inv_1(self): """ @@ -1278,7 +1280,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'G') m.inversion(3, 3) - assert str(m.mutated) == str(Seq('ATGGGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATGGGATCG')) def test_adjecent_ins_inv_2(self): """ @@ -1287,4 +1289,4 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'G') m.inversion(2, 2) - assert str(m.mutated) == str(Seq('AAGCGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGCGATCG')) diff --git a/tests/test_parsers_genbank.py b/tests/test_parsers_genbank.py index 7640c496..f04b8839 100644 --- a/tests/test_parsers_genbank.py +++ b/tests/test_parsers_genbank.py @@ -3,6 +3,8 @@ Tests for the mutalyzer.parsers.genbank module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() from mutalyzer.parsers import genbank diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index fc5e4abe..6f0b4c4e 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -3,9 +3,11 @@ Tests for the Scheduler module. """ +from __future__ import unicode_literals + import bz2 import os -import StringIO +import io #import logging; logging.basicConfig() from Bio import Entrez @@ -33,7 +35,7 @@ class TestScheduler(MutalyzerTest): file_instance = File.File(output.Output('test')) scheduler = Scheduler.Scheduler() - batch_file = StringIO.StringIO('\n'.join(variants) + '\n') + batch_file = io.BytesIO(('\n'.join(variants) + '\n').encode('utf-8')) job, columns = file_instance.parseBatchFile(batch_file) result_id = scheduler.addJob('test@test.test', job, columns, job_type, argument=argument) diff --git a/tests/test_services_json.py b/tests/test_services_json.py index ce029ba7..8df9b748 100644 --- a/tests/test_services_json.py +++ b/tests/test_services_json.py @@ -3,6 +3,8 @@ Tests for the JSON interface to Mutalyzer. """ +from __future__ import unicode_literals + import simplejson as json from spyne.server.null import NullServer import mutalyzer @@ -77,7 +79,7 @@ class TestServicesJson(MutalyzerTest): Running the info method should give us some version information. """ r = self._call('info') - assert type(r['versionParts']) == list + assert isinstance(r['versionParts'], list) assert r['version'] == mutalyzer.__version__ def test_info_announcement(self): @@ -86,12 +88,12 @@ class TestServicesJson(MutalyzerTest): """ announce.set_announcement('Test announcement') r = self._call('info') - assert type(r['announcement']) == str + assert isinstance(r['announcement'], unicode) assert r['announcement'] == 'Test announcement' announce.set_announcement('New announcement') r = self._call('info') - assert type(r['announcement']) == str + assert isinstance(r['announcement'], unicode) assert r['announcement'] == 'New announcement' announce.unset_announcement() diff --git a/tests/test_services_soap.py b/tests/test_services_soap.py index cc1ce8c0..0882c9fb 100644 --- a/tests/test_services_soap.py +++ b/tests/test_services_soap.py @@ -3,6 +3,8 @@ Tests for the SOAP interface to Mutalyzer. """ +from __future__ import unicode_literals + import bz2 import datetime import logging @@ -539,8 +541,8 @@ class TestServicesSoap(MutalyzerTest): 'AL449423.14(CDKN2A_v002):c.5_400del'] data = '\n'.join(variants) + '\n' #.encode('base64') - result = self._call('submitBatchJob', data, 'NameChecker') - job_id = str(result) + result = self._call('submitBatchJob', data.encode('utf-8'), 'NameChecker') + job_id = unicode(result) result = self._call('monitorBatchJob', job_id) assert int(result) == len(variants) @@ -564,8 +566,8 @@ class TestServicesSoap(MutalyzerTest): 'AL449423.14(CDKN2A_v002):c.5_400del'] data = '\n'.join(variants) + '\n' - result = self._call('submitBatchJob', data, 'SyntaxChecker') - job_id = str(result) + result = self._call('submitBatchJob', data.encode('utf-8'), 'SyntaxChecker') + job_id = unicode(result) result = self._call('monitorBatchJob', job_id) assert int(result) == len(variants) @@ -586,8 +588,8 @@ class TestServicesSoap(MutalyzerTest): 'AL449423.14(CDKN2A_v002):c.5_400del'] data = '\r'.join(variants) + '\r' - result = self._call('submitBatchJob', data, 'SyntaxChecker') - job_id = str(result) + result = self._call('submitBatchJob', data.encode('utf-8'), 'SyntaxChecker') + job_id = unicode(result) result = self._call('monitorBatchJob', job_id) assert int(result) == len(variants) @@ -608,8 +610,8 @@ class TestServicesSoap(MutalyzerTest): 'AL449423.14(CDKN2A_v002):c.5_400del'] data = '\r\n'.join(variants) + '\r\n' - result = self._call('submitBatchJob', data, 'SyntaxChecker') - job_id = str(result) + result = self._call('submitBatchJob', data.encode('utf-8'), 'SyntaxChecker') + job_id = unicode(result) result = self._call('monitorBatchJob', job_id) assert int(result) == len(variants) @@ -640,7 +642,7 @@ facilisi.""" data += data try: - self._call('submitBatchJob', data.encode('base64'), 'NameChecker') + self._call('submitBatchJob', data.encode('utf-8'), 'NameChecker') assert False except Fault as e: # - senv:Client.RequestTooLong: Raised by Spyne, depending on @@ -661,7 +663,7 @@ facilisi.""" data = f.read() result = self._call('uploadGenBankLocalFile', data) - ud = str(result) + ud = unicode(result) r = self._call('runMutalyzer', ud + '(SDHD):g.7872G>T') assert r.errors == 0 diff --git a/tests/test_variantchecker.py b/tests/test_variantchecker.py index 1b30786b..8c19421a 100644 --- a/tests/test_variantchecker.py +++ b/tests/test_variantchecker.py @@ -3,6 +3,8 @@ Tests for the variantchecker module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() from mutalyzer.output import Output diff --git a/tests/test_website.py b/tests/test_website.py index e579433a..c649925e 100644 --- a/tests/test_website.py +++ b/tests/test_website.py @@ -5,6 +5,8 @@ Tests for the WSGI interface to Mutalyzer. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() import bz2 import cgi @@ -12,7 +14,7 @@ import logging from mock import patch import os import re -from StringIO import StringIO +from io import BytesIO import time import urllib import urllib2 @@ -264,7 +266,7 @@ class TestWebsite(MutalyzerTest): """ data = {'job_type': job_type, 'email': 'test@test.test', - 'file': (StringIO(file), 'test.txt')} + 'file': (BytesIO(file.encode('utf-8')), 'test.txt')} if assembly_name_or_alias is not None: data['assembly_name_or_alias'] = assembly_name_or_alias @@ -510,7 +512,7 @@ class TestWebsite(MutalyzerTest): Download a C# example client for the web service. """ r = self.app.get('/downloads/client-mono.cs') - assert r.headers['Content-Type'] == 'text/plain' + assert 'text/plain' in r.headers['Content-Type'] assert 'public static void Main(String [] args) {' in r.data def test_download_php(self): @@ -634,7 +636,7 @@ class TestWebsite(MutalyzerTest): 'build': 'hg19', 'acc': 'NM_203473.1'}) assert 'text/plain' in r.headers['Content-Type'] - assert r.content_type == 'text/plain' + assert 'text/plain' in r.content_type expected = '\n'.join(['-158', '1709', '1371']) assert r.data == expected @@ -678,7 +680,7 @@ class TestWebsite(MutalyzerTest): """ r = self.app.post('/reference-loader', data={'method': 'upload', - 'file': (StringIO('this is not a genbank file'), 'AB026906.1.gb')}) + 'file': (BytesIO('this is not a genbank file'.encode('utf-8')), 'AB026906.1.gb')}) assert 'Your reference sequence was loaded successfully.' not in r.data assert 'The file could not be parsed.' in r.data diff --git a/tests/utils.py b/tests/utils.py index befa5d72..f9cfce8b 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -3,6 +3,8 @@ Utilities for unit tests. """ +from __future__ import unicode_literals + from functools import wraps import os import shutil -- GitLab