diff --git a/extras/log-tools/find-crashes.py b/extras/log-tools/find-crashes.py index 0e6d791ef19995d3708a982573b40861822ca71f..cf6ba98600a0a4d4afcfb87f2e4ae136e9254096 100755 --- a/extras/log-tools/find-crashes.py +++ b/extras/log-tools/find-crashes.py @@ -9,6 +9,8 @@ crashed. """ +from __future__ import unicode_literals + import os from mutalyzer import config diff --git a/extras/monitor/mutalyzer-monitor.py b/extras/monitor/mutalyzer-monitor.py index b5ea49fdbfac865afec348dd163759d70905bd98..43e49abc2f1e502e9a7805efabc3090d06134853 100755 --- a/extras/monitor/mutalyzer-monitor.py +++ b/extras/monitor/mutalyzer-monitor.py @@ -15,6 +15,8 @@ Currently implemented checks: """ +from __future__ import unicode_literals + import argparse import logging import sys diff --git a/extras/soap-tools/batchjob.py b/extras/soap-tools/batchjob.py index 7558b98d8e284d0c0de8e7267c406145153bd8b1..de11bc2ac7d64c64efb00158e158fb7e5a9e19a3 100755 --- a/extras/soap-tools/batchjob.py +++ b/extras/soap-tools/batchjob.py @@ -17,6 +17,8 @@ to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/checkSyntax.py b/extras/soap-tools/checkSyntax.py index 78c63e5c902e25d0944b744dfc04691ef6053f40..a2bf32d780966a40f25fbc8846fbb41b61195bdf 100755 --- a/extras/soap-tools/checkSyntax.py +++ b/extras/soap-tools/checkSyntax.py @@ -12,6 +12,8 @@ and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/chromAccession.py b/extras/soap-tools/chromAccession.py index 4fb6e04f1b3baa844bcf50c31f6dd3f826c7ce73..457277d8e278093df25831c7e4de88f2b7d7cde6 100755 --- a/extras/soap-tools/chromAccession.py +++ b/extras/soap-tools/chromAccession.py @@ -13,6 +13,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/descriptionExtract.py b/extras/soap-tools/descriptionExtract.py index 7ca3b2eceefa27fa53b1f41794a8d92dd36b1bba..3889ca414ee9f1054d6350e6f1b87f4b4e909fe4 100755 --- a/extras/soap-tools/descriptionExtract.py +++ b/extras/soap-tools/descriptionExtract.py @@ -14,6 +14,8 @@ service and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getCache.py b/extras/soap-tools/getCache.py index 2f9c7df218b3c831964671a622f6f44d14f4d039..07a86818946b31a0de8e2555d3b62a85af05c8a2 100755 --- a/extras/soap-tools/getCache.py +++ b/extras/soap-tools/getCache.py @@ -12,6 +12,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getGeneAndTranscript.py b/extras/soap-tools/getGeneAndTranscript.py index 8946d59e71c8fb280b4e4e240acdd019f3fe24bd..e4ba939b0a335a34e43b85ff1135c3ada19d8aca 100755 --- a/extras/soap-tools/getGeneAndTranscript.py +++ b/extras/soap-tools/getGeneAndTranscript.py @@ -13,6 +13,8 @@ web service and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getGeneName.py b/extras/soap-tools/getGeneName.py index e3b7dd01445c37602131ffa73e51f680255ee376..ad4ce8c4afe8ad25780a778f76a0d28eaa4f0990 100755 --- a/extras/soap-tools/getGeneName.py +++ b/extras/soap-tools/getGeneName.py @@ -13,6 +13,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getTranscripts.py b/extras/soap-tools/getTranscripts.py index 51052fca68208719de8002af8b44418120180eb3..82af32191ee18635a07ba55472be91f64a8d830d 100755 --- a/extras/soap-tools/getTranscripts.py +++ b/extras/soap-tools/getTranscripts.py @@ -13,6 +13,8 @@ web service and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getTranscriptsAndInfo.py b/extras/soap-tools/getTranscriptsAndInfo.py index 86dc3ff446887e970cd6c521b998629848904943..12b94d86003fb96f3af035a9446a9788615c1bd7 100755 --- a/extras/soap-tools/getTranscriptsAndInfo.py +++ b/extras/soap-tools/getTranscriptsAndInfo.py @@ -14,6 +14,8 @@ and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getTranscriptsByGeneName.py b/extras/soap-tools/getTranscriptsByGeneName.py index d7789a0acbe91b85aef602f9771f25dfd13068a6..f31ff6ba6e667794fdfe3cbaf95f76dcf222038f 100755 --- a/extras/soap-tools/getTranscriptsByGeneName.py +++ b/extras/soap-tools/getTranscriptsByGeneName.py @@ -12,6 +12,8 @@ web service and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getTranscriptsMapping.py b/extras/soap-tools/getTranscriptsMapping.py index 79683369ed86b478aabc89c20c9195634a65a3f2..891dfa75a11100689d7b3f6d3948e8d0abd5ecf8 100755 --- a/extras/soap-tools/getTranscriptsMapping.py +++ b/extras/soap-tools/getTranscriptsMapping.py @@ -16,6 +16,8 @@ and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/getdbSNPDescriptions.py b/extras/soap-tools/getdbSNPDescriptions.py index f5745533067a6e675077d5b9756bd9b7fcd75160..5be99c735012d7cc176e24396af40ccd350c7b43 100755 --- a/extras/soap-tools/getdbSNPDescriptions.py +++ b/extras/soap-tools/getdbSNPDescriptions.py @@ -12,6 +12,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/info.py b/extras/soap-tools/info.py index eb3cd058044621745a59d464bcfd70ca57602a19..1a4ea6e43335330798767d9aee73a880833848b6 100755 --- a/extras/soap-tools/info.py +++ b/extras/soap-tools/info.py @@ -10,6 +10,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/mappingInfo.py b/extras/soap-tools/mappingInfo.py index 49fb4ac404df042d044ce9b6525e2084a8a992f0..7a473b1c9a6cfd86401e75bad22b55ca5f123f2e 100755 --- a/extras/soap-tools/mappingInfo.py +++ b/extras/soap-tools/mappingInfo.py @@ -14,6 +14,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/numberConversion.py b/extras/soap-tools/numberConversion.py index 977bbc719ce83dd34b2047add81ff55cdd978fa8..bd5262f4bb19d75d4d852593ac0ebfd116d627e0 100755 --- a/extras/soap-tools/numberConversion.py +++ b/extras/soap-tools/numberConversion.py @@ -13,6 +13,8 @@ web service and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/runMutalyzer.py b/extras/soap-tools/runMutalyzer.py index 0a2d1e7593db0eed2963cdd80606a015f1ec7a11..475cc6c18c8f6aab61bdb5a952c5d448ec99af2b 100755 --- a/extras/soap-tools/runMutalyzer.py +++ b/extras/soap-tools/runMutalyzer.py @@ -13,6 +13,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/sliceChromosomeByGene.py b/extras/soap-tools/sliceChromosomeByGene.py index 8e24c54d9b9a9a16cdebcb75fb836e7e6a9b66bc..c4e0e4183d002d53b7e620ddef8cad700691d7ca 100755 --- a/extras/soap-tools/sliceChromosomeByGene.py +++ b/extras/soap-tools/sliceChromosomeByGene.py @@ -13,6 +13,8 @@ printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/extras/soap-tools/sp.py b/extras/soap-tools/sp.py index d395d1993195a7664d6daa3ac05c7e7f2c3476f6..a2fd0be498607c268b9ab31d5c8c60efbee6ef5c 100755 --- a/extras/soap-tools/sp.py +++ b/extras/soap-tools/sp.py @@ -11,6 +11,8 @@ # This code is in the public domain; it can be used for whatever purpose # with absolutely no restrictions. +from __future__ import unicode_literals + import sys from SOAPpy import WSDL diff --git a/extras/soap-tools/transcriptInfo.py b/extras/soap-tools/transcriptInfo.py index d25d361a94461572ebd600ac165b3513d8dea92e..bd9c14e8c5dcb0c3b3bca03e513b60f725d89566 100755 --- a/extras/soap-tools/transcriptInfo.py +++ b/extras/soap-tools/transcriptInfo.py @@ -12,6 +12,8 @@ and printed to standard output. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import sys diff --git a/migrations/script.py.mako b/migrations/script.py.mako index 95702017ea341e6455933b35f8ef5bf45f2df728..56af6fd8e141a90a81a3cf64d4f1af10eb291cf7 100644 --- a/migrations/script.py.mako +++ b/migrations/script.py.mako @@ -6,6 +6,8 @@ Create Date: ${create_date} """ +from __future__ import unicode_literals + # revision identifiers, used by Alembic. revision = ${repr(up_revision)} down_revision = ${repr(down_revision)} diff --git a/migrations/versions/402ff01b0d5d_fix_grcm38_chromosome_accession_number_.py b/migrations/versions/402ff01b0d5d_fix_grcm38_chromosome_accession_number_.py index ca664e5629e625ce136b92963c91a637fd790ed5..10ed1f8be249bd96d42fd7c398cbbc3c034d87fd 100644 --- a/migrations/versions/402ff01b0d5d_fix_grcm38_chromosome_accession_number_.py +++ b/migrations/versions/402ff01b0d5d_fix_grcm38_chromosome_accession_number_.py @@ -6,6 +6,8 @@ Create Date: 2014-10-08 15:10:21.522551 """ +from __future__ import unicode_literals + # revision identifiers, used by Alembic. revision = '402ff01b0d5d' down_revision = 'ea660b66f26' diff --git a/migrations/versions/ea660b66f26_initial_schema.py b/migrations/versions/ea660b66f26_initial_schema.py index d0d474ed4a532d1661b126aa3a83abc1170bcdd8..eec6ce6af5ee8767be03e99bda445305002394b1 100644 --- a/migrations/versions/ea660b66f26_initial_schema.py +++ b/migrations/versions/ea660b66f26_initial_schema.py @@ -6,6 +6,8 @@ Create Date: 2014-02-04 18:38:28.416032 """ +from __future__ import unicode_literals + # revision identifiers, used by Alembic. revision = 'ea660b66f26' down_revision = None diff --git a/mutalyzer/Crossmap.py b/mutalyzer/Crossmap.py index 0fb166dc9e2e0c42aef3473ba14015bf9624a726..0de7ce3aba863de574bc9d79e2278b084f417b8c 100644 --- a/mutalyzer/Crossmap.py +++ b/mutalyzer/Crossmap.py @@ -10,6 +10,8 @@ and stop and the orientation of a transcript. #Public classes: # - Crossmap ; Convert from g. to c. or n. notation or vice versa. +from __future__ import unicode_literals + class Crossmap() : """ Convert from I{g.} to I{c.} or I{n.} notation or vice versa. @@ -406,13 +408,13 @@ class Crossmap() : @type a: integer @return: The converted notation (may be unaltered) - @rtype: string + @rtype: unicode """ if a > self.__STOP : - return '*' + str(a - self.__STOP) + return '*' + unicode(a - self.__STOP) - return str(a) + return unicode(a) #int2main def main2int(self, s) : @@ -423,7 +425,7 @@ class Crossmap() : - __STOP ; CDS stop in I{c.} notation. @arg s: A string in '*' notation - @type s: string + @type s: unicode @return: The converted notation (may be unaltered) @rtype: integer @@ -447,20 +449,20 @@ class Crossmap() : @type fuzzy: bool @return: The offset in HGVS notation - @rtype: string + @rtype: unicode """ if t[1] > 0 : # The exon boundary is downstream. if fuzzy: return '+?' if t[0] >= self.__trans_end : # It is downstream of the last exon. - return "+d" + str(t[1]) - return '+' + str(t[1]) + return "+d" + unicode(t[1]) + return '+' + unicode(t[1]) #if if t[1] < 0 : # The exon boundary is uptream. if fuzzy: return '-?' if t[0] <= self.__trans_start : # It is upstream of the first exon. - return "-u" + str(-t[1]) - return str(t[1]) + return "-u" + unicode(-t[1]) + return unicode(t[1]) #if return '' # No offset was given. #int2offset @@ -472,7 +474,7 @@ class Crossmap() : sensible. @arg s: An offset in HGVS notation - @type s: string + @type s: unicode @return: The offset as an integer @rtype: integer @@ -505,12 +507,12 @@ class Crossmap() : @type fuzzy: bool @return: The position in HGVS notation - @rtype: string + @rtype: unicode """ if t[0] >= self.__trans_end or t[0] <= self.__trans_start: - return str(self.int2main(self.__minus(t[0], -t[1]))) - return str(self.int2main(t[0])) + str(self.int2offset(t, fuzzy)) + return unicode(self.int2main(self.__minus(t[0], -t[1]))) + return unicode(self.int2main(t[0])) + unicode(self.int2offset(t, fuzzy)) #tuple2string def g2c(self, a, fuzzy=False) : @@ -525,7 +527,7 @@ class Crossmap() : @type fuzzy: bool @return: The position in HGVS notation - @rtype: string + @rtype: unicode """ return self.tuple2string(self.g2x(a), fuzzy) #g2c diff --git a/mutalyzer/File.py b/mutalyzer/File.py index b95f03012205f4ec02832d610833f39797d43a15..3de998da2d2e47c27e5c422e8ccea340d900bf97 100644 --- a/mutalyzer/File.py +++ b/mutalyzer/File.py @@ -16,6 +16,8 @@ Module for parsing CSV files and spreadsheets. # - File ; Parse CSV files and spreadsheets. +from __future__ import unicode_literals + import magic # open(), MAGIC_MIME, MAGIC_NONE import csv # Sniffer(), reader(), Error import xlrd # open_workbook() @@ -23,10 +25,7 @@ import zipfile # ZipFile() import xml.dom.minidom # parseString() import os # remove() import tempfile -import types # UnicodeType -from cStringIO import StringIO -from mutalyzer import util from mutalyzer.config import settings @@ -173,10 +172,10 @@ class File() : for i in range(sheet.nrows) : row = [] for j in sheet.row_values(i) : - if type(j) == types.UnicodeType : # Convert the data to strings. - row.append(j.encode("utf8")) - else : - row.append(str(j)) + if isinstance(j, unicode): + row.append(j) + else: + row.append(j.decode('utf-8')) #for ret.append(row) #for @@ -209,7 +208,7 @@ class File() : for j in i.getElementsByTagName("table:table-cell") : c = j.getElementsByTagName("text:p") if c : - row.append(c[0].lastChild.data.encode("utf8")) + row.append(c[0].lastChild.data) #if #for ret.append(row) @@ -346,19 +345,19 @@ class File() : @arg handle: A handle to a stream @type handle: stream - @return: The mime type of a file - @rtype: string + @return: The mime type of a file and a textual description. + @rtype: unicode, unicode """ handle.seek(0) buf = handle.read(BUFFER_SIZE) MagicInstance = magic.open(magic.MAGIC_MIME) MagicInstance.load() - mimeType = MagicInstance.buffer(buf).split(';')[0] + mimeType = MagicInstance.buffer(buf).decode('utf-8').split(';')[0] MagicInstance.close() MagicInstance = magic.open(magic.MAGIC_NONE) MagicInstance.load() - description = MagicInstance.buffer(buf) + description = MagicInstance.buffer(buf).decode('utf-8') del MagicInstance handle.seek(0) @@ -419,9 +418,9 @@ def makeList(l, maxlen=10): @arg maxlen: maximum length of the string you want to return @type maxlen: integer @return: a list converted to a string with comma's and spaces - @rtype: string + @rtype: unicode """ - ret = ", ".join(str(i) for i in l[:maxlen]) + ret = ", ".join(i for i in l[:maxlen]) if len(l)>maxlen: return ret+", ..." else: diff --git a/mutalyzer/GenRecord.py b/mutalyzer/GenRecord.py index b30ed80060bb3135f28650bb9bfae4bbf1f30b61..5a729f737270d0ef52d8acc08d9af16de5668589 100644 --- a/mutalyzer/GenRecord.py +++ b/mutalyzer/GenRecord.py @@ -15,7 +15,7 @@ search for them each time. # - GenRecord ; Convert a GenBank record to a nested dictionary. -import Bio +from __future__ import unicode_literals from mutalyzer import util from mutalyzer import Crossmap @@ -85,7 +85,7 @@ class Locus(object) : - CM ; A Crossmap object. @arg name: identifier of the locus - @type name: string + @type name: unicode """ self.name = name @@ -131,7 +131,7 @@ class Locus(object) : Expands the DNA description with a new raw variant. @arg rawVariant: description of a single mutation - @type rawVariant: string + @type rawVariant: unicode """ if self.description: # Don't change anything if we already have an unknown value. @@ -170,7 +170,7 @@ class Gene(object) : - __locusTag ; @arg name: gene name - @type name: string + @type name: unicode """ self.name = name @@ -199,14 +199,14 @@ class Gene(object) : Find a transcript, given its name. @arg name: transcript variant number - @type name: string + @type name: unicode @return: transcript @rtype: object """ for i in self.transcriptList : - if i.name == name or i.name == str("%03i" % int(name)): + if i.name == name or i.name == "%03i" % int(name): return i return None #findLocus @@ -230,7 +230,7 @@ class Gene(object) : Look in the list of transcripts for a given protein accession number. @arg protAcc: protein accession number - @type protAcc: string + @type protAcc: unicode @return: transcript @rtype: object @@ -300,7 +300,7 @@ class Record(object) : Returns a Gene object, given its name. @arg name: Gene name - @type name: string + @type name: unicode @return: Gene object @rtype: object @@ -332,7 +332,7 @@ class Record(object) : Expands the DNA description with a new raw variant. @arg rawVariant: description of a single mutation - @type rawVariant: string + @type rawVariant: unicode """ if self.description : @@ -469,18 +469,18 @@ class GenRecord() : @arg gene: Gene @type gene: object @arg string: DNA sequence - @type string: string + @type string: unicode @kwarg string_reverse: DNA sequence to use (if not None) for the reverse complement. @return: reverse-complement (if applicable), otherwise return the original. - @rtype: string + @rtype: unicode """ if gene.orientation == -1: if string_reverse: string = string_reverse - return Bio.Seq.reverse_complement(string) + return util.reverse_complement(string) return string #__maybeInvert @@ -639,15 +639,15 @@ class GenRecord() : @arg stop_g: stop position @type stop_g: integer @arg varType: variant type - @type varType: string + @type varType: unicode @arg arg1: argument 1 of a raw variant - @type arg1: string + @type arg1: unicode @arg arg2: argument 2 of a raw variant - @type arg2: string + @type arg2: unicode @arg roll: ??? @type roll: tuple (integer, integer) @kwarg arg1_reverse: argument 1 to be used on reverse strand - @type arg1_reverse: string + @type arg1_reverse: unicode @kwarg start_fuzzy: Indicates if start position of variant is fuzzy. @type start_fuzzy: bool @kwarg stop_fuzzy: Indicates if stop position of variant is fuzzy. @@ -666,8 +666,8 @@ class GenRecord() : else: chromStart = self.record.toChromPos(stop_g) chromStop = self.record.toChromPos(start_g) - chromArg1 = Bio.Seq.reverse_complement(arg1) - chromArg2 = Bio.Seq.reverse_complement(arg2) + chromArg1 = util.reverse_complement(arg1) + chromArg2 = util.reverse_complement(arg2) # Todo: Should we use arg1_reverse here? if roll : diff --git a/mutalyzer/Retriever.py b/mutalyzer/Retriever.py index 5fa91eeb1940dcb14fd5ce7087495281a91cdfe9..deb645ae68b238def0db684d81ea3b3b87078a16 100644 --- a/mutalyzer/Retriever.py +++ b/mutalyzer/Retriever.py @@ -10,6 +10,9 @@ Public classes: """ +from __future__ import unicode_literals + +import codecs import os # path.isfile(), link() path.isdir(), path.mkdir(), # walk(), path.getsize(), path.join(), stat(), remove() import time @@ -84,10 +87,10 @@ class Retriever(object) : Convert an accession number to a filename. @arg name: The accession number - @type name: string + @type name: unicode @return: A filename - @rtype: string + @rtype: unicode """ return os.path.join(settings.CACHE_DIR, name + "." + self.fileType + ".bz2") #_nametofile @@ -99,11 +102,13 @@ class Retriever(object) : @arg raw_data: The raw_data to be compressed and written @type raw_data: string @arg filename: The intended name of the outfile - @type filename: string + @type filename: unicode @return: outfile ; The full path and name of the file written - @rtype: string + @rtype: unicode """ + # Todo: Should we write a utf-8 encoded genbank file? Not even sure + # what type `raw_data` is... # Compress the data to save disk space. comp = bz2.BZ2Compressor() data = comp.compress(raw_data) @@ -115,15 +120,16 @@ class Retriever(object) : return out_handle.name # return the full path to the file #_write + # Todo: check callers; argument should be a byte string def _calcHash(self, content) : """ Calculate the md5sum of a piece of text. @arg content: Arbitrary text - @type content: string + @type content: byte string @return: The md5sum of 'content' - @rtype: string + @rtype: unicode """ hashfunc = hashlib.md5() @@ -131,7 +137,7 @@ class Retriever(object) : md5sum = hashfunc.hexdigest() del hashfunc - return md5sum + return unicode(md5sum) #_calcHash def _newUD(self) : @@ -139,11 +145,11 @@ class Retriever(object) : Make a new UD number based on the current time (seconds since 1970). @return: A new UD number - @rtype: string + @rtype: unicode """ UD = util.generate_id() - return "UD_" + str(UD) + return "UD_" + unicode(UD) #_newUD def _updateDBmd5(self, raw_data, name, GI): @@ -159,7 +165,7 @@ class Retriever(object) : @type GI: @return: filename - @rtype: string + @rtype: unicode """ try: reference = Reference.query.filter_by(accession=name).one() @@ -191,10 +197,10 @@ class Retriever(object) : it. @arg rsId: The rsId of the SNP (example: 'rs9919552'). - @type rsId: string + @type rsId: unicode @return: A list of HGVS notations. - @rtype: list(string) + @rtype: list(unicode) """ # A simple input check. id = rs_id[2:] @@ -223,7 +229,7 @@ class Retriever(object) : self._output.addMessage(__file__, 4, 'EENTREZ', 'Error connecting to dbSNP.') self._output.addMessage(__file__, -1, 'INFO', - 'IOError: %s' % str(e)) + 'IOError: %s' % unicode(e)) return [] try: @@ -232,7 +238,7 @@ class Retriever(object) : self._output.addMessage(__file__, 4, 'EENTREZ', 'Error reading from dbSNP.') self._output.addMessage(__file__, -1, 'INFO', - 'IncompleteRead: %s' % str(e)) + 'IncompleteRead: %s' % unicode(e)) return [] if response_text == '\n': @@ -251,7 +257,7 @@ class Retriever(object) : self._output.addMessage(__file__, 4, 'EENTREZ', 'Unknown dbSNP ' \ 'error. Error parsing result XML.') self._output.addMessage(__file__, -1, 'INFO', - 'ExpatError: %s' % str(e)) + 'ExpatError: %s' % unicode(e)) self._output.addMessage(__file__, -1, 'INFO', 'Result from dbSNP: %s' % response_text) return [] @@ -265,7 +271,7 @@ class Retriever(object) : snps = [] for i in rs.getElementsByTagName('hgvs'): - snps.append(i.lastChild.data.encode('utf8')) + snps.append(i.lastChild.data) return snps #snpConvert @@ -286,6 +292,7 @@ class GenBankRetriever(Retriever): # Child specific init #__init__ + # todo: raw_data must always be a byte string def write(self, raw_data, filename, extract) : """ Write raw data to a file. The data is parsed before writing, if a @@ -300,7 +307,7 @@ class GenBankRetriever(Retriever): @arg raw_data: The data @type raw_data: string @arg filename: The intended name of the file. - @type filename: string + @type filename: unicode @arg extract: Flag that indicates whether to extract the record ID and GI number: - 0 ; Do not extract, use 'filename' @@ -310,7 +317,7 @@ class GenBankRetriever(Retriever): @return: tuple ; Depending on the value of 'extract': - 0 ; ('filename', None) - 1 ; (id, GI) - @rtype: tuple (string, string) + @rtype: tuple (unicode, unicode) """ if raw_data == "\nNothing has been found\n" : @@ -378,7 +385,7 @@ class GenBankRetriever(Retriever): net_handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage(__file__, -1, 'INFO', - 'Error connecting to Entrez nuccore database: %s' % str(e)) + 'Error connecting to Entrez nuccore database: %s' % unicode(e)) self._output.addMessage(__file__, 4, 'ERETR', 'Could not retrieve %s.' % name) return None @@ -409,7 +416,7 @@ class GenBankRetriever(Retriever): net_handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage(__file__, -1, 'INFO', - 'Error connecting to Entrez nuccore database: %s' % str(e)) + 'Error connecting to Entrez nuccore database: %s' % unicode(e)) self._output.addMessage(__file__, 4, 'ERETR', 'Could not retrieve %s.' % name) return None @@ -438,7 +445,7 @@ class GenBankRetriever(Retriever): as filename. @arg accno: The accession number of the chromosome - @type accno: string + @type accno: unicode @arg start: Start position of the slice @type start: integer @arg stop: End position of the slice. @@ -450,7 +457,7 @@ class GenBankRetriever(Retriever): @type orientation: integer @return: An UD number - @rtype: string + @rtype: unicode """ # Not a valid slice. @@ -483,7 +490,7 @@ class GenBankRetriever(Retriever): handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage(__file__, -1, 'INFO', - 'Error connecting to Entrez nuccore database: %s' % str(e)) + 'Error connecting to Entrez nuccore database: %s' % unicode(e)) self._output.addMessage(__file__, 4, 'ERETR', 'Could not retrieve slice.') return None @@ -512,7 +519,7 @@ class GenBankRetriever(Retriever): #else if self.write(raw_data, reference.accession, 0): - return str(reference.accession) + return reference.accession #retrieveslice def retrievegene(self, gene, organism, upstream, downstream) : @@ -521,9 +528,9 @@ class GenBankRetriever(Retriever): slice if the gene can be found. @arg gene: Name of the gene - @type gene: string + @type gene: unicode @arg organism: The organism in which we search. - @type organism: string + @type organism: unicode @arg upstream: Number of upstream nucleotides for the slice. @type upstream: integer @arg downstream: Number of downstream nucleotides for the slice. @@ -549,7 +556,7 @@ class GenBankRetriever(Retriever): handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage(__file__, -1, 'INFO', - 'Error connecting to Entrez esearch: %s' % str(e)) + 'Error connecting to Entrez esearch: %s' % unicode(e)) self._output.addMessage(__file__, 4, 'ERETR', 'Could not search for gene %s.' % gene) return None @@ -571,7 +578,7 @@ class GenBankRetriever(Retriever): handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage(__file__, -1, 'INFO', - 'Error connecting to Entrez esummary: %s' % str(e)) + 'Error connecting to Entrez esummary: %s' % unicode(e)) self._output.addMessage(__file__, 4, 'ERETR', 'Could not get mapping information for gene %s.' % gene) return None @@ -631,10 +638,10 @@ class GenBankRetriever(Retriever): is used. @arg url: Location of a GenBank record - @type url: string + @type url: unicode @return: UD or None - @rtype: string + @rtype: unicode """ handle = urllib2.urlopen(url) info = handle.info() @@ -651,14 +658,14 @@ class GenBankRetriever(Retriever): except NoResultFound: UD = self._newUD() if not os.path.isfile(self._nametofile(UD)): - UD = self.write(raw_data, UD, 0) and str(UD) + UD = self.write(raw_data, UD, 0) and UD if UD: #Parsing went OK, add to DB reference = Reference(UD, md5sum, download_url=url) session.add(reference) session.commit() else: if not os.path.isfile(self._nametofile(reference.accession)): - UD = self.write(raw_data, reference.accession, 0) and str(reference.accession) + UD = self.write(raw_data, reference.accession, 0) and reference.accession return UD #Returns the UD or None #if @@ -682,10 +689,10 @@ class GenBankRetriever(Retriever): is used. @arg raw_data: A GenBank record - @type raw_data: string + @type raw_data: byte string - @return: - @rtype: string????? + @return: Accession number for the uploaded file. + @rtype: unicode """ md5sum = self._calcHash(raw_data) @@ -702,7 +709,7 @@ class GenBankRetriever(Retriever): if os.path.isfile(self._nametofile(reference.accession)): return reference.accession else: - return self.write(raw_data, reference.accession, 0) and str(reference.accession) + return self.write(raw_data, reference.accession, 0) and reference.accession #uploadrecord def loadrecord(self, identifier): @@ -718,7 +725,7 @@ class GenBankRetriever(Retriever): 3. Fetched from the NCBI. :arg identifier: A RefSeq accession number or geninfo identifier (GI). - :type identifier: string + :type identifier: unicode :return: A parsed RefSeq record or `None` if no record could be found for the given identifier. @@ -830,7 +837,7 @@ class LRGRetriever(Retriever): Load and parse a LRG file based on the identifier @arg identifier: The name of the LRG file to read - @type identifier: string + @type identifier: unicode @return: record ; GenRecord.Record of LRG file None ; in case of failure @@ -850,6 +857,7 @@ class LRGRetriever(Retriever): # Now we have the file, so we can parse it. file_handle = bz2.BZ2File(filename, "r") + file_handle = codecs.getreader('utf-8')(file_handle) #create GenRecord.Record from LRG file record = lrg.create_record(file_handle.read()) @@ -870,10 +878,10 @@ class LRGRetriever(Retriever): from the pending section. @arg name: The name of the LRG file to fetch - @type name: string + @type name: unicode @return: the full path to the file; None in case of an error - @rtype: string + @rtype: unicode """ prefix = settings.LRG_PREFIX_URL @@ -901,12 +909,12 @@ class LRGRetriever(Retriever): Download an LRG record from an URL. @arg url: Location of the LRG record - @type url: string + @type url: unicode @return: - filename ; The full path to the file - None ; in case of failure - @rtype: string + @rtype: unicode """ lrgID = name or os.path.splitext(os.path.split(url)[1])[0] @@ -914,6 +922,8 @@ class LRGRetriever(Retriever): # return None filename = self._nametofile(lrgID) + # Todo: Properly read the file contents to a unicode string and write + # it utf-8 encoded. handle = urllib2.urlopen(url) info = handle.info() if info["Content-Type"] == "application/xml" and info.has_key("Content-length"): @@ -970,12 +980,12 @@ class LRGRetriever(Retriever): @arg raw_data: The data @type raw_data: string @arg filename: The intended name of the file - @type filename: string + @type filename: unicode @return: - filename ; The full path and name of the file written - None ; In case of an error - @rtype: string + @rtype: unicode """ # Dirty way to test if a file is valid, # Parse the file to see if it's a real LRG file. diff --git a/mutalyzer/Scheduler.py b/mutalyzer/Scheduler.py index e6f102d3e2b23dcd8b60bc7c17a6c7f03a196ad0..ee7223a949e6b6c6e091279aa7e6322ef2169779 100644 --- a/mutalyzer/Scheduler.py +++ b/mutalyzer/Scheduler.py @@ -15,13 +15,14 @@ Module used to add and manage the Batch Jobs. # - Batch Syntax Checker # - Batch Position Converter +from __future__ import unicode_literals + import os # os.path.exists import smtplib # smtplib.STMP from email.mime.text import MIMEText # MIMEText from sqlalchemy import func from sqlalchemy.orm.exc import NoResultFound -import mutalyzer from mutalyzer.config import settings from mutalyzer.db import queries, session from mutalyzer.db.models import Assembly, BatchJob, BatchQueueItem @@ -88,9 +89,9 @@ class Scheduler() : @todo: Handle Connection errors in a try, except clause @arg mailTo: The batch job submitter - @type mailTo: string + @type mailTo: unicode @arg url: The url containing the results - @type url: string + @type url: unicode """ if settings.TESTING: return @@ -410,7 +411,7 @@ Mutalyzer batch scheduler""" % url) O.addMessage(__file__, 4, "EBATCHU", "Unexpected error occurred, dev-team notified") import traceback - O.addMessage(__file__, 4, "DEBUG", repr(traceback.format_exc())) + O.addMessage(__file__, 4, "DEBUG", unicode(repr(traceback.format_exc()))) #except finally : #check if we need to update the database @@ -535,11 +536,11 @@ Mutalyzer batch scheduler""" % url) - Output written to outputfile. @arg cmd: The Syntax Checker input - @type cmd: string + @type cmd: unicode @arg i: The JobID @type i: integer @arg build: The build to use for the converter - @type build: string + @type build: unicode @arg flags: Flags of the current entry @type flags: """ @@ -562,7 +563,7 @@ Mutalyzer batch scheduler""" % url) assembly = Assembly.by_name_or_alias(batch_job.argument) except NoResultFound: O.addMessage(__file__, 3, 'ENOASSEMBLY', - 'Not a valid assembly: ' + str(batch_job.argument)) + 'Not a valid assembly: ' + batch_job.argument) raise converter = Converter(assembly, O) @@ -704,7 +705,7 @@ Mutalyzer batch scheduler""" % url) Add a job to the Database and start the BatchChecker. @arg email: e-mail address of batch supplier - @type email: string + @type email: unicode @arg queue: A list of jobs @type queue: list @arg columns: The number of columns. diff --git a/mutalyzer/__init__.py b/mutalyzer/__init__.py index e3c80aa36a8691de7128fb8a7c482cf58699bb48..6968d5ff84fa0b23b07b8e49adbd449fd6cc61e5 100644 --- a/mutalyzer/__init__.py +++ b/mutalyzer/__init__.py @@ -3,6 +3,9 @@ HGVS variant nomenclature checker. """ +from __future__ import unicode_literals + + # We follow a versioning scheme compatible with setuptools [1] where the # package version is always that of the upcoming release (and not that of the # previous release), post-fixed with ``.dev``. Only in a release commit, the diff --git a/mutalyzer/announce.py b/mutalyzer/announce.py index d8acbe4de84757bde62b0d326b5a4c0a3fc7ee4d..9adbf79109eeb06e8894b74cfdbad7d929261502 100644 --- a/mutalyzer/announce.py +++ b/mutalyzer/announce.py @@ -7,6 +7,8 @@ fast, it can be done on every website pageview without problems. """ +from __future__ import unicode_literals + from mutalyzer.redisclient import client diff --git a/mutalyzer/config/__init__.py b/mutalyzer/config/__init__.py index def4630bc53ad26234896ab56165afa73bdc3c88..462a490e1bf21d18d7bca310e732ec95bd1f2e62 100644 --- a/mutalyzer/config/__init__.py +++ b/mutalyzer/config/__init__.py @@ -12,6 +12,8 @@ be used. """ +from __future__ import unicode_literals + import collections import os diff --git a/mutalyzer/config/default_settings.py b/mutalyzer/config/default_settings.py index 43009e09e1e0142fed8f99bc67a00076c0ab9327..00dc9b2e8070f55b18bd47d9a384bcc7e9fa98cf 100644 --- a/mutalyzer/config/default_settings.py +++ b/mutalyzer/config/default_settings.py @@ -4,6 +4,9 @@ pointed-to by the `MUTALYZER_SETTINGS` environment variable. """ +from __future__ import unicode_literals + + # Use Mutalyzer in debug mode. DEBUG = False diff --git a/mutalyzer/db/__init__.py b/mutalyzer/db/__init__.py index b2192186773b542c68d1dae3884124f012ccfff4..71e8eaf5cd4eeea706873fcb4b179168e34187d7 100644 --- a/mutalyzer/db/__init__.py +++ b/mutalyzer/db/__init__.py @@ -4,6 +4,8 @@ using SQLAlchemy. """ +from __future__ import unicode_literals + import sqlalchemy from sqlalchemy.engine.url import make_url from sqlalchemy.ext.declarative import declarative_base diff --git a/mutalyzer/db/models.py b/mutalyzer/db/models.py index 4119fa99e178b7dfbdaaa91e3cbb1352836dd3c0..faa0754c519549f71d78fbadcceb4c10586d43bf 100644 --- a/mutalyzer/db/models.py +++ b/mutalyzer/db/models.py @@ -3,6 +3,8 @@ Models backed by SQL using SQLAlchemy. """ +from __future__ import unicode_literals + from datetime import datetime import sqlite3 import uuid @@ -50,7 +52,7 @@ class Positions(TypeDecorator): def process_bind_param(self, value, dialect): if value is not None: - value = ','.join(str(i) for i in value) + value = ','.join(unicode(i) for i in value) return value def process_result_value(self, value, dialect): @@ -98,7 +100,7 @@ class BatchJob(db.Base): self.email = email self.download_url = download_url self.argument = argument - self.result_id = str(uuid.uuid4()) + self.result_id = unicode(uuid.uuid4()) self.added = datetime.now() def __repr__(self): diff --git a/mutalyzer/db/queries.py b/mutalyzer/db/queries.py index afdd2a44152e105976edc94db793c4ce12b764d1..7c54d137fa19e5ff0b8459a3df305ec4241c9d2e 100644 --- a/mutalyzer/db/queries.py +++ b/mutalyzer/db/queries.py @@ -7,6 +7,8 @@ Queries on database models. # the models they work with. +from __future__ import unicode_literals + from datetime import datetime, timedelta from sqlalchemy import and_, or_ diff --git a/mutalyzer/describe.py b/mutalyzer/describe.py index 37fb60c238990b3cd12bcdbd9098c48ecfb6724d..d81254c39aeed1febbb7b5545ae48b82e3dfc7cb 100644 --- a/mutalyzer/describe.py +++ b/mutalyzer/describe.py @@ -7,13 +7,14 @@ leading from one sequence to an other. @requires: Bio.Seq """ +from __future__ import unicode_literals + import collections -from Bio import Seq from Bio.SeqUtils import seq3 from Bio.Data import CodonTable from mutalyzer.util import longest_common_prefix, longest_common_suffix -from mutalyzer.util import palinsnoop, roll +from mutalyzer.util import palinsnoop, roll, reverse_complement from mutalyzer import models @@ -34,9 +35,9 @@ class LCS(object): Initialise the class. @arg s1: A string. - @type s1: str + @type s1: unicode @arg s2: A string. - @type s2: str + @type s2: unicode @arg lcp: The length of the longest common prefix of {s1} and {s2}. @type lcp: int @arg s1_end: End of the substring in {s1}. @@ -55,21 +56,21 @@ class LCS(object): self.__s2_rc = None self.__matrix_rc = None if DNA: - self.__s2_rc = Seq.reverse_complement(s2[self.__lcp:s2_end]) + self.__s2_rc = reverse_complement(s2[self.__lcp:s2_end]) self.__matrix_rc = self.LCSMatrix(self.__s1, self.__s2_rc) #if #__init__ - def __str__(self): + def __unicode__(self): """ Return a graphical representation of the LCS matrix, mainly for debugging. @returns: A graphical representation of the LCS matrix. - @rtype: str + @rtype: unicode """ return self.visMatrix((0, len(self.__s1)), (0, len(self.__s2))) - #__str__ + #__unicode__ def visMatrix(self, r1, r2, rc=False): """ @@ -77,7 +78,7 @@ class LCS(object): debugging. @returns: A graphical representation of the LCS matrix. - @rtype: str + @rtype: unicode """ nr1 = r1[0] - self.__lcp, r1[1] - self.__lcp nr2 = r2[0] - self.__lcp, r2[1] - self.__lcp @@ -91,7 +92,7 @@ class LCS(object): out = self.__delim.join(self.__delim + '-' + s2[nr2[0]:nr2[1]]) + '\n' for i in range(nr1[0], nr1[1] + 1): out += (('-' + self.__s1)[i] + self.__delim + - self.__delim.join(map(lambda x: str(M[i][x]), + self.__delim.join(map(lambda x: unicode(M[i][x]), range(nr2[0], nr2[1] + 1))) + '\n') return out @@ -102,9 +103,9 @@ class LCS(object): Calculate the Longest Common Substring matrix. @arg s1: A string. - @type s1: str + @type s1: unicode @arg s2: A string. - @type s2: str + @type s2: unicode @returns: A matrix with the LCS of {s1}[i], {s2}[j] at position i, j. @rval: list[list[int]] @@ -201,9 +202,9 @@ def __makeOverlaps(peptide): Make a list of overlapping 2-mers of {peptide} in order of appearance. @arg peptide: A peptide sequence. - @type peptide: str + @type peptide: unicode @returns: All 2-mers of {peptide} in order of appearance. - @rtype: list(str) + @rtype: list(unicode) """ return map(lambda x: peptide[x:x+2], range(len(peptide) - 1)) #__makeOverlaps @@ -213,13 +214,13 @@ def __options(pList, peptidePrefix, FS, output): Enumerate all peptides that could result from a frame shift. @arg pList: List of overlapping 2-mers of a peptide. - @type pList: list(str) + @type pList: list(unicode) @arg peptidePrefix: Prefix of a peptide in the alternative reading frame. - @type peptidePrefix: str + @type peptidePrefix: unicode @arg FS: Frame shift table. @type FS: dict @arg output: List of peptides, should be empty initially. - @type output: list(str) + @type output: list(unicode) """ if not pList: output.append(peptidePrefix) @@ -234,7 +235,7 @@ def enumFS(peptide, FS): Enumerate all peptides that could result from a frame shift. @arg peptide: Original peptide sequence. - @type peptide: str + @type peptide: unicode @arg FS: Frame shift table. @type FS: dict """ @@ -250,9 +251,9 @@ def fitFS(peptide, altPeptide, FS): {peptide}. @arg peptide: Original peptide sequence. - @type peptide: str + @type peptide: unicode @arg altPeptide: Observed peptide sequence. - @type altPeptide: str + @type altPeptide: unicode @arg FS: Frame shift table. @type FS: dict """ @@ -302,11 +303,11 @@ class DescribeRawVar(models.RawVar): @arg end_offset: @type end_offset: int @arg type: Variant type. - @type type: str + @type type: unicode @arg deleted: Deleted part of the reference sequence. - @type deleted: str + @type deleted: unicode @arg inserted: Inserted part. - @type inserted: str + @type inserted: unicode @arg shift: Amount of freedom. @type shift: int """ @@ -336,7 +337,7 @@ class DescribeRawVar(models.RawVar): correct description. Also see the comment in the class definition. @returns: The HGVS description of the raw variant stored in this class. - @rtype: str + @rtype: unicode """ if not self.start: return "=" @@ -365,7 +366,7 @@ class DescribeRawVar(models.RawVar): correct description. Also see the comment in the class definition. @returns: The HGVS description of the raw variant stored in this class. - @rtype: str + @rtype: unicode """ if self.type == "unknown": return "?" @@ -491,7 +492,7 @@ def alleleDescription(allele): @type allele: list(DescribeRawVar) @returns: The HGVS description of {allele}. - @rval: str + @rval: unicode """ if len(allele) > 1: return "[%s]" % ';'.join(map(lambda x : x.hgvs, allele)) @@ -530,9 +531,9 @@ def DNA_description(M, s1, s2, s1_start, s1_end, s2_start, s2_end): {s1_start}..{s1_end} on {s1} and {s2_start}..{s2_end} on {s2}. arg s1: Sequence 1. - type s1: str + type s1: unicode arg s2: Sequence 2. - type s2: str + type s2: unicode arg s1_start: Start of the range on {s1}. type s1_start: int arg s1_end: End of the range on {s1}. @@ -682,9 +683,9 @@ def protein_description(M, s1, s2, s1_start, s1_end, s2_start, s2_end): {s1_start}..{s1_end} on {s1} and {s2_start}..{s2_end} on {s2}. arg s1: Sequence 1. - type s1: str + type s1: unicode arg s2: Sequence 2. - type s2: str + type s2: unicode arg s1_start: Start of the range on {s1}. type s1_start: int arg s1_end: End of the range on {s1}. @@ -810,15 +811,15 @@ def describe(original, mutated, DNA=True): Convenience function for DNA_description(). @arg original: - @type original: str + @type original: unicode @arg mutated: - @type mutated: str + @type mutated: unicode @returns: A list of DescribeRawVar objects, representing the allele. @rval: list(DescribeRawVar) """ - s1 = str(original) - s2 = str(mutated) + s1 = original + s2 = mutated lcp = len(longest_common_prefix(s1, s2)) lcs = len(longest_common_suffix(s1[lcp:], s2[lcp:])) s1_end = len(s1) - lcs diff --git a/mutalyzer/describe_c.py b/mutalyzer/describe_c.py deleted file mode 100755 index 1da86f77293e015ba2a0f53a5a3f61a3fcaeca4d..0000000000000000000000000000000000000000 --- a/mutalyzer/describe_c.py +++ /dev/null @@ -1,587 +0,0 @@ -#!/usr/bin/python - -""" -Prototype of a module that can generate a HGVS description of the variant(s) -leading from one sequence to an other. - -@requires: Bio.Seq -""" -import collections -from Bio import Seq -from Bio.SeqUtils import seq3 -from Bio.Data import CodonTable - -from mutalyzer.util import longest_common_prefix, longest_common_suffix -from mutalyzer.util import palinsnoop, roll -from mutalyzer import models - -from extractor import extractor - -def makeFSTables(table_id): - """ - For every pair of amino acids, calculate the set of possible amino acids in - a different reading frame. Do this for both alternative reading frames (+1 - and +2). - - @arg table_id: Coding table ID. - @type table_id: int - @returns: Two dictionaries for the two alternative reading frames. - @rtype: tuple(dict, dict) - """ - # Make the forward translation table. - table = dict(CodonTable.unambiguous_dna_by_id[table_id].forward_table) - for i in CodonTable.unambiguous_dna_by_id[table_id].stop_codons: - table[i] = '*' - - # Make the reverse translation table. - reverse_table = collections.defaultdict(list) - for i in table: - reverse_table[table[i]].append(i) - - # Make the frame shift tables. - FS1 = collections.defaultdict(set) - FS2 = collections.defaultdict(set) - for AA_i in reverse_table: - for AA_j in reverse_table: - for codon_i in reverse_table[AA_i]: - for codon_j in reverse_table[AA_j]: - FS1[AA_i + AA_j].add(table[(codon_i + codon_j)[1:4]]) # +1. - FS2[AA_i + AA_j].add(table[(codon_i + codon_j)[2:5]]) # +2. - #for - return FS1, FS2 -#makeFSTables - -def __makeOverlaps(peptide): - """ - Make a list of overlapping 2-mers of {peptide} in order of appearance. - - @arg peptide: A peptide sequence. - @type peptide: str - @returns: All 2-mers of {peptide} in order of appearance. - @rtype: list(str) - """ - return map(lambda x: peptide[x:x+2], range(len(peptide) - 1)) -#__makeOverlaps - -def __options(pList, peptidePrefix, FS, output): - """ - Enumerate all peptides that could result from a frame shift. - - @arg pList: List of overlapping 2-mers of a peptide. - @type pList: list(str) - @arg peptidePrefix: Prefix of a peptide in the alternative reading frame. - @type peptidePrefix: str - @arg FS: Frame shift table. - @type FS: dict - @arg output: List of peptides, should be empty initially. - @type output: list(str) - """ - if not pList: - output.append(peptidePrefix) - return - #if - for i in FS[pList[0]]: - __options(pList[1:], peptidePrefix + i, FS, output) -#__options - -def enumFS(peptide, FS): - """ - Enumerate all peptides that could result from a frame shift. - - @arg peptide: Original peptide sequence. - @type peptide: str - @arg FS: Frame shift table. - @type FS: dict - """ - output = [] - - __options(__makeOverlaps(peptide), "", FS, output) - return output -#enumFS - -def fitFS(peptide, altPeptide, FS): - """ - Check whether peptide {altPeptide} is a possible frame shift of peptide - {peptide}. - - @arg peptide: Original peptide sequence. - @type peptide: str - @arg altPeptide: Observed peptide sequence. - @type altPeptide: str - @arg FS: Frame shift table. - @type FS: dict - """ - # Todo: This is a temporary fix to prevent crashing on frameshift - # detection (I think bug #124). - return False - - if len(peptide) < len(altPeptide): - return False - - pList = __makeOverlaps(peptide) - - for i in range(len(altPeptide)): - if not altPeptide[i] in FS[pList[i]]: - return False - return True -#fitFS - -def findFS(peptide, altPeptide, FS): - """ - Find the longest part of {altPeptide} that fits in {peptide} in a certain - frame given by {FS}. - - @arg peptide: Original peptide sequence. - @type peptide: str - @arg altPeptide: Observed peptide sequence. - @type altPeptide: str - @arg FS: Frame shift table. - @type FS: dict - - @returns: The length and the offset in {peptide} of the largest frameshift. - @rtype: tuple(int, int) - """ - pList = __makeOverlaps(peptide) - maxFS = 0 - fsStart = 0 - - for i in range(len(pList))[::-1]: - for j in range(min(i + 1, len(altPeptide))): - if not altPeptide[::-1][j] in FS[pList[i - j]]: - break - if j >= maxFS: - maxFS = j - fsStart = i - j + 2 - #if - #for - - return maxFS - 1, fsStart -#findFS - -class RawVar(models.RawVar): - """ - Container for a raw variant. - - To use this class correctly, do not supply more than the minimum amount of - data. The {description()} function may not work properly if too much - information is given. - - Example: if {end} is initialised for a substitution, a range will be - retuned, resulting in a description like: 100_100A>T - """ - - def __init__(self, DNA=True, start=0, start_offset=0, end=0, end_offset=0, - type="none", deleted="", inserted="", shift=0, startAA="", endAA="", - term=0): - """ - Initialise the class with the appropriate values. - - @arg start: Start position. - @type start: int - @arg start_offset: - @type start_offset: int - @arg end: End position. - @type end: int - @arg end_offset: - @type end_offset: int - @arg type: Variant type. - @type type: str - @arg deleted: Deleted part of the reference sequence. - @type deleted: str - @arg inserted: Inserted part. - @type inserted: str - @arg shift: Amount of freedom. - @type shift: int - """ - # TODO: Will this container be used for all variants, or only genomic? - # start_offset and end_offset may be never used. - self.DNA = DNA - self.start = start - self.start_offset = start_offset - self.end = end - self.end_offset = end_offset - self.type = type - self.deleted = deleted - self.inserted = inserted - self.shift = shift - self.startAA = startAA - self.endAA = endAA - self.term = term - self.update() - #self.hgvs = self.description() - #self.hgvsLength = self.descriptionLength() - #__init__ - - def __DNADescription(self): - """ - Give the HGVS description of the raw variant stored in this class. - - Note that this function relies on the absence of values to make the - correct description. Also see the comment in the class definition. - - @returns: The HGVS description of the raw variant stored in this class. - @rtype: str - """ - if not self.start: - return "=" - - descr = "%i" % self.start - - if self.end: - descr += "_%i" % self.end - - if self.type != "subst": - descr += "%s" % self.type - - if self.inserted: - return descr + "%s" % self.inserted - return descr - #if - - return descr + "%s>%s" % (self.deleted, self.inserted) - #__DNADescription - - def __proteinDescription(self): - """ - Give the HGVS description of the raw variant stored in this class. - - Note that this function relies on the absence of values to make the - correct description. Also see the comment in the class definition. - - @returns: The HGVS description of the raw variant stored in this class. - @rtype: str - """ - if self.type == "unknown": - return "?" - if not self.start: - return "=" - - descr = "" - if not self.deleted: - if self.type == "ext": - descr += '*' - else: - descr += "%s" % seq3(self.startAA) - #if - else: - descr += "%s" % seq3(self.deleted) - descr += "%i" % self.start - if self.end: - descr += "_%s%i" % (seq3(self.endAA), self.end) - if self.type not in ["subst", "stop", "ext", "fs"]: # fs is not a type - descr += self.type - if self.inserted: - descr += "%s" % seq3(self.inserted) - - if self.type == "stop": - return descr + '*' - if self.term: - return descr + "fs*%i" % self.term - return descr - #__proteinDescription - - def __DNADescriptionLength(self): - """ - Give the standardised length of the HGVS description of the raw variant - stored in this class. - - Note that this function relies on the absence of values to make the - correct description. Also see the comment in the class definition. - - @returns: The standardised length of the HGVS description of the raw - variant stored in this class. - @rtype: int - """ - if not self.start: # `=' or `?' - return 1 - - descrLen = 1 # Start position. - - if self.end: # '_' and end position. - descrLen += 2 - - if self.type != "subst": - descrLen += len(self.type) - - if self.inserted: - return descrLen + len(self.inserted) - return descrLen - #if - - return 4 # Start position, '>' and end position. - #__DNAdescriptionLength - - def __proteinDescriptionLength(self): - """ - Give the standardised length of the HGVS description of the raw variant - stored in this class. - - Note that this function relies on the absence of values to make the - correct description. Also see the comment in the class definition. - - @returns: The standardised length of the HGVS description of the raw - variant stored in this class. - @rtype: int - """ - if not self.start: # = - return 1 - - descrLen = 1 # Start position. - if not self.deleted and self.type == "ext": - descrLen += 1 # * - else: - descrLen += 3 # One amino acid. - if self.end: - descrLen += 5 # `_' + one amino acid + end position. - if self.type not in ["subst", "stop", "ext", "fs"]: - descrLen += len(self.type) - if self.inserted: - descrLen += 3 * len(self.inserted) - if self.type == "stop": - return descrLen + 1 # * - if self.term: - return descrLen + len(self.type) + 2 # `*' + length until stop. - return descrLen - #__proteinDescriptionLength - - def update(self): - """ - """ - self.hgvs = self.description() - self.hgvsLength = self.descriptionLength() - #update - - def description(self): - """ - """ - if self.DNA: - return self.__DNADescription() - return self.__proteinDescription() - #description - - def descriptionLength(self): - """ - Give the standardised length of the HGVS description of the raw variant - stored in this class. - - @returns: The standardised length of the HGVS description of the raw - variant stored in this class. - @rtype: int - """ - if self.DNA: - return self.__DNADescriptionLength() - return self.__proteinDescriptionLength() - #descriptionLength -#RawVar - -def alleleDescription(allele): - """ - Convert a list of raw variants to an HGVS allele description. - - @arg allele: A list of raw variants representing an allele description. - @type allele: list(RawVar) - - @returns: The HGVS description of {allele}. - @rval: str - """ - if len(allele) > 1: - return "[%s]" % ';'.join(map(lambda x: x.hgvs, allele)) - return allele[0].hgvs -#alleleDescription - -def alleleDescriptionLength(allele): - """ - Calculate the standardised length of an HGVS allele description. - - @arg allele: A list of raw variants representing an allele description. - @type allele: list(RawVar) - - @returns: The standardised length of the HGVS description of {allele}. - @rval: int - """ - # NOTE: Do we need to count the ; and [] ? - return sum(map(lambda x: x.hgvsLength, allele)) -#alleleDescriptionLength - -def printpos(s, start, end, fill=0): - """ - For debugging purposes. - """ - # TODO: See if this can partially replace or be merged with the - # visualisation in the __mutate() function of mutator.py - fs = 10 # Flank size. - - return "%s %s%s %s" % (s[start - fs:start], s[start:end], '-' * fill, - s[end:end + fs]) -#printpos - -def var2RawVar(s1, s2, var, DNA=True): - """ - """ - # Unknown. - if s1 == '?' or s2 == '?': - return [RawVar(DNA=DNA, type="unknown")] - - # Insertion / Duplication. - if var.reference_start == var.reference_end: - ins_length = var.sample_end - var.sample_start - shift5, shift3 = roll(s2, var.sample_start + 1, var.sample_end) - shift = shift5 + shift3 - - var.reference_start += shift3 - var.reference_end += shift3 - var.sample_start += shift3 - var.sample_end += shift3 - - if (var.sample_start - ins_length >= 0 and - s1[var.reference_start - ins_length:var.reference_start] == - s2[var.sample_start:var.sample_end]): - - if ins_length == 1: - return RawVar(DNA=DNA, start=var.reference_start, type="dup", - shift=shift) - return RawVar(DNA=DNA, start=var.reference_start - ins_length + 1, - end=var.reference_end, type="dup", shift=shift) - #if - return RawVar(DNA=DNA, start=var.reference_start, - end=var.reference_start + 1, - inserted=s2[var.sample_start:var.sample_end], type="ins", - shift=shift) - #if - - # Deletion. - if var.sample_start == var.sample_end: - shift5, shift3 = roll(s1, var.reference_start + 1, var.reference_end) - shift = shift5 + shift3 - - var.reference_start += shift3 + 1 - var.reference_end += shift3 - - if var.reference_start == var.reference_end: - return RawVar(DNA=DNA, start=var.reference_start, type="del", - shift=shift) - return RawVar(DNA=DNA, start=var.reference_start, - end=var.reference_end, type="del", shift=shift) - #if - - # Substitution. - if (var.reference_start + 1 == var.reference_end and - var.sample_start + 1 == var.sample_end): - - return RawVar(DNA=DNA, start=var.reference_start + 1, - deleted=s1[var.reference_start], inserted=s2[var.sample_start], - type="subst") - #if - - # Simple InDel. - if var.reference_start + 1 == var.reference_end: - return RawVar(DNA=DNA, start=var.reference_start + 1, - inserted=s2[var.sample_start:var.sample_end], type="delins") - - # Inversion. - if var.type == extractor.VARIANT_REVERSE_COMPLEMENT: - trim = palinsnoop(s1[var.reference_start:var.reference_end]) - - if trim > 0: # Partial palindrome. - var.reference_end -= trim - var.sample_end -= trim - #if - - return RawVar(DNA=DNA, start=var.reference_start + 1, - end=var.reference_end, type="inv") - #if - - # InDel. - return RawVar(DNA=DNA, start=var.reference_start + 1, - end=var.reference_end, inserted=s2[var.sample_start:var.sample_end], - type="delins") -#var2RawVar - -def description(s1, s2, DNA=True): - """ - Give an allele description of the change from {s1} to {s2}. - - arg s1: Sequence 1. - type s1: str - arg s2: Sequence 2. - type s2: str - - @returns: A list of RawVar objects, representing the allele. - @rval: list(RawVar) - """ - description = [] - - if not DNA: - FS1, FS2 = makeFSTables(1) - longestFSf = max(findFS(s1, s2, FS1), findFS(s1, s2, FS2)) - longestFSr = max(findFS(s2, s1, FS1), findFS(s2, s1, FS2)) - - if longestFSf > longestFSr: - print s1[:longestFSf[1]], s1[longestFSf[1]:] - print s2[:len(s2) - longestFSf[0]], s2[len(s2) - longestFSf[0]:] - s1_part = s1[:longestFSf[1]] - s2_part = s2[:len(s2) - longestFSf[0]] - term = longestFSf[0] - #if - else: - print s1[:len(s1) - longestFSr[0]], s1[len(s1) - longestFSr[0]:] - print s2[:longestFSr[1]], s2[longestFSr[1]:] - s1_part = s1[:len(s1) - longestFSr[0]] - s2_part = s2[:longestFSr[1]] - term = len(s2) - longestFSr[1] - #else - - s1_part = s1 - s2_part = s2 - for variant in extractor.extract(str(s1_part), len(s1_part), - str(s2_part), len(s2_part), 1): - description.append(var2RawVar(s1, s2, variant, DNA=DNA)) - - if description: - description[-1].term = term + 2 - description[-1].update() - #if - #if - else: - for variant in extractor.extract(str(s1), len(s1), str(s2), len(s2), - 0): - if variant.type != extractor.VARIANT_IDENTITY: - description.append(var2RawVar(s1, s2, variant, DNA=DNA)) - - # Nothing happened. - if not description: - return [RawVar(DNA=DNA)] - - return description -#description - -if __name__ == "__main__": - a = "ATAGATGATAGATAGATAGAT" - b = "ATAGATGATTGATAGATAGAT" - print alleleDescription(description(a, b, DNA=True)) - - a = "MAVLWRLSAVCGALGGRALLLRTPVVRPAH" - b = "MAVLWRLSAGCGALGGRALLLRTPVVRAH" - print alleleDescription(description(a, b, DNA=False)) - - a = "MDYSLAAALTLHGHWGLGQVVTDYVHGDALQKAAKAGLLALSALTFAGLCYFNYHDVGICKAVAMLWKL" - b = "MDYSLAAALTFMVTGALDKLLLTMFMGMPCRKLPRQGFWHFQL" - #print alleleDescription(description(a, b, DNA=False)) - #print alleleDescription(description(b, a, DNA=False)) - print "1" - extractor.extract(a, len(a), b, len(b), 1) - print "2" - extractor.extract(b, len(b), a, len(a), 1) - print "3" - - - a = "VVSVLLLGLLPAAYLNPCSAMYYSLAAALTLHGHWGLGQV" - b = "VVSVLLLGLLPAAYLNPCSAMDYSLAAALTLHGHWGLGQV" - print alleleDescription(description(a, b, DNA=False)) - print alleleDescription(description(b, a, DNA=False)) - - a = "ACGCTCGATCGCTTATAGCATGGGGGGGGGATCTAGCTCTCTCTATAAGATA" - b = "ACGCTCGATCGCTTATACCCCCCCCATGCGATCTAGCTCTCTCTATAAGATA" - print alleleDescription(description(a, b, DNA=True)) - -#if diff --git a/mutalyzer/entrypoints/__init__.py b/mutalyzer/entrypoints/__init__.py index 36b5ad16a25f2e75f11765e052dd8099697ddb13..5c6d2cf615d3f891a1404e3ff3326f0424928f0d 100644 --- a/mutalyzer/entrypoints/__init__.py +++ b/mutalyzer/entrypoints/__init__.py @@ -3,6 +3,11 @@ Entry points to Mutalyzer. """ +from __future__ import unicode_literals + +import sys + + class _ReverseProxied(object): """ Wrap the application in this middleware and configure the front-end server @@ -36,3 +41,13 @@ class _ReverseProxied(object): if scheme: environ['wsgi.url_scheme'] = scheme return self.app(environ, *args, **kwargs) + + +def _cli_string(argument): + """ + Decode a command line argument byte string to unicode using our best + guess for the encoding (noop on unicode strings). + """ + if isinstance(argument, unicode): + return argument + return unicode(argument, encoding=sys.stdin.encoding) diff --git a/mutalyzer/entrypoints/admin.py b/mutalyzer/entrypoints/admin.py index 42929e6bb31c974149f11b12a7ba39680eb9c5ae..9b06920d96e34194303169ab793553cd2d38ce7d 100644 --- a/mutalyzer/entrypoints/admin.py +++ b/mutalyzer/entrypoints/admin.py @@ -3,16 +3,19 @@ Command line interface to Mutalyzer administrative tools. """ +from __future__ import unicode_literals + import argparse import json import os +import sys import alembic.command import alembic.config from alembic.migration import MigrationContext -from sqlalchemy.exc import IntegrityError from sqlalchemy.orm.exc import NoResultFound +from . import _cli_string from .. import announce from .. import db from ..db import session @@ -96,7 +99,7 @@ def import_mapview(assembly_name_or_alias, mapview_file, group_label): try: mapping.import_from_mapview_file(assembly, mapview_file, group_label) except mapping.MapviewSortError as e: - raise UserError(str(e)) + raise UserError(unicode(e)) def import_gene(assembly_name_or_alias, gene): @@ -184,8 +187,9 @@ def main(): """ assembly_parser = argparse.ArgumentParser(add_help=False) assembly_parser.add_argument( - '-a', '--assembly', metavar='ASSEMBLY', dest='assembly_name_or_alias', - default='hg19', help='assembly to import to (default: hg19)') + '-a', '--assembly', metavar='ASSEMBLY', type=_cli_string, + dest='assembly_name_or_alias', default='hg19', + help='assembly to import to (default: hg19)') parser = argparse.ArgumentParser( description='Mutalyzer administrative tools.') @@ -227,7 +231,7 @@ def main(): 'mapview_file', metavar='FILE', type=argparse.FileType('r'), help='file from NCBI mapview (example: seq_gene.md), see note below') p.add_argument( - 'group_label', metavar='GROUP_LABEL', + 'group_label', metavar='GROUP_LABEL', type=_cli_string, help='use only entries with this group label (example: ' 'GRCh37.p2-Primary Assembly)') @@ -241,7 +245,7 @@ def main(): ' (i.e., NCBI mapview).') p.set_defaults(func=import_gene) p.add_argument( - 'gene', metavar='GENE_SYMBOL', + 'gene', metavar='GENE_SYMBOL', type=_cli_string, help='gene to import all transcript mappings for from the UCSC ' 'database (example: TTN)') @@ -255,7 +259,7 @@ def main(): 'usual source (i.e., NCBI mapview).') p.set_defaults(func=import_reference) p.add_argument( - 'reference', metavar='ACCESSION', + 'reference', metavar='ACCESSION', type=_cli_string, help='genomic reference to import all genes from (example: ' 'NC_012920.1)') @@ -272,10 +276,10 @@ def main(): description=set_announcement.__doc__.split('\n\n')[0]) p.set_defaults(func=set_announcement) p.add_argument( - 'body', metavar='ANNOUNCEMENT', + 'body', metavar='ANNOUNCEMENT', type=_cli_string, help='announcement text to show to the user') p.add_argument( - '--url', metavar='URL', dest='url', + '--url', metavar='URL', dest='url', type=_cli_string, help='URL to more information on the announcement') # Subparser 'announcement unset'. @@ -290,10 +294,10 @@ def main(): description=sync_cache.__doc__.split('\n\n')[0], epilog='Intended use is to run daily from cron.') p.add_argument( - 'wsdl_url', metavar='WSDL_URL', + 'wsdl_url', metavar='WSDL_URL', type=_cli_string, help='location of the remote WSDL description') p.add_argument( - 'url_template', metavar='URL_TEMPLATE', + 'url_template', metavar='URL_TEMPLATE', type=_cli_string, help='URL for remote downloads, in which the filename is to be ' 'substituted for {file}') p.add_argument( @@ -313,7 +317,7 @@ def main(): '--destructive', dest='destructive', action='store_true', help='delete any existing tables and data') p.add_argument( - '-c', '--alembic-config', metavar='ALEMBIC_CONFIG', + '-c', '--alembic-config', metavar='ALEMBIC_CONFIG', type=_cli_string, dest='alembic_config_path', help='path to Alembic configuration file') p.set_defaults(func=setup_database) @@ -323,7 +327,7 @@ def main(): args.func(**{k: v for k, v in vars(args).items() if k not in ('func', 'subcommand')}) except UserError as e: - parser.error(str(e)) + parser.error(unicode(e)) if __name__ == '__main__': diff --git a/mutalyzer/entrypoints/batch_processor.py b/mutalyzer/entrypoints/batch_processor.py index 286c411609642515e8ff6e3308e759b4234b0b92..ae3c2945748db1a3b286690e8df52dedbe603c13 100644 --- a/mutalyzer/entrypoints/batch_processor.py +++ b/mutalyzer/entrypoints/batch_processor.py @@ -6,12 +6,13 @@ Mutalyzer batch processor. """ +from __future__ import unicode_literals + import argparse import signal import sys import time -from .. import config from .. import db from .. import Scheduler diff --git a/mutalyzer/entrypoints/mutalyzer.py b/mutalyzer/entrypoints/mutalyzer.py index d123482fbe92d6ffa0f0277dcfd2847d877ecbab..6717161d1d4795c923f70cfa6846358ace2972c8 100644 --- a/mutalyzer/entrypoints/mutalyzer.py +++ b/mutalyzer/entrypoints/mutalyzer.py @@ -5,8 +5,12 @@ Mutalyzer command-line name checker. """ +from __future__ import unicode_literals + import argparse +import sys +from . import _cli_string from .. import describe from .. import output from .. import variantchecker @@ -114,7 +118,7 @@ def main(): parser = argparse.ArgumentParser( description='Mutalyzer command-line name checker.') parser.add_argument( - 'description', metavar='DESCRIPTION', + 'description', metavar='DESCRIPTION', type=_cli_string, help='variant description to run the name checker on') args = parser.parse_args() diff --git a/mutalyzer/entrypoints/service_json.py b/mutalyzer/entrypoints/service_json.py index 25ff8bbfc1d7d3a01a49a375a6caba90b846aed3..5e5d93d01a2e7d48d3acbf4e0014ac2f4ddde60f 100644 --- a/mutalyzer/entrypoints/service_json.py +++ b/mutalyzer/entrypoints/service_json.py @@ -18,6 +18,8 @@ You can also use the built-in HTTP server by running this file directly. """ +from __future__ import unicode_literals + import argparse import logging import sys @@ -25,7 +27,7 @@ import sys from wsgiref.simple_server import make_server from spyne.server.wsgi import WsgiApplication -from . import _ReverseProxied +from . import _cli_string, _ReverseProxied from ..config import settings from ..services import json @@ -57,9 +59,9 @@ def main(): parser = argparse.ArgumentParser( description='Mutalyzer HTTP/RPC+JSON webservice.') parser.add_argument( - '-H', '--host', metavar='HOSTNAME', dest='host', default='127.0.0.1', - help='hostname to listen on (default: 127.0.0.1; specify 0.0.0.0 to ' - 'listen on all hostnames)') + '-H', '--host', metavar='HOSTNAME', type=_cli_string, dest='host', + default='127.0.0.1', help='hostname to listen on (default: ' + '127.0.0.1; specify 0.0.0.0 to listen on all hostnames)') parser.add_argument( '-p', '--port', metavar='PORT', dest='port', type=int, default=8082, help='port to listen on (default: 8082)') diff --git a/mutalyzer/entrypoints/service_soap.py b/mutalyzer/entrypoints/service_soap.py index 6b630ad6aa1bc885995e099bca91553260854b0d..8179faa358e7e109442cef799b1ffc2f8e4c0128 100644 --- a/mutalyzer/entrypoints/service_soap.py +++ b/mutalyzer/entrypoints/service_soap.py @@ -18,6 +18,8 @@ You can also use the built-in HTTP server by running this file directly. """ +from __future__ import unicode_literals + import argparse import logging import sys @@ -25,7 +27,7 @@ import sys from wsgiref.simple_server import make_server from spyne.server.wsgi import WsgiApplication -from . import _ReverseProxied +from . import _cli_string, _ReverseProxied from ..config import settings from ..services import soap @@ -58,9 +60,9 @@ def main(): parser = argparse.ArgumentParser( description='Mutalyzer SOAP webservice.') parser.add_argument( - '-H', '--host', metavar='HOSTNAME', dest='host', default='127.0.0.1', - help='hostname to listen on (default: 127.0.0.1; specify 0.0.0.0 to ' - 'listen on all hostnames)') + '-H', '--host', metavar='HOSTNAME', type=_cli_string, dest='host', + default='127.0.0.1', help='hostname to listen on (default: ' + '127.0.0.1; specify 0.0.0.0 to listen on all hostnames)') parser.add_argument( '-p', '--port', metavar='PORT', dest='port', type=int, default=8081, help='port to listen on (default: 8081)') diff --git a/mutalyzer/entrypoints/website.py b/mutalyzer/entrypoints/website.py index a62e3bb332322312191d4f8eff800d711608037b..f387b70ff4cb5f4a315ca4de9a87c9e0d0033b5a 100644 --- a/mutalyzer/entrypoints/website.py +++ b/mutalyzer/entrypoints/website.py @@ -39,9 +39,12 @@ also serve the static files. """ +from __future__ import unicode_literals + import argparse +import sys -from . import _ReverseProxied +from . import _cli_string, _ReverseProxied from ..config import settings from .. import website @@ -66,9 +69,9 @@ def main(): parser = argparse.ArgumentParser( description='Mutalyzer website.') parser.add_argument( - '-H', '--host', metavar='HOSTNAME', dest='host', default='127.0.0.1', - help='hostname to listen on (default: 127.0.0.1; specify 0.0.0.0 to ' - 'listen on all hostnames)') + '-H', '--host', metavar='HOSTNAME', type=_cli_string, dest='host', + default='127.0.0.1', help='hostname to listen on (default: ' + '127.0.0.1; specify 0.0.0.0 to listen on all hostnames)') parser.add_argument( '-p', '--port', metavar='PORT', dest='port', type=int, default=8089, help='port to listen on (default: 8080)') diff --git a/mutalyzer/grammar.py b/mutalyzer/grammar.py index 0e65ec574822f0182ee4d1eb0abe6accb548161d..8f231bf57cee26ed032a21dbfeb4cb3f7d83f1ce 100644 --- a/mutalyzer/grammar.py +++ b/mutalyzer/grammar.py @@ -19,6 +19,8 @@ The grammar is described in [3]. """ +from __future__ import unicode_literals + from pyparsing import * @@ -48,7 +50,7 @@ class Grammar(): ########################################################################## # BNF: Name -> ([a-z] | [a-Z] | [0-9])+ - Name = Word(alphanums, min=1) + Name = Word(unicode(alphanums), min=1) # BNF: Nt -> `a' | `c' | `g' | `u' | `A' | `C' | `G' | `T' | `U' #Nt = Word('acgtuACGTU', exact=1) @@ -66,7 +68,7 @@ class Grammar(): NtString = Combine(OneOrMore(Nt)) # BNF: Number -> [0-9]+ - Number = Word(nums) + Number = Word(unicode(nums)) ########################################################################## # Reference sequences @@ -79,7 +81,7 @@ class Grammar(): ProtIso = Suppress('_i') + Number('ProtIso') # BNF: GeneName -> ([a-Z] | [0-9] | `-')+ - GeneName = Word(alphanums + '-', min=1) + GeneName = Word(unicode(alphanums) + '-', min=1) # BNF: GeneSymbol -> `(' Name (TransVar | ProtIso)? `)' GeneSymbol = Suppress('(') + Group(GeneName('GeneSymbol') + \ @@ -94,11 +96,11 @@ class Grammar(): # BNF: AccNo -> ([a-Z] Number `_')+ Version? AccNo = NotAny('LRG_') + \ - Combine(Word(alphas + '_') + Number)('RefSeqAcc') + \ + Combine(Word(unicode(alphas) + '_') + Number)('RefSeqAcc') + \ Optional(Version) # BNF: UD -> `UD_' [a-Z]+ (`_' Number)+ - UD = Combine('UD_' + Word(alphas) + OneOrMore('_' + Number))('RefSeqAcc') + UD = Combine('UD_' + Word(unicode(alphas)) + OneOrMore('_' + Number))('RefSeqAcc') # BNF: LRGTranscriptID -> `t' [0-9]+ LRGTranscriptID = Suppress('t') + Number('LRGTranscriptID') @@ -467,7 +469,7 @@ class Grammar(): the input where the error occurred (and return None). @arg variant: The input string that needs to be parsed. - @type variant: string + @type variant: unicode @return: The parse tree containing the parse results, or None in case of a parsing error. @@ -480,12 +482,12 @@ class Grammar(): return self.Var.parseString(variant, parseAll=True) # Todo: check .dump() except ParseException as err: - print err.line - print " "*(err.column-1) + "^" - print err + #print err.line + #print " "*(err.column-1) + "^" + #print err # Log parse error and the position where it occurred. - self._output.addMessage(__file__, 4, 'EPARSE', str(err)) - pos = int(str(err).split(':')[-1][:-1]) - 1 + self._output.addMessage(__file__, 4, 'EPARSE', unicode(err)) + pos = int(unicode(err).split(':')[-1][:-1]) - 1 self._output.addOutput('parseError', variant) self._output.addOutput('parseError', pos * ' ' + '^') return None diff --git a/mutalyzer/mapping.py b/mutalyzer/mapping.py index 693294d31b5a2a06319c24566d3e98259657882d..e5bd96db14324e8cc9de6f7df6c509f73ae03812 100644 --- a/mutalyzer/mapping.py +++ b/mutalyzer/mapping.py @@ -10,11 +10,12 @@ update the database with this information. """ +from __future__ import unicode_literals + from collections import defaultdict from itertools import groupby from operator import attrgetter, itemgetter -from Bio.Seq import reverse_complement import MySQLdb from mutalyzer.db import session @@ -24,6 +25,7 @@ from mutalyzer.models import SoapMessage, Mapping, Transcript from mutalyzer.output import Output from mutalyzer import Crossmap from mutalyzer import Retriever +from mutalyzer import util class MapviewSortError(Exception): @@ -40,28 +42,29 @@ def _construct_change(var, reverse=False): @type reverse: bool @return: Description of mutation (without reference and positions). - @rtype: string + @rtype: unicode """ + # Note that the pyparsing parse tree yields `str('')` for nonexisting + # attributes, so we wrap the optional attributes in `unicode()`. if reverse: - # todo: if var.Arg1 is unicode, this crashes try: - arg1 = str(int(var.Arg1)) + arg1 = unicode(int(var.Arg1)) except ValueError: - arg1 = reverse_complement(str(var.Arg1) or '') + arg1 = util.reverse_complement(unicode(var.Arg1)) try: - arg2 = str(int(var.Arg2)) + arg2 = unicode(int(var.Arg2)) except ValueError: - arg2 = reverse_complement(str(var.Arg2) or '') + arg2 = util.reverse_complement(unicode(var.Arg2)) else: - arg1 = var.Arg1 - arg2 = var.Arg2 + arg1 = unicode(var.Arg1) + arg2 = unicode(var.Arg2) def parse_sequence(seq): if not seq.Sequence: raise NotImplementedError('Only explicit sequences are supported ' 'for insertions.') if reverse: - return reverse_complement(str(seq.Sequence)) + return util.reverse_complement(seq.Sequence) return seq.Sequence if var.MutationType == 'subst': @@ -72,7 +75,7 @@ def _construct_change(var, reverse=False): seqs = reversed(var.SeqList) else: seqs = var.SeqList - insertion = '[' + ';'.join(str(parse_sequence(seq)) + insertion = '[' + ';'.join(parse_sequence(seq) for seq in seqs) + ']' else: insertion = parse_sequence(var.Seq) @@ -161,11 +164,11 @@ class Converter(object) : Get data from database. @arg acc: NM_ accession number (without version) - @type acc: string + @type acc: unicode @arg version: version number @type version: integer @kwarg selector: Optional gene symbol selector. - @type selector: str + @type selector: unicode @kwarg selector_version: Optional transcript version selector. @type selector_version: int """ @@ -269,7 +272,7 @@ class Converter(object) : @arg Loc: A location in either I{g.} or I{c.} notation @type Loc: object @arg Type: The reference type - @type Type: string + @type Type: unicode @returns: triple: 0. Main coordinate in I{c.} notation 1. Offset coordinate in I{c.} notation @@ -359,7 +362,7 @@ class Converter(object) : available. @arg accNo: transcript (NM_) accession number (with or without version) - @type accNo: string + @type accNo: unicode @return: transcription start, transcription end and CDS stop @rtype: triple @@ -381,7 +384,7 @@ class Converter(object) : One of the entry points (called by the HTML publisher). @arg accNo: The full NM accession number (including version) - @type accNo: string + @type accNo: unicode @return: T ; ClassSerializer object with the types trans_start, trans_stop and CDS_stop @@ -404,9 +407,9 @@ class Converter(object) : One of the entry points (called by the HTML publisher). @arg accNo: transcript (NM_) accession number (with version?) - @type accNo: string + @type accNo: unicode @arg mutation: the 'mutation' (e.g. c.123C>T) - @type mutation: string + @type mutation: unicode @return: ClassSerializer object @rtype: object @@ -493,10 +496,10 @@ class Converter(object) : Converts a complete HGVS I{c.} notation into a chromosomal notation. @arg variant: The variant in HGVS I{c.} notation - @type variant: string + @type variant: unicode @return: var_in_g ; The variant in HGVS I{g.} notation - @rtype: string + @rtype: unicode """ if self._parseInput(variant): acc = self.parseTree.RefSeqAcc @@ -528,7 +531,7 @@ class Converter(object) : r_change = _construct_change(variant, reverse=True) except NotImplementedError as e: self.__output.addMessage(__file__, 3, 'ENOTIMPLEMENTED', - str(e)) + unicode(e)) return None if self.mapping.orientation == 'forward': @@ -568,14 +571,14 @@ class Converter(object) : @arg positions: Positions in c. notation to convert. @type positions: list @arg reference: Transcript reference. - @type reference: string + @type reference: unicode @kwarg version: Transcript reference version. If omitted, '0' is assumed. - @type version: string + @type version: unicode @return: Chromosome name, orientation (+ or -), and converted positions. - @rtype: tuple(string, string, list) + @rtype: tuple(unicode, unicode, list) This only works for positions on transcript references in c. notation. """ @@ -617,10 +620,10 @@ class Converter(object) : def correctChrVariant(self, variant) : """ @arg variant: - @type variant: string + @type variant: unicode @return: variant ; - @rtype: string + @rtype: unicode """ #Pre split check @@ -651,12 +654,12 @@ class Converter(object) : def chrom2c(self, variant, rt, gene=None): """ @arg variant: a variant description - @type variant: string + @type variant: unicode @arg rt: the return type - @type rt: string + @type rt: unicode @kwarg gene: Optional gene name. If given, return variant descriptions on all transcripts for this gene. - @type gene: string + @type gene: unicode @return: HGVS_notatations ; @rtype: dictionary or list @@ -751,7 +754,7 @@ class Converter(object) : r_change = _construct_change(variant, reverse=True) except NotImplementedError as e: self.__output.addMessage(__file__, 4, - "ENOTIMPLEMENTEDERROR", str(e)) + "ENOTIMPLEMENTEDERROR", unicode(e)) return None startp = self.crossmap.tuple2string((cmap.startmain, cmap.startoffset)) @@ -786,6 +789,8 @@ class Converter(object) : #Converter +# Todo: This seems broken at the moment. +# Todo: Correct handling of string encodings. def import_from_ucsc_by_gene(assembly, gene): """ Import transcript mappings for a gene from the UCSC. @@ -878,6 +883,7 @@ def import_from_reference(assembly, reference): session.commit() +# Todo: File must be opened with the correct encoding. def import_from_mapview_file(assembly, mapview_file, group_label): """ Import transcript mappings from an NCBI mapview file. diff --git a/mutalyzer/models.py b/mutalyzer/models.py index 24a340fed34dd9a565e3d63c22a23d418607ac1e..bc9bf5a0780a382af267b3973b17b017c6a8ff77 100644 --- a/mutalyzer/models.py +++ b/mutalyzer/models.py @@ -8,6 +8,8 @@ from the Spyne model classes. """ +from __future__ import unicode_literals + from spyne.model.primitive import Integer, Boolean, DateTime, Unicode from spyne.model.binary import ByteArray from spyne.model.complex import ComplexModel, Array diff --git a/mutalyzer/mutator.py b/mutalyzer/mutator.py index 8047d932d4bab1ca4fa66b2020e2d69428d97853..4a4b0a2d157460e9ebc5cebbde89f0111090492f 100644 --- a/mutalyzer/mutator.py +++ b/mutalyzer/mutator.py @@ -12,12 +12,11 @@ The original as well as the mutated string are stored here. """ +from __future__ import unicode_literals + from collections import defaultdict from Bio import Restriction -from Bio.Seq import Seq -from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA -from Bio.Seq import reverse_complement from mutalyzer import util @@ -46,7 +45,7 @@ class Mutator(): Initialise the instance with the original sequence. @arg orig: The original sequence before mutation. - @type orig: str + @type orig: Bio.Seq.Seq @arg output: The output object. @type output: mutalyzer.Output.Output """ @@ -57,6 +56,8 @@ class Mutator(): self._output = output self.orig = orig + # Note that we don't need to create a copy here, since mutation + # operations are not in place (`self._mutate`). self.mutated = orig #__init__ @@ -72,7 +73,7 @@ class Mutator(): @rtype: dict """ analysis = Restriction.Analysis(self._restriction_batch, sequence) - return dict((str(k), len(v)) for k, v in analysis.with_sites().items()) + return dict((unicode(k), len(v)) for k, v in analysis.with_sites().items()) #_restriction_count def _counts_diff(self, counts1, counts2): @@ -109,10 +110,10 @@ class Mutator(): @arg pos2: Second interbase position of the deleted sequence. @type pos2: int @arg ins: Inserted sequence. - @type ins: str + @type ins: unicode @return: Visualisation. - @rtype: str + @rtype: unicode """ loflank = self.orig[max(pos1 - VIS_FLANK_LENGTH, 0):pos1] roflank = self.orig[pos2:pos2 + VIS_FLANK_LENGTH] @@ -338,7 +339,7 @@ class Mutator(): @arg pos2: Second interbase position of the deleted sequence. @type pos2: int @arg ins: Inserted sequence. - @type ins: str + @type ins: unicode """ correct = 1 if pos1 == pos2 else 0 self.mutated = (self.mutated[:self.shift(pos1 + 1) - 1] + @@ -375,7 +376,7 @@ class Mutator(): @arg pos: Interbase position where the insertion should take place. @type pos: int @arg ins: Inserted sequence. - @type ins: str + @type ins: unicode """ visualisation = ['insertion between %i and %i' % (pos, pos + 1)] visualisation.extend(self._visualise(pos, pos, ins)) @@ -394,7 +395,7 @@ class Mutator(): @arg pos2: Last nucleotide of the deleted sequence. @type pos2: int @arg ins: Inserted sequence. - @type ins: str + @type ins: unicode """ visualisation = ['delins from %i to %i' % (pos1, pos2)] visualisation.extend(self._visualise(pos1 - 1, pos2, ins)) @@ -410,7 +411,7 @@ class Mutator(): @arg pos: Position of the substitution. @type pos: int @arg nuc: Substituted nucleotide. - @type nuc: str + @type nuc: unicode """ visualisation = ['substitution at %i' % pos] visualisation.extend(self._visualise(pos - 1, pos, nuc)) @@ -428,14 +429,13 @@ class Mutator(): @arg pos2: Last nucleotide of the inverted sequence. @type pos2: int """ + sequence = util.reverse_complement(unicode(self.orig[pos1 - 1:pos2])) + visualisation = ['inversion between %i and %i' % (pos1, pos2)] - visualisation.extend( - self._visualise(pos1 - 1, pos2, - reverse_complement(self.orig[pos1 - 1:pos2]))) + visualisation.extend(self._visualise(pos1 - 1, pos2, sequence)) self._output.addOutput('visualisation', visualisation) - self._mutate(pos1 - 1, pos2, - reverse_complement(self.orig[pos1 - 1:pos2])) + self._mutate(pos1 - 1, pos2, sequence) #inversion def duplication(self, pos1, pos2): @@ -447,11 +447,12 @@ class Mutator(): @arg pos2: Last nucleotide of the duplicated sequence. @type pos2: int """ + sequence = unicode(self.orig[pos1 - 1:pos2]) + visualisation = ['duplication from %i to %i' % (pos1, pos2)] - visualisation.extend( - self._visualise(pos2, pos2, self.orig[pos1 - 1:pos2])) + visualisation.extend(self._visualise(pos2, pos2, sequence)) self._output.addOutput('visualisation', visualisation) - self._mutate(pos1 - 1, pos1 - 1, self.orig[pos1 - 1:pos2]) + self._mutate(pos1 - 1, pos1 - 1, sequence) #duplication #Mutator diff --git a/mutalyzer/output.py b/mutalyzer/output.py index 3ca1c8a71d8a998463262074fd0fae17f8a5c84c..fbec8418274798c367ba9153a438096284f974ae 100644 --- a/mutalyzer/output.py +++ b/mutalyzer/output.py @@ -23,6 +23,9 @@ Public classes: """ +from __future__ import unicode_literals + +import io import time from mutalyzer import util @@ -71,12 +74,13 @@ class Output() : - _warnings ; Initialised to 0. @arg instance: The filename of the module that created this object - @type instance: string + @type instance: unicode """ self._outputData = {} self._messages = [] self._instance = util.nice_filename(instance) - self._loghandle = open(settings.LOG_FILE, "a+") + self._loghandle = io.open(settings.LOG_FILE, mode='a+', + encoding='utf-8') self._errors = 0 self._warnings = 0 #__init__ @@ -147,7 +151,7 @@ class Output() : - _messages ; The messages list. @arg errorcode: The error code to filter on - @type errorcode: string + @type errorcode: unicode @return: A filtered list @rtype: list @@ -194,7 +198,7 @@ class Output() : - _outputData ; The output dictionary. @arg name: Name of a node in the output dictionary - @type name: string + @type name: unicode @arg data: The data to be stored at this node @type data: object """ @@ -258,7 +262,7 @@ class Output() : - Number of errors - Number of warnings - Summary - @rtype: integer, integer, string + @rtype: integer, integer, unicode """ e_s = 's' w_s = 's' @@ -297,13 +301,13 @@ class Message() : - description ; A description of the message. @arg origin: Name of the module creating this object - @type origin: string + @type origin: unicode @arg level: Importance of the message @type level: integer @arg code: The error code of the message - @type code: string + @type code: unicode @arg description: A description of the message - @type description: string + @type description: unicode """ self.origin = origin self.level = level @@ -316,17 +320,17 @@ class Message() : (self.origin, self.level, self.code, self.description) #__repr__ - def __str__(self): + def __unicode__(self): return '%s (%s): %s' % \ (self.named_level(), self.origin, self.description) - #__str__ + #__unicode__ def named_level(self): """ Get message log level as readable string. @return: A readable description of the log level. - @rtype: string + @rtype: unicode """ if self.level == 0: return "Debug" diff --git a/mutalyzer/parsers/__init__.py b/mutalyzer/parsers/__init__.py index 3e1bd90dd08aa288d05a8c342e2bbae9218a730c..6b3f43347bc55d1518e6aaeb0279b5fa3bac9871 100644 --- a/mutalyzer/parsers/__init__.py +++ b/mutalyzer/parsers/__init__.py @@ -1,3 +1,6 @@ """ Parsers for GenRecord objects. """ + + +from __future__ import unicode_literals diff --git a/mutalyzer/parsers/genbank.py b/mutalyzer/parsers/genbank.py index 867fa78f7b2d838d9076fc460eb5fd02282aee58..247545989e105702211e0c796a88b256edd40f3d 100644 --- a/mutalyzer/parsers/genbank.py +++ b/mutalyzer/parsers/genbank.py @@ -4,6 +4,9 @@ mutalyzer GenRecord. Record populated with data from a GenBank file. """ +from __future__ import unicode_literals + +import codecs import re import bz2 from itertools import izip_longest @@ -41,7 +44,7 @@ class tempGene(): - cdsList ; CDS list (including internal splice sites). @arg name: Gene name - @type name: string + @type name: unicode """ self.name = name @@ -75,8 +78,8 @@ class GBparser(): ret = [] - if not str(location.start).isdigit() or \ - not str(location.end).isdigit() : + if not unicode(location.start).isdigit() or \ + not unicode(location.end).isdigit() : return None #if @@ -99,8 +102,8 @@ class GBparser(): ret = [] - if not str(locationList.location.start).isdigit() or \ - not str(locationList.location.end).isdigit() : + if not unicode(locationList.location.start).isdigit() or \ + not unicode(locationList.location.end).isdigit() : return None #if @@ -128,10 +131,10 @@ class GBparser(): @arg transcriptAcc: Accession number of the transcript for which we want to find the protein - @type transcriptAcc: string + @type transcriptAcc: unicode @return: Accession number of a protein or None if nothing can be found - @rtype: string + @rtype: unicode """ link = queries.get_transcript_protein_link(transcriptAcc) if link is not None: @@ -146,7 +149,7 @@ class GBparser(): finally: handle.close() - transcriptGI = result["IdList"][0] + transcriptGI = unicode(result["IdList"][0]) handle = Entrez.elink(dbfrom = "nucleotide", db = "protein", id = transcriptGI) @@ -162,11 +165,11 @@ class GBparser(): queries.update_transcript_protein_link(transcriptAcc) return None - proteinGI = result[0]["LinkSetDb"][0]["Link"][0]["Id"] + proteinGI = unicode(result[0]["LinkSetDb"][0]["Link"][0]["Id"]) handle = Entrez.efetch(db='protein', id=proteinGI, rettype='acc', retmode='text') - proteinAcc = handle.read().split('.')[0] + proteinAcc = unicode(handle.read()).split('.')[0] handle.close() queries.update_transcript_protein_link(transcriptAcc, proteinAcc) @@ -179,7 +182,7 @@ class GBparser(): sentence from another. The index of the last word is counted backwards. @arg sentences: A list of sentences. - @type sentences: list of strings + @type sentences: list of unicode strings @return: The indices of the words where sentences start to differ, both are -1 when no mismatches are found. @@ -217,7 +220,7 @@ class GBparser(): [-1:1] yields the empty list. """ # Create lists of words - lists = map(str.split, sentences) + lists = [s.split() for s in sentences] try: forward, reverse = [next(i for i, v in @@ -239,7 +242,7 @@ class GBparser(): @arg locus: The locus object on which the transfer should be performed @type locus: locus object @arg key: The name of the variable that should be transferred - @type key: string + @type key: unicode """ if locus.qualifiers.has_key(key) : @@ -315,7 +318,7 @@ class GBparser(): @arg locusList: A list of loci @type locusList: list @arg tagName: Name of the tag to be checked - @type tagName: string + @type tagName: unicode """ tags = [] @@ -476,13 +479,14 @@ class GBparser(): Create a GenRecord.Record from a GenBank file @arg filename: The full path to the compressed GenBank file - @type filename: string + @type filename: unicode @return: A GenRecord.Record instance @rtype: object (record) """ # first create an intermediate genbank record with BioPython file_handle = bz2.BZ2File(filename, "r") + file_handle = codecs.getreader('utf-8')(file_handle) biorecord = SeqIO.read(file_handle, "genbank") file_handle.close() diff --git a/mutalyzer/parsers/lrg.py b/mutalyzer/parsers/lrg.py index d3624360291b5035fc3f5e6a323de4a59a08bfdc..0336d1062589508447edc2891c19fd261a21dbe3 100644 --- a/mutalyzer/parsers/lrg.py +++ b/mutalyzer/parsers/lrg.py @@ -21,6 +21,8 @@ added in python2.5. Its main strengths are speed and readability [pythonesque]. """ +from __future__ import unicode_literals + import xml.dom.minidom from Bio.Seq import Seq from Bio.Alphabet import IUPAC @@ -54,14 +56,14 @@ def _get_content(data, refname): @arg data: a minidom object @type data: object @arg refname: the name of a member of the minidom object - @type refname: string + @type refname: unicode - @return: The UTF-8 content of the textnode or an emtpy string + @return: The content of the textnode or an emtpy string @rtype: string """ temp = data.getElementsByTagName(refname) if temp: - return temp[0].lastChild.data.encode("utf8") + return temp[0].lastChild.data else: return "" #_get_content @@ -75,14 +77,14 @@ def _attr2dict(attr): @type attr: object @return: A dictionary with pairing of node-attribute names and values. - Integer string values are converted to integers. String values are converted - to UTF-8 + Integer string values are converted to integers. @rtype: dictionary """ ret = {} for key, value in attr.items(): - value = value.isdigit() and int(value) or value.encode("utf-8") - ret[key.encode("utf-8")] = value + if value.isdigit(): + value = int(value) + ret[key] = value return ret #_attr2dict @@ -166,7 +168,7 @@ def create_record(data): for tData in fixed.getElementsByTagName("transcript"): # iterate over the transcripts in the fixed section. # get the transcript from the updatable section and combine results - transcriptName = tData.getAttribute("name").encode("utf8")[1:] + transcriptName = tData.getAttribute("name")[1:] transcription = [t for t in gene.transcriptList if t.name == transcriptName][0] #TODO?: swap with gene.findLocus diff --git a/mutalyzer/redisclient.py b/mutalyzer/redisclient.py index ec9e6050548a85d04dced7489fbd8de195a5c6fc..58acd7cacdda8818dcf902150f361ce8d7342313 100644 --- a/mutalyzer/redisclient.py +++ b/mutalyzer/redisclient.py @@ -18,6 +18,8 @@ simple and just use one global connection pool as created by `StrictRedis`. """ +from __future__ import unicode_literals + import redis from mutalyzer.config import settings @@ -37,7 +39,9 @@ class LazyClient(util.LazyObject): import mockredis self._wrapped = mockredis.MockRedis(strict=True) else: - self._wrapped = redis.StrictRedis.from_url(settings.REDIS_URI) + self._wrapped = redis.StrictRedis.from_url(settings.REDIS_URI, + decode_responses=True, + charset='utf-8') #: Global :class:`LazyClient` instance. Use this for all communication with diff --git a/mutalyzer/services/__init__.py b/mutalyzer/services/__init__.py index 05b3d031865b91b2a3ebd2ead081592a52a119e2..81887d7c05baaf74a3ef836f34fdb9dbe9c25336 100644 --- a/mutalyzer/services/__init__.py +++ b/mutalyzer/services/__init__.py @@ -1,3 +1,6 @@ """ Services (RPC) for Mutalyzer. """ + + +from __future__ import unicode_literals diff --git a/mutalyzer/services/json.py b/mutalyzer/services/json.py index c35b79293c1a790209185a9efa37772155acf07e..89c6a26e11cca3e0f2c64f2c96621cbaeffb236b 100644 --- a/mutalyzer/services/json.py +++ b/mutalyzer/services/json.py @@ -3,6 +3,8 @@ Mutalyzer web service HTTP/RPC with JSON response payloads. """ +from __future__ import unicode_literals + from spyne.application import Application from spyne.protocol.http import HttpRpc from spyne.protocol.json import JsonDocument diff --git a/mutalyzer/services/rpc.py b/mutalyzer/services/rpc.py index ba29c0083360ed1e85c02553c587fbfdeecde370..004fa869bcfeed611b63847cc01a4309c19be58e 100644 --- a/mutalyzer/services/rpc.py +++ b/mutalyzer/services/rpc.py @@ -9,6 +9,8 @@ Mutalyzer RPC services. """ +from __future__ import unicode_literals + from spyne.decorator import srpc from spyne.service import ServiceBase from spyne.model.primitive import Integer, Boolean, DateTime, Unicode @@ -16,16 +18,15 @@ from spyne.model.complex import Array from spyne.model.fault import Fault import os import socket -from cStringIO import StringIO -import tempfile -from operator import itemgetter, attrgetter +from io import BytesIO +from operator import attrgetter from sqlalchemy.orm.exc import NoResultFound import mutalyzer from mutalyzer.config import settings from mutalyzer.db import session -from mutalyzer.db.models import (Assembly, Chromosome, BatchJob, - BatchQueueItem, TranscriptMapping) +from mutalyzer.db.models import (Assembly, BatchJob, BatchQueueItem, + TranscriptMapping) from mutalyzer.output import Output from mutalyzer.grammar import Grammar from mutalyzer.sync import CacheSync @@ -103,7 +104,9 @@ class MutalyzerService(ServiceBase): 'Only files up to %d megabytes are accepted.' % (settings.MAX_FILE_SIZE // 1048576)) - batch_file = StringIO(''.join(data)) + batch_file = BytesIO() + for d in data: + batch_file.write(d) job, columns = file_instance.parseBatchFile(batch_file) batch_file.close() @@ -144,7 +147,7 @@ class MutalyzerService(ServiceBase): @arg job_id: Batch job identifier. - @return: Batch job result file. + @return: Batch job result file (UTF-8, base64 encoded). """ left = BatchQueueItem.query.join(BatchJob).filter_by(result_id=job_id).count() @@ -152,7 +155,7 @@ class MutalyzerService(ServiceBase): raise Fault('EBATCHNOTREADY', 'Batch job result is not yet ready.') filename = 'batch-job-%s.txt' % job_id - handle = open(os.path.join(settings.CACHE_DIR, filename)) + handle = open(os.path.join(settings.CACHE_DIR, filename), 'rb') return handle @srpc(Mandatory.Unicode, Mandatory.Unicode, Mandatory.Integer, Boolean, @@ -804,23 +807,18 @@ class MutalyzerService(ServiceBase): result.sourceGi = O.getIndexedOutput('source_gi', 0) result.molecule = O.getIndexedOutput('molecule', 0) - # We force the results to strings here, because some results - # may be of type Bio.Seq.Seq which spyne doesn't like. - # - # todo: We might have to also do this elsewhere. - - result.original = str(O.getIndexedOutput("original", 0)) - result.mutated = str(O.getIndexedOutput("mutated", 0)) + result.original = O.getIndexedOutput("original", 0) + result.mutated = O.getIndexedOutput("mutated", 0) - result.origMRNA = str(O.getIndexedOutput("origMRNA", 0)) - result.mutatedMRNA = str(O.getIndexedOutput("mutatedMRNA", 0)) + result.origMRNA = O.getIndexedOutput("origMRNA", 0) + result.mutatedMRNA = O.getIndexedOutput("mutatedMRNA", 0) - result.origCDS = str(O.getIndexedOutput("origCDS", 0)) - result.newCDS = str(O.getIndexedOutput("newCDS", 0)) + result.origCDS = O.getIndexedOutput("origCDS", 0) + result.newCDS = O.getIndexedOutput("newCDS", 0) - result.origProtein = str(O.getIndexedOutput("oldprotein", 0)) - result.newProtein = str(O.getIndexedOutput("newprotein", 0)) - result.altProtein = str(O.getIndexedOutput("altProtein", 0)) + result.origProtein = O.getIndexedOutput("oldprotein", 0) + result.newProtein = O.getIndexedOutput("newprotein", 0) + result.altProtein = O.getIndexedOutput("altProtein", 0) result.chromDescription = \ O.getIndexedOutput("genomicChromDescription", 0) @@ -995,7 +993,7 @@ class MutalyzerService(ServiceBase): transcript.CM.info() cds_start = 1 - t.cTransEnd = str(t.exons[-1].cStop) + t.cTransEnd = unicode(t.exons[-1].cStop) t.gTransEnd = t.exons[-1].gStop t.chromTransEnd = GenRecordInstance.record.toChromPos( t.gTransEnd) @@ -1009,15 +1007,15 @@ class MutalyzerService(ServiceBase): t.name = '%s_v%s' % (gene.name, transcript.name) t.id = transcript.transcriptID t.product = transcript.transcriptProduct - t.cTransStart = str(trans_start) + t.cTransStart = unicode(trans_start) t.gTransStart = transcript.CM.x2g(trans_start, 0) t.chromTransStart = GenRecordInstance.record.toChromPos( t.gTransStart) - t.cCDSStart = str(cds_start) + t.cCDSStart = unicode(cds_start) t.gCDSStart = transcript.CM.x2g(cds_start, 0) t.chromCDSStart = GenRecordInstance.record.toChromPos( t.gCDSStart) - t.cCDSStop = str(cds_stop) + t.cCDSStop = unicode(cds_stop) t.gCDSStop = transcript.CM.x2g(cds_stop, 0) t.chromCDSStop = GenRecordInstance.record.toChromPos(t.gCDSStop) t.locusTag = transcript.locusTag @@ -1045,7 +1043,7 @@ class MutalyzerService(ServiceBase): """ Upload a genbank file. - @arg data: Genbank file (base64 encoded). + @arg data: Genbank file (UTF-8, base64 encoded). @return: UD accession number for the uploaded genbank file. """ output = Output(__file__) @@ -1067,7 +1065,7 @@ class MutalyzerService(ServiceBase): 'Only files up to %d megabytes are accepted.' % (settings.MAX_FILE_SIZE // 1048576)) - ud = retriever.uploadrecord(''.join(data)) + ud = retriever.uploadrecord(b''.join(data)) output.addMessage(__file__, -1, 'INFO', 'Finished processing uploadGenBankLocalFile()') @@ -1075,7 +1073,7 @@ class MutalyzerService(ServiceBase): # Todo: use SOAP Fault object here (see Trac issue #41). if not ud: error = 'The request could not be completed\n' \ - + '\n'.join(map(lambda m: str(m), output.getMessages())) + + '\n'.join(map(lambda m: unicode(m), output.getMessages())) raise Exception(error) return ud @@ -1112,7 +1110,7 @@ class MutalyzerService(ServiceBase): # Todo: use SOAP Fault object here (see Trac issue #41). if not UD: error = 'The request could not be completed\n' \ - + '\n'.join(map(lambda m: str(m), O.getMessages())) + + '\n'.join(map(lambda m: unicode(m), O.getMessages())) raise Exception(error) return UD @@ -1281,7 +1279,7 @@ class MutalyzerService(ServiceBase): messages = output.getMessages() if messages: error = 'The request could not be completed\n' + \ - '\n'.join(map(lambda m: str(m), output.getMessages())) + '\n'.join(map(lambda m: unicode(m), output.getMessages())) raise Exception(error) return descriptions diff --git a/mutalyzer/services/soap.py b/mutalyzer/services/soap.py index a7d7b001868705b65807f27edba653c1050e6fda..d8f28407bd29afbedc37be52020122a96c2c8490 100644 --- a/mutalyzer/services/soap.py +++ b/mutalyzer/services/soap.py @@ -3,6 +3,8 @@ Mutalyzer SOAP/1.1 web service. """ +from __future__ import unicode_literals + from spyne.application import Application from spyne.protocol.soap import Soap11 diff --git a/mutalyzer/stats.py b/mutalyzer/stats.py index bb1dec573161b469af85f52f8862d5883b45f4a7..e7228cdfb4e8dbb34a1a59ebcc07654f42679a8c 100644 --- a/mutalyzer/stats.py +++ b/mutalyzer/stats.py @@ -17,6 +17,8 @@ module much more. """ +from __future__ import unicode_literals + import time from mutalyzer.redisclient import client @@ -36,7 +38,8 @@ def increment_counter(counter): pipe.incr('counter:%s:total' % counter) for label, bucket, expire in INTERVALS: - key = 'counter:%s:%s:%s' % (counter, label, time.strftime(bucket)) + key = 'counter:%s:%s:%s' % (counter, label, + unicode(time.strftime(bucket))) pipe.incr(key) # It's safe to just keep on expiring the counter, even if it already diff --git a/mutalyzer/sync.py b/mutalyzer/sync.py index e5465e1e35e6f5a1cbc1556b8f5f817520947a2a..a1a1b7f90a3e687ef17aa833f3613b7895054d9c 100644 --- a/mutalyzer/sync.py +++ b/mutalyzer/sync.py @@ -3,6 +3,8 @@ Synchronizing the reference file cache with other Mutalyzer instances. """ +from __future__ import unicode_literals + from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() from datetime import datetime, timedelta @@ -86,7 +88,7 @@ class CacheSync(object): or later. :arg remote_wsdl: The url of the remote SOAP WSDL description. - :type remote_wsdl: str + :type remote_wsdl: unicode :arg created_since: Only entries with this creation date or later are returned. :type created_since: datatime.datetime @@ -111,11 +113,11 @@ class CacheSync(object): 1: 'forward', 2: 'reverse'} - entry_dict = {'name': str(entry.name), - 'hash': str(entry.hash), + entry_dict = {'name': entry.name, + 'hash': entry.hash, 'created': entry.created} for attribute in ('gi', 'chromosomeName', 'url', 'cached'): - entry_dict[attribute] = str(entry[attribute]) \ + entry_dict[attribute] = entry[attribute] \ if attribute in entry else None for attribute in ('chromosomeStart', 'chromosomeStop'): entry_dict[attribute] = int(entry[attribute]) \ @@ -131,9 +133,9 @@ class CacheSync(object): Download a remote file located at `url` and store it as `name`. :arg name: Name to store the file under. - :type name: str + :type name: unicode :arg url: Url to the remote file. - :type url: str + :type url: unicode """ if not re.match('^[\da-zA-Z\._-]+$', name): return @@ -160,10 +162,10 @@ class CacheSync(object): (14, 3) :arg remote_wsdl: The url of the remote SOAP WSDL description. - :type remote_wsdl: str + :type remote_wsdl: unicode :arg url_template: Formatting string containing a ``{file}`` occurence, see example usage above. - :string url_template: str + :string url_template: unicode :arg days: Only remote entries added this number of days ago or later are considered. :type days: int diff --git a/mutalyzer/util.py b/mutalyzer/util.py index 936f0812b6abb077cb17dcb252a146cb3a5285f5..4017b57a3a37bcd17731928a8b595593ae7d5eaf 100644 --- a/mutalyzer/util.py +++ b/mutalyzer/util.py @@ -19,20 +19,80 @@ General utility functions. """ +from __future__ import unicode_literals + from functools import wraps import inspect from itertools import izip_longest import math import operator -import os import sys import time -from Bio.Alphabet import IUPAC -import Bio.Seq from Bio.SeqUtils import seq3 +# Taken from BioPython. +AMBIGUOUS_DNA_COMPLEMENT = { + 'A': 'T', + 'C': 'G', + 'G': 'C', + 'T': 'A', + 'M': 'K', + 'R': 'Y', + 'W': 'W', + 'S': 'S', + 'Y': 'R', + 'K': 'M', + 'V': 'B', + 'H': 'D', + 'D': 'H', + 'B': 'V', + 'X': 'X', + 'N': 'N'} +AMBIGUOUS_RNA_COMPLEMENT = { + 'A': 'U', + 'C': 'G', + 'G': 'C', + 'U': 'A', + 'M': 'K', + 'R': 'Y', + 'W': 'W', + 'S': 'S', + 'Y': 'R', + 'K': 'M', + 'V': 'B', + 'H': 'D', + 'D': 'H', + 'B': 'V', + 'X': 'X', + 'N': 'N'} + + +def _make_translation_table(complement_mapping): + before = complement_mapping.keys() + before += [b.lower() for b in before] + after = complement_mapping.values() + after += [b.lower() for b in after] + return {ord(k): v for k, v in zip(before, after)} + + +_dna_complement_table = _make_translation_table(AMBIGUOUS_DNA_COMPLEMENT) +_rna_complement_table = _make_translation_table(AMBIGUOUS_RNA_COMPLEMENT) + + +def reverse_complement(sequence): + """ + Reverse complement of a sequence represented as unicode string. + """ + if 'U' in sequence or 'u' in sequence: + table = _rna_complement_table + else: + table = _dna_complement_table + + return ''.join(reversed(sequence.translate(table))) + + def grouper(iterable, n=2, fillvalue=None): """ Make an iterator that takes {n} elements at a time from {iterable}, using @@ -115,17 +175,17 @@ def splice(s, splice_sites): 'bcdghijklmnoptuvw' @arg s: A DNA sequence. - @type s: string + @type s: any sequence type @arg splice_sites: A list of even length of integers. @type splice_sites: list @return: The concatenation of slices from the sequence that is present in the GenBank record. - @rtype: string + @rtype: type(s) @todo: Assert length of splice_sites is even. """ - transcript = '' + transcript = s[:0] for acceptor, donor in grouper(splice_sites): transcript += s[acceptor - 1:donor] @@ -146,7 +206,7 @@ def __nsplice(string, splice_sites, CDS, orientation) : @todo: documentation """ - transcript = "" + transcript = string[:0] if orientation == 1 : for i in range(0, len(splice_sites), 2) : if CDS[0] >= splice_sites[i] and CDS[0] <= splice_sites[i + 1] : @@ -212,14 +272,15 @@ def format_range(first, last): @type last: integer @return: {first}_{last} in case of a real range, {first} otherwise. - @rtype: string + @rtype: unicode """ if first == last: - return str(first) + return unicode(first) return '%i_%i' % (first, last) #format_range + def roll_(s, start, end) : """ Different (and easier) way of finding the variability of a substring. @@ -239,6 +300,7 @@ def roll_(s, start, end) : return j, i #roll + def roll(s, first, last): """ Determine the variability of a variant by looking at cyclic @@ -254,7 +316,7 @@ def roll(s, first, last): (1, 3) @arg s: A reference sequence. - @type s: string + @type s: any sequence type @arg first: First position of the pattern in the reference sequence. @type first: int @arg last: Last position of the pattern in the reference sequence. @@ -302,13 +364,13 @@ def palinsnoop(s): 0 @arg s: A nucleotide sequence. - @type s: string + @type s: unicode @return: The number of elements that are palindromic or -1 if the string is a 'palindrome'. - @rtype: string + @rtype: int """ - s_revcomp = Bio.Seq.reverse_complement(s) + s_revcomp = reverse_complement(s) for i in range(int(math.ceil(len(s) / 2.0))): if s[i] != s_revcomp[i]: @@ -330,12 +392,12 @@ def longest_common_prefix(s1, s2): 'abcdefg' @arg s1: The first string. - @type s1: string + @type s1: unicode @arg s2: The second string. - @type s2: string + @type s2: unicode @return: The longest common prefix of s1 and s2. - @rtype: string + @rtype: unicode @todo: This is mostly used just for the length of the returned string, and we could also return that directly. @@ -359,9 +421,9 @@ def longest_common_suffix(s1, s2): 'efg' @arg s1: The first string. - @type s1: string + @type s1: unicode @arg s2: The second string. - @type s2: string + @type s2: unicode @return: The longest common suffix of s1 and s2. @rtype: string @@ -380,15 +442,15 @@ def trim_common(s1, s2): ('xyzef', 'abc', 3, 1) @arg s1: A string. - @type s1: string + @type s1: unicode @arg s2: Another string. - @type s2: string + @type s2: unicode @return: A tuple of: - - string: Trimmed version of s1. - - string: Trimmed version of s2. - - int: Length of longest common prefix. - - int: Length of longest common suffix. + - unicode: Trimmed version of s1. + - unicode: Trimmed version of s2. + - int: Length of longest common prefix. + - int: Length of longest common suffix. @todo: More intelligently handle longest_common_prefix(). """ @@ -407,14 +469,14 @@ def is_dna(s): >>> is_dna('TACUGT') False - @arg s: Any string or Bio.Seq.Seq instance. - @type s: string + @arg s: Any string. + @type s: unicode @return: True if the string is a DNA string, False otherwise. @rtype: boolean """ - for i in str(s): - if not i in IUPAC.unambiguous_dna.letters: + for i in s: + if i not in 'ATCG': return False return True @@ -435,16 +497,16 @@ def in_frame_description(s1, s2) : ('p.(Pro4_Gln6delinsGlnGlnMet)', 3, 6, 6) @arg s1: The original protein. - @type s1: string + @type s1: unicode @arg s2: The mutated protein. - @type s2: string + @type s2: unicode @return: A tuple of: - - string ; Protein description of the change. - - int ; First position of the change. - - int ; Last position of the change in the first protein. - - int ; Last position of the change in the second protein. - @rtype: tuple(string, int, int, int) + - unicode ; Protein description of the change. + - int ; First position of the change. + - int ; Last position of the change in the first protein. + - int ; Last position of the change in the second protein. + @rtype: tuple(unicode, int, int, int) @todo: More intelligently handle longest_common_prefix(). @todo: Refactor this code (too many return statements). @@ -528,16 +590,16 @@ def out_of_frame_description(s1, s2): ('p.(Pro4Glnfs*5)', 3, 7, 7) @arg s1: The original protein. - @type s1: string + @type s1: unicode @arg s2: The mutated protein. - @type s2: string + @type s2: unicode @return: A tuple of: - - string ; Protein description of the change. - - int ; First position of the change. - - int ; Last position of the first protein. - - int ; Last position of the second protein. - @rtype: tuple(string, int, int, int) + - unicode ; Protein description of the change. + - int ; First position of the change. + - int ; Last position of the first protein. + - int ; Last position of the second protein. + @rtype: tuple(unicode, int, int, int) @todo: More intelligently handle longest_common_prefix(). """ @@ -573,23 +635,23 @@ def protein_description(cds_stop, s1, s2) : @arg cds_stop: Position of the stop codon in c. notation (CDS length). @type cds_stop: int @arg s1: The original protein. - @type s1: string + @type s1: unicode @arg s2: The mutated protein. - @type s2: string + @type s2: unicode @return: A tuple of: - - string ; Protein description of the change. - - int ; First position of the change. - - int ; Last position of the change in the first protein. - - int ; Last position of the change in the second protein. - @rtype: tuple(string, int, int, int) + - unicode ; Protein description of the change. + - int ; First position of the change. + - int ; Last position of the change in the first protein. + - int ; Last position of the change in the second protein. + @rtype: tuple(unicode, int, int, int) """ if cds_stop % 3: - description = out_of_frame_description(str(s1), str(s2)) + description = out_of_frame_description(s1, s2) else: - description = in_frame_description(str(s1), str(s2)) + description = in_frame_description(s1, s2) - if not s2 or str(s1[0]) != str(s2[0]): + if not s2 or s1[0] != s2[0]: # Mutation in start codon. return 'p.?', description[1], description[2], description[3] @@ -603,7 +665,7 @@ def visualise_sequence(sequence, max_length=25, flank_size=6): string is clipped; otherwise the string is just returned. @arg sequence: DNA sequence. - @type sequence: str + @type sequence: unicode @arg max_length: Maximum length of visualised sequence. @type max_length: int @arg flank_size: Length of the flanks in clipped visualised sequence. @@ -629,19 +691,19 @@ def _insert_tag(s, pos1, pos2, tag1, tag2): anything either. @arg s: A sequence. - @type s: + @type s: unicode @arg pos1: Position of tag1. @type pos1: int @arg pos2: Position of tag2. @type pos2: int @arg tag1: Content of tag1. - @type tag1: string + @type tag1: unicode @arg tag2: Content of tag2. - @type tag2: string + @type tag2: unicode @return: The original sequence, or a sequence with eiter tag1, tag2 or both tags inserted. - @rtype: string + @rtype: unicode @todo: Cleanup (note: only used in print_protein_html). """ @@ -670,7 +732,7 @@ def print_protein_html(s, first, last, O, where, text=False): and is suitable for viewing in a monospaced font. @arg s: A protein sequence. - @type s: string + @type s: unicode @arg first: First position to highlight. @type first: int @arg last: Last position to highlight. @@ -678,7 +740,7 @@ def print_protein_html(s, first, last, O, where, text=False): @arg O: The Output object. @type O: Modules.Output.Output @arg where: Location in the {O} object to store the representation. - @type where: string + @type where: unicode @todo: Cleanup. """ @@ -701,7 +763,7 @@ def print_protein_html(s, first, last, O, where, text=False): o = 1 # Add the first position. - output = '%s ' % str(o).rjust(m) + output = '%s ' % unicode(o).rjust(m) for i in range(0, len(s), block): # Add the blocks. @@ -714,13 +776,13 @@ def print_protein_html(s, first, last, O, where, text=False): # Add the position (while escaping any potential highlighting). if text: if first < o < last: - output = '%s%s%s ' % (tag2, str(o).rjust(m), tag1) + output = '%s%s%s ' % (tag2, unicode(o).rjust(m), tag1) else: - output = '%s ' % str(o).rjust(m) + output = '%s ' % unicode(o).rjust(m) else: output = \ '<tt style="color:000000;font-weight:normal">%s</tt> ' % \ - str(o).rjust(m) + unicode(o).rjust(m) # Add last line. O.addOutput(where, output) @@ -748,10 +810,10 @@ def nice_filename(filename): Strip the path and the extention from a filename. @arg filename: A complete path plus extention. - @type filename: string + @type filename: unicode @return: The bare filename without a path and extention. - @rtype: string + @rtype: unicode """ return filename.split('/')[-1].split('.')[0] #nice_filename @@ -788,16 +850,16 @@ def format_usage(usage=None, keywords={}): @kwarg usage: The string to format. If omitted, the calling module's docstring is used. - @type usage: string + @type usage: unicode @kwarg keywords: A dictionary of (keyword, value) pairs used to format the usage string. If it does not contain the key 'command', it is added with the value of sys.argv[0]. - @type keywords: dictionary(string, string) + @type keywords: dictionary(unicode, unicode) @return: Formatted usage string. This is {usage} with any entries from {keywords} replaced and cut-off at the first occurence of two consecutive empty lines. - @rtype: string + @rtype: unicode """ if not usage: caller = inspect.stack()[1] diff --git a/mutalyzer/variantchecker.py b/mutalyzer/variantchecker.py index 65dd70564a727e88eb38ddac39baf38e5befb286..3f0ee4220d8d38451ab7cf4067f287c31ae29383 100644 --- a/mutalyzer/variantchecker.py +++ b/mutalyzer/variantchecker.py @@ -9,17 +9,22 @@ Notes about naming positions: * translation -> begin/end * any range of bases -> first/last * interbase position (if two numbers are used) -> before/after + +Notes about string representations: +* All variant descriptions and their parts are unicode strings +* All reference sequences (and their mutated version) are Bio.Seq.Seq objects """ -from operator import itemgetter, attrgetter +from __future__ import unicode_literals + +from operator import attrgetter -import Bio -import Bio.Seq -from Bio.Seq import Seq +from Bio.Data import CodonTable from Bio.Alphabet import IUPAC from Bio.Alphabet import DNAAlphabet from Bio.Alphabet import ProteinAlphabet +from Bio.Alphabet import _verify_alphabet from mutalyzer import util from mutalyzer.db.models import Assembly @@ -126,14 +131,14 @@ def _check_argument(argument, reference, first, last, output): Do several checks for the optional argument of a variant. Raise a _RawVariantError exception if the checks fail. + @arg argument: The optional argument. + @type argument: unicode @arg reference: The reference sequence. - @type reference: string + @type reference: Bio.Seq.Seq @arg first: Start position of the variant. @type first: int @arg last: End position of the variant. @type last: int - @arg argument: The optional argument. - @type argument: string @arg output: The Output object. @type output: mutalyzer.Output.Output @@ -164,8 +169,8 @@ def _check_argument(argument, reference, first, last, output): 'Invalid letters in argument.') raise _NotDNAError() # And the DNA must match the reference sequence. - reference_slice = str(reference[first - 1:last]) - if reference_slice != str(argument): + reference_slice = unicode(reference[first - 1:last]) + if reference_slice != argument: # Todo: Be more informative. output.addMessage(__file__, 3, 'EREF', '%s not found at position %s, found %s ' \ @@ -286,9 +291,9 @@ def apply_substitution(position, original, substitute, mutator, record, O): @arg position: Genomic location of the substitution. @type position: int @arg original: Nucleotide in the reference sequence. - @type original: string + @type original: unicode @arg substitute: Nucleotide in the mutated sequence. - @type substitute: string + @type substitute: unicode @arg mutator: A Mutator instance. @type mutator: mutalyzer.mutator.Mutator @arg record: A GenRecord object. @@ -310,7 +315,7 @@ def apply_substitution(position, original, substitute, mutator, record, O): mutator.substitution(position, substitute) - record.name(position, position, 'subst', mutator.orig[position - 1], + record.name(position, position, 'subst', unicode(mutator.orig[position - 1]), substitute, None) #apply_substitution @@ -326,7 +331,7 @@ def apply_deletion_duplication(first, last, type, mutator, record, O, @arg last: Genomic end position of the del/dup. @type last: int @arg type: The variant type (del or dup). - @type type: string + @type type: unicode @arg mutator: A Mutator instance. @type mutator: mutalyzer.mutator.Mutator @arg record: A GenRecord object. @@ -376,9 +381,9 @@ def apply_deletion_duplication(first, last, type, mutator, record, O, 'Sequence "%s" at position %s was given, however, ' \ 'the HGVS notation prescribes that on the forward strand ' \ 'it should be "%s" at position %s.' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), + util.visualise_sequence(unicode(mutator.orig[first - 1:last])), util.format_range(first, last), - util.visualise_sequence(str(mutator.orig[new_first - 1:new_stop])), + util.visualise_sequence(unicode(mutator.orig[new_first - 1:new_stop])), util.format_range(new_first, new_stop))) if forward_roll != original_forward_roll and not reverse_strand: @@ -388,9 +393,9 @@ def apply_deletion_duplication(first, last, type, mutator, record, O, O.addMessage(__file__, 1, 'IROLLBACK', 'Sequence "%s" at position %s was not corrected to "%s" at ' \ 'position %s, since they reside in different exons.' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), + util.visualise_sequence(unicode(mutator.orig[first - 1:last])), util.format_range(first, last), - util.visualise_sequence(str(mutator.orig[incorrect_first - 1:incorrect_stop])), + util.visualise_sequence(unicode(mutator.orig[incorrect_first - 1:incorrect_stop])), util.format_range(incorrect_first, incorrect_stop))) if reverse_roll and reverse_strand: @@ -400,9 +405,9 @@ def apply_deletion_duplication(first, last, type, mutator, record, O, 'Sequence "%s" at position %s was given, however, ' \ 'the HGVS notation prescribes that on the reverse strand ' \ 'it should be "%s" at position %s.' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), + util.visualise_sequence(unicode(mutator.orig[first - 1:last])), util.format_range(first, last), - util.visualise_sequence(str(mutator.orig[new_first - 1:new_stop])), + util.visualise_sequence(unicode(mutator.orig[new_first - 1:new_stop])), util.format_range(new_first, new_stop))) # We don't go through the trouble of visualising the *corrected* variant @@ -434,7 +439,7 @@ def apply_inversion(first, last, mutator, record, O): @arg O: The Output object. @type O: Modules.Output.Output """ - snoop = util.palinsnoop(mutator.orig[first - 1:last]) + snoop = util.palinsnoop(unicode(mutator.orig[first - 1:last])) if snoop: # We have a reverse-complement-palindromic prefix. @@ -444,7 +449,7 @@ def apply_inversion(first, last, mutator, record, O): O.addMessage(__file__, 2, 'WNOCHANGE', 'Sequence "%s" at position %i_%i is a palindrome ' \ '(its own reverse complement).' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), + util.visualise_sequence(unicode(mutator.orig[first - 1:last])), first, last)) return else: @@ -453,10 +458,10 @@ def apply_inversion(first, last, mutator, record, O): 'palindrome (the first %i nucleotide(s) are the reverse ' \ 'complement of the last one(s)), the HGVS notation ' \ 'prescribes that it should be "%s" at position %i_%i.' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), + util.visualise_sequence(unicode(mutator.orig[first - 1:last])), first, last, snoop, util.visualise_sequence( - str(mutator.orig[first + snoop - 1: last - snoop])), + unicode(mutator.orig[first + snoop - 1: last - snoop])), first + snoop, last - snoop)) first += snoop last -= snoop @@ -466,8 +471,8 @@ def apply_inversion(first, last, mutator, record, O): if first == last: O.addMessage(__file__, 2, 'WWRONGTYPE', 'Inversion at position ' \ '%i is actually a substitution.' % first) - record.name(first, first, 'subst', mutator.orig[first - 1], - Bio.Seq.reverse_complement(mutator.orig[first - 1]), None) + record.name(first, first, 'subst', unicode(mutator.orig[first - 1]), + util.reverse_complement(unicode(mutator.orig[first - 1])), None) else : record.name(first, last, 'inv', '', '', None) #apply_inversion @@ -483,7 +488,7 @@ def apply_insertion(before, after, s, mutator, record, O): @arg after: Genomic position after the insertion. @type after: int @arg s: Nucleotides to be inserted. - @type s: string + @type s: nucleotide @arg mutator: A Mutator instance. @type mutator: mutalyzer.mutator.Mutator @arg record: A GenRecord object. @@ -547,7 +552,7 @@ def apply_insertion(before, after, s, mutator, record, O): 'however, the HGVS notation prescribes that it should be a ' \ 'duplication of %s at position %i_%i.' % ( s, before, before + 1, - mutator.mutated[new_before + forward_roll:new_stop + forward_roll], + unicode(mutator.mutated[new_before + forward_roll:new_stop + forward_roll]), before + forward_roll, before + forward_roll + insertion_length - 1)) after += forward_roll - 1 @@ -566,7 +571,7 @@ def apply_insertion(before, after, s, mutator, record, O): 'that on the forward strand it should be an insertion of %s ' \ 'at position %i_%i.' % ( s, before, before + 1, - mutator.mutated[new_before + forward_roll:new_stop + forward_roll], + unicode(mutator.mutated[new_before + forward_roll:new_stop + forward_roll]), new_before + forward_roll, new_before + forward_roll + 1)) if forward_roll != original_forward_roll and not reverse_strand: @@ -576,7 +581,7 @@ def apply_insertion(before, after, s, mutator, record, O): 'insertion of %s at position %i_%i, since they reside in ' \ 'different exons.' % ( s, before, before + 1, - mutator.mutated[new_before + original_forward_roll:new_stop + original_forward_roll], + unicode(mutator.mutated[new_before + original_forward_roll:new_stop + original_forward_roll]), new_before + original_forward_roll, new_before + original_forward_roll + 1)) if reverse_roll and reverse_strand: @@ -585,13 +590,13 @@ def apply_insertion(before, after, s, mutator, record, O): 'that on the reverse strand it should be an insertion of %s ' \ 'at position %i_%i.' % ( s, before, before + 1, - mutator.mutated[new_before - reverse_roll:new_stop - reverse_roll], + unicode(mutator.mutated[new_before - reverse_roll:new_stop - reverse_roll]), new_before - reverse_roll, (new_before - reverse_roll) + 1)) record.name(before, before + 1, 'ins', - mutator.mutated[new_before + forward_roll:new_stop + forward_roll], + unicode(mutator.mutated[new_before + forward_roll:new_stop + forward_roll]), '', (reverse_roll, forward_roll), - mutator.mutated[new_before - reverse_roll:new_stop - reverse_roll]) + unicode(mutator.mutated[new_before - reverse_roll:new_stop - reverse_roll])) #apply_insertion @@ -605,7 +610,7 @@ def apply_delins(first, last, insert, mutator, record, output): @arg last: Genomic end position of the delins. @type last: int @arg insert: Sequence to insert. - @type insert: string + @type insert: unicode @arg mutator: A Mutator instance. @type mutator: mutalyzer.mutator.Mutator @arg record: A GenRecord object. @@ -613,14 +618,13 @@ def apply_delins(first, last, insert, mutator, record, output): @arg output: The Output object. @type output: Modules.Output.Output """ - delete = mutator.orig[first - 1:last] + delete = unicode(mutator.orig[first - 1:last]) - if str(delete) == str(insert): + if delete == insert: output.addMessage(__file__, 2, 'WNOCHANGE', 'Sequence "%s" at position %i_%i is identical to ' \ 'the variant.' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), - first, last)) + util.visualise_sequence(delete), first, last)) return delete_trimmed, insert_trimmed, lcp, lcs = util.trim_common(delete, insert) @@ -646,7 +650,7 @@ def apply_delins(first, last, insert, mutator, record, output): mutator, record, output) return - if str(Bio.Seq.reverse_complement(delete_trimmed)) == insert_trimmed: + if util.reverse_complement(delete_trimmed) == insert_trimmed: output.addMessage(__file__, 2, 'WWRONGTYPE', 'The given DelIns ' \ 'is actually an inversion.') apply_inversion(first + lcp, last - lcs, mutator, @@ -658,7 +662,7 @@ def apply_delins(first, last, insert, mutator, record, output): 'Sequence "%s" at position %i_%i has the same prefix or ' \ 'suffix as the inserted sequence "%s". The HGVS notation ' \ 'prescribes that it should be "%s" at position %i_%i.' % ( - util.visualise_sequence(str(mutator.orig[first - 1:last])), + util.visualise_sequence(unicode(mutator.orig[first - 1:last])), first, last, insert, insert_trimmed, first + lcp, last - lcs)) mutator.delins(first + lcp, last - lcs, insert_trimmed) @@ -952,17 +956,19 @@ def process_raw_variant(mutator, variant, record, transcript, output): """ variant, original_description = variant.RawVar, variant[-1] - # {argument} may be a number, or a subsequence of the reference. - # {sequence} is the variant subsequence. - argument = variant.Arg1 - sequence = variant.Arg2 + # `argument` may be a number, or a subsequence of the reference. + # `sequence` is the variant subsequence. + # Note that pyparsing will return `str('')` if the attribute does not + # exist, so we explicitely convert the result to unicode. + argument = unicode(variant.Arg1) + sequence = unicode(variant.Arg2) # If we are on the reverse strand, subsequences must be in reverse # complement. if transcript and transcript.CM.orientation == -1: - sequence = Bio.Seq.reverse_complement(sequence) + sequence = util.reverse_complement(sequence) if util.is_dna(argument): - argument = Bio.Seq.reverse_complement(argument) + argument = util.reverse_complement(argument) # Get genomic first and last positions for this variant. Below we handle # the different ways of describing these positions. @@ -1189,7 +1195,7 @@ def process_raw_variant(mutator, variant, record, transcript, output): def parse_sequence(seq): if seq.Sequence: if transcript and transcript.CM.orientation == -1: - return Bio.Seq.reverse_complement(str(seq.Sequence)) + return util.reverse_complement(seq.Sequence) return seq.Sequence if seq.StartLoc and seq.EndLoc: @@ -1228,9 +1234,9 @@ def process_raw_variant(mutator, variant, record, transcript, output): 'Position %s is out of range.' % range_last) raise _RawVariantError() - insertion = mutator.orig[range_first - 1:range_last] + insertion = unicode(mutator.orig[range_first - 1:range_last]) if seq.Inv: - insertion = Bio.Seq.reverse_complement(str(insertion)) + insertion = util.reverse_complement(insertion) return insertion @@ -1245,7 +1251,7 @@ def process_raw_variant(mutator, variant, record, transcript, output): seqs = reversed(variant.SeqList) else: seqs = variant.SeqList - insertion = ''.join(str(parse_sequence(seq)) + insertion = ''.join(parse_sequence(seq) for seq in seqs) else: insertion = parse_sequence(variant.Seq) @@ -1316,32 +1322,33 @@ def _add_transcript_info(mutator, transcript, output): if transcript.transcribe: output.addOutput('myTranscriptDescription', transcript.description or '=') output.addOutput('origMRNA', - str(util.splice(mutator.orig, transcript.mRNA.positionList))) + unicode(util.splice(mutator.orig, transcript.mRNA.positionList))) output.addOutput('mutatedMRNA', - str(util.splice(mutator.mutated, + unicode(util.splice(mutator.mutated, mutator.shift_sites(transcript.mRNA.positionList)))) # Add protein prediction to output. if transcript.translate: - cds_original = Seq(str(util.splice(mutator.orig, transcript.CDS.positionList)), - IUPAC.unambiguous_dna) - cds_variant = Seq(str(util.__nsplice(mutator.mutated, - mutator.shift_sites(transcript.mRNA.positionList), - mutator.shift_sites(transcript.CDS.location), - transcript.CM.orientation)), - IUPAC.unambiguous_dna) + cds_original = util.splice(mutator.orig, transcript.CDS.positionList) + cds_original.alphabet = IUPAC.unambiguous_dna - #output.addOutput('origCDS', cds_original) - - if transcript.CM.orientation == -1: - cds_original = Bio.Seq.reverse_complement(cds_original) - cds_variant = Bio.Seq.reverse_complement(cds_variant) - - if not util.is_dna(cds_original): + if not _verify_alphabet(cds_original): output.addMessage(__file__, 4, 'ENODNA', 'Invalid letters in reference sequence.') return + cds_variant = util.__nsplice(mutator.mutated, + mutator.shift_sites(transcript.mRNA.positionList), + mutator.shift_sites(transcript.CDS.location), + transcript.CM.orientation) + cds_variant.alphabet = IUPAC.unambiguous_dna + + #output.addOutput('origCDS', cds_original) + + if transcript.CM.orientation == -1: + cds_original = cds_original.reverse_complement() + cds_variant = cds_variant.reverse_complement() + if '*' in cds_original.translate(table=transcript.txTable)[:-1]: output.addMessage(__file__, 3, 'ESTOP', 'In frame stop codon found.') @@ -1354,36 +1361,35 @@ def _add_transcript_info(mutator, transcript, output): # Note: addOutput('origCDS', ...) was first before the possible # reverse complement operation above. - output.addOutput('origCDS', cds_original) - output.addOutput("newCDS", cds_variant[:(len(str(protein_variant)) + 1) * 3]) + output.addOutput('origCDS', unicode(cds_original)) + output.addOutput("newCDS", unicode(cds_variant[:(len(protein_variant) + 1) * 3])) - output.addOutput('oldprotein', protein_original + '*') + output.addOutput('oldprotein', unicode(protein_original) + '*') # Todo: Don't generate the fancy HTML protein views here, do this in # website.py. # I think it would also be nice to include the mutated list of splice # sites. - if not protein_variant or protein_variant[0] != 'M': + if not protein_variant or unicode(protein_variant[0]) != 'M': # Todo: Protein differences are not color-coded, # use something like below in protein_description(). - util.print_protein_html(protein_original + '*', 0, 0, output, - 'oldProteinFancy') - util.print_protein_html(protein_original + '*', 0, 0, output, - 'oldProteinFancyText', text=True) - if str(cds_variant[0:3]) in \ - Bio.Data.CodonTable.unambiguous_dna_by_id \ - [transcript.txTable].start_codons: + util.print_protein_html(unicode(protein_original) + '*', 0, 0, + output, 'oldProteinFancy') + util.print_protein_html(unicode(protein_original) + '*', 0, 0, + output, 'oldProteinFancyText', text=True) + if unicode(cds_variant[0:3]) in \ + CodonTable.unambiguous_dna_by_id[transcript.txTable].start_codons: output.addOutput('newprotein', '?') util.print_protein_html('?', 0, 0, output, 'newProteinFancy') util.print_protein_html('?', 0, 0, output, 'newProteinFancyText', text=True) - output.addOutput('altStart', str(cds_variant[0:3])) - if str(protein_original[1:]) != str(protein_variant[1:]): + output.addOutput('altStart', unicode(cds_variant[0:3])) + if unicode(protein_original[1:]) != unicode(protein_variant[1:]): output.addOutput('altProtein', - 'M' + protein_variant[1:] + '*') - util.print_protein_html('M' + protein_variant[1:] + '*', 0, + 'M' + unicode(protein_variant[1:]) + '*') + util.print_protein_html('M' + unicode(protein_variant[1:]) + '*', 0, 0, output, 'altProteinFancy') - util.print_protein_html('M' + protein_variant[1:] + '*', 0, + util.print_protein_html('M' + unicode(protein_variant[1:]) + '*', 0, 0, output, 'altProteinFancyText', text=True) else : output.addOutput('newprotein', '?') @@ -1395,21 +1401,22 @@ def _add_transcript_info(mutator, transcript, output): cds_length = util.cds_length( mutator.shift_sites(transcript.CDS.positionList)) descr, first, last_original, last_variant = \ - util.protein_description(cds_length, protein_original, - protein_variant) + util.protein_description(cds_length, + unicode(protein_original), + unicode(protein_variant)) # This is never used. output.addOutput('myProteinDescription', descr) - util.print_protein_html(protein_original + '*', first, + util.print_protein_html(unicode(protein_original) + '*', first, last_original, output, 'oldProteinFancy') - util.print_protein_html(protein_original + '*', first, + util.print_protein_html(unicode(protein_original) + '*', first, last_original, output, 'oldProteinFancyText', text=True) - if str(protein_original) != str(protein_variant): - output.addOutput('newprotein', protein_variant + '*') - util.print_protein_html(protein_variant + '*', first, + if unicode(protein_original) != unicode(protein_variant): + output.addOutput('newprotein', unicode(protein_variant) + '*') + util.print_protein_html(unicode(protein_variant) + '*', first, last_variant, output, 'newProteinFancy') - util.print_protein_html(protein_variant + '*', first, + util.print_protein_html(unicode(protein_variant) + '*', first, last_variant, output, 'newProteinFancyText', text=True) #_add_transcript_info @@ -1473,6 +1480,7 @@ def process_variant(mutator, description, record, output): if description.LrgAcc: # LRG case, pick the top gene. gene = record.record.geneList[0] + if transcript_id: transcript = gene.findLocus(transcript_id) if not transcript: @@ -1481,7 +1489,7 @@ def process_variant(mutator, description, record, output): # NG_012772.1). output.addMessage(__file__, 4, "ENOTRANSCRIPT", "Multiple transcripts found for gene %s. Please " \ - "choose from: %s" %(gene.name, + "choose from: %s" % (gene.name, ", ".join(gene.listLoci()))) else: # No transcript id given. @@ -1563,10 +1571,10 @@ def process_variant(mutator, description, record, output): 'Protein level descriptions can only be done on a protein or transcript reference.') raise _VariantError() else: - cds = Seq(str(util.splice(mutator.orig, transcript.CDS.positionList)), - IUPAC.unambiguous_dna) + cds = util.splice(mutator.orig, transcript.CDS.positionList) + cds.alphabet = IUPAC.unambiguous_dna if transcript.CM.orientation == -1: - cds = Bio.Seq.reverse_complement(cds) + cds = cds.reverse_complement() protein = cds.translate(table=transcript.txTable, cds=True, to_stop=True) mutator.orig = protein mutator.mutated = protein @@ -1644,12 +1652,12 @@ def check_variant(description, output): if parsed_description.LrgAcc: record_id = parsed_description.LrgAcc - elif parsed_description.Version: - record_id = parsed_description.RefSeqAcc + '.' + parsed_description.Version + elif parsed_description.RefSeqAcc: + if parsed_description.Version: + record_id = parsed_description.RefSeqAcc + '.' + parsed_description.Version + else: + record_id = parsed_description.RefSeqAcc else: - record_id = parsed_description.RefSeqAcc - - if not record_id: output.addMessage(__file__, 4, 'ENOREF', 'No reference sequence given.') return @@ -1657,7 +1665,7 @@ def check_variant(description, output): if parsed_description.LrgAcc: filetype = 'LRG' - transcript_id = parsed_description.LRGTranscriptID + transcript_id = parsed_description.LRGTranscriptID or '' retriever = Retriever.LRGRetriever(output) else: filetype = 'GB' @@ -1732,8 +1740,8 @@ def check_variant(description, output): except _VariantError: return - output.addOutput('original', str(mutator.orig)) - output.addOutput('mutated', str(mutator.mutated)) + output.addOutput('original', unicode(mutator.orig)) + output.addOutput('mutated', unicode(mutator.mutated)) # Chromosomal region (only for GenBank human transcript references). # This is still quite ugly code, and should be cleaned up once we have @@ -1775,17 +1783,18 @@ def check_variant(description, output): transcript.proteinDescription = 'p.?' continue - cds_original = Seq(str(util.splice(mutator.orig, transcript.CDS.positionList)), - IUPAC.unambiguous_dna) - cds_variant = Seq(str(util.__nsplice(mutator.mutated, - mutator.shift_sites(transcript.mRNA.positionList), - mutator.shift_sites(transcript.CDS.location), - transcript.CM.orientation)), - IUPAC.unambiguous_dna) + cds_original = util.splice(mutator.orig, transcript.CDS.positionList) + cds_original.alphabet = IUPAC.unambiguous_dna + + cds_variant = util.__nsplice(mutator.mutated, + mutator.shift_sites(transcript.mRNA.positionList), + mutator.shift_sites(transcript.CDS.location), + transcript.CM.orientation) + cds_variant.alphabet = IUPAC.unambiguous_dna if transcript.CM.orientation == -1: - cds_original = Bio.Seq.reverse_complement(cds_original) - cds_variant = Bio.Seq.reverse_complement(cds_variant) + cds_original = cds_original.reverse_complement() + cds_variant = cds_variant.reverse_complement() #if '*' in cds_original.translate()[:-1]: # output.addMessage(__file__, 3, "ESTOP", @@ -1801,7 +1810,7 @@ def check_variant(description, output): # FIXME this is a bit of a rancid fix. protein_original = cds_original.translate( table=transcript.txTable, cds=True, to_stop=True) - except Bio.Data.CodonTable.TranslationError: + except CodonTable.TranslationError: if transcript.current: output.addMessage( __file__, 2, "WTRANS", @@ -1822,7 +1831,7 @@ def check_variant(description, output): cds_length = util.cds_length( mutator.shift_sites(transcript.CDS.positionList)) transcript.proteinDescription = util.protein_description( - cds_length, protein_original, protein_variant)[0] + cds_length, unicode(protein_original), unicode(protein_variant))[0] except IndexError: # Todo: Probably CDS start was hit by removal of exon.. transcript.proteinDescription = 'p.?' diff --git a/mutalyzer/website/__init__.py b/mutalyzer/website/__init__.py index 730c33e86f6ee5be9edd5afcb13166d4c58d907d..2ce0450bf8765e9197c37545aeef9b3281315c43 100644 --- a/mutalyzer/website/__init__.py +++ b/mutalyzer/website/__init__.py @@ -3,6 +3,8 @@ Mutalyzer website interface using the Flask framework. """ +from __future__ import unicode_literals + import logging import os import pkg_resources diff --git a/mutalyzer/website/templates/base.html b/mutalyzer/website/templates/base.html index 2f45caf9f0a7a4be3f98721736c785861b620dad..270e3bdfd75d1bd69e692de91f876300b26f4066 100644 --- a/mutalyzer/website/templates/base.html +++ b/mutalyzer/website/templates/base.html @@ -22,7 +22,7 @@ src="{{ url_for('static', filename='js/generator.js') }}"> </script> <meta http-equiv="Content-Type" - content="text/html; charset=iso-8859-1"> + content="text/html; charset=utf-8"> <title>Mutalyzer {{ mutalyzer_version }} — {{ page_title }}</title> </head> <body diff --git a/mutalyzer/website/views.py b/mutalyzer/website/views.py index 475330b7ce084273bc61627e5a24221d2b0ccff0..03644dcc6c77663bc779f1eebadc939dff86e15e 100644 --- a/mutalyzer/website/views.py +++ b/mutalyzer/website/views.py @@ -3,6 +3,8 @@ Mutalyzer website views. """ +from __future__ import unicode_literals + import bz2 import os import pkg_resources @@ -144,7 +146,7 @@ def soap_api(): xsl_doc = etree.parse(xsl_handle) transform = etree.XSLT(xsl_doc) - return make_response(str(transform(wsdl_doc))) + return make_response(unicode(transform(wsdl_doc))) @website.route('/downloads/<string:filename>') @@ -159,7 +161,7 @@ def downloads(filename): except jinja2.exceptions.TemplateNotFound: abort(404) - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' response.headers['Content-Disposition'] = ('attachment; filename="%s"' % filename) return response @@ -233,10 +235,7 @@ def name_checker(): % (description, request.remote_addr)) stats.increment_counter('name-checker/website') - # Todo: The following is probably a problem elsewhere too. We stringify - # the variant, because a unicode string crashes BioPython's - # `reverse_complement`. - variantchecker.check_variant(str(description), output) + variantchecker.check_variant(description, output) errors, warnings, summary = output.Summary() parse_error = output.getOutput('parseError') @@ -272,18 +271,20 @@ def name_checker(): # Experimental description extractor. if (output.getIndexedOutput('original', 0) and output.getIndexedOutput('mutated', 0)): + extracted = extractedProt = '(skipped)' + allele = describe.describe(output.getIndexedOutput('original', 0), output.getIndexedOutput('mutated', 0)) - prot_allele = describe.describe( - output.getIndexedOutput('oldprotein', 0), - output.getIndexedOutput('newprotein', 0, default=''), - DNA=False) - - extracted = extractedProt = '(skipped)' if allele: extracted = describe.alleleDescription(allele) - if prot_allele: - extractedProt = describe.alleleDescription(prot_allele) + + if output.getIndexedOutput('oldprotein', 0): + prot_allele = describe.describe( + output.getIndexedOutput('oldprotein', 0), + output.getIndexedOutput('newprotein', 0, default=''), + DNA=False) + if prot_allele: + extractedProt = describe.alleleDescription(prot_allele) else: extracted = extractedProt = '' @@ -350,11 +351,10 @@ def bed(): if not description: abort(404) - return render_template('name-checker.html') output = Output(__file__) - variantchecker.check_variant(str(description), output) + variantchecker.check_variant(description, output) raw_variants = output.getIndexedOutput('rawVariantsChromosomal', 0) if not raw_variants: @@ -376,14 +376,14 @@ def bed(): for descr, positions in raw_variants[2]: bed += '\t'.join([raw_variants[0], - str(min(positions) - 1), - str(max(positions)), + unicode(min(positions) - 1), + unicode(max(positions)), descr, '0', raw_variants[1]]) + '\n' response = make_response(bed) - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' return response @@ -579,7 +579,7 @@ def reference_loader_submit(): output = Output(__file__) output.addMessage(__file__, -1, 'INFO', 'Received request upload(%s) with arguments %s from %s' - % (method, str(request.form), request.remote_addr)) + % (method, unicode(request.form), request.remote_addr)) assemblies = Assembly.query \ .order_by(Assembly.taxonomy_common_name.asc(), @@ -668,11 +668,11 @@ def reference_loader_submit(): if not ud: errors.append('The request could not be completed') - errors.extend(str(m) for m in output.getMessages()) + errors.extend(unicode(m) for m in output.getMessages()) output.addMessage(__file__, -1, 'INFO', 'Finished request upload(%s) with arguments %s from %s' - % (method, str(request.form), request.remote_addr)) + % (method, unicode(request.form), request.remote_addr)) return render_template('reference-loader.html', assemblies=assemblies, @@ -737,7 +737,7 @@ def reference(filename): response = make_response(bz2.BZ2File(file_path, 'r').read()) - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' response.headers['Content-Disposition'] = ('attachment; filename="%s"' % filename) return response @@ -894,7 +894,7 @@ def batch_job_result(result_id): return send_from_directory(settings.CACHE_DIR, 'batch-job-%s.txt' % result_id, - mimetype='text/plain', + mimetype='text/plain; charset=utf-8', as_attachment=True) @@ -933,10 +933,7 @@ def lovd_get_gs(): % (mutation_name, variant_record, forward, request.remote_addr)) - # Todo: The following is probably a problem elsewhere too. - # We stringify the variant, because a unicode string crashes - # Bio.Seq.reverse_complement in mapping.py:607. - variantchecker.check_variant(str(mutation_name), output) + variantchecker.check_variant(mutation_name, output) output.addMessage(__file__, -1, 'INFO', 'Finished request getGS(%s, %s, %s)' @@ -955,11 +952,11 @@ def lovd_get_gs(): standalone=1)) else: response = make_response(l[0]) - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' return response response = make_response('Transcript not found') - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' return response @@ -1041,7 +1038,7 @@ def lovd_variant_info(): assembly = Assembly.by_name_or_alias(build) except NoResultFound: response = make_response('invalid build') - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' return response converter = Converter(assembly, output) @@ -1079,7 +1076,7 @@ def lovd_variant_info(): response = re.sub('^Error \(.*\):', 'Error:', result) response = make_response(result) - response.headers['Content-Type'] = 'text/plain' + response.headers['Content-Type'] = 'text/plain; charset=utf-8' return response diff --git a/tests/fixtures.py b/tests/fixtures.py index 595d72a663e3ec06a6df748f3d21e6aa4a8019ee..71b1ae1bfc7bba9bc17a56f8c1431f56b2eddde7 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -7,6 +7,8 @@ as :func:`hg19` must be called after the :func:`database` fixture). """ +from __future__ import unicode_literals + import os import shutil diff --git a/tests/old/lrgtest.py b/tests/old/lrgtest.py index afeefc3324596c39bf3723f40d119a4f0df90d0c..d2dae2bca774fad39d9cdeb7bc8e888db7083075 100644 --- a/tests/old/lrgtest.py +++ b/tests/old/lrgtest.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals import sys, os, unittest, types #make it possible to import the Modules diff --git a/tests/old/maptest.py b/tests/old/maptest.py index 7f3105a46eac5c4af99025728b736fa08c49b4e1..40dc1d15dfee1df5b4cfcc46c2d6948dd6796423 100644 --- a/tests/old/maptest.py +++ b/tests/old/maptest.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals import sys, os, unittest, types #make it possible to import the Modules diff --git a/tests/old/recordtest.py b/tests/old/recordtest.py index d55bd58c0df440a4774ceaf59a44444f388740dd..a9cc9354557e3ba32f299bd23718cddab3ff7b46 100644 --- a/tests/old/recordtest.py +++ b/tests/old/recordtest.py @@ -2,6 +2,7 @@ recordtest.py contains TestRecord - a BaseClass for testing GenRecord.Record instances """ +from __future__ import unicode_literals import unittest, types from Modules import GenRecord #test class-types @@ -56,7 +57,7 @@ class TestRecord(unittest.TestCase): self.assertTrue(isinstance(plist, (types.NoneType, GenRecord.PList))) - #self.assertTrue(any(map(isinstance, + #self.assertTrue(any(map(isinstance, def _test_if_loc(self, loc): @@ -76,7 +77,5 @@ class TestRecord(unittest.TestCase): if __name__ == "__main__": - # This file should be imported + # This file should be imported pass - - diff --git a/tests/test_crossmap.py b/tests/test_crossmap.py index ff9d6d75928918b19d01b769b5a099d864408b11..990f93fe877dfcc6d8945187a4d559238f7f9a45 100644 --- a/tests/test_crossmap.py +++ b/tests/test_crossmap.py @@ -3,6 +3,8 @@ Tests for the Crossmap module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() from mutalyzer.Crossmap import Crossmap diff --git a/tests/test_describe.py b/tests/test_describe.py index 8315213eb49cc5c688d1d4816841dcc5c7dcb02b..e81c7ce45bf6dbb5776326d75e3f7f410179db6d 100644 --- a/tests/test_describe.py +++ b/tests/test_describe.py @@ -3,6 +3,8 @@ Tests for the mutalyzer.describe module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() import os diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 1ebaa399e372155291f33ce1c6de21b22682c5ad..dad9a9c64c959cd91433b0324f0c7eb346f3e58c 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -3,6 +3,8 @@ Tests for the mutalyzer.grammar module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() import os diff --git a/tests/test_mapping.py b/tests/test_mapping.py index 5ebdc60e667cc3ec46cd46cda7c71e29561061ee..620f9d757f388579381edbf0eb3c64d032db51a3 100644 --- a/tests/test_mapping.py +++ b/tests/test_mapping.py @@ -3,6 +3,8 @@ Tests for the mapping module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() from sqlalchemy import or_ diff --git a/tests/test_mutator.py b/tests/test_mutator.py index 36c5b8d152ebfa553e859b9ef11dae3e3a40bd43..05e2c685fb33f29978839b17236c933c4b232016 100644 --- a/tests/test_mutator.py +++ b/tests/test_mutator.py @@ -3,6 +3,8 @@ Tests for the mutalyzer.mutator module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() import re import os @@ -666,7 +668,7 @@ class TestMutator(MutalyzerTest): """ m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) - assert str(m.mutated) == str(Seq('ACGATCG')) + assert unicode(m.mutated) == unicode(Seq('ACGATCG')) def test_largedel(self): """ @@ -674,7 +676,7 @@ class TestMutator(MutalyzerTest): """ m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 7) - assert str(m.mutated) == str(Seq('AG')) + assert unicode(m.mutated) == unicode(Seq('AG')) def test_ins(self): """ @@ -682,7 +684,7 @@ class TestMutator(MutalyzerTest): """ m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'A') - assert str(m.mutated) == str(Seq('ATACGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATACGATCG')) def test_largeins(self): """ @@ -690,7 +692,7 @@ class TestMutator(MutalyzerTest): """ m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'ATCG') - assert str(m.mutated) == str(Seq('ATATCGCGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATATCGCGATCG')) def test_sub(self): """ @@ -698,7 +700,7 @@ class TestMutator(MutalyzerTest): """ m = self._mutator(Seq('ATCGATCG')) m.substitution(3, 'G') - assert str(m.mutated) == str(Seq('ATGGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATGGATCG')) def test_adjecent_del_sub_1(self): """ @@ -709,7 +711,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.substitution(3, 'G') - assert str(m.mutated) == str(Seq('AGGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGGATCG')) def test_adjecent_del_sub_2(self): """ @@ -718,7 +720,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(3, 3) m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGGATCG')) def test_near_adjecent_del_sub_1(self): """ @@ -727,7 +729,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.substitution(4, 'T') - assert str(m.mutated) == str(Seq('ACTATCG')) + assert unicode(m.mutated) == unicode(Seq('ACTATCG')) def test_near_adjecent_del_sub_2(self): """ @@ -736,7 +738,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(4, 4) m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGCATCG')) + assert unicode(m.mutated) == unicode(Seq('AGCATCG')) def test_adjecent_largedel_sub_1(self): """ @@ -746,7 +748,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 6) m.substitution(7, 'T') - assert str(m.mutated) == str(Seq('ATG')) + assert unicode(m.mutated) == unicode(Seq('ATG')) def test_adjecent_largedel_sub_2(self): """ @@ -756,7 +758,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(3, 7) m.substitution(2, 'C') - assert str(m.mutated) == str(Seq('ACG')) + assert unicode(m.mutated) == unicode(Seq('ACG')) def test_near_adjecent_largedel_sub_1(self): """ @@ -765,7 +767,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 5) m.substitution(7, 'T') - assert str(m.mutated) == str(Seq('ATTG')) + assert unicode(m.mutated) == unicode(Seq('ATTG')) def test_near_adjecent_largedel_sub_2(self): """ @@ -774,7 +776,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(4, 7) m.substitution(2, 'C') - assert str(m.mutated) == str(Seq('ACCG')) + assert unicode(m.mutated) == unicode(Seq('ACCG')) def test_adjectent_del_ins_1(self): """ @@ -783,7 +785,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('AGCGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGCGATCG')) def test_adjectent_del_ins_2(self): """ @@ -792,7 +794,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(3, 3) m.insertion(2, 'A') - assert str(m.mutated) == str(Seq('ATAGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATAGATCG')) def test_near_adjectent_del_ins(self): """ @@ -801,7 +803,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.insertion(3, 'T') - assert str(m.mutated) == str(Seq('ACTGATCG')) + assert unicode(m.mutated) == unicode(Seq('ACTGATCG')) def test_adjecent_ins_sub_1(self): """ @@ -811,7 +813,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'A') m.substitution(3, 'G') - assert str(m.mutated) == str(Seq('ATAGGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATAGGATCG')) def test_adjecent_ins_sub_2(self): """ @@ -821,7 +823,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'A') m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGACGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGACGATCG')) def test_near_adjecent_ins_sub(self): """ @@ -831,7 +833,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'A') m.substitution(4, 'T') - assert str(m.mutated) == str(Seq('ATACTATCG')) + assert unicode(m.mutated) == unicode(Seq('ATACTATCG')) def test_adjecent_largeins_sub_1(self): """ @@ -841,7 +843,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'ATCG') m.substitution(3, 'G') - assert str(m.mutated) == str(Seq('ATATCGGGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATATCGGGATCG')) def test_adjecent_largeins_sub_2(self): """ @@ -851,7 +853,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'ATCG') m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGATCGCGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGATCGCGATCG')) def test_near_adjecent_largeins_sub(self): """ @@ -861,7 +863,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'ATCG') m.substitution(4, 'T') - assert str(m.mutated) == str(Seq('ATATCGCTATCG')) + assert unicode(m.mutated) == unicode(Seq('ATATCGCTATCG')) def test_adjecent_del_del_1(self): """ @@ -870,7 +872,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.deletion(3, 3) - assert str(m.mutated) == str(Seq('AGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGATCG')) def test_adjecent_del_del_2(self): """ @@ -879,7 +881,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(3, 3) m.deletion(2, 2) - assert str(m.mutated) == str(Seq('AGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGATCG')) def test_adjecent_delins_snp_1(self): """ @@ -888,7 +890,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 2, 'A') m.substitution(3, 'G') - assert str(m.mutated) == str(Seq('AAGGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGGATCG')) def test_adjecent_delins_snp_2(self): """ @@ -897,7 +899,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 3, 'A') m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGAGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGAGATCG')) def test_adjecent_largedelins_eq_snp_1(self): """ @@ -907,7 +909,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAA') m.substitution(7, 'G') - assert str(m.mutated) == str(Seq('AAAAAAGG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAGG')) def test_adjecent_largedelins_min_snp_1(self): """ @@ -917,7 +919,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAA') m.substitution(7, 'G') - assert str(m.mutated) == str(Seq('AAAAGG')) + assert unicode(m.mutated) == unicode(Seq('AAAAGG')) def test_adjecent_largedelins_plus_snp_1(self): """ @@ -927,7 +929,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAAAA') m.substitution(7, 'G') - assert str(m.mutated) == str(Seq('AAAAAAAAGG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAAAGG')) def test_adjecent_largedelins_eq_snp_2(self): """ @@ -937,7 +939,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAA') m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('AGAAAAAG')) def test_adjecent_largedelins_min_snp_2(self): """ @@ -947,7 +949,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAA') m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGAAAG')) + assert unicode(m.mutated) == unicode(Seq('AGAAAG')) def test_adjecent_largedelins_plus_snp_2(self): """ @@ -957,7 +959,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAAAA') m.substitution(2, 'G') - assert str(m.mutated) == str(Seq('AGAAAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('AGAAAAAAAG')) def test_adjecent_delins_del_1(self): """ @@ -966,7 +968,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 2, 'A') m.deletion(3, 3) - assert str(m.mutated) == str(Seq('AAGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGATCG')) def test_adjecent_delins_del_2(self): """ @@ -975,7 +977,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 3, 'A') m.deletion(2, 2) - assert str(m.mutated) == str(Seq('AAGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGATCG')) def test_adjecent_largedelins_eq_del_1(self): """ @@ -985,7 +987,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAA') m.deletion(7, 7) - assert str(m.mutated) == str(Seq('AAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAG')) def test_adjecent_largedelins_min_del_1(self): """ @@ -995,7 +997,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAA') m.deletion(7, 7) - assert str(m.mutated) == str(Seq('AAAAG')) + assert unicode(m.mutated) == unicode(Seq('AAAAG')) def test_adjecent_largedelins_plus_del_1(self): """ @@ -1005,7 +1007,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAAAA') m.deletion(7, 7) - assert str(m.mutated) == str(Seq('AAAAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAAAG')) def test_adjecent_largedelins_eq_del_2(self): """ @@ -1015,7 +1017,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAA') m.deletion(2, 2) - assert str(m.mutated) == str(Seq('AAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAG')) def test_adjecent_largedelins_min_del_2(self): """ @@ -1025,7 +1027,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAA') m.deletion(2, 2) - assert str(m.mutated) == str(Seq('AAAAG')) + assert unicode(m.mutated) == unicode(Seq('AAAAG')) def test_adjecent_largedelins_plus_del_2(self): """ @@ -1035,7 +1037,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAAAA') m.deletion(2, 2) - assert str(m.mutated) == str(Seq('AAAAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAAAG')) def test_adjectent_delins_ins_1(self): """ @@ -1044,7 +1046,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 2, 'A') m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('AAGCGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGCGATCG')) def test_adjectent_delins_ins_2(self): """ @@ -1053,7 +1055,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 3, 'A') m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('ATGAGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATGAGATCG')) def test_adjectent_largedelins_eq_ins_1(self): """ @@ -1062,7 +1064,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAA') m.insertion(6, 'G') - assert str(m.mutated) == str(Seq('AAAAAAGCG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAGCG')) def test_adjectent_largedelins_min_ins_1(self): """ @@ -1071,7 +1073,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAA') m.insertion(6, 'G') - assert str(m.mutated) == str(Seq('AAAAGCG')) + assert unicode(m.mutated) == unicode(Seq('AAAAGCG')) def test_adjectent_largedelins_plus_ins_1(self): """ @@ -1080,7 +1082,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAAAA') m.insertion(6, 'G') - assert str(m.mutated) == str(Seq('AAAAAAAAGCG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAAAGCG')) def test_adjectent_largedelins_eq_ins_2(self): """ @@ -1089,7 +1091,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAA') m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('ATGAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('ATGAAAAAG')) def test_adjectent_largedelins_min_ins_2(self): """ @@ -1098,7 +1100,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAA') m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('ATGAAAG')) + assert unicode(m.mutated) == unicode(Seq('ATGAAAG')) def test_adjectent_largedelins_plus_ins_2(self): """ @@ -1107,7 +1109,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAAAA') m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('ATGAAAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('ATGAAAAAAAG')) def test_adjectent_delins_del_delins(self): """ @@ -1116,7 +1118,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 3, 'A') m.delins(4, 4, 'T') - assert str(m.mutated) == str(Seq('AATATCG')) + assert unicode(m.mutated) == unicode(Seq('AATATCG')) def test_adjectent_largedelins_plus_delins_1(self): """ @@ -1125,7 +1127,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAAAAAA') m.delins(7, 7, 'T') - assert str(m.mutated) == str(Seq('AAAAAAAATG')) + assert unicode(m.mutated) == unicode(Seq('AAAAAAAATG')) def test_adjectent_largedelins_plus_delins_2(self): """ @@ -1134,7 +1136,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAAAAAA') m.delins(2, 2, 'C') - assert str(m.mutated) == str(Seq('ACAAAAAAAG')) + assert unicode(m.mutated) == unicode(Seq('ACAAAAAAAG')) def test_adjectent_largedelins_min_delins_1(self): """ @@ -1143,7 +1145,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(2, 6, 'AAA') m.delins(7, 7, 'T') - assert str(m.mutated) == str(Seq('AAAATG')) + assert unicode(m.mutated) == unicode(Seq('AAAATG')) def test_adjectent_largedelins_min_delins_2(self): """ @@ -1152,7 +1154,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.delins(3, 7, 'AAA') m.delins(2, 2, 'C') - assert str(m.mutated) == str(Seq('ACAAAG')) + assert unicode(m.mutated) == unicode(Seq('ACAAAG')) def test_adjectent_del_dup_1(self): """ @@ -1161,7 +1163,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.duplication(3, 3) - assert str(m.mutated) == str(Seq('ACCGATCG')) + assert unicode(m.mutated) == unicode(Seq('ACCGATCG')) def test_adjectent_del_dup_2(self): """ @@ -1170,7 +1172,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(3, 3) m.duplication(2, 2) - assert str(m.mutated) == str(Seq('ATTGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATTGATCG')) def test_adjectent_ins_dup_1(self): """ @@ -1179,7 +1181,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'G') m.duplication(3, 3) - assert str(m.mutated) == str(Seq('ATGCCGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATGCCGATCG')) def test_adjectent_ins_dup_2(self): """ @@ -1188,7 +1190,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'G') m.duplication(2, 2) - assert str(m.mutated) == str(Seq('ATTGCGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATTGCGATCG')) def test_adjectent_ins_ins_1(self): """ @@ -1197,7 +1199,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'G') m.insertion(3, 'A') - assert str(m.mutated) == str(Seq('ATGCAGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATGCAGATCG')) def test_adjectent_ins_ins_2(self): """ @@ -1206,7 +1208,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(3, 'A') m.insertion(2, 'G') - assert str(m.mutated) == str(Seq('ATGCAGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATGCAGATCG')) def test_ins_ins(self): """ @@ -1215,7 +1217,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'G') m.insertion(2, 'A') - assert str(m.mutated) in (str(Seq('ATGACGATCG')), str(Seq('ATAGCGATCG'))) + assert unicode(m.mutated) in (unicode(Seq('ATGACGATCG')), unicode(Seq('ATAGCGATCG'))) def test_adjecent_inv_inv_1(self): """ @@ -1224,7 +1226,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.inversion(2, 2) m.inversion(3, 3) - assert str(m.mutated) == str(Seq('AAGGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGGATCG')) def test_adjecent_inv_inv_2(self): """ @@ -1233,7 +1235,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.inversion(3, 3) m.inversion(2, 2) - assert str(m.mutated) == str(Seq('AAGGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGGATCG')) def test_adjecent_dup_dup_1(self): """ @@ -1242,7 +1244,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.duplication(2, 2) m.duplication(3, 3) - assert str(m.mutated) == str(Seq('ATTCCGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATTCCGATCG')) def test_adjecent_dup_dup_2(self): """ @@ -1251,7 +1253,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.duplication(3, 3) m.duplication(2, 2) - assert str(m.mutated) == str(Seq('ATTCCGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATTCCGATCG')) def test_adjecent_del_inv_1(self): """ @@ -1260,7 +1262,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(2, 2) m.inversion(3, 3) - assert str(m.mutated) == str(Seq('AGGATCG')) + assert unicode(m.mutated) == unicode(Seq('AGGATCG')) def test_adjecent_del_inv_2(self): """ @@ -1269,7 +1271,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.deletion(3, 3) m.inversion(2, 2) - assert str(m.mutated) == str(Seq('AAGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGATCG')) def test_adjecent_ins_inv_1(self): """ @@ -1278,7 +1280,7 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'G') m.inversion(3, 3) - assert str(m.mutated) == str(Seq('ATGGGATCG')) + assert unicode(m.mutated) == unicode(Seq('ATGGGATCG')) def test_adjecent_ins_inv_2(self): """ @@ -1287,4 +1289,4 @@ class TestMutator(MutalyzerTest): m = self._mutator(Seq('ATCGATCG')) m.insertion(2, 'G') m.inversion(2, 2) - assert str(m.mutated) == str(Seq('AAGCGATCG')) + assert unicode(m.mutated) == unicode(Seq('AAGCGATCG')) diff --git a/tests/test_parsers_genbank.py b/tests/test_parsers_genbank.py index 7640c496af9aef1c871fa313b69e2ef836d1aace..f04b883971617ee9885ba3478bd667c674189650 100644 --- a/tests/test_parsers_genbank.py +++ b/tests/test_parsers_genbank.py @@ -3,6 +3,8 @@ Tests for the mutalyzer.parsers.genbank module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() from mutalyzer.parsers import genbank diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index fc5e4abe498469e83f4986b40fb68967fd165d86..6f0b4c4ef123aad85abf8af919ecb792d216386c 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -3,9 +3,11 @@ Tests for the Scheduler module. """ +from __future__ import unicode_literals + import bz2 import os -import StringIO +import io #import logging; logging.basicConfig() from Bio import Entrez @@ -33,7 +35,7 @@ class TestScheduler(MutalyzerTest): file_instance = File.File(output.Output('test')) scheduler = Scheduler.Scheduler() - batch_file = StringIO.StringIO('\n'.join(variants) + '\n') + batch_file = io.BytesIO(('\n'.join(variants) + '\n').encode('utf-8')) job, columns = file_instance.parseBatchFile(batch_file) result_id = scheduler.addJob('test@test.test', job, columns, job_type, argument=argument) diff --git a/tests/test_services_json.py b/tests/test_services_json.py index ce029ba764fab2c7cadd84ea730671abca41cca4..8df9b7485ae9c642a0615453ba7e79e882797cb4 100644 --- a/tests/test_services_json.py +++ b/tests/test_services_json.py @@ -3,6 +3,8 @@ Tests for the JSON interface to Mutalyzer. """ +from __future__ import unicode_literals + import simplejson as json from spyne.server.null import NullServer import mutalyzer @@ -77,7 +79,7 @@ class TestServicesJson(MutalyzerTest): Running the info method should give us some version information. """ r = self._call('info') - assert type(r['versionParts']) == list + assert isinstance(r['versionParts'], list) assert r['version'] == mutalyzer.__version__ def test_info_announcement(self): @@ -86,12 +88,12 @@ class TestServicesJson(MutalyzerTest): """ announce.set_announcement('Test announcement') r = self._call('info') - assert type(r['announcement']) == str + assert isinstance(r['announcement'], unicode) assert r['announcement'] == 'Test announcement' announce.set_announcement('New announcement') r = self._call('info') - assert type(r['announcement']) == str + assert isinstance(r['announcement'], unicode) assert r['announcement'] == 'New announcement' announce.unset_announcement() diff --git a/tests/test_services_soap.py b/tests/test_services_soap.py index cc1ce8c00320164293fb03ac66f662b6e454941c..0882c9fbce7e09365b0c235805ac03e519e75dd9 100644 --- a/tests/test_services_soap.py +++ b/tests/test_services_soap.py @@ -3,6 +3,8 @@ Tests for the SOAP interface to Mutalyzer. """ +from __future__ import unicode_literals + import bz2 import datetime import logging @@ -539,8 +541,8 @@ class TestServicesSoap(MutalyzerTest): 'AL449423.14(CDKN2A_v002):c.5_400del'] data = '\n'.join(variants) + '\n' #.encode('base64') - result = self._call('submitBatchJob', data, 'NameChecker') - job_id = str(result) + result = self._call('submitBatchJob', data.encode('utf-8'), 'NameChecker') + job_id = unicode(result) result = self._call('monitorBatchJob', job_id) assert int(result) == len(variants) @@ -564,8 +566,8 @@ class TestServicesSoap(MutalyzerTest): 'AL449423.14(CDKN2A_v002):c.5_400del'] data = '\n'.join(variants) + '\n' - result = self._call('submitBatchJob', data, 'SyntaxChecker') - job_id = str(result) + result = self._call('submitBatchJob', data.encode('utf-8'), 'SyntaxChecker') + job_id = unicode(result) result = self._call('monitorBatchJob', job_id) assert int(result) == len(variants) @@ -586,8 +588,8 @@ class TestServicesSoap(MutalyzerTest): 'AL449423.14(CDKN2A_v002):c.5_400del'] data = '\r'.join(variants) + '\r' - result = self._call('submitBatchJob', data, 'SyntaxChecker') - job_id = str(result) + result = self._call('submitBatchJob', data.encode('utf-8'), 'SyntaxChecker') + job_id = unicode(result) result = self._call('monitorBatchJob', job_id) assert int(result) == len(variants) @@ -608,8 +610,8 @@ class TestServicesSoap(MutalyzerTest): 'AL449423.14(CDKN2A_v002):c.5_400del'] data = '\r\n'.join(variants) + '\r\n' - result = self._call('submitBatchJob', data, 'SyntaxChecker') - job_id = str(result) + result = self._call('submitBatchJob', data.encode('utf-8'), 'SyntaxChecker') + job_id = unicode(result) result = self._call('monitorBatchJob', job_id) assert int(result) == len(variants) @@ -640,7 +642,7 @@ facilisi.""" data += data try: - self._call('submitBatchJob', data.encode('base64'), 'NameChecker') + self._call('submitBatchJob', data.encode('utf-8'), 'NameChecker') assert False except Fault as e: # - senv:Client.RequestTooLong: Raised by Spyne, depending on @@ -661,7 +663,7 @@ facilisi.""" data = f.read() result = self._call('uploadGenBankLocalFile', data) - ud = str(result) + ud = unicode(result) r = self._call('runMutalyzer', ud + '(SDHD):g.7872G>T') assert r.errors == 0 diff --git a/tests/test_variantchecker.py b/tests/test_variantchecker.py index 1b30786b27730bdc91ac5b39785c0f6fa9625d28..8c19421a9f0b8c891908b316d162a007b3d2733b 100644 --- a/tests/test_variantchecker.py +++ b/tests/test_variantchecker.py @@ -3,6 +3,8 @@ Tests for the variantchecker module. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() from mutalyzer.output import Output diff --git a/tests/test_website.py b/tests/test_website.py index e579433a18f321a2fb2784530b8381111bc9b3e6..c649925e411dc091e172fddc82b5b64550b0f0b2 100644 --- a/tests/test_website.py +++ b/tests/test_website.py @@ -5,6 +5,8 @@ Tests for the WSGI interface to Mutalyzer. """ +from __future__ import unicode_literals + #import logging; logging.basicConfig() import bz2 import cgi @@ -12,7 +14,7 @@ import logging from mock import patch import os import re -from StringIO import StringIO +from io import BytesIO import time import urllib import urllib2 @@ -264,7 +266,7 @@ class TestWebsite(MutalyzerTest): """ data = {'job_type': job_type, 'email': 'test@test.test', - 'file': (StringIO(file), 'test.txt')} + 'file': (BytesIO(file.encode('utf-8')), 'test.txt')} if assembly_name_or_alias is not None: data['assembly_name_or_alias'] = assembly_name_or_alias @@ -510,7 +512,7 @@ class TestWebsite(MutalyzerTest): Download a C# example client for the web service. """ r = self.app.get('/downloads/client-mono.cs') - assert r.headers['Content-Type'] == 'text/plain' + assert 'text/plain' in r.headers['Content-Type'] assert 'public static void Main(String [] args) {' in r.data def test_download_php(self): @@ -634,7 +636,7 @@ class TestWebsite(MutalyzerTest): 'build': 'hg19', 'acc': 'NM_203473.1'}) assert 'text/plain' in r.headers['Content-Type'] - assert r.content_type == 'text/plain' + assert 'text/plain' in r.content_type expected = '\n'.join(['-158', '1709', '1371']) assert r.data == expected @@ -678,7 +680,7 @@ class TestWebsite(MutalyzerTest): """ r = self.app.post('/reference-loader', data={'method': 'upload', - 'file': (StringIO('this is not a genbank file'), 'AB026906.1.gb')}) + 'file': (BytesIO('this is not a genbank file'.encode('utf-8')), 'AB026906.1.gb')}) assert 'Your reference sequence was loaded successfully.' not in r.data assert 'The file could not be parsed.' in r.data diff --git a/tests/utils.py b/tests/utils.py index befa5d72859279140211ad412fa2920fce8961d6..f9cfce8bb44a2ce0e7bd09d9951e92d6b8ea1c34 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -3,6 +3,8 @@ Utilities for unit tests. """ +from __future__ import unicode_literals + from functools import wraps import os import shutil