From ebca9a785d64f5c48f7ae463f1622e165ed752b7 Mon Sep 17 00:00:00 2001
From: Martijn Vermaat <martijn@vermaat.name>
Date: Sat, 28 Mar 2015 22:12:06 +0100
Subject: [PATCH] Unicode fixes

---
 mutalyzer/describe.py              | 12 ++++----
 mutalyzer/entrypoints/mutalyzer.py |  2 +-
 mutalyzer/extractor_loader.py      |  2 ++
 mutalyzer/test.py                  | 48 ++++++++++++++++--------------
 mutalyzer/util.py                  |  4 +--
 mutalyzer/variant.py               | 38 ++++++++++++-----------
 mutalyzer/website/views.py         |  6 ++--
 7 files changed, 60 insertions(+), 52 deletions(-)

diff --git a/mutalyzer/describe.py b/mutalyzer/describe.py
index 82c8d6c4..09c133d8 100644
--- a/mutalyzer/describe.py
+++ b/mutalyzer/describe.py
@@ -10,7 +10,6 @@ from __future__ import unicode_literals
 
 import collections
 
-from Bio.SeqUtils import seq3
 from Bio.Data import CodonTable
 
 from mutalyzer.util import palinsnoop, roll
@@ -302,7 +301,8 @@ def describe_dna(s1, s2):
     description = Allele()
     in_transposition = 0
 
-    extracted = extractor.extract(unicode(s1), len(s1), unicode(s2), len(s2), 0)
+    extracted = extractor.extract(s1.encode('utf-8'), len(s1),
+                                  s2.encode('utf-8'), len(s2), 0)
     for variant in extracted.variants:
        # print (variant.type, variant.reference_start,
        #     variant.reference_end, variant.sample_start,
@@ -354,9 +354,9 @@ def describe_protein(s1, s2):
     Give an allele description of the change from {s1} to {s2}.
 
     :arg s1: Sequence 1.
-    :type s1: str
+    :type s1: unicode
     :arg s2: Sequence 2.
-    :type s2: str
+    :type s2: unicode
 
     :returns: A list of RawVar objects, representing the allele.
     :rtype: list(RawVar)
@@ -384,8 +384,8 @@ def describe_protein(s1, s2):
 
     s1_part = s1
     s2_part = s2
-    for variant in extractor.extract(str(s1_part), len(s1_part),
-            str(s2_part), len(s2_part), 1):
+    for variant in extractor.extract(s1_part.encode('utf-8'), len(s1_part),
+                                     s2_part.encode('utf-8'), len(s2_part), 1):
         description.append(var_to_rawvar(s1, s2, variant,
             container=ProteinVar))
 
diff --git a/mutalyzer/entrypoints/mutalyzer.py b/mutalyzer/entrypoints/mutalyzer.py
index a548376e..08864546 100644
--- a/mutalyzer/entrypoints/mutalyzer.py
+++ b/mutalyzer/entrypoints/mutalyzer.py
@@ -19,7 +19,7 @@ from .. import variantchecker
 class MyEncoder(json.JSONEncoder):
     def default(self, o):
         json_object = o.__dict__
-        json_object.update({"hgvs": str(o), "weight": o.weight()})
+        json_object.update({"hgvs": unicode(o), "weight": o.weight()})
 
         return json_object
     #default
diff --git a/mutalyzer/extractor_loader.py b/mutalyzer/extractor_loader.py
index 81f43cce..ee8b097e 100644
--- a/mutalyzer/extractor_loader.py
+++ b/mutalyzer/extractor_loader.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python
 
+from __future__ import unicode_literals
+
 import sys
 
 import json
diff --git a/mutalyzer/test.py b/mutalyzer/test.py
index acdca95d..f733aec8 100644
--- a/mutalyzer/test.py
+++ b/mutalyzer/test.py
@@ -1,23 +1,25 @@
-#!/usr/bin/env python
-
-import json
-
-import describe
-
-class MyEncoder(json.JSONEncoder):
-    def default(self, o):
-        return o.__dict__
-
-def main():
-    ref = "ACGTCGATTCGCTAGCTTCGGGGGATAGATAGAGATATAGAGATATTTTT"
-    alt = "ACGTCGGTTCGCTAGCTTCGGGGGATAGATAGATATATAGAGATATTTTT"
-
-    extracted_allele = describe.describe_dna(ref, alt)
-
-    print extracted_allele
-    print json.dumps({"reference_sequence": ref, "sample_sequence": alt,
-        "allele_description": extracted_allele}, cls=MyEncoder)
-#main
-
-if __name__ == "__main__":
-    main()
+#!/usr/bin/env python
+
+from __future__ import unicode_literals
+
+import json
+
+import describe
+
+class MyEncoder(json.JSONEncoder):
+    def default(self, o):
+        return o.__dict__
+
+def main():
+    ref = "ACGTCGATTCGCTAGCTTCGGGGGATAGATAGAGATATAGAGATATTTTT"
+    alt = "ACGTCGGTTCGCTAGCTTCGGGGGATAGATAGATATATAGAGATATTTTT"
+
+    extracted_allele = describe.describe_dna(ref, alt)
+
+    print extracted_allele
+    print json.dumps({"reference_sequence": ref, "sample_sequence": alt,
+        "allele_description": extracted_allele}, cls=MyEncoder)
+#main
+
+if __name__ == "__main__":
+    main()
diff --git a/mutalyzer/util.py b/mutalyzer/util.py
index 6b7987b3..63f916d3 100644
--- a/mutalyzer/util.py
+++ b/mutalyzer/util.py
@@ -434,7 +434,7 @@ def longest_common_suffix(s1, s2):
     @type s2: unicode
 
     @return: The longest common suffix of s1 and s2.
-    @rtype: string
+    @rtype: unicode
     """
     return longest_common_prefix(s1[::-1], s2[::-1])[::-1]
 #longest_common_suffix
@@ -680,7 +680,7 @@ def visualise_sequence(sequence, max_length=25, flank_size=6):
     @type flank_size: int
 
     @return: Either the original sequence, or an abbreviation of it.
-    @rtype: str
+    @rtype: unicode
     """
     if len(sequence) > max_length:
         return '%s [%ibp] %s' % (sequence[:flank_size],
diff --git a/mutalyzer/variant.py b/mutalyzer/variant.py
index a8e512a2..ab7a5e85 100644
--- a/mutalyzer/variant.py
+++ b/mutalyzer/variant.py
@@ -1,6 +1,10 @@
 """
 """
 
+from __future__ import unicode_literals
+
+from Bio.SeqUtils import seq3
+
 from extractor import extractor
 
 from mutalyzer import models
@@ -18,11 +22,11 @@ class HGVSList(list):
     """
     Container for a list of sequences or variants.
     """
-    def __str__(self):
+    def __unicode__(self):
         if len(self) > 1:
-            return "[{}]".format(';'.join(map(str, self)))
-        return str(self[0])
-    #__str__
+            return "[{}]".format(';'.join(map(unicode, self)))
+        return unicode(self[0])
+    #__unicode__
 
     def weight(self):
         weight = sum(map(lambda x: x.weight(), self))
@@ -47,7 +51,7 @@ class ISeq(object):
             weight_position=1):
         """
         :arg sequence: Literal inserted sequence.
-        :type sequence: str
+        :type sequence: unicode
         :arg start: Start position for a transposed sequence.
         :type start: int
         :arg end: End position for a transposed sequence.
@@ -66,7 +70,7 @@ class ISeq(object):
             self.type = "ins"
     #__init__
 
-    def __str__(self):
+    def __unicode__(self):
         if self.type == "ins":
             return self.sequence
 
@@ -75,7 +79,7 @@ class ISeq(object):
 
         inverted = "inv" if self.reverse else ""
         return "{}_{}{}".format(self.start, self.end, inverted)
-    #__str__
+    #__unicode__
 
     def __nonzero__(self):
          return bool(self.sequence)
@@ -118,9 +122,9 @@ class DNAVar(models.DNAVar):
         :arg sample_end_offset:
         :type sample_end_offset: int
         :arg type: Variant type.
-        :type type: str
+        :type type: unicode
         :arg deleted: Deleted part of the reference sequence.
-        :type deleted: str
+        :type deleted: unicode
         :arg inserted: Inserted part.
         :type inserted: object
         :arg shift: Amount of freedom.
@@ -143,12 +147,12 @@ class DNAVar(models.DNAVar):
         self.shift = shift
     #__init__
 
-    def __str__(self):
+    def __unicode__(self):
         """
         Give the HGVS description of the raw variant stored in this class.
 
         :returns: The HGVS description of the raw variant stored in this class.
-        :rtype: str
+        :rtype: unicode
         """
         if self.type == "unknown":
             return "?"
@@ -169,7 +173,7 @@ class DNAVar(models.DNAVar):
         #if
 
         return description + "{}>{}".format(self.deleted, self.inserted)
-    #__str__
+    #__unicode__
 
     def weight(self):
         if self.type == "unknown":
@@ -204,9 +208,9 @@ class ProteinVar(models.ProteinVar):
         :arg sample_end: End position.
         :type sample_end: int
         :arg type: Variant type.
-        :type type: str
+        :type type: unicode
         :arg deleted: Deleted part of the reference sequence.
-        :type deleted: str
+        :type deleted: unicode
         :arg inserted: Inserted part.
         :type inserted: object
         :arg shift: Amount of freedom.
@@ -225,7 +229,7 @@ class ProteinVar(models.ProteinVar):
         self.term = term
     #__init__
 
-    def __str__(self):
+    def __unicode__(self):
         """
         Give the HGVS description of the raw variant stored in this class.
 
@@ -233,7 +237,7 @@ class ProteinVar(models.ProteinVar):
         correct description. Also see the comment in the class definition.
 
         :returns: The HGVS description of the raw variant stored in this class.
-        :rtype: str
+        :rtype: unicode
         """
         if self.type == "unknown":
             return "?"
@@ -262,5 +266,5 @@ class ProteinVar(models.ProteinVar):
         if self.term:
             return description + "fs*{}".format(self.term)
         return description
-    #__str__
+    #__unicode__
 #ProteinVar
diff --git a/mutalyzer/website/views.py b/mutalyzer/website/views.py
index 5bb2ae35..41db5354 100644
--- a/mutalyzer/website/views.py
+++ b/mutalyzer/website/views.py
@@ -278,9 +278,9 @@ def name_checker():
 
         extracted = extractedProt = '(skipped)'
         if allele:
-            extracted = str(allele) #describe.allele_description(allele)
+            extracted = unicode(allele) #describe.allele_description(allele)
         if prot_allele:
-            extractedProt = str(prot_allele) #describe.allele_description(prot_allele)
+            extractedProt = unicode(prot_allele) #describe.allele_description(prot_allele)
 
     else:
         extracted = extractedProt = ''
@@ -700,7 +700,7 @@ def description_extractor():
                           'Variant sequence is not DNA.')
 
     raw_vars = describe.describe_dna(reference_sequence, variant_sequence)
-    description = str(raw_vars) #describe.allele_description(raw_vars)
+    description = unicode(raw_vars) #describe.allele_description(raw_vars)
 
     errors, warnings, summary = output.Summary()
     messages = map(util.message_info, output.getMessages())
-- 
GitLab