Commit 3d731525 authored by Vermaat's avatar Vermaat
Browse files

Support Python 3.3 and 3.4

This fixes issues related to different SWIG handling of unicode strings
between Python 2 and Python 3.

We also provide convenient package-level imports for the public API
functions and clean up packaging.
parent 396973f5
......@@ -2,6 +2,8 @@
language: python
python:
- "2.7"
- "3.3"
- "3.4"
before_install:
- sudo apt-get update -qq
- sudo apt-get install -y swig
......
......@@ -8,34 +8,22 @@ Copyright (c) 2013 Jonathan K. Vis <jvis@liacs.nl>
Licensed under the MIT license, see the LICENSE file.
"""
# On the event of a new release, we update the __version_info__ package
# global and set RELEASE to True.
# Before a release, a development version is denoted by a __version_info__
# ending with a 'dev' item and RELEASE is set to False.
#
# We follow a versioning scheme compatible with setuptools [1] where the
# __version_info__ variable always contains the version of the upcomming
# release (and not that of the previous release), post-fixed with a 'dev'
# item. Only in a release commit, this 'dev' item is removed (and added
# again in the next commit).
#
# [1] http://peak.telecommunity.com/DevCenter/setuptools#specifying-your-project-s-version
RELEASE = False
from __future__ import (absolute_import, division, print_function,
unicode_literals)
__version_info__ = ('0', '1', 'dev')
from . import describe, extractor
__version__ = '.'.join(__version_info__)
__version_info__ = tuple(extractor.VERSION.split('.'))
__version__ = extractor.VERSION
__author__ = 'LUMC, Jonathan K. Vis'
__contact__ = 'jvis@liacs.nl'
__homepage__ = 'https://humgenprojects.lumc.nl/trac/extractor'
usage = __doc__.split("\n\n\n")
__homepage__ = 'https://github.com/LUMC/extractor'
def docSplit(func):
return func.__doc__.split("\n\n")[0]
def version(name):
return "%s version %s\n\nAuthor : %s <%s>\nHomepage : %s" % (name,
__version__, __author__, __contact__, __homepage__)
describe_dna = describe.describe_dna
describe_protein = describe.describe_protein
extract = extractor.extract
......@@ -4,12 +4,13 @@ other.
"""
from __future__ import unicode_literals
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import math
from .variant import ISeq, ISeqList, DNAVar, ProteinVar, Allele
from . import extractor
from . import extractor, util
# Taken from BioPython.
......@@ -272,14 +273,17 @@ def describe_dna(s1, s2):
description = Allele()
in_transposition = 0
extracted = extractor.extract(s1.encode('utf-8'), len(s1),
s2.encode('utf-8'), len(s2), 0)
s1_swig = util.swig_str(s1)
s2_swig = util.swig_str(s2)
extracted = extractor.extract(s1_swig[0], s1_swig[1],
s2_swig[0], s2_swig[1], 0)
for variant in extracted.variants:
# print (variant.type, variant.reference_start,
# print(variant.type, variant.reference_start,
# variant.reference_end, variant.sample_start,
# variant.sample_end, variant.transposition_start,
# variant.transposition_end)
# print (variant.type & extractor.TRANSPOSITION_OPEN, variant.type &
# print(variant.type & extractor.TRANSPOSITION_OPEN, variant.type &
# extractor.TRANSPOSITION_CLOSE)
if variant.type & extractor.TRANSPOSITION_OPEN:
......
"""
General utility definitions.
"""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import sys
PY2 = sys.version_info[0] == 2
# From BioPython.
protein_letters_1to3 = {
'A': 'Ala', 'C': 'Cys', 'D': 'Asp',
'E': 'Glu', 'F': 'Phe', 'G': 'Gly', 'H': 'His',
'I': 'Ile', 'K': 'Lys', 'L': 'Leu', 'M': 'Met',
'N': 'Asn', 'P': 'Pro', 'Q': 'Gln', 'R': 'Arg',
'S': 'Ser', 'T': 'Thr', 'V': 'Val', 'W': 'Trp',
'Y': 'Tyr',
}
protein_letters_1to3_extended = dict(list(protein_letters_1to3.items()) + list({
'B': 'Asx', 'X': 'Xaa', 'Z': 'Glx', 'J': 'Xle',
'U': 'Sel', 'O': 'Pyl',
}.items()))
# From BioPython.
def seq3(seq, custom_map={'*': 'Ter'}, undef_code='Xaa'):
"""Turn a one letter code protein sequence into one with three letter codes.
The single input argument 'seq' should be a protein sequence using single
letter codes, either as a python string or as a Seq or MutableSeq object.
This function returns the amino acid sequence as a string using the three
letter amino acid codes. Output follows the IUPAC standard (including
ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk.
Any unknown character (including possible gap characters), is changed into
'Xaa'.
e.g.
>>> from Bio.SeqUtils import seq3
>>> seq3("MAIVMGRWKGAR*")
'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
You can set a custom translation of the codon termination code using the
"custom_map" argument, e.g.
>>> seq3("MAIVMGRWKGAR*", custom_map={"*": "***"})
'MetAlaIleValMetGlyArgTrpLysGlyAlaArg***'
You can also set a custom translation for non-amino acid characters, such
as '-', using the "undef_code" argument, e.g.
>>> seq3("MAIVMGRWKGA--R*", undef_code='---')
'MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer'
If not given, "undef_code" defaults to "Xaa", e.g.
>>> seq3("MAIVMGRWKGA--R*")
'MetAlaIleValMetGlyArgTrpLysGlyAlaXaaXaaArgTer'
This function was inspired by BioPerl's seq3.
"""
# not doing .update() on IUPACData dict with custom_map dict
# to preserve its initial state (may be imported in other modules)
threecode = dict(list(protein_letters_1to3_extended.items()) +
list(custom_map.items()))
#We use a default of 'Xaa' for undefined letters
#Note this will map '-' to 'Xaa' which may be undesirable!
return ''.join(threecode.get(aa, undef_code) for aa in seq)
def swig_str(s, ascii_only=True):
"""
Given a unicode string, returns the representation expected by SWIG and
its (UTF-8 encoded) length. Unless `ascii_only=False`, the string must
contain only characters in the ASCII range.
Unfortunately, SWIG encodes unicode strings on Python 3 (the `str` type)
automatically as UTF-8, while it doesn't do so on Python 3 (the `unicode`)
type. So we have to encode ourselves on Python 2. Hence this function.
The SWIG documentation doesn't really discuss this, so this was a real
pain to debug.
Note that to correctly calculate the length of the resulting *char value,
we also have to encode to UTF-8 on Python 3. Since this means decoding is
done twice (once here for the length calculation and once by SWIG), we
instead assume all characters are in the ACII range so we can use the
length of the unicode string directly. This assumption can be removed by
specifying `ascii_only=False`.
http://www.swig.org/Doc2.0/SWIGDocumentation.html#Python_nn49
https://github.com/swig/swig/blob/master/Lib/python/pystrings.swg
http://comments.gmane.org/gmane.comp.programming.swig.devel/23268
"""
if PY2 or not ascii_only:
s_encoded = s.encode('utf-8')
if PY2:
return s_encoded, len(s_encoded)
return s, len(s_encoded)
return s, len(s)
#: Python 3 behaviour for `str` on both Python 2 and 3.
str = unicode if PY2 else str
def python_2_unicode_compatible(cls):
"""
A decorator that defines `__unicode__` and `__str__` methods under Python
2. Under Python 3 it does nothing.
To support Python 2 and 3 with a single code base, define a `__str__`
method returning unicode text and apply this decorator to the class.
The implementation comes from django.utils.encoding.
"""
if PY2:
cls.__unicode__ = cls.__str__
cls.__str__ = lambda self: self.__unicode__().encode('utf-8')
return cls
......@@ -2,71 +2,12 @@
Models for the description extractor.
"""
from __future__ import unicode_literals
from . import extractor
# From BioPython.
protein_letters_1to3 = {
'A': 'Ala', 'C': 'Cys', 'D': 'Asp',
'E': 'Glu', 'F': 'Phe', 'G': 'Gly', 'H': 'His',
'I': 'Ile', 'K': 'Lys', 'L': 'Leu', 'M': 'Met',
'N': 'Asn', 'P': 'Pro', 'Q': 'Gln', 'R': 'Arg',
'S': 'Ser', 'T': 'Thr', 'V': 'Val', 'W': 'Trp',
'Y': 'Tyr',
}
protein_letters_1to3_extended = dict(list(protein_letters_1to3.items()) + list({
'B': 'Asx', 'X': 'Xaa', 'Z': 'Glx', 'J': 'Xle',
'U': 'Sel', 'O': 'Pyl',
}.items()))
def seq3(seq, custom_map={'*': 'Ter'}, undef_code='Xaa'):
"""Turn a one letter code protein sequence into one with three letter codes.
The single input argument 'seq' should be a protein sequence using single
letter codes, either as a python string or as a Seq or MutableSeq object.
This function returns the amino acid sequence as a string using the three
letter amino acid codes. Output follows the IUPAC standard (including
ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk.
Any unknown character (including possible gap characters), is changed into
'Xaa'.
e.g.
from __future__ import (absolute_import, division, print_function,
unicode_literals)
>>> from Bio.SeqUtils import seq3
>>> seq3("MAIVMGRWKGAR*")
'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
You can set a custom translation of the codon termination code using the
"custom_map" argument, e.g.
>>> seq3("MAIVMGRWKGAR*", custom_map={"*": "***"})
'MetAlaIleValMetGlyArgTrpLysGlyAlaArg***'
You can also set a custom translation for non-amino acid characters, such
as '-', using the "undef_code" argument, e.g.
>>> seq3("MAIVMGRWKGA--R*", undef_code='---')
'MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer'
If not given, "undef_code" defaults to "Xaa", e.g.
>>> seq3("MAIVMGRWKGA--R*")
'MetAlaIleValMetGlyArgTrpLysGlyAlaXaaXaaArgTer'
This function was inspired by BioPerl's seq3.
"""
# not doing .update() on IUPACData dict with custom_map dict
# to preserve its initial state (may be imported in other modules)
threecode = dict(list(protein_letters_1to3_extended.items()) +
list(custom_map.items()))
#We use a default of 'Xaa' for undefined letters
#Note this will map '-' to 'Xaa' which may be undesirable!
return ''.join(threecode.get(aa, undef_code) for aa in seq)
from . import extractor
from extractor.util import python_2_unicode_compatible, seq3, str
WEIGHTS = {
......@@ -79,6 +20,7 @@ WEIGHTS = {
}
@python_2_unicode_compatible
class HGVSList(object):
"""
Container for a list of sequences or variants.
......@@ -99,10 +41,10 @@ class HGVSList(object):
return self.__bool__()
def __unicode__(self):
def __str__(self):
if len(self.items) > 1:
return '[{}]'.format(';'.join(map(unicode, self.items)))
return unicode(self.items[0])
return '[{}]'.format(';'.join(map(str, self.items)))
return str(self.items[0])
def append(self, item):
......@@ -125,6 +67,7 @@ class ISeqList(HGVSList):
pass
@python_2_unicode_compatible
class ISeq(object):
"""
Container for an inserted sequence.
......@@ -150,7 +93,7 @@ class ISeq(object):
self.type = 'ins'
def __unicode__(self):
def __str__(self):
if self.type == 'ins':
return self.sequence
......@@ -178,6 +121,7 @@ class ISeq(object):
inverse_weight)
@python_2_unicode_compatible
class DNAVar(object):
"""
Container for a DNA variant.
......@@ -219,7 +163,7 @@ class DNAVar(object):
self.shift = shift
def __unicode__(self):
def __str__(self):
"""
Give the HGVS description of the raw variant stored in this class.
......@@ -259,6 +203,7 @@ class DNAVar(object):
return weight + WEIGHTS[self.type] + self.inserted.weight()
@python_2_unicode_compatible
class ProteinVar(object):
"""
Container for a protein variant.
......@@ -292,7 +237,7 @@ class ProteinVar(object):
self.term = term
def __unicode__(self):
def __str__(self):
"""
Give the HGVS description of the raw variant stored in this class.
......
......@@ -5,7 +5,7 @@ _build.sub_commands = [
('build_py', _build.has_pure_modules),
('build_clib', _build.has_c_libraries),
('build_scripts', _build.has_scripts)
]
]
#from setuptools.command.bdist_egg import bdist_egg
#old_run = bdist_egg.run
......@@ -16,29 +16,54 @@ _build.sub_commands = [
#
#bdist_egg.run = run
import os
import sys
from setuptools import setup
from distutils.core import Extension
if sys.version_info < (2, 6):
raise Exception('extractor requires Python 2.6 or higher.')
if sys.version_info < (2, 7):
raise Exception('extractor requires Python 2.7 or higher.')
# Todo: How does this play with pip freeze requirement files?
requires = []
import extractor as distmeta
# This is quite the hack, but we don't want to import our package from here
# since that's recipe for disaster (it might have some uninstalled
# dependencies, or we might import another already installed version).
distmeta = {}
for line in open(os.path.join('extractor', '__init__.py')):
try:
field, value = (x.strip() for x in line.split('='))
except ValueError:
continue
value = value.strip('\'"')
distmeta[field] = value
# The __version__ value is actually defined in extractor.h.
for line in open(os.path.join('extractor', 'extractor.h')):
if ' VERSION = ' in line:
version = line.split('=')[-1].strip('\'"; ')
distmeta['__version__'] = version
distmeta['__version_info__'] = tuple(version.split('.'))
break
try:
with open('readme.md') as readme:
long_description = readme.read()
except IOError:
long_description = 'See ' + distmeta['__homepage__']
setup(
name='extractor',
ext_modules=[Extension('_extractor', ['extractor/extractor.i',
'extractor/extractor.cc'], swig_opts=['-c++'])],
py_modules=['extractor.extractor'],
version=distmeta.__version__,
description=distmeta.usage[0],
long_description=distmeta.__doc__,
author=distmeta.__author__,
author_email=distmeta.__contact__,
url=distmeta.__homepage__,
version=distmeta['__version__'],
description='Extract a list of differences between two sequences',
long_description=long_description,
author=distmeta['__author__'],
author_email=distmeta['__contact__'],
url=distmeta['__homepage__'],
license='MIT License',
platforms=['any'],
packages=['extractor'],
......@@ -53,6 +78,12 @@ setup(
'Intended Audience :: Developers',
'Operating System :: OS Independent',
'Programming Language :: Python',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.6',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Programming Language :: C++',
'Topic :: Scientific/Engineering',
],
......
......@@ -3,13 +3,11 @@ Tests for the mutalyzer.describe module.
"""
from __future__ import unicode_literals
from __future__ import (absolute_import, division, print_function,
unicode_literals)
#import logging; logging.basicConfig()
import os
import extractor
from extractor import describe
from extractor import describe_dna
from extractor.util import str
class TestDescribe:
......@@ -22,7 +20,7 @@ class TestDescribe:
"""
reference = 'ACGTCGATTCGCTAGCTTCGGGGGATAGATAGAGATATAGAGAT'
result = describe.describe_dna(reference, sample)
result = describe_dna(reference, sample)
assert result[0].type == expected[0]
assert result[0].start == expected[1]
assert result[0].end == expected[2]
......@@ -30,47 +28,47 @@ class TestDescribe:
assert result[0].sample_end == expected[4]
assert result[0].deleted[0].sequence == expected[5]
assert result[0].inserted[0].sequence == expected[6]
assert unicode(result[0]) == expected[7]
assert str(result[0]) == expected[7]
def test1(self):
"""
Test 1.
"""
result = describe.describe_dna(
result = describe_dna(
'ATGATGATCAGATACAGTGTGATACAGGTAGTTAGACAA',
'ATGATTTGATCAGATACATGTGATACCGGTAGTTAGGACAA')
assert unicode(result) == '[5_6insTT;17del;26A>C;35dup]'
assert str(result) == '[5_6insTT;17del;26A>C;35dup]'
def test2(self):
"""
Test 2.
"""
result = describe.describe_dna(
result = describe_dna(
'TAAGCACCAGGAGTCCATGAAGAAGATGGCTCCTGCCATGGAATCCCCTACTCTACTGTG',
'TAAGCACCAGGAGTCCATGAAGAAGCTGGATCCTCCCATGGAATCCCCTACTCTACTGTG')
assert unicode(result) == '[26A>C;30C>A;35G>C]'
assert str(result) == '[26A>C;30C>A;35G>C]'
def test3(self):
"""
Test 3.
"""
result = describe.describe_dna(
result = describe_dna(
'TAAGCACCAGGAGTCCATGAAGAAGATGGCTCCTGCCATGGAATCCCCTACTCTA',
'TAAGCACCAGGAGTCCATGAAGAAGCCATGTCCTGCCATGGAATCCCCTACTCTA')
assert unicode(result) == '[26_29inv;30C>G]'
assert str(result) == '[26_29inv;30C>G]'
def test4(self):
"""
Test 4.
"""
result = describe.describe_dna(
result = describe_dna(
'TAAGCACCAGGAGTCCATGAAGAAGATGGCTCCTGCCATGGAATCCCCTACTCTA',
'TAAGCACCAGGAGTCCATGAAGAAGCCATGTCCTGCCATGAATCCCCTACTCTA')
assert unicode(result) == '[26_29inv;30C>G;41del]'
assert str(result) == '[26_29inv;30C>G;41del]'
def test5(self):
......
......@@ -3,12 +3,17 @@ Unit tests for the extractor Python interface.
"""
from extractor import extractor
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from extractor import extractor, util
class TestExtractor:
def _test_dna(self, s1, s2, expected_variants):
extracted = extractor.extract(s1, len(s1), s2, len(s2), 0)
s1_swig = util.swig_str(s1)
s2_swig = util.swig_str(s2)
extracted = extractor.extract(s1_swig[0], s1_swig[1], s2_swig[0], s2_swig[1], 0)
assert len(extracted.variants) == len(expected_variants)
for variant, expected_variant in zip(extracted.variants, expected_variants):
for attribute, expected_value in expected_variant.items():
......
[tox]
envlist = py26,py27,py33,py34
envlist = py27,py33,py34
[testenv]
deps = pytest
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment