diff --git a/mutalyzer/entrypoints/__init__.py b/mutalyzer/entrypoints/__init__.py index 5c6d2cf615d3f891a1404e3ff3326f0424928f0d..7d95d01efa1575d1a4896bbc15e3a4f2e972b4cf 100644 --- a/mutalyzer/entrypoints/__init__.py +++ b/mutalyzer/entrypoints/__init__.py @@ -5,6 +5,7 @@ Entry points to Mutalyzer. from __future__ import unicode_literals +import locale import sys @@ -48,6 +49,8 @@ def _cli_string(argument): Decode a command line argument byte string to unicode using our best guess for the encoding (noop on unicode strings). """ + encoding = sys.stdin.encoding or locale.getpreferredencoding() + if isinstance(argument, unicode): return argument - return unicode(argument, encoding=sys.stdin.encoding) + return unicode(argument, encoding=encoding) diff --git a/mutalyzer/entrypoints/admin.py b/mutalyzer/entrypoints/admin.py index 9b06920d96e34194303169ab793553cd2d38ce7d..e7c74178ea0560a133ce39e4d04c8ee1e647d58d 100644 --- a/mutalyzer/entrypoints/admin.py +++ b/mutalyzer/entrypoints/admin.py @@ -6,9 +6,10 @@ Command line interface to Mutalyzer administrative tools. from __future__ import unicode_literals import argparse +import codecs import json +import locale import os -import sys import alembic.command import alembic.config @@ -29,10 +30,12 @@ class UserError(Exception): pass -def add_assembly(assembly_file): +def add_assembly(assembly_file, encoding): """ Add genome assembly definition from a JSON file. """ + assembly_file = codecs.getreader(encoding)(assembly_file) + try: definition = json.load(assembly_file) except ValueError: @@ -87,10 +90,13 @@ def list_assemblies(): assembly.taxonomy_id) -def import_mapview(assembly_name_or_alias, mapview_file, group_label): +def import_mapview(assembly_name_or_alias, mapview_file, encoding, + group_label): """ Import transcript mappings from an NCBI mapview file. """ + mapview_file = codecs.getreader(encoding)(mapview_file) + try: assembly = Assembly.by_name_or_alias(assembly_name_or_alias) except NoResultFound: @@ -185,6 +191,8 @@ def main(): """ Command-line interface to Mutalyzer administrative tools. """ + default_encoding = locale.getpreferredencoding() + assembly_parser = argparse.ArgumentParser(add_help=False) assembly_parser.add_argument( '-a', '--assembly', metavar='ASSEMBLY', type=_cli_string, @@ -214,9 +222,13 @@ def main(): description=add_assembly.__doc__.split('\n\n')[0]) p.set_defaults(func=add_assembly) p.add_argument( - 'assembly_file', metavar='FILE', type=argparse.FileType('r'), + 'assembly_file', metavar='FILE', type=argparse.FileType('rb'), help='genome assembly definition JSON file (example: ' 'extras/assemblies/GRCh37.json)') + p.add_argument( + '--encoding', metavar='ENCODING', type=_cli_string, + default=default_encoding, + help='input file encoding (default: %s)' % default_encoding) # Subparser 'assemblies import-mapview'. p = s.add_parser( @@ -228,8 +240,12 @@ def main(): '`sort -t $\'\\t\' -k 11,11 -k 2,2` command.') p.set_defaults(func=import_mapview) p.add_argument( - 'mapview_file', metavar='FILE', type=argparse.FileType('r'), + 'mapview_file', metavar='FILE', type=argparse.FileType('rb'), help='file from NCBI mapview (example: seq_gene.md), see note below') + p.add_argument( + '--encoding', metavar='ENCODING', type=_cli_string, + default=default_encoding, + help='input file encoding (default: %s)' % default_encoding) p.add_argument( 'group_label', metavar='GROUP_LABEL', type=_cli_string, help='use only entries with this group label (example: ' diff --git a/mutalyzer/mapping.py b/mutalyzer/mapping.py index e5bd96db14324e8cc9de6f7df6c509f73ae03812..11e058997182252b01d75ea1b0586cb555347d18 100644 --- a/mutalyzer/mapping.py +++ b/mutalyzer/mapping.py @@ -883,7 +883,6 @@ def import_from_reference(assembly, reference): session.commit() -# Todo: File must be opened with the correct encoding. def import_from_mapview_file(assembly, mapview_file, group_label): """ Import transcript mappings from an NCBI mapview file.