From 666299144f811c4157ba2994dd7409956ef3dafa Mon Sep 17 00:00:00 2001
From: Martijn Vermaat <martijn@vermaat.name>
Date: Mon, 20 Oct 2014 11:24:21 +0200
Subject: [PATCH] Unit tests for unicode strings

---
 mutalyzer/website/views.py  | 20 ++++----
 requirements.txt            |  1 +
 tests/test_scheduler.py     | 17 ++++++-
 tests/test_services_json.py | 43 +++++++++++++++++
 tests/test_services_soap.py | 42 +++++++++++++++++
 tests/test_website.py       | 94 ++++++++++++++++++++++++++++++++++---
 6 files changed, 197 insertions(+), 20 deletions(-)

diff --git a/mutalyzer/website/views.py b/mutalyzer/website/views.py
index f6f228b1..84b5cf85 100644
--- a/mutalyzer/website/views.py
+++ b/mutalyzer/website/views.py
@@ -9,12 +9,11 @@ import bz2
 import os
 import pkg_resources
 import re
-from cStringIO import StringIO
 import urllib
 
 from flask import Blueprint
-from flask import (abort, current_app, jsonify, make_response, redirect,
-                   render_template, request, send_from_directory, url_for)
+from flask import (abort, jsonify, make_response, redirect, render_template,
+                   request, send_from_directory, url_for)
 import jinja2
 from lxml import etree
 from spyne.server.http import HttpBase
@@ -24,9 +23,8 @@ import mutalyzer
 from mutalyzer import (announce, describe, File, Retriever, Scheduler, stats,
                        util, variantchecker)
 from mutalyzer.config import settings
-from mutalyzer.db import session
 from mutalyzer.db.models import BATCH_JOB_TYPES
-from mutalyzer.db.models import Assembly, BatchJob, BatchQueueItem
+from mutalyzer.db.models import Assembly, BatchJob
 from mutalyzer.grammar import Grammar
 from mutalyzer.mapping import Converter
 from mutalyzer.output import Output
@@ -137,13 +135,13 @@ def soap_api():
     """
     soap_server = HttpBase(soap.application)
     soap_server.doc.wsdl11.build_interface_document(settings.SOAP_WSDL_URL)
-    wsdl_handle = StringIO(soap_server.doc.wsdl11.get_interface_document())
+    wsdl_string = soap_server.doc.wsdl11.get_interface_document()
 
-    xsl_handle = open(os.path.join(
-            pkg_resources.resource_filename('mutalyzer', 'website/templates'),
-            'wsdl-viewer.xsl'), 'r')
-    wsdl_doc = etree.parse(wsdl_handle)
-    xsl_doc = etree.parse(xsl_handle)
+    xsl_file = os.path.join(
+        pkg_resources.resource_filename('mutalyzer', 'website/templates'),
+        'wsdl-viewer.xsl')
+    wsdl_doc = etree.fromstring(wsdl_string)
+    xsl_doc = etree.parse(xsl_file)
     transform = etree.XSLT(xsl_doc)
 
     return make_response(unicode(transform(wsdl_doc)))
diff --git a/requirements.txt b/requirements.txt
index c79b98aa..63d953ea 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,3 +22,4 @@ alembic==0.6.3
 Sphinx==1.2.1
 sphinx-rtd-theme==0.1.5
 cchardet==0.3.5
+Werkzeug==0.9.6
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 6470eb46..791f867d 100644
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -14,7 +14,7 @@ from Bio import Entrez
 from mock import patch
 
 from mutalyzer.config import settings
-from mutalyzer.db.models import BatchJob, BatchQueueItem
+from mutalyzer.db.models import BatchJob
 from mutalyzer import File
 from mutalyzer import output
 from mutalyzer import Scheduler
@@ -49,7 +49,8 @@ class TestScheduler(MutalyzerTest):
         assert left == 0
 
         filename = 'batch-job-%s.txt' % result_id
-        result = open(os.path.join(settings.CACHE_DIR, filename))
+        result = io.open(os.path.join(settings.CACHE_DIR, filename),
+                         encoding='utf-8')
 
         next(result) # Header.
         assert expected == [line.strip().split('\t') for line in result]
@@ -320,3 +321,15 @@ class TestScheduler(MutalyzerTest):
         file_instance = File.File(output.Output('test'))
         job, columns = file_instance.parseBatchFile(batch_file)
         assert job is None
+
+    def test_unicode_input(self):
+        """
+        Simple input with some non-ASCII unicode characters.
+        """
+        variants = ['\u2026AB026906.1:c.274G>T',
+                    '\u2026AL449423.14(CDKN2A_v002):c.5_400del']
+        expected = [['\u2026AB026906.1:c.274G>T',
+                     '(grammar): Expected W:(0123...) (at char 0), (line:1, col:1)'],
+                    ['\u2026AL449423.14(CDKN2A_v002):c.5_400del',
+                     '(grammar): Expected W:(0123...) (at char 0), (line:1, col:1)']]
+        self._batch_job_plain_text(variants, expected, 'syntax-checker')
diff --git a/tests/test_services_json.py b/tests/test_services_json.py
index 8df9b748..81833505 100644
--- a/tests/test_services_json.py
+++ b/tests/test_services_json.py
@@ -9,6 +9,7 @@ import simplejson as json
 from spyne.server.null import NullServer
 import mutalyzer
 from mutalyzer import announce
+from mutalyzer import Scheduler
 from mutalyzer.services.json import application
 
 from fixtures import database, hg19, hg19_transcript_mappings
@@ -99,3 +100,45 @@ class TestServicesJson(MutalyzerTest):
         announce.unset_announcement()
         r = self._call('info')
         assert not r.get('announcement')
+
+    def test_checksyntax_unicode(self):
+        """
+        Run checkSyntax with an invalid variant description containing
+        non-ASCII unicode characters.
+        """
+        r = self._call('checkSyntax', 'La Pe\xf1a')
+        assert r['valid'] == False
+        assert len(r['messages']) == 1
+        assert r['messages'][0]['errorcode'] == 'EPARSE'
+        assert r['messages'][0]['message'] ==  'Expected W:(0123...) (at char 2), (line:1, col:3)'
+
+    @fix(database)
+    def test_batchjob_unicode(self):
+        """
+        Submit a batch job with non-ASCII unicode characters in the input
+        file.
+        """
+        variants = ['\u2026AB026906.1:c.274G>T',
+                    '\u2026AL449423.14(CDKN2A_v002):c.5_400del']
+        expected = [['\u2026AB026906.1:c.274G>T',
+                     '(grammar): Expected W:(0123...) (at char 0), (line:1, col:1)'],
+                    ['\u2026AL449423.14(CDKN2A_v002):c.5_400del',
+                     '(grammar): Expected W:(0123...) (at char 0), (line:1, col:1)']]
+
+        data = '\n'.join(variants) + '\n' #.encode('base64')
+
+        result = self._call('submitBatchJob', data.encode('utf-8'), 'SyntaxChecker')
+        job_id = unicode(result)
+
+        result = self._call('monitorBatchJob', job_id)
+        assert int(result) == len(variants)
+
+        scheduler = Scheduler.Scheduler()
+        scheduler.process()
+
+        result = self._call('monitorBatchJob', job_id)
+        assert int(result) == 0
+
+        result = self._call('getBatchJob', job_id)
+        result = result.decode('base64').decode('utf-8').strip().split('\n')[1:]
+        assert expected == [line.split('\t') for line in result]
diff --git a/tests/test_services_soap.py b/tests/test_services_soap.py
index 0882c9fb..0a85844d 100644
--- a/tests/test_services_soap.py
+++ b/tests/test_services_soap.py
@@ -669,3 +669,45 @@ facilisi."""
         assert r.errors == 0
         assert r.genomicDescription == ud + ':g.7872G>T'
         assert ud + '(SDHD_v001):c.274G>T' in r.transcriptDescriptions.string
+
+    def test_checksyntax_unicode(self):
+        """
+        Run checkSyntax with an invalid variant description containing
+        non-ASCII unicode characters.
+        """
+        r = self._call('checkSyntax', 'La Pe\xf1a')
+        assert r.valid == False
+        assert len(r.messages.SoapMessage) == 1
+        assert r.messages.SoapMessage[0]['errorcode'] == 'EPARSE'
+        assert r.messages.SoapMessage[0]['message'] ==  'Expected W:(0123...) (at char 2), (line:1, col:3)'
+
+    @fix(database)
+    def test_batchjob_unicode(self):
+        """
+        Submit a batch job with non-ASCII unicode characters in the input
+        file.
+        """
+        variants = ['\u2026AB026906.1:c.274G>T',
+                    '\u2026AL449423.14(CDKN2A_v002):c.5_400del']
+        expected = [['\u2026AB026906.1:c.274G>T',
+                     '(grammar): Expected W:(0123...) (at char 0), (line:1, col:1)'],
+                    ['\u2026AL449423.14(CDKN2A_v002):c.5_400del',
+                     '(grammar): Expected W:(0123...) (at char 0), (line:1, col:1)']]
+
+        data = '\n'.join(variants) + '\n' #.encode('base64')
+
+        result = self._call('submitBatchJob', data.encode('utf-8'), 'SyntaxChecker')
+        job_id = unicode(result)
+
+        result = self._call('monitorBatchJob', job_id)
+        assert int(result) == len(variants)
+
+        scheduler = Scheduler.Scheduler()
+        scheduler.process()
+
+        result = self._call('monitorBatchJob', job_id)
+        assert int(result) == 0
+
+        result = self._call('getBatchJob', job_id)
+        result = result.decode('base64').decode('utf-8').strip().split('\n')[1:]
+        assert expected == [line.split('\t') for line in result]
diff --git a/tests/test_website.py b/tests/test_website.py
index c649925e..fd0f02e7 100644
--- a/tests/test_website.py
+++ b/tests/test_website.py
@@ -9,21 +9,15 @@ from __future__ import unicode_literals
 
 #import logging; logging.basicConfig()
 import bz2
-import cgi
-import logging
 from mock import patch
 import os
-import re
 from io import BytesIO
-import time
-import urllib
-import urllib2
 
 from Bio import Entrez
 import lxml.html
 
-import mutalyzer
 from mutalyzer import announce, Scheduler
+from mutalyzer.db import models
 from mutalyzer.website import create_app
 
 from fixtures import cache, database, hg19, hg19_transcript_mappings
@@ -739,3 +733,89 @@ class TestWebsite(MutalyzerTest):
         assert 'text/plain' in r.headers['Content-Type']
         assert '\t'.join(['chrX', '154157690', '154157691', '4374A>T', '0', '-']) in r.data
         assert '\t'.join(['chrX', '154157683', '154157685', '4380_4381del', '0', '-']) in r.data
+
+    def test_checksyntax_unicode(self):
+        """
+        Run check syntax form with an invalid variant description containing
+        non-ASCII unicode characters.
+        """
+        r = self.app.get('/syntax-checker',
+                         query_string={'description': 'La Pe\xf1a'})
+        body = r.get_data(as_text=True)
+        assert 'Fatal' in body
+        assert 'Details of the parse error' in body
+        assert 'Expected W:(0123...) (at char 2), (line:1, col:3)' in body
+
+    @fix(database)
+    def test_batch_unicode(self):
+        """
+        Submit a batch form with non-ASCII unicode characters in the input
+        file.
+        """
+        file = '\n'.join(['\u2026AB026906.1:c.274G>T',
+                          '\u2026AL449423.14(CDKN2A_v002):c.5_400del'])
+        expected = [['\u2026AB026906.1:c.274G>T',
+                     '(grammar): Expected W:(0123...) (at char 0), (line:1, col:1)'],
+                    ['\u2026AL449423.14(CDKN2A_v002):c.5_400del',
+                     '(grammar): Expected W:(0123...) (at char 0), (line:1, col:1)']]
+
+        data = {'job_type': 'syntax-checker',
+                'email': 'test@test.test',
+                'file': (BytesIO(file.encode('utf-8')), 'test.txt')}
+
+        r = self.app.post('/batch-jobs',
+                          data=data)
+        progress_url = '/' + r.location.split('/')[-1]
+
+        assert models.BatchJob.query.first().email == 'test@test.test'
+
+        scheduler = Scheduler.Scheduler()
+        scheduler.process()
+
+        r = self.app.get(progress_url)
+
+        dom = lxml.html.fromstring(r.data)
+        result_url = dom.cssselect('#ifnot_items_left a')[0].attrib['href']
+
+        r = self.app.get(result_url)
+        assert 'text/plain' in r.headers['Content-Type']
+
+        result = r.get_data(as_text=True).strip().split('\n')[1:]
+        assert expected == [line.split('\t') for line in result]
+
+    @fix(database)
+    def test_batch_unicode_email(self):
+        """
+        Submit a batch form with non-ASCII unicode characters in the email
+        address.
+        """
+        file = '\n'.join(['AB026906.1:c.274G>T',
+                          'AL449423.14(CDKN2A_v002):c.5_400del'])
+        expected = [['AB026906.1:c.274G>T',
+                     'OK'],
+                    ['AL449423.14(CDKN2A_v002):c.5_400del',
+                     'OK']]
+
+        data = {'job_type': 'syntax-checker',
+                'email': 'pe\xf1a@test.test',
+                'file': (BytesIO(file.encode('utf-8')), 'test.txt')}
+
+        r = self.app.post('/batch-jobs',
+                          data=data)
+        progress_url = '/' + r.location.split('/')[-1]
+
+        assert models.BatchJob.query.first().email == 'pe\xf1a@test.test'
+
+        scheduler = Scheduler.Scheduler()
+        scheduler.process()
+
+        r = self.app.get(progress_url)
+
+        dom = lxml.html.fromstring(r.data)
+        result_url = dom.cssselect('#ifnot_items_left a')[0].attrib['href']
+
+        r = self.app.get(result_url)
+        assert 'text/plain' in r.headers['Content-Type']
+
+        result = r.get_data(as_text=True).strip().split('\n')[1:]
+        assert expected == [line.split('\t') for line in result]
-- 
GitLab