diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000000000000000000000000000000000000..7236e8939d41be4157e063796c475fd62341e96c --- /dev/null +++ b/AUTHORS @@ -0,0 +1,5 @@ +Leiden University Medical Center department of Human Genetics <humgen@lumc.nl> +Jeroen Laros <j.f.j.laros@lumc.nl> +Gerben Stouten +Gerard Schaafsma <g.c.p.schaafsma@lumc.nl> +Martijn Vermaat <m.vermaat.hg@lumc.nl> diff --git a/INSTALL b/INSTALL new file mode 100644 index 0000000000000000000000000000000000000000..870d2b4afb597306ae488812a2bb4f1e792757bb --- /dev/null +++ b/INSTALL @@ -0,0 +1,149 @@ +Mutalyzer installation instructions +=================================== + + +Default configuration notes +--------------------------- + +The instructions in this file are quite specific to the standard Mutalyzer +environment. This consists of a Debian stable (Squeeze) system with Apache +and Mutalyzer using its mod_wsgi module. Debian conventions are used +throughout. + +The following is an overview of default locations used by Mutalyzer: + + Package files /usr/local/lib/python2.6/dist-packages/... + Configuration /etc/mutalyzer/config + Log file /var/log/mutalyzer.log + Cache directory /var/cache/mutalyzer + Batchd init script /etc/init.d/mutalyzer-batchd + Mapping update crontab /etc/cron.d/mutalyzer-mapping-update + Apache configuration /etc/apache2/conf.d/mutalyzer.conf + Static website files /var/www/mutalyzer/base + +The default database user is 'mutalyzer' with no password and the database +names are 'mutalyzer', 'hg18', and 'hg19'. + +By default, Mutalyzer is exposed under the '/mutalyzer' url by Apache. + +All Mutalyzer processes run under the www-data user and files created and/or +modified by Mutalyzer are owned by this user. + +If you have a different environment, or want to customize the default +locations, you can read through these instructions and modify them to your +needs. + + +Short version +------------- + +Run the following commands: + + svn https://www.mutalyzer.nl/svn/mutalyzer2/trunk . + sudo bash extras/pre-install.sh + sudo python setup.py install + sudo bash extras/post-install.sh + sensible-browser http://localhost/mutalyzer + +Or follow the more detailed instructions below. + + +Automated deployment on a remote host +------------------------------------- + +For deploying Mutalyzer on a remote (production or testing) host, we recommend +to automate the steps described below by using Fabric and the included +fabfile. You need Fabric installed on your local machine: + + easy_install fabric + +To do a deployment on a server with an existing configured Mutalyzer +installation: + + fab deploy -H server1.mutalyzer.nl + +To do a fresh deployment on a new server: + + fab deploy:boostrap=yes -H server1.mutalyzer.nl + + +Get Mutalyzer +------------- + +Since you are reading this, you can probably skip this step. Otherwise, get +your hands on a tarball and: + + tar -zxvf mutalyzer-XXX.tar.gz + cd mutalyzer-XXX + +Or get the source from SVN directly: + + svn https://www.mutalyzer.nl/svn/mutalyzer2/trunk mutalyzer + cd mutalyzer + + +Install dependencies +-------------------- + +If you are on Debian or Ubuntu, you can use the following command to install +all dependencies: + + sudo bash extras/pre-install.sh + +Otherwise, install them manually (perhaps have a look in the above script for +a useful dependency list). + + +Install Mutalyzer +----------------- + +Mutalyzer can be installed using Python setuptools. For a production +environment: + + sudo python setup.py install + +Alternatively, if you want to have a development environment, use: + + sudo python setup.py develop + +The development environment uses symlinks to this source directory, so you can +develop directly from here. This command should be re-issued whenever the +version number of Mutalyzer is updated. + + +Setup Mutalyzer +--------------- + +This step creates configuration files and populates the database: + + sudo bash extras/post-install.sh + +You can now edit /etc/mutalyzer/config and /etc/apache2/conf.d/mutalyzer.conf +to your likings. + + +Test the installation +--------------------- + +You should always test the installation. The tests (for now at least) need +the batch daemon and the webserver (the SOAP part) running. + +Now run the tests: + + MUTALYZER_ENV=test nosetests + + +Upgrade Mutalyzer +----------------- + +Unless you installed Mutalyzer in a development environment as described +above, you can upgrade Mutalyzer to a new version by running from the source +directory: + + sudo python setup.py install + sudo bash extras/post-upgrade + +If you installed Mutalyzer in a development environment, you don't have to +do anything to upgrade except for running the automated migration scripts: + + for M in extras/migrations/*.migration; do sudo $M migrate; done diff --git a/Install.txt b/Install.txt deleted file mode 100644 index f80d9fbeb2de47ca1bb538d416ccf3a391cf24b0..0000000000000000000000000000000000000000 --- a/Install.txt +++ /dev/null @@ -1,94 +0,0 @@ -This program depends on the following packages: -- mysql-server >= 5.0.0 -- python >= 2.5.2 -- python-mysqldb >= 1.2.2 -- python-biopython >= 1.54 -- python-pyparsing >= 1.5.0 -- python-configobj >= 4.4.0 -- python-simpletal >= 4.1-6 -- python-soaplib >= 2.0.0-alpha1 -- python-soappy >= 0.12.0-2 -- python-magic >= 5.04-2 -- python-psutil >= 0.1.3-1 -- python-xlrd >= 0.6.1-2 -- python-daemon >= 1.5.5 -- python-webpy >= 0.33 -- python-webtest >= 1.2.3 -- python-suds >= 0.3.9-1 - -The web and SOAP interfaces depend on the following packages: -- apache2 >= 2.2.11 -- libapache2-mod-wsgi >= 2.8 - -Add Apache configuration for Mutalyzer. For example, add the following to -the file /etc/apache2/sites-available/mutalyzer (change the path -/var/www/mutalyzer2 when appropriate) and run 'a2ensite mutalyzer'. ---- -<VirtualHost *:80> - ServerName mutalyzer.nl - - WSGIScriptAlias /services /var/www/mutalyzer2/src/webservice.py - WSGIScriptAlias / /var/www/mutalyzer2/src/wsgi.py - - Alias /base /var/www/mutalyzer2/templates/base -</VirtualHost> ---- - -After installing MySQL, create a database named ``hg19'', owned by the -user ``mutalyzer'': ---- -cat << EOF | mysql -u root -p - CREATE USER mutalyzer; - CREATE DATABASE hg19; - GRANT ALL PRIVILEGES ON hg19.* TO mutalyzer; - FLUSH PRIVILEGES; -EOF -cat << EOF | mysql -u root -p - CREATE DATABASE hg18; - GRANT ALL PRIVILEGES ON hg18.* TO mutalyzer; - FLUSH PRIVILEGES; -EOF ---- -Then retrieve the refLink table from the UCSC website: -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refLink.sql -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refLink.txt.gz - -For Variant_info to work, you need the following files too: -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.sql -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/gbStatus.sql -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/gbStatus.txt.gz - - -And import the table: ---- -mysql -u mutalyzer -D hg19 < refLink.sql -zcat refLink.txt.gz | mysql -u mutalyzer -D hg19 -e 'LOAD DATA LOCAL INFILE "/dev/stdin" INTO TABLE refLink;' - ---- - -Combine the mapping info into one table. ---- -CREATE TABLE map - SELECT DISTINCT acc, version, txStart, txEnd, cdsStart, cdsEnd, - exonStarts, exonEnds, name2 AS geneName, chrom, - strand, protAcc - FROM gbStatus, refGene, refLink - WHERE type = "mRNA" - AND refGene.name = acc - AND acc = mrnaAcc; ---- - - -For Variant_info: ---- -mysql -u mutalyzer -D hg19 < gbStatus.sql -zgrep mRNA gbStatus.txt.gz | mysql -u mutalyzer -D hg19 -e 'LOAD DATA LOCAL INFILE "/dev/stdin" INTO TABLE gbStatus;' -mysql -u mutalyzer -D hg19 < refGene.sql -zcat refGene.txt.gz | mysql -u mutalyzer -D hg19 -e 'LOAD DATA LOCAL INFILE "/dev/stdin" INTO TABLE refGene;' ---- - -Edit the file mutalyzer.conf to finish the installation. - -Take a look at Obsoleted.txt for information about backwards compatibility and -removed features. diff --git a/Obsoleted.txt b/Obsoleted.txt deleted file mode 100644 index a66a3d2d7226b8aabd6bd258b05179bc261d18b2..0000000000000000000000000000000000000000 --- a/Obsoleted.txt +++ /dev/null @@ -1,9 +0,0 @@ -On eu.liacs.nl: /etc/apache2/mods-enabled/rewrite.load -- This file contains a rewrite rule that converts "Variant_info.php" to - "Variant_info". -- When all LOVD versions are above 2.0-23, this rule can be deleted and the - rewrite module can be disabled. - -index.py: -- In the Variant_info() function a substitution on error messages is performed. -- When all LOVD versions are above 2.0-23, this check can be deleted. diff --git a/README b/README new file mode 100644 index 0000000000000000000000000000000000000000..1390caec4c4de2fb6a20c477ef9d13f1e4155c70 --- /dev/null +++ b/README @@ -0,0 +1,163 @@ +Mutalyzer, a HGVS variant nomenclature checker +============================================== + + +Documentation +------------- + +See the doc/ directory for (possibly outdated) developer documentation and +presentation slides related to Mutalyzer. See http://www.mutalyzer.nl for +user documentation. + + +Installation +------------ + +See the INSTALL file for installation instructions. + + +Unit tests +---------- + +The unit tests depend on a running batch daemon, webserver, and SOAP +webservice: + + sudo /etc/init.d/mutalyzer-batchd start + sudo /etc/init.d/apache2 start + +Now run the tests with: + + MUTALYZER_ENV=test nosetests -v + +Or, if you are in a hurry, skip the long-running tests with: + + MUTALYZER_ENV=test MUTALYZER_QUICK_TEST=1 nosetests -v + + +Development notes +----------------- + +Todo list: +- Improve the web interface design :) +- Test all uses of mkstemp(). +- Use naming conventions for modules Crossmap, Db, File, GenRecord, Retriever + and Scheduler. +- Use standard logging module, with rotating functionality. Race conditions + on the log file are probably a problem in the current setup. + Instead of that rotating, we could also use logrotate: + http://serverfault.com/questions/55610/logrotate-and-open-files +- Setup continuous integration. Currently, I'm most impressed with Hudson. + http://hudson-ci.org/ + http://www.rhonabwy.com/wp/2009/11/04/setting-up-a-python-ci-server-with-hudson/ + Or perhaps Jenkins. + http://jenkins-ci.org/ +- Use monit on the production server. + http://mmonit.com/monit/ +- Migrate Javascript to JQuery. +- I think in the long run, the Output object is not really the way to go. It + obscures the control flow. The logging part should use the standard logging + module. The data gathering by the Output object is probably better handled + by explicitely returning data objects from functions. +- Migrate from TAL to a more mondern and maintained Python template library, + for example jinja. +- Develop a large test suite. +- Create a web interface url to watch the progress of a batch job. +- Create webservices for the batch jobs (steal ideas from Jeroen's DVD + webservice). +- Use virtualenv? +- Use SQLAlchemy? +- Password for MySQL user. +- In deployment, remove old versions of Mutalyzer package? +- Use https protocol. +- Check for os.path.join vulnerabilities. +- Use a standard solution for the database migrations in extras/migrations. +- Use something like Sphinx to generate development documentation from code. +- There are some problems with the batch architecture, especially that there + cannot be multiple workers without synchronisation problems. + Good read: http://news.ycombinator.com/item?id=3002861 + Suggestion: http://celeryproject.org/ +- Have a normal 404 page. + +Code style guide: +- Follow PEP 8 (code) and PEP 257 (docstrings). + http://www.python.org/dev/peps/pep-0008/ + http://www.python.org/dev/peps/pep-0257/ + Read the Google Python Style guide: + http://google-styleguide.googlecode.com/svn/trunk/pyguide.html +- Use Epydoc style documentation in docstrings. +- End class and method definitions with their name as comment. +- Executables are in the bin/ directory. +- For examples, check established Python projects: + http://code.djangoproject.com/browser/django/trunk + http://twistedmatrix.com/trac/browser/trunk + https://github.com/webpy/webpy + https://github.com/mitsuhiko/jinja2 + https://bitbucket.org/mramm/tg-21/src + http://bazaar.launchpad.net/~bzr-pqm/bzr/bzr.dev/files + https://github.com/ask/celery +- A lot of code does not yet adhere to these points, this is an ongoing + effort. + +Obsoleted features: +- On eu.liacs.nl: + /etc/apache2/mods-enabled/rewrite.load contains a rewrite rule that converts + "Variant_info.php" to "Variant_info". + When all LOVD versions are above 2.0-23, this rule can be deleted and the + rewrite module can be disabled. +- In the Variant_info() function a substitution on error messages is + performed. + When all LOVD versions are above 2.0-23, this check can be deleted. + + +Dependencies +------------ + +Mutalyzer depends on the following (Debian/Ubuntu) packages: +- mysql-server >= 5.1 +- python >= 2.6 +- python-mysqldb >= 1.2.2 +- python-biopython >= 1.54 +- python-pyparsing >= 1.5.0 +- python-configobj >= 4.4.0 +- python-magic >= 5.04-2 +- python-psutil >= 0.1.3-1 +- python-xlrd >= 0.6.1-2 +- python-daemon >= 1.5.5 +- python-soappy >= 0.12.0-2 +- python-suds >= 0.3.9-1 + +The web and SOAP interfaces depend on the following packages: +- apache2 >= 2.2.11 +- libapache2-mod-wsgi >= 2.8 +- python-webpy >= 0.33 +- python-soaplib >= 2.0.0-alpha1 +- python-simpletal >= 4.1-6 + +Automatic remote deployment depends on Fabric: +- fabric >= 0.9.0-2 + +The unit tests depend on the following packages: +- python-nose >= 0.11 +- python-webtest >= 1.2.3 + +As of 2011-08-23, snakefood reports the following imports from the Mutalyzer +source code (excluding the standard library imports): + + Bio + MySQLdb + SOAPpy + configobj + daemon + fabric + lockfile + lxml + magic + nose + pyparsing + setuptools + simpletal + soaplib + suds + web + webtest + xlrd diff --git a/bin/mutalyzer b/bin/mutalyzer new file mode 100755 index 0000000000000000000000000000000000000000..68345947cd5c43c2f108c842ea6b7d96d32a2800 --- /dev/null +++ b/bin/mutalyzer @@ -0,0 +1,190 @@ +#!/usr/bin/env python +""" +Command-line interface to the nomenclature checker. + +Usage: + {command} variant + + variant: The variant description to check. + + +@todo: Refactor this file. +""" + + +import sys +import os + +from mutalyzer import variantchecker +from mutalyzer.output import Output +from mutalyzer.config import Config +from mutalyzer.util import format_usage + + +def main(cmd): + """ + Command line interface to the name checker. + + @todo: documentation + """ + C = Config() + O = Output(__file__, C.Output) + + O.addMessage(__file__, -1, "INFO", "Received variant " + cmd) + + RD = variantchecker.check_variant(cmd, C, O) + + O.addMessage(__file__, -1, "INFO", "Finished processing variant " + cmd) + + ### OUTPUT BLOCK ### + gn = O.getOutput("genename") + if gn : + print "Gene Name: " + gn[0] + tv = O.getOutput("transcriptvariant") + if tv : + print "Transcript variant: " + tv[0] + print + #if + + for i in O.getMessages() : + print i + errors, warnings, summary = O.Summary() + print summary + print + + if not errors: + visualisation = O.getOutput("visualisation") + if visualisation : + for i in range(len(visualisation)) : + if i and not i % 3 : + print + print visualisation[i] + #for + print + #if + + reference = O.getOutput("reference")[-1] + for i in O.getOutput("descriptions") : + print i + print + for i in O.getOutput("protDescriptions") : + print i + print + + if RD.record and RD.record._sourcetype == "LRG": #LRG record + from collections import defaultdict + toutput = defaultdict(list) + poutput = defaultdict(list) + for i in RD.record.geneList: + for j in i.transcriptList: + d = j.description + d = ';' in d and '['+d+']' or d + if j.name: + toutput[i.name].append( + "%st%s:%c.%s" % (reference, j.name, j.molType, d)) + else: + pass + if j.molType == 'c': + poutput[i.name].append( + "%sp%s:%s" % (reference, j.name, + j.proteinDescription)) + poutput[i.name].sort() + toutput[i.name].sort() + + #Transcript Notation + print "Following transcripts were affected:" + for key, values in toutput.items(): + print key + for value in values: + print "\t"+value + + #Protein Notation + print "\nFollowing proteins were affected:" + for key, values in poutput.items(): + print key + for value in values: + print "\t"+value + #for + #if + else : + for i in RD.record.geneList : + for j in i.transcriptList : + if ';' in j.description : + print "%s(%s_v%s):%c.[%s]" % (reference, i.name, j.name, + j.molType, j.description) + else : + print "%s(%s_v%s):%c.%s" % (reference, i.name, j.name, + j.molType, j.description) + if (j.molType == 'c') : + print "%s(%s_i%s):%s" % (reference, i.name, j.name, + j.proteinDescription) + #else + #for + #for + #else + + #Genomic Notation + rdrd = RD.record.description + gdescr = ';' in rdrd and '['+rdrd+']' or rdrd + print "\nGenomic notation:\n\t%s:g.%s" % (reference, gdescr) + print O.getOutput("genomicChromDescription") + + op = O.getOutput("oldprotein") + if op : + print "\nOld protein:" + #__bprint(op[0], O) + #for i in O.getOutput("oldProteinFancy") : + # print i + print 'Disabled (see how wsgi.py handles this)' + print + #if + np = O.getOutput("newprotein") + if np : + print "\nNew protein:" + #__bprint(np[0], O) + #for i in O.getOutput("newProteinFancy") : + # print i + print 'Disabled (see how wsgi.py handles this)' + print + #if + ap = O.getOutput("altProtein") + if ap : + print "\nAlternative protein using start codon %s:" % \ + O.getOutput("altstart")[0] + #__bprint(ap[0], O) + #for i in O.getOutput("altProteinFancy") : + # print i + print 'Disabled (see how wsgi.py handles this)' + print + #if + + for i in O.getOutput("exonInfo") : + print i + print + print O.getOutput("cdsStart") + print O.getOutput("cdsStop") + print + + for i in O.getOutput("legends") : + print i + + print + print "Restriction sites:" + for i in O.getOutput("restrictionSites") : + print i + + print "+++ %s" % O.getOutput("myTranscriptDescription") + + #if + ### OUTPUT BLOCK ### + del O +#main + + +if __name__ == '__main__': + + if len(sys.argv) < 2: + print format_usage() + sys.exit(1) + + main(sys.argv[1]) diff --git a/bin/mutalyzer-batchd b/bin/mutalyzer-batchd new file mode 100755 index 0000000000000000000000000000000000000000..868eafa933fe36a3e7681c2d9641c6d7840fb0f5 --- /dev/null +++ b/bin/mutalyzer-batchd @@ -0,0 +1,105 @@ +#!/usr/bin/env python +""" +Daemon for processing scheduled batch jobs. + +We use python-daemon [1] for daemonizing the job processing. This file +should be run with the mutalyzer directory as working directory. + +@todo: Check if PID dir is writable. +@todo: Get rid of ugly exception logging. +@todo: Reload configuration without restarting (for example, on SIGHUP). +@todo: Use [2] to set process name (and use that in init script). + +[1] http://pypi.python.org/pypi/python-daemon/ +[2] http://code.google.com/p/py-setproctitle/ +""" + + +import os +import sys +from daemon import pidlockfile, DaemonContext +from lockfile import LockTimeout +import signal +import time +import traceback + +from mutalyzer.config import Config +from mutalyzer.Db import Batch +from mutalyzer.Scheduler import Scheduler + + +def cleanup(signum, stack_frame): + """ + Generate a normal exit signal. + """ + sys.exit(1) + + +def daemonize(): + """ + Write PID file when it is not locked and daemonize a loop processing + scheduled batch jobs. + """ + config = Config() + batch_config = config.Batch + + pidfile = os.path.realpath(batch_config.PIDfile) + + lockfile = pidlockfile.TimeoutPIDLockFile(pidfile, acquire_timeout=1, + threaded=False) + + context = DaemonContext(working_directory=os.getcwd(), + pidfile=lockfile) + + # To preserve stderr and stdout, add these arguments. + #stdin=sys.stdin, + #stdout=sys.stdout, + #files_preserve=[sys.stdin, sys.stdout] + + # Writing the PID file as root before changing user/group does not seem + # to work. + #uid=pwd.getpwnam('www-data').pw_uid + #gid=grp.getgrnam('www-data').gr_gid + + context.signal_map = { + signal.SIGTERM: cleanup, + signal.SIGHUP: 'terminate' + } + + with context: + # Note that any opened files are now closed. This is not a problem for + # the Config instance, since it does not read its file again after + # initialisation. + database = Batch(config.Db) + scheduler = Scheduler(config.Scheduler, database) + + def stop_scheduler(signum, stack_frame): + scheduler.stop() + signal.signal(signal.SIGTERM, stop_scheduler) + + while not scheduler.stopped(): + # Process batch jobs. This process() method runs while there + # exist jobs to run. + try: + scheduler.process() + except Exception as e: + f = open('/tmp/batcherror.log', 'a+') + f.write('Error (%s): %s\n' % (type(e), str(e))) + f.write('%s\n\n' % repr(traceback.format_exc())) + f.flush() + f.close() + pass + if scheduler.stopped(): + break + # Wait a bit and process any possible new jobs. + time.sleep(5) + + +if __name__ == '__main__': + try: + daemonize() + except LockTimeout: + # If we want to see something on stdout, we have to add it to the + # {files_preserve} argument of the DaemonContext. + #print 'Mutalyzer batch daemon is already running.' + sys.exit(1) diff --git a/bin/mutalyzer-cache-sync b/bin/mutalyzer-cache-sync new file mode 100755 index 0000000000000000000000000000000000000000..405084ea16b373b3e0e161a1cc35cb03cb55432b --- /dev/null +++ b/bin/mutalyzer-cache-sync @@ -0,0 +1,51 @@ +#!/usr/bin/env python +""" +Synchronize the database cache with other Mutalyzer instances. + +Usage: + {command} remote_wsdl url_template days + + remote_wsdl: Location of the remote WSDL description. + url_template: URL to remote downloads, where {{file}} is to be substituted + by the filename. + days: Number of days to go back in the remote cache. + + +This program is intended to be run daily from cron. Example: + + 25 5 * * * mutalyzer-cache-sync 'http://dom1/?wsdl' 'http://dom1/{file}' 7 + 55 5 * * * mutalyzer-cache-sync 'http://dom2/?wsdl' 'http://dom2/{file}' 7 +""" + + +import sys + +from mutalyzer.config import Config +from mutalyzer.output import Output +from mutalyzer.sync import CacheSync +from mutalyzer import Db +from mutalyzer.util import format_usage + + +def cache_sync(remote_wsdl, url_template, days): + """ + Synchronize the database cache with other Mutalyzer instances. + """ + config = Config() + output = Output(__file__, config.Output) + database = Db.Cache(config.Db) + + sync = CacheSync(config.Retriever, output, database) + sync.sync_with_remote(remote_wsdl, url_template, days) + + +if __name__ == '__main__': + if len(sys.argv) < 4: + print format_usage() + sys.exit(1) + try: + days = int(sys.argv[3]) + except ValueError: + print 'Last argument must be an integer.' + sys.exit(1) + cache_sync(sys.argv[1], sys.argv[2], int(sys.argv[3])) diff --git a/bin/mutalyzer-mapping-update b/bin/mutalyzer-mapping-update new file mode 100755 index 0000000000000000000000000000000000000000..cbc88ef27dab3c6f81854a91424605748df7e1dd --- /dev/null +++ b/bin/mutalyzer-mapping-update @@ -0,0 +1,58 @@ +#!/usr/bin/env python +""" +Update the database with mapping information from the NCBI. + +Usage: + {command} database mapping_file assembly + + database: Database to update (i.e. 'hg18' or 'hg19'). + mapping_file: Path to the NCBI mapping information. + assembly: Use only entries from this assembly (this is the 'group_name' + column in the NCBI mapping file). + + +This program is intended to be run daily from cron. Example: + + 25 6 * * * mutalyzer-mapping-update hg19 /tmp/seq_gene.md reference +""" + + +import sys + +from mutalyzer.config import Config +from mutalyzer.output import Output +from mutalyzer.mapping import NCBIUpdater +from mutalyzer.util import format_usage + + +def main(database, mapping_file, assembly): + """ + Update the database with information from the NCBI. + + @arg database: Database to update (i.e. 'hg18' or 'hg19'). + @type database: string + @arg mapping_file: Path to NCBI mapping information. + @type mapping_file: string + @arg assembly: Use only entries from this assembly (this is the + 'group_name' column in the NCBI mapping file). + @type assembly: string + + @todo: Also report how much was added/updated. + """ + config = Config() + output = Output(__file__, config.Output) + output.addMessage(__file__, -1, 'INFO', + 'Starting NCBI mapping data update') + + updater = NCBIUpdater(database, config) + updater.load(mapping_file, assembly) + updater.merge() + + output.addMessage(__file__, -1, 'INFO', 'NCBI mapping data update end') + + +if __name__ == '__main__': + if len(sys.argv) != 4: + print format_usage() + sys.exit(1) + main(*sys.argv[1:]) diff --git a/bin/mutalyzer-webservice.wsgi b/bin/mutalyzer-webservice.wsgi new file mode 100755 index 0000000000000000000000000000000000000000..bc624da31f7e733ff61b899860ca67f6c2fafb2b --- /dev/null +++ b/bin/mutalyzer-webservice.wsgi @@ -0,0 +1,56 @@ +#!/usr/bin/env python +""" +WSGI interface to the Mutalyzer SOAP webservice. + +The WSGI interface is exposed through the module variable 'application'. + +Example Apache/mod_wsgi configuration: + + WSGIScriptAlias /services /usr/local/bin/mutalyzer-webservice.wsgi + +Be sure to have this line first if you also define a / alias, like this: + + WSGIScriptAlias /services /usr/local/bin/mutalyzer-webservice.wsgi + WSGIScriptAlias / /usr/local/bin/mutalyzer-website.wsgi + +You can also use the built-in HTTP server by running this file directly. + +To start the built-in HTTP server on port 8081: + + /usr/local/bin/mutalyzer-webservice.wsgi 8081 + +@todo: Do we really use namespaces correctly? +@todo: For some reason, the server exposes its location including ?wsdl. +@todo: More thourough input checking. The @soap decorator does not do any + kind of strictness checks on the input. For example, in + transcriptInfo, the build argument must really be present. (Hint: + use __checkBuild.) +@todo: The mutalyzer.config.Config object can just be instantiated once + and we should not create it on every request. +""" + + +import sys +from wsgiref.simple_server import make_server +from mutalyzer import webservice + + +DEFAULT_PORT = 8081 + + +# Unfortunately we cannot instantiate wsgi.Application here, see the note +# near the bottom of mutalyzer/webservice.py. +application = webservice.application + + +if __name__ == '__main__': + port = DEFAULT_PORT + if len(sys.argv) > 1: + try: + port = int(sys.argv[1]) + except ValueError: + print 'Not a valid port number: %s' % sys.argv[1] + sys.exit(1) + print 'Listening to http://localhost:%d/' % port + print 'WDSL file is at http://localhost:%d/?wsdl' % port + make_server('localhost', port, application).serve_forever() diff --git a/bin/mutalyzer-website.wsgi b/bin/mutalyzer-website.wsgi new file mode 100755 index 0000000000000000000000000000000000000000..be05dbb3e67e0bfb8c17762f2e001304a76eba8d --- /dev/null +++ b/bin/mutalyzer-website.wsgi @@ -0,0 +1,50 @@ +#!/usr/bin/env python +""" +WSGI interface to the Mutalyzer website. + +The WSGI interface is exposed through the module variable 'application'. +Static files are not handled by this interface and should be served through +the '/base' url prefix separately. + +Example Apache/mod_wsgi configuration: + + Alias /base /var/www/mutalyzer/base + WSGIScriptAlias / /usr/local/bin/mutalyzer-website.wsgi + +You can also use the built-in HTTP server by running this file directly. +Note, however, that static files are not served by this server. A common +pattern is to use Nginx as a proxy and static file server. + +Start the built-in HTTP server on port 8080: + + /usr/local/bin/mutalyzer-website.wsgi 8080 + +Example Nginx configuration: + + server { + listen 80; + location /base/ { + root /var/www/mutalyzer/base; + if (-f $request_filename) { + rewrite ^/base/(.*)$ /base/$1 break; + } + } + location / { + proxy_read_timeout 300; # 5 minutes + proxy_pass http://127.0.0.1:8080; + } + } + +@todo: Integrate webservice.py (http://webpy.org/cookbook/webservice/). +@todo: Move /templates/base to /static for web.py compatibility. +""" + + +from mutalyzer import website + + +application = website.app.wsgifunc() + + +if __name__ == '__main__': + website.app.run() diff --git a/extras/apache/mutalyzer.conf b/extras/apache/mutalyzer.conf new file mode 100644 index 0000000000000000000000000000000000000000..67b8fd172b8db0b7c501204449369ddd5e6e3c5d --- /dev/null +++ b/extras/apache/mutalyzer.conf @@ -0,0 +1,28 @@ +# Static files +Alias /mutalyzer/base /var/www/mutalyzer/base +<Directory /var/www/mutalyzer/base> + Order deny,allow + Allow from all + Options -Indexes + AllowOverride None +</Directory> + +# Use daemon mode of mod_wsgi +WSGIDaemonProcess mutalyzer processes=2 threads=15 maximum-requests=10000 +WSGIProcessGroup mutalyzer + +# Webservice +WSGIScriptAlias /mutalyzer/services <MUTALYZER_BIN_WEBSERVICE> +<Directory /mutalyzer/services> + Order deny,allow + Allow from all + Options -Indexes +</Directory> + +# Website +WSGIScriptAlias /mutalyzer <MUTALYZER_BIN_WEBSITE> +<Directory /mutalyzer> + Order deny,allow + Allow from all + Options -Indexes +</Directory> diff --git a/mutalyzer.conf b/extras/config.example similarity index 79% rename from mutalyzer.conf rename to extras/config.example index 1cd39d230268f803b49e0f8fa55587b8b52a3004..4bbecc30a65e772e202e1d4aa04c499a27bc4580 100644 --- a/mutalyzer.conf +++ b/extras/config.example @@ -1,7 +1,7 @@ # # Mutalyzer config file. # - +# Copy this file to /etc/mutalyzer/config and modify to suit your preferences. # # These settings are used by the Retriever module. @@ -11,7 +11,7 @@ email = "mutalyzer@humgen.nl" # The cache directory. -cache = "./var/cache" +cache = "/var/cache/mutalyzer" # The maximum size of the cache in megabytes. cachesize = 50 @@ -42,17 +42,8 @@ LocalMySQLuser = "mutalyzer" # Host name for the local databases. LocalMySQLhost = "localhost" -# MySQL username for the UCSC database. -RemoteMySQLuser = "genome" - -# Host name for the UCSC database. -RemoteMySQLhost = "genome-mysql.cse.ucsc.edu" -# Retrieve all entries modified within a certain number of days. -UpdateInterval = 7 -# Temporary file for updated UCSC mapping information. -TempFile = "./var/UCSC_Update.txt" # @@ -60,7 +51,7 @@ TempFile = "./var/UCSC_Update.txt" # # Name and location of the log file. -log = "./var/mutalyzer.log" +log = "/var/log/mutalyzer.log" # Prefix for each log message. datestring = "%Y-%m-%d %H:%M:%S" @@ -80,6 +71,9 @@ loglevel = 3 # Level of output messages. outputlevel = 1 +# Show debug info in the web interface. +debug = yes + # # These settings are used by the Mutator module. @@ -99,23 +93,17 @@ flankclipsize = 6 # These settings are used by the Scheduler module. # -# Name of the batch process. -processName = "MutalyzerBatch2" - # Return e-mail address. mailFrom = "noreply@humgen.nl" -# Location of the mail template. -mailMessage = "./mail.txt" - # Subject of the message. mailSubject = "Result of Mutalyzer batch check." # Location of the results. -resultsDir = "./var/cache" +resultsDir = "/var/cache/mutalyzer" # Location of the PID file. -PIDfile = "./var/batch.pid" +PIDfile = "/var/run/mutalyzer/mutalyzer-batchd.pid" # Maximum size for uploaded batch input files in megabytes. batchInputMaxSize = 5 @@ -143,9 +131,6 @@ bufSize = 32768 # The obligatory header in batch request files. header = "AccNo", "Genesymbol", "Mutation" -# Directory for temporary files. -tempDir = "./var" - # Threshold for Batch Jobs threshold = 0.05 @@ -154,12 +139,6 @@ threshold = 0.05 # These settings are used by the GenRecord module. # -# Number of upstream nucleotides when searching for a transcript. -upstream = 5000 - -# Number of downstream nucleotides when searching for a transcript. -downstream = 2000 - spliceAlarm = 2 spliceWarn = 5 diff --git a/extras/config.user.example b/extras/config.user.example new file mode 100644 index 0000000000000000000000000000000000000000..92dd1d4b39840fd0cce0d30efd2521df74c55e1e --- /dev/null +++ b/extras/config.user.example @@ -0,0 +1,11 @@ +# +# Mutalyzer config file. +# +# Copy this file to ~/.config/mutalyzer/config to overwrite definitions from +# /etc/mutalyzer.config. + +# The cache directory. +cache = "/home/<USERNAME>/.cache/mutalyzer" + +# Name and location of the log file. +log = "/tmp/mutalyzer-<USERNAME>.log" diff --git a/extras/cron.d/mutalyzer-cache-sync b/extras/cron.d/mutalyzer-cache-sync new file mode 100644 index 0000000000000000000000000000000000000000..c58ea7742d54a461e7a93ee102cd6c9a3274a492 --- /dev/null +++ b/extras/cron.d/mutalyzer-cache-sync @@ -0,0 +1,2 @@ +# Synchronize the local cache with the live server every morning at 05:25 +#25 5 * * * www-data <MUTALYZER_BIN_CACHE_SYNC> 'http://www.mutalyzer.nl/2.0/services/?wsdl' 'http://www.mutalyzer.nl/2.0/Reference/{file}' 3 diff --git a/extras/cron.d/mutalyzer-mapping-update b/extras/cron.d/mutalyzer-mapping-update new file mode 100644 index 0000000000000000000000000000000000000000..cac1a9767741c0a42a372408ac65cb7c58d1647d --- /dev/null +++ b/extras/cron.d/mutalyzer-mapping-update @@ -0,0 +1,5 @@ +# Update the mapping database every sunday morning at 03:25 and 04:25 +#25 3 * * 7 www-data wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/BUILD.36.3/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md; <MUTALYZER_BIN_MAPPING_UPDATE> hg18 /tmp/seq_gene.md reference +#25 4 * * 7 www-data wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/BUILD.37.2/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md; <MUTALYZER_BIN_MAPPING_UPDATE> hg19 /tmp/seq_gene.md 'GRCh37.p2-Primary Assembly' + +##25 4 * * 7 www-data wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md; <MUTALYZER_BIN_MAPPING_UPDATE> hg19 /tmp/seq_gene.md 'GRCh37.p2-Primary Assembly' diff --git a/extras/init.d/mutalyzer-batchd b/extras/init.d/mutalyzer-batchd new file mode 100644 index 0000000000000000000000000000000000000000..ea14453ad5cd2a59cb00f3d2af4e13296cf1b062 --- /dev/null +++ b/extras/init.d/mutalyzer-batchd @@ -0,0 +1,132 @@ +#! /bin/sh +### BEGIN INIT INFO +# Provides: mutalyzer-batchd +# Required-Start: $local_fs $remote_fs $network $syslog $mysql +# Required-Stop: $local_fs $remote_fs $network $syslog $mysql +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: Start and stop the Mutalyzer batch daemon +# Description: Controls the Mutalyzer batch job processing daemon.. +### END INIT INFO + +PATH=/sbin:/usr/sbin:/bin:/usr/bin +DESC="Mutalyzer batch deamon" +NAME=mutalyzer-batchd +DAEMON=<MUTALYZER_BIN_BATCHD> +DIR=/ +PIDDIR=/var/run/mutalyzer +PIDFILE=$PIDDIR/$NAME.pid +SCRIPTNAME=/etc/init.d/$NAME +USER=www-data + +# Exit if the package is not installed +[ -x "$DAEMON" ] || exit 0 + +# Read configuration variable file if it is present +[ -r /etc/default/$NAME ] && . /etc/default/$NAME + +# Load the VERBOSE setting and other rcS variables +. /lib/init/vars.sh + +# Define LSB log_* functions. +# Depend on lsb-base (>= 3.0-6) to ensure that this file is present. +. /lib/lsb/init-functions + +# +# Function that starts the daemon/service +# +do_start() +{ + # Return + # 0 if daemon has been started + # 1 if daemon was already running + # 2 if daemon could not be started + mkdir -p $PIDDIR + chown -R $USER $PIDDIR + start-stop-daemon --start --quiet --pidfile $PIDFILE --exec $DAEMON --chuid $USER --chdir $DIR --test > /dev/null \ + || return 1 + start-stop-daemon --start --quiet --pidfile $PIDFILE --exec $DAEMON --chuid $USER --chdir $DIR -- \ + $DAEMON_ARGS \ + || return 2 + # Add code here, if necessary, that waits for the process to be ready + # to handle requests from services started subsequently which depend + # on this one. As a last resort, sleep for some time. +} + +# +# Function that stops the daemon/service +# +do_stop() +{ + # Return + # 0 if daemon has been stopped + # 1 if daemon was already stopped + # 2 if daemon could not be stopped + # other if a failure occurred + start-stop-daemon --stop --quiet --oknodo --pidfile $PIDFILE + RETVAL="$?" + [ "$RETVAL" = 2 ] && return 2 + # Many daemons don't delete their pidfiles when they exit. + rm -f $PIDFILE + return "$RETVAL" +} + +case "$1" in + start) + [ "$VERBOSE" != no ] && log_daemon_msg "Starting $DESC" "$NAME" + do_start + case "$?" in + 0|1) [ "$VERBOSE" != no ] && log_end_msg 0 ;; + 2) [ "$VERBOSE" != no ] && log_end_msg 1 ;; + esac + ;; + stop) + [ "$VERBOSE" != no ] && log_daemon_msg "Stopping $DESC" "$NAME" + do_stop + case "$?" in + 0|1) [ "$VERBOSE" != no ] && log_end_msg 0 ;; + 2) [ "$VERBOSE" != no ] && log_end_msg 1 ;; + esac + ;; + status) + status_of_proc "$DAEMON" "$NAME" && exit 0 || exit $? + ;; + #reload|force-reload) + # + # If do_reload() is not implemented then leave this commented out + # and leave 'force-reload' as an alias for 'restart'. + # + #log_daemon_msg "Reloading $DESC" "$NAME" + #do_reload + #log_end_msg $? + #;; + restart|force-reload) + # + # If the "reload" option is implemented then remove the + # 'force-reload' alias + # + log_daemon_msg "Restarting $DESC" "$NAME" + do_stop + case "$?" in + 0|1) + do_start + case "$?" in + 0) log_end_msg 0 ;; + 1) log_end_msg 1 ;; # Old process is still running + *) log_end_msg 1 ;; # Failed to start + esac + ;; + *) + # Failed to stop + log_end_msg 1 + ;; + esac + ;; + *) + #echo "Usage: $SCRIPTNAME {start|stop|restart|reload|force-reload}" >&2 + echo "Usage: $SCRIPTNAME {start|stop|status|restart|force-reload}" >&2 + exit 3 + ;; +esac + +: diff --git a/extras/log-tools/find-crashes.py b/extras/log-tools/find-crashes.py new file mode 100755 index 0000000000000000000000000000000000000000..cea38c2e667cb8ebf0fa3abd5fcf69845b6923d7 --- /dev/null +++ b/extras/log-tools/find-crashes.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python + +""" +Search log for bugs. + +Finds occurrences of 'Received' in the log file that are not followed by an +occurrence of 'Finished'. These are probably runs of the namechecker that +crashed. +""" + + +import os +from mutalyzer.config import Config + + +config = Config() +handle = open(config.Output.log, 'r') + +scanning = False +line = handle.readline() + +while line: + if not scanning: + if ' Received ' in line: + message = line + scanning = True + else: + if ' Received ' in line: + print message, + scanning = False + if ' Finished ' in line: + scanning = False + line = handle.readline() + +handle.close() diff --git a/extras/migrations/001-db-gbinfo-add-created.migration b/extras/migrations/001-db-gbinfo-add-created.migration new file mode 100755 index 0000000000000000000000000000000000000000..26fd668a98012159486e81b7f6853f09cbf91e1e --- /dev/null +++ b/extras/migrations/001-db-gbinfo-add-created.migration @@ -0,0 +1,46 @@ +#!/usr/bin/env python +""" +Add a column and index 'created' to the 'GBInfo' table. + +Usage: + ./001-db-gbinfo-add-created.migration [migrate] +""" + + +import migration + + +def check(): + """ + Check if migration is needed. + """ + connection = migration.db_connect('mutalyzer') + cursor = connection.cursor() + cursor.execute('SHOW COLUMNS FROM GBInfo WHERE field = "created";') + has_column = len(cursor.fetchall()) > 0 + cursor.execute('SHOW INDEX FROM GBInfo WHERE Key_name = "created";') + has_index = len(cursor.fetchall()) > 0 + connection.close() + if has_column != has_index: + migration.fatal('Installation is not in a recognizable state. Fix manually.') + return not has_column + + +def migrate(): + """ + Perform migration. + """ + connection = migration.db_connect('mutalyzer') + cursor = connection.cursor() + cursor.execute(""" + ALTER TABLE GBInfo + ADD COLUMN created TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + ADD INDEX created (created);""") + connection.commit() + connection.close() + migration.info('Added column mutalyzer.GBInfo.created') + migration.info('Added index on mutalyzer.GBInfo.created') + + +if __name__ == '__main__': + migration.main(check, migrate) diff --git a/extras/migrations/002-db-map-to-mapping.migration b/extras/migrations/002-db-map-to-mapping.migration new file mode 100755 index 0000000000000000000000000000000000000000..0c0f02af6788b653d834764a8103441bc06ac571 --- /dev/null +++ b/extras/migrations/002-db-map-to-mapping.migration @@ -0,0 +1,138 @@ +#!/usr/bin/env python +""" +Convert the old 'map' tables to the new 'Mapping' tables. + +Usage: + ./002-db-map-to-mapping.migration [migrate] + +This is basically just a renaming of columns and +- use NULL for missing values +- add 1 to all chromosomal start positions. + +The following tables on hg18 and hg19 are dropped: +- gbStatus +- map_cdsBackup +- refGene +- refLink + +The map tables are renamed to map_backup. +""" + + +import MySQLdb +import migration + + +def _exon_starts(starts): + updated = [] + for start in starts.split(',')[:-1]: + updated.append(str(int(start) + 1)) + return ','.join(updated) + + +def _exon_stops(stops): + if stops[-1] == ',': + return stops[:-1] + + +def _check(db): + # Todo: Also check if 'map' is gone. + connection = migration.db_connect(db) + cursor = connection.cursor() + cursor.execute('SHOW TABLES LIKE "Mapping";') + ok = len(cursor.fetchall()) > 0 + connection.close() + return ok + + +def _migrate(db): + connection = migration.db_connect(db) + cursor = connection.cursor() + cursor.execute(""" + CREATE TABLE Mapping ( + gene varchar(255) DEFAULT NULL, + transcript varchar(20) NOT NULL DEFAULT '', + version smallint(6) DEFAULT NULL, + chromosome varchar(40) DEFAULT NULL, + orientation char(1) DEFAULT NULL, + start int(11) unsigned DEFAULT NULL, + stop int(11) unsigned DEFAULT NULL, + cds_start int(11) unsigned DEFAULT NULL, + cds_stop int(11) unsigned DEFAULT NULL, + exon_starts longblob NOT NULL, + exon_stops longblob NOT NULL, + protein varchar(20) DEFAULT NULL, + source varchar(20) DEFAULT NULL, + INDEX (transcript) + );""") + select_cursor = connection.cursor(MySQLdb.cursors.DictCursor) + select_cursor.execute(""" + SELECT + geneName as gene, + acc as transcript, + version as version, + chrom as chromosome, + strand as orientation, + txStart + 1 as start, + txEnd as stop, + NULLIF(cdsStart + 1, cdsEnd + 1) as cds_start, + NULLIF(cdsEnd, cdsStart) as cds_stop, + exonStarts as exon_starts, + exonEnds as exon_stops, + NULLIF(protAcc, '') as protein, + 'UCSC' as source + FROM + map;""") + count = 0 + while True: + r = select_cursor.fetchone() + if r == None: + break + count += 1 + cursor.execute(""" + INSERT INTO Mapping + (gene, transcript, version, chromosome, orientation, start, stop, + cds_start, cds_stop, exon_starts, exon_stops, protein, source) + VALUES + (%s, %s, %s, %s, %s, %s, %s, + %s, %s, %s, %s, %s, %s);""", + (r['gene'], r['transcript'], r['version'], r['chromosome'], + r['orientation'], r['start'], r['stop'], r['cds_start'], + r['cds_stop'], _exon_starts(r['exon_starts']), _exon_stops(r['exon_stops']), + r['protein'], r['source'])) + + migration.info('Converted table map to table Mapping on %s (%d entries)' % (db, count)) + + cursor.execute('DROP TABLE IF EXISTS gbStatus, map_cdsBackup, refGene, refLink') + cursor.execute('RENAME TABLE map TO map_backup') + + migration.info('Dropped tables gbStatus, map_cdsBackup, refGene, refLink on %s' % db) + migration.info('Renamed table map to map_backup on %s' % db) + + select_cursor.close() + cursor.close() + connection.commit() + connection.close() + + +def check(): + """ + Check if migration is needed. + """ + hg18_ok = _check('hg18') + hg19_ok = _check('hg19') + if hg18_ok != hg19_ok: + migration.fatal('Installation is not in a recognizable state. Fix manually.') + return not hg18_ok + + +def migrate(): + """ + Perform migration. + """ + _migrate('hg18') + _migrate('hg19') + + +if __name__ == '__main__': + migration.main(check, migrate) diff --git a/extras/migrations/003-config-remove-ucsc.migration b/extras/migrations/003-config-remove-ucsc.migration new file mode 100755 index 0000000000000000000000000000000000000000..0578d8ce728294ebb33e1abb699593ec9a46819e --- /dev/null +++ b/extras/migrations/003-config-remove-ucsc.migration @@ -0,0 +1,30 @@ +#!/bin/bash + +# Remove UCSC database values from the configuration file. +# +# Usage: +# ./003-config-remove-ucsc.migration [migrate] + +COLOR_INFO='\033[32m' +COLOR_WARNING='\033[33m' +COLOR_ERROR='\033[31m' +COLOR_END='\033[0m' + +if [ -e /etc/mutalyzer/config ] && $(grep -q 'MySQL username for the UCSC database' /etc/mutalyzer/config); then + echo -e "${COLOR_WARNING}This migration is needed.${COLOR_END}" + if [ "$1" = 'migrate' ]; then + echo 'Performing migration.' + echo -e "${COLOR_INFO}Copying /etc/mutalyzer/config to /etc/mutalyzer/config.backup${COLOR_END}" + cp /etc/mutalyzer/config /etc/mutalyzer/config.backup + sed -i '/MySQL username for the UCSC database/d' /etc/mutalyzer/config + sed -i '/Host name for the UCSC database/d' /etc/mutalyzer/config + sed -i '/Retrieve all entries modified within a certain number of days/d' /etc/mutalyzer/config + sed -i '/RemoteMySQLuser =/d' /etc/mutalyzer/config + sed -i '/^RemoteMySQLhost =/d' /etc/mutalyzer/config + sed -i '/^UpdateInterval =/d' /etc/mutalyzer/config + echo -e "${COLOR_INFO}Removed all UCSC database configuration values from /etc/mutalyzer/config${COLOR_END}" + echo 'Performed migration.' + fi +else + echo -e "${COLOR_INFO}This migration is not needed.${COLOR_END}" +fi diff --git a/extras/migrations/004-cron-ucsc-to-ncbi.migration b/extras/migrations/004-cron-ucsc-to-ncbi.migration new file mode 100755 index 0000000000000000000000000000000000000000..192bbd297468d3d261c2dc79cf9ac144d9720c85 --- /dev/null +++ b/extras/migrations/004-cron-ucsc-to-ncbi.migration @@ -0,0 +1,29 @@ +#!/bin/bash + +# Remove UCSC update from cron and install NCBI update. +# +# Usage: +# ./004-cron-ucsc-to-ncbi.migration [migrate] + +COLOR_INFO='\033[32m' +COLOR_WARNING='\033[33m' +COLOR_ERROR='\033[31m' +COLOR_END='\033[0m' + +if [ -e /etc/cron.d/mutalyzer-ucsc-update ] && $(grep -v -q '^#' /etc/cron.d/mutalyzer-ucsc-update); then + echo -e "${COLOR_WARNING}This migration is needed.${COLOR_END}" + if [ "$1" = 'migrate' ]; then + echo 'Performing migration.' + sed -i 's/^/#/' /etc/cron.d/mutalyzer-ucsc-update + echo -e "${COLOR_INFO}Commented all lines in /etc/cron.d/mutalyzer-ucsc-update${COLOR_END}" + if [ ! -e /etc/cron.d/mutalyzer-mapping-update ]; then + BIN_MAPPING_UPDATE=$(which mutalyzer-mapping-update) + cp extras/cron.d/mutalyzer-mapping-update /etc/cron.d/mutalyzer-mapping-update + sed -i -e "s@<MUTALYZER_BIN_MAPPING_UPDATE>@${BIN_MAPPING_UPDATE}@g" /etc/cron.d/mutalyzer-mapping-update + echo -e "${COLOR_INFO}Installed /etc/cron.d/mutalyzer-mapping-update${COLOR_END}" + fi + echo 'Performed migration.' + fi +else + echo -e "${COLOR_INFO}This migration is not needed.${COLOR_END}" +fi diff --git a/extras/migrations/README b/extras/migrations/README new file mode 100644 index 0000000000000000000000000000000000000000..e514423f3eda8b60a79e6aa430fa8984bfd9085d --- /dev/null +++ b/extras/migrations/README @@ -0,0 +1,18 @@ +Automated migration scripts +=========================== + +This directory contains scripts to automate the migration of a Mutalyzer +installation to the latest version. Things that might need a migration +include: + +- database schema +- database data +- configuration files +- cache +- ? + +All migration scripts accept as parameters a flag 'migrate'. Running a +script without parameters just checks if the migration is needed. Running +a script with 'migrate' does the actual migration (only if needed). + +Performing multiple migrations should be done in the order of their names. diff --git a/extras/migrations/migration.py b/extras/migrations/migration.py new file mode 100644 index 0000000000000000000000000000000000000000..c3e6b3dd935e7bad983f9371e372f9bdf7a2a838 --- /dev/null +++ b/extras/migrations/migration.py @@ -0,0 +1,62 @@ +""" +Some utility functions for our simple migrations. + +@todo: Perhaps this should be moved to the mutalyzer package. +""" + + +import sys +import MySQLdb + + +COLOR_INFO = '\033[32m' +COLOR_WARNING = '\033[33m' +COLOR_ERROR = '\033[31m' +COLOR_END = '\033[0m' + + +def print_color(message, color=None): + if color is None: + print message + else: + print color + message + COLOR_END + + +def info(message): + print_color(message, COLOR_INFO) + + +def warning(message): + print_color(message, COLOR_WARNING) + + +def error(message): + print_color(message, COLOR_ERROR) + + +def fatal(message): + error(message) + sys.exit(1) + + +def db_connect(database): + try: + connection = MySQLdb.connect(host='localhost', + user='mutalyzer', + passwd='', + db=database) + except MySQLdb.Error as e: + fatal('Error %d: %s' % (e.args[0], e.args[1])) + return connection + + +def main(check, migrate): + needed = check() + if needed: + warning('This migration is needed.') + if len(sys.argv) > 1 and sys.argv[1] == 'migrate': + print 'Performing migration.' + migrate() + print 'Performed migration.' + else: + info('This migration is not needed.') diff --git a/extras/post-install.sh b/extras/post-install.sh new file mode 100644 index 0000000000000000000000000000000000000000..d9dfa1ea798a7e7d8038d1530687275a7b06e909 --- /dev/null +++ b/extras/post-install.sh @@ -0,0 +1,292 @@ +#!/bin/bash + +# Post-install script for Mutalyzer. Run after the setuptools installation +# (python setup.py install). +# +# Notice: The definitions in this file are quite specific to the standard +# Mutalyzer environment. This consists of a Debian stable (Squeeze) system +# with Apache and Mutalyzer using its mod_wsgi module. Debian conventions are +# used throughout. See the README file for more information. +# +# Usage (from the source root directory): +# sudo bash extras/post-install.sh +# +# Todo: +# - Copy doc to /usr/share/doc +# - General cleanup + +set -e + +COLOR_INFO='\033[32m' +COLOR_WARNING='\033[33m' +COLOR_ERROR='\033[31m' +COLOR_END='\033[0m' + +# The 'cd /' is a hack to prevent the mutalyzer package under the current +# directory to be used. +PACKAGE_ROOT=$(cd / && python -c 'import mutalyzer; print mutalyzer.package_root()') +BIN_BATCHD=$(which mutalyzer-batchd) +BIN_CACHE_SYNC=$(which mutalyzer-cache-sync) +BIN_MAPPING_UPDATE=$(which mutalyzer-mapping-update) +BIN_WEBSITE=$(which mutalyzer-website.wsgi) +BIN_WEBSERVICE=$(which mutalyzer-webservice.wsgi) + +if [ ! -e /etc/mutalyzer/config ]; then + echo -e "${COLOR_INFO}Creating /etc/mutalyzer/config${COLOR_END}" + mkdir -p /etc/mutalyzer + cp extras/config.example /etc/mutalyzer/config + chmod -R u=rwX,go=rX /etc/mutalyzer +else + echo -e "${COLOR_WARNING}Not touching /etc/mutalyzer/config (it exists)${COLOR_END}" +fi + +for USERNAME in $(cut -f 1 -d : /etc/passwd); do + if [ -d "/home/${USERNAME}" ]; then + echo -e "${COLOR_INFO}Creating /home/${USERNAME}/.config/mutalyzer/config${COLOR_END}" + echo -e "${COLOR_INFO}Creating /home/${USERNAME}/.cache/mutalyzer${COLOR_END}" + su $USERNAME -c "mkdir -p /home/$USERNAME/.config/mutalyzer" + su $USERNAME -c "mkdir -p /home/$USERNAME/.cache/mutalyzer" + su $USERNAME -c "cp extras/config.user.example /home/$USERNAME/.config/mutalyzer/config" + su $USERNAME -c "touch /tmp/mutalyzer-$USERNAME.log" + sed -i -e "s@<USERNAME>@${USERNAME}@g" /home/$USERNAME/.config/mutalyzer/config + fi +done + +echo -e "${COLOR_INFO}Touching /var/log/mutalyzer.log${COLOR_END}" +touch /var/log/mutalyzer.log +chown www-data:www-data /var/log/mutalyzer.log +chmod u=rw,go=r /var/log/mutalyzer.log + +echo -e "${COLOR_INFO}Touching /var/cache/mutalyzer${COLOR_END}" +mkdir -p /var/cache/mutalyzer +chown -R www-data:www-data /var/cache/mutalyzer +chmod -R u=rwX,go=rX /var/cache/mutalyzer + +echo -e "${COLOR_INFO}Creating /etc/init.d/mutalyzer-batchd${COLOR_INFO}" +cp extras/init.d/mutalyzer-batchd /etc/init.d/mutalyzer-batchd +sed -i -e "s@<MUTALYZER_BIN_BATCHD>@${BIN_BATCHD}@g" /etc/init.d/mutalyzer-batchd +chmod u=rwx,go=rx /etc/init.d/mutalyzer-batchd + +echo -e "${COLOR_INFO}Installing init scripts${COLOR_END}" +update-rc.d -f mutalyzer-batchd remove +update-rc.d mutalyzer-batchd defaults 98 02 + +echo -e "${COLOR_INFO}Installing crontab${COLOR_END}" +cp extras/cron.d/mutalyzer-cache-sync /etc/cron.d/mutalyzer-cache-sync +sed -i -e "s@<MUTALYZER_BIN_CACHE_SYNC>@${BIN_CACHE_SYNC}@g" /etc/cron.d/mutalyzer-cache-sync +cp extras/cron.d/mutalyzer-mapping-update /etc/cron.d/mutalyzer-mapping-update +sed -i -e "s@<MUTALYZER_BIN_MAPPING_UPDATE>@${BIN_MAPPING_UPDATE}@g" /etc/cron.d/mutalyzer-mapping-update + +echo -e "${COLOR_INFO}Creating /etc/apache2/conf.d/mutalyzer.conf${COLOR_END}" +cp extras/apache/mutalyzer.conf /etc/apache2/conf.d/mutalyzer.conf +sed -i -e "s@<MUTALYZER_BIN_WEBSITE>@${BIN_WEBSITE}@g" -e "s@<MUTALYZER_BIN_WEBSERVICE>@${BIN_WEBSERVICE}@g" -e "s@<MUTALYZER_BIN_BATCHD>@${BIN_BATCHD}@g" /etc/apache2/conf.d/mutalyzer.conf +chmod u=rw,go=r /etc/apache2/conf.d/mutalyzer.conf + +echo "You will now be asked for the MySQL root password" + +# Create databases +cat << EOF | mysql -u root -p + CREATE USER mutalyzer; + CREATE DATABASE mutalyzer; + CREATE DATABASE hg18; + CREATE DATABASE hg19; + GRANT ALL PRIVILEGES ON mutalyzer.* TO mutalyzer; + GRANT ALL PRIVILEGES ON hg18.* TO mutalyzer; + GRANT ALL PRIVILEGES ON hg19.* TO mutalyzer; + FLUSH PRIVILEGES; +EOF + +echo -e "${COLOR_INFO}Creating tables in hg18 database${COLOR_END}" + +# Create ChrName and Mapping table (hg18) +cat << EOF | mysql -u mutalyzer -D hg18 +CREATE TABLE ChrName ( + AccNo char(20) NOT NULL, + name char(20) NOT NULL, + PRIMARY KEY (AccNo) +); +CREATE TABLE Mapping ( + gene varchar(255) DEFAULT NULL, + transcript varchar(20) NOT NULL DEFAULT '', + version smallint(6) DEFAULT NULL, + chromosome varchar(40) DEFAULT NULL, + orientation char(1) DEFAULT NULL, + start int(11) unsigned DEFAULT NULL, + stop int(11) unsigned DEFAULT NULL, + cds_start int(11) unsigned DEFAULT NULL, + cds_stop int(11) unsigned DEFAULT NULL, + exon_starts longblob NOT NULL, + exon_stops longblob NOT NULL, + protein varchar(20) DEFAULT NULL, + source varchar(20) DEFAULT NULL, + INDEX (transcript) +); +INSERT INTO ChrName (AccNo, name) VALUES +('NC_000001.9', 'chr1'), +('NC_000002.10', 'chr2'), +('NC_000003.10', 'chr3'), +('NC_000004.10', 'chr4'), +('NC_000005.8', 'chr5'), +('NC_000006.10', 'chr6'), +('NC_000007.12', 'chr7'), +('NC_000008.9', 'chr8'), +('NC_000009.10', 'chr9'), +('NC_000010.9', 'chr10'), +('NC_000011.8', 'chr11'), +('NC_000012.10', 'chr12'), +('NC_000013.9', 'chr13'), +('NC_000014.7', 'chr14'), +('NC_000015.8', 'chr15'), +('NC_000016.8', 'chr16'), +('NC_000017.9', 'chr17'), +('NC_000018.8', 'chr18'), +('NC_000019.8', 'chr19'), +('NC_000020.9', 'chr20'), +('NC_000021.7', 'chr21'), +('NC_000022.9', 'chr22'), +('NC_000023.9', 'chrX'), +('NC_000024.8', 'chrY'), +('NC_001807.4', 'chrM'), +('NT_113891.1', 'chr6_cox_hap1'), +('NT_113959.1', 'chr22_h2_hap1'); +EOF + +echo -e "${COLOR_INFO}Populating Mapping table with NCBI data (hg18)${COLOR_END}" + +# Populate Mapping table with NCBI data (hg18) +wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/BUILD.36.3/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md +echo -e "${COLOR_INFO}Importing NCBI mapping data, this may take a few minutes (hg18)${COLOR_END}" +$($BIN_MAPPING_UPDATE hg18 /tmp/seq_gene.md reference) + +echo -e "${COLOR_INFO}Creating tables in hg19 database${COLOR_END}" + +# Create ChrName and Mapping table (hg19) +cat << EOF | mysql -u mutalyzer -D hg19 +CREATE TABLE ChrName ( + AccNo char(20) NOT NULL, + name char(20) NOT NULL, + PRIMARY KEY (AccNo) +); +CREATE TABLE Mapping ( + gene varchar(255) DEFAULT NULL, + transcript varchar(20) NOT NULL DEFAULT '', + version smallint(6) DEFAULT NULL, + chromosome varchar(40) DEFAULT NULL, + orientation char(1) DEFAULT NULL, + start int(11) unsigned DEFAULT NULL, + stop int(11) unsigned DEFAULT NULL, + cds_start int(11) unsigned DEFAULT NULL, + cds_stop int(11) unsigned DEFAULT NULL, + exon_starts longblob NOT NULL, + exon_stops longblob NOT NULL, + protein varchar(20) DEFAULT NULL, + source varchar(20) DEFAULT NULL, + INDEX (transcript) +); +INSERT INTO ChrName (AccNo, name) VALUES +('NC_000001.10', 'chr1'), +('NC_000002.11', 'chr2'), +('NC_000003.11', 'chr3'), +('NC_000004.11', 'chr4'), +('NC_000005.9', 'chr5'), +('NC_000006.11', 'chr6'), +('NC_000007.13', 'chr7'), +('NC_000008.10', 'chr8'), +('NC_000009.11', 'chr9'), +('NC_000010.10', 'chr10'), +('NC_000011.9', 'chr11'), +('NC_000012.11', 'chr12'), +('NC_000013.10', 'chr13'), +('NC_000014.8', 'chr14'), +('NC_000015.9', 'chr15'), +('NC_000016.9', 'chr16'), +('NC_000017.10', 'chr17'), +('NC_000018.9', 'chr18'), +('NC_000019.9', 'chr19'), +('NC_000020.10', 'chr20'), +('NC_000021.8', 'chr21'), +('NC_000022.10', 'chr22'), +('NC_000023.10', 'chrX'), +('NC_000024.9', 'chrY'), +('NT_167244.1', 'chr6_apd_hap1'), +('NT_113891.2', 'chr6_cox_hap2'), +('NT_167245.1', 'chr6_dbb_hap3'), +('NT_167246.1', 'chr6_mann_hap4'), +('NT_167247.1', 'chr6_mcf_hap5'), +('NT_167248.1', 'chr6_qbl_hap6'), +('NT_167249.1', 'chr6_ssto_hap7'), +('NT_167250.1', 'chr4_ctg9_hap1'), +('NT_167251.1', 'chr17_ctg5_hap1'); +EOF + +echo -e "${COLOR_INFO}Populating Mapping table with NCBI data (hg19)${COLOR_END}" + +# Populate Mapping table with UCSC data (hg19) +#wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md +wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/BUILD.37.2/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md +echo -e "${COLOR_INFO}Importing NCBI mapping data, this may take a few minutes (hg19)${COLOR_END}" +$($BIN_MAPPING_UPDATE hg19 /tmp/seq_gene.md 'GRCh37.p2-Primary Assembly') + +echo -e "${COLOR_INFO}Creating tables in mutalyzer database${COLOR_END}" + +# Create mutalyzer tables +cat << EOF | mysql -u mutalyzer -D mutalyzer +CREATE TABLE BatchJob ( + JobID char(20) NOT NULL, + Filter char(20) NOT NULL, + EMail char(255) NOT NULL, + FromHost char(255) NOT NULL, + JobType char(20) DEFAULT NULL, + Arg1 char(20) DEFAULT NULL, + PRIMARY KEY (JobID) +); +CREATE TABLE BatchQueue ( + QueueID int(5) NOT NULL AUTO_INCREMENT, + JobID char(20) NOT NULL, + Input char(255) NOT NULL, + Flags char(20) DEFAULT NULL, + PRIMARY KEY (QueueID), + KEY JobQueue (JobID,QueueID) +); +CREATE TABLE GBInfo ( + AccNo char(20) NOT NULL DEFAULT '', + GI char(13) DEFAULT NULL, + hash char(32) NOT NULL DEFAULT '', + ChrAccVer char(20) DEFAULT NULL, + ChrStart int(12) DEFAULT NULL, + ChrStop int(12) DEFAULT NULL, + orientation int(2) DEFAULT NULL, + url char(255) DEFAULT NULL, + created TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (AccNo), + UNIQUE KEY hash (hash), + UNIQUE KEY alias (GI), + INDEX (created) +); +CREATE TABLE Link ( + mrnaAcc char(20) NOT NULL, + protAcc char(20) NOT NULL, + PRIMARY KEY (mrnaAcc), + UNIQUE KEY protAcc (protAcc) +); +EOF + +# The remainder is essentially the same as post-upgrade.sh + +if [ ! -e /var/www/mutalyzer ]; then + mkdir -p /var/www/mutalyzer +fi + +if [ -e /var/www/mutalyzer/base ]; then + echo "Removing /var/www/mutalyzer/base" + rm /var/www/mutalyzer/base +fi + +echo -e "${COLOR_INFO}Symlinking /var/www/mutalyzer/base to $PACKAGE_ROOT/templates/base${COLOR_END}" +ln -s $PACKAGE_ROOT/templates/base /var/www/mutalyzer/base + +echo "Restarting Apache" +/etc/init.d/apache2 restart + +echo "Starting Mutalyzer batch daemon" +/etc/init.d/mutalyzer-batchd start diff --git a/extras/post-upgrade.sh b/extras/post-upgrade.sh new file mode 100644 index 0000000000000000000000000000000000000000..37ae527a2e4f1f713cc99736caf6e926a4b4991e --- /dev/null +++ b/extras/post-upgrade.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Post-upgrade script for Mutalyzer. Run after the setuptools installation +# (python setup.py install). +# +# Notice: The definitions in this file are quite specific to the standard +# Mutalyzer environment. This consists of a Debian stable (Squeeze) system +# with Apache and Mutalyzer using its mod_wsgi module. Debian conventions are +# used throughout. See the README file for more information. +# +# Usage (from the source root directory): +# sudo bash extras/post-upgrade.sh + +set -e + +COLOR_INFO='\033[32m' +COLOR_WARNING='\033[33m' +COLOR_ERROR='\033[31m' +COLOR_END='\033[0m' + +# The 'cd /' is a hack to prevent the mutalyzer package under the current +# directory to be used. +PACKAGE_ROOT=$(cd / && python -c 'import mutalyzer; print mutalyzer.package_root()') +BIN_WEBSITE=$(which mutalyzer-website.wsgi) +BIN_WEBSERVICE=$(which mutalyzer-webservice.wsgi) + +if [ ! -e /var/www/mutalyzer ]; then + mkdir -p /var/www/mutalyzer +fi + +if [ -e /var/www/mutalyzer/base ]; then + echo "Removing /var/www/mutalyzer/base" + rm /var/www/mutalyzer/base +fi + +echo -e "${COLOR_INFO}Symlinking /var/www/mutalyzer/base to $PACKAGE_ROOT/templates/base${COLOR_END}" +ln -s $PACKAGE_ROOT/templates/base /var/www/mutalyzer/base + +echo "Running any needed migrations" +for MIGRATION in extras/migrations/*.migration; do + echo "Checking migration $(basename $MIGRATION)" + $MIGRATION migrate +done + +echo -e "${COLOR_INFO}Assuming mod_wsgi daemon mode, not restarting Apache${COLOR_END}" +#/etc/init.d/apache2 restart + +echo "Touching WSGI entry to reload application" +touch $BIN_WEBSITE +touch $BIN_WEBSERVICE + +echo "Restarting Mutalyzer batch daemon" +/etc/init.d/mutalyzer-batchd restart diff --git a/extras/pre-install.sh b/extras/pre-install.sh new file mode 100644 index 0000000000000000000000000000000000000000..8af0b585beae93da581e28719483aa4324ff0f79 --- /dev/null +++ b/extras/pre-install.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Pre-install script for Mutalyzer on Debian or Debian-like systems. Run +# before the setuptools installation (python setup.py install). +# +# Notice: The definitions in this file are quite specific to the standard +# Mutalyzer environment. This consists of a Debian stable (Squeeze) system +# with Apache and Mutalyzer using its mod_wsgi module. Debian conventions are +# used throughout. See the README file for more information. +# +# Usage (from the source root directory): +# sudo bash extras/pre-install.sh + +set -e + +COLOR_INFO='\033[32m' +COLOR_WARNING='\033[33m' +COLOR_ERROR='\033[31m' +COLOR_END='\033[0m' + +echo -e "${COLOR_INFO}Installing packages with apt${COLOR_END}" + +apt-get install -y \ + mysql-server \ + python \ + python-mysqldb \ + python-biopython \ + python-pyparsing \ + python-configobj \ + python-simpletal \ + python-soappy \ + python-magic \ + python-psutil \ + python-xlrd \ + python-daemon \ + python-webpy \ + python-webtest \ + python-nose \ + apache2 \ + libapache2-mod-wsgi \ + python-setuptools \ + git-core + +echo -e "${COLOR_INFO}Installing latest soaplib from git master${COLOR_END}" + +mkdir -p /tmp/mutalyzer-install +pushd /tmp/mutalyzer-install + +git clone https://github.com/soaplib/soaplib.git +cd soaplib +python setup.py install + +popd +rm -Rf /tmp/mutalyzer-install + +echo -e "${COLOR_INFO}Installing suds using easy_install${COLOR_END}" + +easy_install suds diff --git a/extras/soap-tools/getCache.py b/extras/soap-tools/getCache.py new file mode 100755 index 0000000000000000000000000000000000000000..cc8ecc18c20568193e0ae3da681537c4473f261d --- /dev/null +++ b/extras/soap-tools/getCache.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +""" +Get cache entries from a Mutalyzer installation. + +Usage: + {command} days + + days: Retrieve entries of at most this number of days old. + +The cache entries are retrieved from the Mutalyzer SOAP webservice and printed +to standard output. +""" + + +from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() + +import sys +from datetime import datetime, timedelta +from suds.client import Client + +from mutalyzer.util import format_usage + + +WSDL_LOCATION = 'http://localhost/mutalyzer/services/?wsdl' + + +def main(days): + """ + Get cache entries and print them to standard output. + """ + created_since = datetime.today() - timedelta(days=days) + service = Client(WSDL_LOCATION, cache=None).service + result = service.getCache(created_since) + + if result: + for entry in result.CacheEntry: + print 'Entry %s created at %s' % (entry.name, entry.created) + + +if __name__ == '__main__': + if len(sys.argv) != 2: + print format_usage() + sys.exit(1) + try: + main(int(sys.argv[1])) + except ValueError: + print 'First argument must be an integer' + sys.exit(1) diff --git a/extras/soap-tools/getGeneAndTranscript.py b/extras/soap-tools/getGeneAndTranscript.py new file mode 100755 index 0000000000000000000000000000000000000000..979e52d99ac97196b5a729f27bc369ac45590444 --- /dev/null +++ b/extras/soap-tools/getGeneAndTranscript.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +""" +Get transcript and product name from a Mutalyzer installation. + +Usage: + {command} genomic_reference transcript_reference + + genomic_reference: Genomic reference in which to lookup the transcript. + transcript_reference: Reference of the transcript to lookup. + +The transcript and product name are retrieved from the Mutalyzer SOAP +webservice and printed to standard output. +""" + + +from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() + +import sys +from suds.client import Client + +from mutalyzer.util import format_usage + + +WSDL_LOCATION = 'http://localhost/mutalyzer/services/?wsdl' + + +def main(genomic_reference, transcript_reference): + """ + Get cache entries and print them to standard output. + """ + service = Client(WSDL_LOCATION, cache=None).service + result = service.getGeneAndTranscript(genomic_reference, + transcript_reference) + + if result: + print 'Transcript: %s' % result.transcriptName + print 'Product: %s' % result.productName + + +if __name__ == '__main__': + if len(sys.argv) != 3: + print format_usage() + sys.exit(1) + main(*sys.argv[1:]) diff --git a/extras/soap-tools/getTranscripts.py b/extras/soap-tools/getTranscripts.py new file mode 100755 index 0000000000000000000000000000000000000000..310fb3fa96ada76b61906940b111d3b8aaa387af --- /dev/null +++ b/extras/soap-tools/getTranscripts.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python +""" +Get transcript accession numbers that overlap with a chromosomal position. + +Usage: + {command} chromosome position + + chromosome: Chromosome to lookup transcripts for (e.g. 'chrX'). + position: Position to lookup overlapping transcripts for. + +The transcript accession numbers are retrieved from the Mutalyzer SOAP +webservice and printed to standard output. +""" + + +from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() + +import sys +from suds.client import Client + +from mutalyzer.util import format_usage + + +WSDL_LOCATION = 'http://localhost/mutalyzer/services/?wsdl' + + +def main(chromosome, position): + """ + Get transcript accession numbers and print them to standard output. + """ + service = Client(WSDL_LOCATION, cache=None).service + result = service.getTranscripts('hg19', chromosome, position, True) + + if result: + for transcript in result.string: + print transcript + + +if __name__ == '__main__': + if len(sys.argv) != 3: + print format_usage() + sys.exit(1) + main(*sys.argv[1:]) diff --git a/extras/soap-tools/getTranscriptsAndInfo.py b/extras/soap-tools/getTranscriptsAndInfo.py new file mode 100755 index 0000000000000000000000000000000000000000..ce3b1a223c562a50595c9e0aca03a3c08eed6650 --- /dev/null +++ b/extras/soap-tools/getTranscriptsAndInfo.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python +""" +Get extended information on transcripts contained in a genomic reference. + +Usage: + {command} genomic_reference [gene] + + genomic_reference: Genomic reference to look for transcripts in, for + example 'AL449423.14'. + gene: Optionally restrict results to transcripts for this gene. + +The transcript information is retrieved from the Mutalyzer SOAP webservice and +printed to standard output. +""" + + +from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() + +import sys +from suds.client import Client + +from mutalyzer.util import format_usage + + +WSDL_LOCATION = 'http://localhost/mutalyzer/services/?wsdl' + + +def main(genomic_reference, gene=None): + """ + Get extended transcript information and print this to standard output. + """ + service = Client(WSDL_LOCATION, cache=None).service + result = service.getTranscriptsAndInfo(genomic_reference, gene) + + if result: + for t in result.TranscriptInfo: + print """Transcript: %s + ID: %s + Product: %s + Locus tag: %s + Link method: %s + Translation: + Start: %s (c), %s (g) + End: %s (c), %s (g) + Sortable end: %s + CDS: + Start: %s (c), %s (g) + End: %s (c), %s (g)""" % \ + (t.name, t.id, t.product, t.locusTag, t.linkMethod, t.cTransStart, + t.gTransStart, t.cTransEnd, t.gTransEnd, t.sortableTransEnd, + t.cCDSStart, t.gCDSStart, t.cCDSStop, t.gCDSStop) + + if 'proteinTranscript' in t: + print """ Protein: + Name: %s + ID: %s + Product: %s""" % \ + (t.proteinTranscript.name, t.proteinTranscript.id, + t.proteinTranscript.product) + + if 'exons' in t: + print ' Exons:' + for e in t.exons.ExonInfo: + print ' %s - %s (c), %s - %s (g)' % \ + (e.cStart, e.cStop, e.gStart, e.gStop) + + +if __name__ == '__main__': + if len(sys.argv) < 2: + print format_usage() + sys.exit(1) + main(*sys.argv[1:]) diff --git a/extras/soap-tools/getTranscriptsByGeneName.py b/extras/soap-tools/getTranscriptsByGeneName.py new file mode 100755 index 0000000000000000000000000000000000000000..d757dcf1d3a2fe7f0f24f74a9af9d1efdd5b134e --- /dev/null +++ b/extras/soap-tools/getTranscriptsByGeneName.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python +""" +Get transcript accession numbers for a gene. + +Usage: + {command} gene + + gene: Gene name to lookup transcripts for. + +The transcript accession numbers are retrieved from the Mutalyzer SOAP +webservice and printed to standard output. +""" + + +from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() + +import sys +from suds.client import Client + +from mutalyzer.util import format_usage + + +WSDL_LOCATION = 'http://localhost/mutalyzer/services/?wsdl' + + +def main(gene): + """ + Get transcript accession numbers and print them to standard output. + """ + service = Client(WSDL_LOCATION, cache=None).service + result = service.getTranscriptsByGeneName('hg19', gene) + + if result: + for transcript in result.string: + print transcript + + +if __name__ == '__main__': + if len(sys.argv) != 2: + print format_usage() + sys.exit(1) + main(sys.argv[1]) diff --git a/extras/soap-tools/getdbSNPDescriptions.py b/extras/soap-tools/getdbSNPDescriptions.py new file mode 100755 index 0000000000000000000000000000000000000000..11cc54a803ec40804dc292e50a29ae74ed9fd745 --- /dev/null +++ b/extras/soap-tools/getdbSNPDescriptions.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python +""" +Get HGVS descriptions for a dbSNP rs number. + +Usage: + {command} rs_number + + rs_number: A valid dbSNP rs number, e.g. 'rs9919552'. + +The HGVS descriptions are retrieved from the Mutalyzer SOAP webservice and +printed to standard output. +""" + + +from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() + +import sys +from suds.client import Client + +from mutalyzer.util import format_usage + + +WSDL_LOCATION = 'http://localhost/mutalyzer/services/?wsdl' + + +def main(rs_number): + """ + Get HGVS descriptions and print them to standard output. + """ + service = Client(WSDL_LOCATION, cache=None).service + result = service.getdbSNPDescriptions(rs_number) + + if result: + for description in result.string: + print description + + +if __name__ == '__main__': + if len(sys.argv) != 2: + print format_usage() + sys.exit(1) + main(sys.argv[1]) diff --git a/extras/soap-tools/info.py b/extras/soap-tools/info.py new file mode 100755 index 0000000000000000000000000000000000000000..e7f80ef343d25a4f123dc70ae3e099173e805722 --- /dev/null +++ b/extras/soap-tools/info.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +""" +Get static version information from a Mutalyzer installation. + +Usage: + {command} + +The version information is retrieved from the Mutalyzer SOAP webservice and +printed to standard output. +""" + + +from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() + +import sys +from suds.client import Client + + +from mutalyzer.util import format_usage + + +WSDL_LOCATION = 'http://localhost/mutalyzer/services/?wsdl' + + +def main(): + """ + Get static version information and print this to standard output. + """ + service = Client(WSDL_LOCATION, cache=None).service + result = service.info() + + if result: + print 'Version: %s' % result.version + print 'Version parts: %s' % ', '.join( + p for p in result.versionParts.string) + print 'Release date: %s' % result.releaseDate + print 'Nomenclature version: %s' % result.nomenclatureVersion + print 'Nomenclature version parts: %s' % ', '.join( + p for p in result.nomenclatureVersionParts.string) + print 'Server name: %s' % result.serverName + print 'Contact e-mail: %s' % result.contactEmail + + +if __name__ == '__main__': + if len(sys.argv) != 1: + print format_usage() + sys.exit(1) + main() diff --git a/extras/soap-tools/numberConversion.py b/extras/soap-tools/numberConversion.py new file mode 100755 index 0000000000000000000000000000000000000000..f72fb5320b6410f00b2d1d28fff4681b41330f70 --- /dev/null +++ b/extras/soap-tools/numberConversion.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +""" +Convert a variant description from c. to g. notation or vice versa. + +Usage: + {command} build description + + build: Human genome reference build to use, i.e. 'hg18' or 'hg19'. + description: Variant description to convert. + +The converted HGVS description(s) is (are) retrieved from the Mutalyzer SOAP +webservice and printed to standard output. +""" + + +from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() + +import sys +from suds.client import Client + +from mutalyzer.util import format_usage + + +WSDL_LOCATION = 'http://localhost/mutalyzer/services/?wsdl' + + +def main(build, description): + """ + Convert variant description from c. to g. notation or vice versa. + """ + service = Client(WSDL_LOCATION, cache=None).service + result = service.numberConversion(build, description) + + if result: + for description in result.string: + print description + else: + print 'No descriptions returned.' + + +if __name__ == '__main__': + if len(sys.argv) != 3: + print format_usage() + sys.exit(1) + main(*sys.argv[1:]) diff --git a/extras/soap-tools/runMutalyzer.py b/extras/soap-tools/runMutalyzer.py new file mode 100755 index 0000000000000000000000000000000000000000..3d0c47e90d15f3977fca578d8ec3e6f682088419 --- /dev/null +++ b/extras/soap-tools/runMutalyzer.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +""" +Run the Mutalyzer namechecker on a variant description. + +Usage: + {command} description [verbosity] + + description: Variant description to check. + verbosity: If 'verbose', also output full original and variant sequences. + +The namechecker results are retrieved from the Mutalyzer SOAP webservice and +printed to standard output. +""" + + +from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() + +import sys +from suds.client import Client + +from mutalyzer.util import format_usage + + +WSDL_LOCATION = 'http://localhost/mutalyzer/services/?wsdl' + + +def main(description, verbosity=None): + """ + Run the Mutalyzer namechecker and print results to standard output. + """ + service = Client(WSDL_LOCATION, cache=None).service + result = service.runMutalyzer(description) + + if result.rawVariants: + for v in result.rawVariants.RawVariant: + print 'Raw variant: %s' % v.description + print '%s\n' % v.visualisation + + if verbosity == 'verbose': + print 'Original:\n%s\n' % result.original + print 'Mutated:\n%s\n' % result.mutated + print 'origMRNA:\n%s\n' % result.origMRNA + print 'mutatedMRNA:\n%s\n' % result.mutatedMRNA + print 'origCDS:\n%s\n' % result.origCDS + print 'newCDS:\n%s\n' % result.newCDS + print 'origProtein:\n%s\n' % result.origProtein + print 'newProtein:\n%s\n' % result.newProtein + print 'altProtein:\n%s\n' % result.altProtein + + print 'Errors: %s' % result.errors + print 'Warnings: %s' % result.warnings + print 'Summary: %s\n' % result.summary + + if result.messages: + for m in result.messages.SoapMessage: + print 'Error %s: %s\n' % (m.errorcode, m.message) + + if 'chromDescription' in result: + print 'Chromosomal description: %s' % result.chromDescription + print 'Genomic description: %s' % result.genomicDescription + + if result.transcriptDescriptions: + print 'Affected transcripts:' + print '\n'.join(result.transcriptDescriptions.string) + if result.proteinDescriptions: + print 'Affected proteins:' + print '\n'.join(result.proteinDescriptions.string) + + +if __name__ == '__main__': + if len(sys.argv) < 2: + print format_usage() + sys.exit(1) + main(*sys.argv[1:]) diff --git a/extras/soap-tools/sliceChromosomeByGene.py b/extras/soap-tools/sliceChromosomeByGene.py new file mode 100755 index 0000000000000000000000000000000000000000..698a927cfd5a155c9d7484813c843ffc7b0f381c --- /dev/null +++ b/extras/soap-tools/sliceChromosomeByGene.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +""" +Create a slice of a chromosome by gene. + +Usage: + {command} gene + + gene: Gene symbol of the gene to slice. + +A slice containing the gene with 5000 upstream bases and 2000 downstream bases +is created with the Mutalyzer SOAP webservice. The resulting UD number is +printed to standard output. +""" + + +from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() + +import sys +from suds.client import Client +from suds import WebFault + +from mutalyzer.util import format_usage + + +WSDL_LOCATION = 'http://localhost/mutalyzer/services/?wsdl' + + +def main(gene): + """ + Slice chromosome by gene and print UD number to standard output. + """ + service = Client(WSDL_LOCATION, cache=None).service + + try: + print service.sliceChromosomeByGene(gene, 'Human', 5000, 2000) + except WebFault as message: + print message + + +if __name__ == '__main__': + if len(sys.argv) != 2: + print format_usage() + sys.exit(1) + main(sys.argv[1]) diff --git a/extras/soap-tools/sp.py b/extras/soap-tools/sp.py new file mode 100755 index 0000000000000000000000000000000000000000..29fa90198636567eced291eca4534cf2f4de6e7a --- /dev/null +++ b/extras/soap-tools/sp.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python + +# Example SOAP client for the Mutalyzer webservice in Python using the +# SOAPpy library. +# +# See http://www.mutalyzer.nl/2.0/webservices +# +# Usage: +# python sp.py +# +# This code is in the public domain; it can be used for whatever purpose +# with absolutely no restrictions. + +import sys +from SOAPpy import WSDL + +o = WSDL.Proxy("http://localhost/mutalyzer/service.wsdl") + +# Get all transcripts that are hit when we look at position 159272155 on +# chromosome 1. +print "hg19", "chr1", 159272155 +r = o.getTranscripts(build="hg19", chrom="chr1", pos=159272155) +if r: + # This seems to be a bug in SOAPpy. Arrays of length 1 are + # flattened, so we cannot iterate over them. + if not isinstance(r.string, list): + r.string = [r.string] + for i in r.string: + print i, o.getGeneName(build="hg19", accno=i) + +# Get all transcripts and genes that have (part of) a transcript in the range +# 159272155-159372155 on chromosome 1. +print "\n", "hg19", "chr1", 159272155, 159372155, 1 +r = o.getTranscriptsRange(build="hg19", chrom="chr1", pos1=159272155, + pos2=159372155, method=1) +if r: + # This seems to be a bug in SOAPpy. Arrays of length 1 are + # flattened, so we cannot iterate over them. + if not isinstance(r.string, list): + r.string = [r.string] + for i in r.string: + print i, o.getGeneName(build="hg19", accno=i) + +# Get all transcripts and genes that have the entire transcript in the range +# 159272155-159372155 on chromosome 1. +print "\n", "hg19", "chr1", 159272155, 159372155, 0 +r = o.getTranscriptsRange(build="hg19", chrom="chr1", pos1=159272155, + pos2=159372155, method=0) +if r: + # This seems to be a bug in SOAPpy. Arrays of length 1 are + # flattened, so we cannot iterate over them. + if not isinstance(r.string, list): + r.string = [r.string] + for i in r.string: + print i, o.getGeneName(build="hg19", accno=i) + +print "\n", "hg19", "NM_002001.2", "c.2del" +r = o.mappingInfo(LOVD_ver="123", build="hg19", accNo="NM_002001.2", + variant="c.1del") +print r.mutationType +print r.start_g +print r.end_g + +print "\n", "hg19", "NM_002002.2" +r = o.transcriptInfo(LOVD_ver="123", build="hg19", accNo="NM_002001.2") +print r.CDS_stop +print r.trans_start +print r.trans_stop + +print "\n", "hg19", "NM_002001.2:c.1del" +r = o.numberConversion(build="hg19", variant="NM_002001.2:c.1del") +if r: + # This seems to be a bug in SOAPpy. Arrays of length 1 are + # flattened, so we cannot iterate over them. + if not isinstance(r.string, list): + r.string = [r.string] + for i in r.string: + print i + +print "\n", "hg19", "DMD" +r = o.getTranscriptsByGeneName(build="hg19", name="DMD") +if r: + # This seems to be a bug in SOAPpy. Arrays of length 1 are + # flattened, so we cannot iterate over them. + if not isinstance(r.string, list): + r.string = [r.string] + for i in r.string: + print i + +print "\n", "NM_002001.2:g.1del" +r = o.runMutalyzer(variant="NM_002001.2:g.1del") +print r.original +print r.mutated +print r.origMRNA +print r.mutatedMRNA +print r.origCDS +print r.newCDS +print r.origProtein +print r.newProtein +print r.altProtein +print r.errors +print r.warnings +print r.summary diff --git a/extras/soap-tools/transcriptInfo.py b/extras/soap-tools/transcriptInfo.py new file mode 100755 index 0000000000000000000000000000000000000000..eaeb5792285abf981b9b4f3c65f97d061547b7d8 --- /dev/null +++ b/extras/soap-tools/transcriptInfo.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +""" +Get transcript information from a Mutalyzer installation. + +Usage: + {command} transcript + + transcript: Transcript accession number, e.g. 'NM_002001.2'. + +The transcript information is retrieved from the Mutalyzer SOAP webservice and +printed to standard output. +""" + + +from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() + +import sys +from suds.client import Client + +from mutalyzer.util import format_usage + + +WSDL_LOCATION = 'http://localhost/mutalyzer/services/?wsdl' + + +def main(transcript): + """ + Get transcript information and print it to standard output. + """ + service = Client(WSDL_LOCATION, cache=None).service + result = service.transcriptInfo(LOVD_ver='123', build='hg19', + accNo=transcript) + + if result: + print 'Transcript start: %s' % result.trans_start + print 'Transcript stop: %s' % result.trans_stop + print 'CDS stop: %s' % result.CDS_stop + + +if __name__ == '__main__': + if len(sys.argv) != 2: + print format_usage() + sys.exit(1) + main(sys.argv[1]) diff --git a/fabfile.py b/fabfile.py new file mode 100644 index 0000000000000000000000000000000000000000..55a4c5a20fe2f6d69c036b2691fc5c9777a1e6cd --- /dev/null +++ b/fabfile.py @@ -0,0 +1,78 @@ +""" +Fabric fabfile for Mutalyzer. + +Notice: The definitions in this file are quite specific to the standard +Mutalyzer environment. This consists of a Debian stable (Squeeze) system with +Apache and Mutalyzer using its mod_wsgi module. Debian conventions are used +throughout. See the README file for more information. + +To do a deployment on a server with an existing configured Mutalyzer +installation: + + $ fab deploy -H server1.mutalyzer.nl + +For a fresh deployment on a new server: + + $ fab deploy:boostrap=yes -H server1.mutalyzer.nl +""" + + +from fabric.api import * + + +def deploy(bootstrap='no'): + """ + Deploy Mutalyzer on the remote host. + + Create a source distribution, transfer it to the remote host, and install + from there. After installation, we restart Apache and the Mutalyzer batch + daemon. + + Additionally, if bootstrap=yes, install all dependencies before Mutalyzer + installation, and bootstrap the Mutalyzer configuration afterwards (i.e. + create and fill database, add cron script, create cache directory, etc). + """ + # Currently, Fabric only supports task arguments as strings. + bootstrap = (bootstrap == 'yes') + + # Create a new source distribution as a tarball. + local('python setup.py sdist --formats=gztar') + + # Figure out the release name and tarball filename. + dist = local('python setup.py --fullname', capture=True).strip() + tarball = '%s.tar.gz' % dist + + # Create a place where we can unzip the source tarball. + run('mkdir /tmp/mutalyzer') + + # Upload the source tarball to the temporary folder on the server. + put('dist/%s' % tarball, '/tmp/mutalyzer/%s' % tarball) + + # Go to that directory, unzip and install it. + with cd('/tmp/mutalyzer'): + run('tar xzf %s' % tarball) + + # Go to the tarball's contents and do the installation. + with cd('/tmp/mutalyzer/%s' % dist): + + if bootstrap: + # Install dependencies. + sudo('bash extras/pre-install.sh') + + # Install Mutalyzer. + sudo('python setup.py install') + + if bootstrap: + # Configure Mutalyzer. + sudo('bash extras/post-install.sh') + else: + # Restart services. + sudo('bash extras/post-upgrade.sh') + + # Run unittests. + #run('MUTALYZER_ENV=test nosetests -v') + + # Now that all is set up, delete the folder again. + # I don't like to 'sudo rm -Rf' but since there where files created by + # root in this directory, we have to. + sudo('rm -Rf /tmp/mutalyzer') diff --git a/install.sh b/install.sh deleted file mode 100644 index 9b2db630e3c2dd6ced44d8f16fb7a68b4586857f..0000000000000000000000000000000000000000 --- a/install.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/sh - -updateCron() { - cron_entry="$1 python `pwd`/src/$2.py" - - if ! `crontab -l | grep "$cron_entry" > /dev/null`; then - echo "Updating cron entry." - if `crontab -l | grep $2 > /dev/null`; then - echo "Removing old entry." - crontab -l | grep -v $2 | crontab - fi - echo "Installing new entry." - ( - crontab -l - echo $cron_entry - ) | crontab - fi -} - -if `echo $0 | grep '/' > /dev/null`; then - echo "Please run this script from the installation directory." - exit 1 -fi - -updateCron "25 6 \* \* \*" "UCSC_update" -updateCron "*/1 \* \* \* \*" "BatchChecker" - -cat << EOF > .htaccess -SetHandler mod_python -PythonHandler src/handler -PythonPath "sys.path + ['`pwd`/src']" -PythonDebug On - -RewriteEngine on -RewriteRule Variant_info.php Variant_info -RewriteRule .*bugtracker.* https://www.mutalyzer.nl/projects/mutalyzer2/ -EOF - -chmod go+rx . src src/Modules templates -chmod go+r .htaccess mutalyzer.conf src/*.py src/Modules/*.py templates/* -chmod go+rw var diff --git a/mail.txt b/mail.txt deleted file mode 100644 index 518cf70af9b4c429d3859fe6a05a0c2aa8caaf1f..0000000000000000000000000000000000000000 --- a/mail.txt +++ /dev/null @@ -1,12 +0,0 @@ -Dear submitter, - -The batch operation you have submitted, has been processed successfully. - -Your results can be found here: -%s - -Thanks for using Mutalyzer. - - -With kind regards, -Mutalyzer batch checker. diff --git a/mutalyzer.sql b/mutalyzer.sql deleted file mode 100644 index 4ec13599be14749961158e1a8ebdd3f8427ca5df..0000000000000000000000000000000000000000 --- a/mutalyzer.sql +++ /dev/null @@ -1,50 +0,0 @@ -CREATE DATABASE `mutalyzer` - -USE `mutalyzer`; - -CREATE TABLE `BatchJob` ( - `JobID` char(20) NOT NULL, - `Filter` char(20) NOT NULL, - `EMail` char(255) NOT NULL, - `FromHost` char(255) NOT NULL, - `JobType` char(20) DEFAULT NULL, - `Arg1` char(20) DEFAULT NULL, - PRIMARY KEY (`JobID`) -); - -CREATE TABLE `BatchQueue` ( - `QueueID` int(5) NOT NULL AUTO_INCREMENT, - `JobID` char(20) NOT NULL, - `Input` char(255) NOT NULL, - `Flags` char(20) DEFAULT NULL, - PRIMARY KEY (`QueueID`) -); -CREATE TABLE `GBInfo` ( - `AccNo` char(20) NOT NULL DEFAULT '', - `GI` char(13) DEFAULT NULL, - `hash` char(32) NOT NULL DEFAULT '', - `ChrAccVer` char(20) DEFAULT NULL, - `ChrStart` int(12) DEFAULT NULL, - `ChrStop` int(12) DEFAULT NULL, - `orientation` int(2) DEFAULT NULL, - `url` char(255) DEFAULT NULL, - PRIMARY KEY (`AccNo`), - UNIQUE KEY `hash` (`hash`), - UNIQUE KEY `alias` (`GI`) -); - -CREATE TABLE `Link` ( - `mrnaAcc` char(20) NOT NULL, - `protAcc` char(20) NOT NULL, - PRIMARY KEY (`mrnaAcc`), - UNIQUE KEY `protAcc` (`protAcc`) -); - -CREATE TABLE `mm1` ( - `hg18` char(50) DEFAULT NULL, - `hg19` char(50) DEFAULT NULL -); -CREATE TABLE `mm2` ( - `hg18` char(50) DEFAULT NULL, - `hg19` char(50) DEFAULT NULL -); diff --git a/src/Modules/Crossmap.py b/mutalyzer/Crossmap.py similarity index 97% rename from src/Modules/Crossmap.py rename to mutalyzer/Crossmap.py index 08c5747a43c41af32bc6662f814e7523f1c51b92..cd289e3ebd5e558886bc1204401f4e81cb759847 100644 --- a/src/Modules/Crossmap.py +++ b/mutalyzer/Crossmap.py @@ -435,7 +435,7 @@ class Crossmap() : return int(s) #main2int - def int2offset(self, t) : + def int2offset(self, t, fuzzy=False): """ Convert a tuple of integers to offset-notation. This adds a `+', and `u' or `d' to the offset when appropriate. The main value is @@ -443,17 +443,22 @@ class Crossmap() : @arg t: A tuple of integers: (main, offset) in __STOP notation @type t: tuple + @kwarg fuzzy: Denotes that the coordinate is fuzzy (i.e. offset is + unknown). + @type fuzzy: bool @return: The offset in HGVS notation @rtype: string """ if t[1] > 0 : # The exon boundary is downstream. + if fuzzy: return '+?' if t[0] >= self.__trans_end : # It is downstream of the last exon. return "+d" + str(t[1]) return '+' + str(t[1]) #if if t[1] < 0 : # The exon boundary is uptream. + if fuzzy: return '-?' if t[0] <= self.__trans_start : # It is upstream of the first exon. return "-u" + str(-t[1]) return str(t[1]) @@ -490,32 +495,38 @@ class Crossmap() : return int(s[1:]) #offset2int - def tuple2string(self, t) : + def tuple2string(self, t, fuzzy=False) : """ Convert a tuple (main, offset) in __STOP notation to I{c.} notation. @arg t: A tuple (main, offset) in __STOP notation @type t: tuple + @kwarg fuzzy: Denotes that the coordinate is fuzzy (i.e. offset is + unknown). + @type fuzzy: bool @return: The position in HGVS notation @rtype: string """ - return str(self.int2main(t[0])) + str(self.int2offset(t)) + return str(self.int2main(t[0])) + str(self.int2offset(t, fuzzy)) #tuple2string - def g2c(self, a) : + def g2c(self, a, fuzzy=False) : """ Uses both g2x() and tuple2string() to translate a genomic position to __STOP notation to I{c.} notation. @arg a: The genomic position that must be translated @type a: integer + @kwarg fuzzy: Denotes that the coordinate is fuzzy (i.e. offset is + unknown). + @type fuzzy: bool @return: The position in HGVS notation @rtype: string """ - return self.tuple2string(self.g2x(a)) + return self.tuple2string(self.g2x(a), fuzzy) #g2c def info(self) : @@ -567,7 +578,7 @@ class Crossmap() : #Crossmap # -# Unit test. +# Unit test. Todo: move to /tests # if __name__ == "__main__" : # Build a crossmapper for a hypothetical gene. diff --git a/src/Modules/Db.py b/mutalyzer/Db.py similarity index 71% rename from src/Modules/Db.py rename to mutalyzer/Db.py index 8e3a1108bb330a3b6e37caf79eb80a6b761bd77b..73c400467b58d51d68c4c159b6504f69c9919675 100644 --- a/src/Modules/Db.py +++ b/mutalyzer/Db.py @@ -10,24 +10,22 @@ statements. @requires: types @requires: time @requires: os -@requires: Modules.Misc """ #Public classes: # - Db ; Log in to a database and keep it open for queries. # - Mapping ; Mapping of transcripts and genes. -# - Remote ; Retrieving updates for the mapping databases. -# - Update ; Updating the mapping databases. # - Cache ; Cache administration. # - Batch ; Batch checker. -import MySQLdb # connect(), escape_string() -import types # TupleType -import time # strftime() -import os # os.remove() +import types +import warnings + +import MySQLdb + +from mutalyzer import util -from Modules import Misc # ID() # # Note that compound queries are split into single queries because of a bug @@ -94,7 +92,7 @@ class Db() : if args != (None,) : # Don't escape the empty string. for i in args : if i : - if type(i) == types.StringType : + if type(i) in [types.StringType, types.UnicodeType]: escaped_args.append(MySQLdb.escape_string(str(i))) else : escaped_args.append(i) @@ -134,7 +132,7 @@ class Mapping(Db) : - query(statement) ; General query function. SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping; Accumulated mapping info. """ def __init__(self, build, config) : @@ -151,66 +149,12 @@ class Mapping(Db) : Db.__init__(self, build, config.LocalMySQLuser, config.LocalMySQLhost) #__init__ - def get_protAcc(self, mrnaAcc) : - """ - Query the database for a protein ID given an mRNA ID. - - SQL tables from dbNames: - - map ; Accumulated mapping info. - - @arg mrnaAcc: The ID of an mRNA - @type mrnaAcc: string - - @return: The protein ID - @rtype: string - """ - - statement = """ - SELECT protAcc - FROM map - WHERE acc = %s; - """, mrnaAcc - - return self.query(statement)[0][0] - #get_protAcc - - def get_NM_info(self, mrnaAcc, version = None) : - """ - Retrieve various data for an NM number. - - SQL tables from dbNames: - - map ; Accumulated mapping info. - - @arg mrnaAcc: The ID of an mRNA - @type mrnaAcc: string - @arg version: version number of the accession number (not used) - @type version: integer - - @return: - - exonStarts ; List of exon start sites. - - exonEnds ; List of exon end sites. - - cdsStart ; Position of the start codon. - - cdsEnd ; Position of the end codon. - - strand ; Orientation of the gene (+ = forward, - - = reverse) - @rtype: list - """ - - statement = """ - SELECT exonStarts, exonEnds, cdsStart, cdsEnd, strand - FROM map - WHERE acc = %s; - """, mrnaAcc - - return self.query(statement)[0] - #get_NM_info - def get_NM_version(self, mrnaAcc) : """ Get the version number of an accession number. SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping ; Accumulated mapping info. @arg mrnaAcc: The ID of an mRNA @type mrnaAcc: string @@ -221,8 +165,8 @@ class Mapping(Db) : statement = """ SELECT version - FROM map - WHERE acc = %s; + FROM Mapping + WHERE transcript = %s; """, mrnaAcc return [int(i[0]) for i in self.query(statement)] @@ -234,7 +178,7 @@ class Mapping(Db) : If the version number is None, use the "newest" version number. SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping ; Accumulated mapping info. @arg mrnaAcc: The ID of an mRNA @type mrnaAcc: string @@ -243,27 +187,34 @@ class Mapping(Db) : @return: The version number @rtype: integer - """ + @todo: The 'order by chrom asc' is a quick hack to make sure we first + get a primary assembly mapping instead of some haplotype mapping + for genes in the HLA cluster. + A better fix is to return the entire list of mappings, and/or + remove all secondary mappings for the HLA cluster. + See also test_converter.test_hla_cluster and bug #58. + """ q = """ - select acc, - txStart, txEnd, - cdsStart, cdsEnd, - exonStarts, exonEnds, - geneName, chrom, - strand, protAcc, - MAX(version) - from map + select transcript, + start, stop, + cds_start, cds_stop, + exon_starts, exon_stops, + gene, chromosome, + orientation, protein + from Mapping """ if version is None: q += """ - where acc = %s; + where transcript = %s + order by version desc, chromosome asc; """ statement = (q, mrnaAcc) else: q += """ - where acc = %s and - version = %s; + where transcript = %s and + version = %s + order by chromosome asc; """ statement = q, (mrnaAcc, version) @@ -277,7 +228,7 @@ class Mapping(Db) : should be returned, set overlap to 0. SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping ; Accumulated mapping info. @arg chrom: The chromosome (coded as "chr1", ..., "chrY") @type chrom: string @@ -289,35 +240,35 @@ class Mapping(Db) : - 0 ; Return only the transcripts that completely fall in the range [p1, p2] - 1 ; Return all hit transcripts - @type overlap: boolean + @type overlap: boolean @return: All accession numbers that are hit according to the overlap criterium @rtype: list """ q = """ - select acc, - txStart, txEnd, - cdsStart, cdsEnd, - exonStarts, exonEnds, - geneName, chrom, - strand, protAcc, + select transcript, + start, stop, + cds_start, cds_stop, + exon_starts, exon_stops, + gene, chromosome, + orientation, protein, version - from map + from Mapping """ if overlap: q += """ - WHERE chrom = %s AND - txStart <= "%s" AND - txEnd >= "%s"; + WHERE chromosome = %s AND + start <= "%s" AND + stop >= "%s"; """ statement = q, (chrom, p2, p1) else: q += """ - WHERE chrom = %s AND - txStart >= "%s" AND - txEnd <= "%s"; + WHERE chromosome = %s AND + start >= "%s" AND + stop <= "%s"; """ statement = q, (chrom, p1, p2) @@ -332,16 +283,16 @@ class Mapping(Db) : geneName ; Name of a gene. SQL tables from dbNames: - map ; Accumulated mapping info. + Mapping ; Accumulated mapping info. Returns: list ; A list of transcripts. """ statement = """ - SELECT acc, version - FROM map - WHERE geneName = %s; + SELECT transcript, version + FROM Mapping + WHERE gene = %s; """, geneName ret = self.query(statement) @@ -359,7 +310,7 @@ class Mapping(Db) : Get the name of a gene, given a transcript identifier (NM number). SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping ; Accumulated mapping info. @arg mrnaAcc: The ID of an mRNA @type mrnaAcc: string @@ -369,9 +320,9 @@ class Mapping(Db) : """ statement = """ - SELECT geneName - FROM map - WHERE acc = %s; + SELECT gene + FROM Mapping + WHERE transcript = %s; """, mrnaAcc ret = self.query(statement) @@ -385,7 +336,7 @@ class Mapping(Db) : Check if the given name is a valid chromosome name. SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping ; Accumulated mapping info. @arg name: The name to be tested @type name: string @@ -397,8 +348,8 @@ class Mapping(Db) : statement = """ SELECT COUNT(*) - FROM map - WHERE chrom = %s; + FROM Mapping + WHERE chromosome = %s; """, name if int(self.query(statement)[0][0]) > 0 : @@ -463,7 +414,7 @@ class Mapping(Db) : Get the chromosome name, given a transcript identifier (NM number). SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping ; Accumulated mapping info. @arg acc: The NM accession number (version NOT included) @type acc: string @@ -473,335 +424,238 @@ class Mapping(Db) : """ statement = """ - SELECT chrom - FROM map - WHERE acc = %s; + SELECT chromosome + FROM Mapping + WHERE transcript = %s; """, acc - print acc + ret = self.query(statement) if ret : return ret[0][0] return None #get_chromName -#Mapper - -class Remote(Db) : - """ - Database functions for retrieving updates for the mapping databases. - Special methods: - - __init__(config) ; Initialise the class. - - Public methods: - - get_Update() ; Retrieve new mapping info from the UCSC. - - Inherited methods from Db: - - query(statement) ; General query function. - - SQL tables from dbNames: - - gbStatus ; acc -> version mapping (NM to NM + version), - type, modDate - - refGene ; name -> geneName mapping (NM to gene name), - txStart, txEnd, cdsStart, cdsEnd, exonStarts, - exonEnds, chrom, strand. - - refLink ; mrnaAcc -> protAcc mapping (NM to NP). - """ - - def __init__(self, build, config) : + def merge_update(self): """ - Initialise the Db parent class. Use the remote database for a - certain build. + Merge existing mapping information with new mapping information, which + should be in table 'MappingTemp'. - Private variables (altered): - - __config ; Configuration variables. - - @arg build: The version of the mapping database - @type build: string - @arg config: Configuration variables - @type config: class instance - """ - - self.__config = config - Db.__init__(self, build, config.RemoteMySQLuser, config.RemoteMySQLhost) - #__init__ - - def get_Update(self) : - """ - Retrieve all mapping updates from the UCSC within a certain time - window (defined in the configuration file) and gather the results - into one mapping table. - - The results will be written to a temporary file (also defined in - the configuration file) to be imported in the local database with - the load_Update() function. + The strategy is as follows. Existing mappings (accumulated by + Mutalyzer in the past) that are not in the new mapping information are + added to the new mapping information. The resulting set is used as the + mapping information from now on. + This way, we get the latest updates for existing mappings and keep old + mappings not in the updated information. SQL tables from dbNames: - - gbStatus ; acc -> version mapping (NM to NM + version), - type, modDate - - refGene ; name -> geneName mapping (NM to gene name), - txStart, txEnd, cdsStart, cdsEnd, exonStarts, - exonEnds, chrom, strand. - - refLink ; mrnaAcc -> protAcc mapping (NM to NP). - """ + - Mapping ; Accumulated mapping info. + - MappingTemp ; New mapping info. + - MappingBackup ; Backup of accumulated mapping info. - statement = """ - SELECT DISTINCT acc, version, txStart, txEnd, cdsStart, cdsEnd, - exonStarts, exonEnds, name2 AS geneName, chrom, - strand, protAcc - FROM gbStatus, refGene, refLink - WHERE type = "mRNA" - AND refGene.name = acc - AND acc = mrnaAcc - AND time >= DATE_SUB(CURDATE(), INTERVAL %s DAY); - """, self.__config.UpdateInterval - - handle = open(self.__config.TempFile, "w") - - # Convert the results to a tab delimited file. - for i in self.query(statement) : - for j in i : - handle.write(str(j) + chr(0x09)) # 0x09 is a TAB. - handle.write('\n') - #for - - handle.close() - #get_Update -#Remote - -class Update(Db) : - """ - Database functions for updating the mapping databases. - - Public methods: - - load_Update() ; Load new mapping info into the local database. - - count_Updates() ; Count the number of entries in the new - mapping info table. - - backup_cdsUpdates() ; Make a backup of updates that overwrite the - old mapping info. - - count_cdsUpdates() ; Count the number of updates that overwrite - the old mapping info. - - merge_cdsUpdates() ; Merge the backup of old mapping info with the - other old info. - - merge_Update() ; Merge the new mapping info from the UCSC with - what we already have. - - Inherited methods from Db: - - query(statement) ; General query function. - - SQL tables from dbNames: - - map ; Accumulated mapping info. - - map_temp ; Newly found data. - - map_new ; Merge of map_temp and map. - - map_cdsBackup_temp ; Entries that were updated without an increment - of the version number. - - map_cdsBackup ; Merge of map_cdsBackup_temp and itself. - """ - - def __init__(self, build, config) : + @note: We temporarily suppress warnings during some queries, since + they are expected and clutter the console output (e.g. warnings + for existing tables). + @todo: Return number of entries added/updated. """ - Initialise the Db parent class. Use the remote database for a - certain build. + statement = """ + CREATE TABLE IF NOT EXISTS MappingTemp LIKE Mapping; + """, None + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + self.query(statement) - Private variables (altered): - - __config ; Configuration variables. + statement = """ + INSERT INTO MappingTemp + SELECT * FROM Mapping AS OldM + WHERE NOT EXISTS ( + SELECT * FROM MappingTemp AS NewM + WHERE OldM.transcript = NewM.transcript + AND OldM.version = NewM.version + ); + """, None + self.query(statement) - @arg build: The version of the mapping database - @type build: string - @arg config: Configuration variables - @type config: class instance - """ + statement = """ + DROP TABLE IF EXISTS MappingBackup; + """, None + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + self.query(statement) - self.__config = config - Db.__init__(self, build, config.LocalMySQLuser, config.LocalMySQLhost) - #__init__ + statement = """ + RENAME TABLE Mapping TO MappingBackup, MappingTemp TO Mapping; + """, None + self.query(statement) + #merge_update - def load_Update(self) : + def ncbi_create_temporary_tables(self): """ - Load the updates from the temporary file (defined in the - configuration file) created by the get_Update() function and import - it in the local database. - - SQL tables from dbNames (altered): - - map_temp ; Created and loaded with data from TempFile. + Create temporary tables to import NCBI mapping into. SQL tables from dbNames: - - map ; Accumulated mapping info. + - Genes ; Gene names from NCBI. + - Transcripts ; Transcript mappings from NCBI. + - Exons ; Exon mappings from NCBI. """ - - # The statements in this function may be combined when MYSQL_BUG is - # solved. + self.ncbi_drop_temporary_tables() statement = """ - CREATE TABLE map_temp LIKE map; + CREATE TABLE Genes ( + id varchar(20) NOT NULL DEFAULT '', + name varchar(255) DEFAULT NULL, + PRIMARY KEY (id) + ); """, None self.query(statement) - statement = """ - LOAD DATA LOCAL INFILE %s - INTO TABLE map_temp; - """, self.__config.TempFile + statement = """ + CREATE TABLE Transcripts ( + name varchar(20) NOT NULL DEFAULT '', + gene_id varchar(20) DEFAULT NULL, + chromosome char(2) DEFAULT NULL, + start int(11) DEFAULT NULL, + stop int(11) DEFAULT NULL, + orientation char(1) DEFAULT NULL, + PRIMARY KEY (name,start) + ); + """, None self.query(statement) - os.remove(self.__config.TempFile) - #load_Update + statement = """ + CREATE TABLE Exons ( + transcript varchar(20) NOT NULL DEFAULT '', + start int(11) DEFAULT NULL, + stop int(11) DEFAULT NULL, + cds_start int(11) DEFAULT NULL, + cds_stop int(11) DEFAULT NULL, + protein varchar(20) DEFAULT NULL, + PRIMARY KEY (transcript,start) + ); + """, None + self.query(statement) + #ncbi_create_temporary_table - def count_Updates(self) : + def ncbi_drop_temporary_tables(self): """ - Count the number of updates. This function will only work if it - is preceeded by the load_Update() function. Otherwise the map_temp - table may not exist. This function can not be used after the - merge_Update() function has been executed, since it drops the - map_temp table. + Drop temporary tables used for importing NCBI mapping information. - @return: The number of entries in the table of updated mapping info - @rtype: integer + SQL tables from dbNames: + - Genes ; Gene names from NCBI. + - Transcripts ; Transcript mappings from NCBI. + - Exons ; Exon mappings from NCBI. """ - statement = """ - SELECT COUNT(*) - FROM map_temp; + DROP TABLE IF EXISTS Genes, Transcripts, Exons; """, None + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + self.query(statement) + #ncbi_drop_temporary_tables - return int(self.query(statement)[0][0]) - #count_Updates - - def backup_cdsUpdates(self) : + def ncbi_import_gene(self, id, name): """ - Copy all mapping entries where there was an update, but no - increment in the version number, to a backup table. Note that - we use acc, version, txStart as the primary key because members - of a gene family are mapped multiple times. - - SQL tables from dbNames (altered): - - map_cdsBackup_temp ; Created and filled with entries that - were updated without an increment of the - version number. + Import a (gene id, gene name) pair in a temporary table. SQL tables from dbNames: - - map ; Accumulated mapping info. - - map_temp ; Freshly downloaded mapping info. + - Genes ; Gene names from NCBI. """ - statement = """ - CREATE TABLE map_cdsBackup_temp - SELECT map.* - FROM map, map_temp - WHERE map.acc = map_temp.acc - AND map.version = map_temp.version - AND map.txStart = map_temp.txStart - AND ( - map.cdsStart != map_temp.cdsStart - OR map.cdsEnd != map_temp.cdsEnd - ); - """, None + INSERT IGNORE INTO Genes (id, name) VALUES (%s, %s); + """, (id, name) self.query(statement) - #backup_cdsUpdates + #ncbi_import_gene - def count_cdsUpdates(self) : + def ncbi_import_transcript(self, name, gene, chromosome, start, stop, + orientation): """ - Count the number of mapping entries that have changed without an - increment in the version number. This function can only be called - after backup_cdsUpdates() has been executed and before - merge_cdsUpdates has been executed. + Import a transcript mapping in a temporary table. SQL tables from dbNames: - - map_cdsBackup_temp ; Entries that wre updated without an - increment of the version number. - - @return: The number of mapping entries that have changed without an - increment in the version number - @rtype: integer + - Transcripts ; Transcript mappings from NCBI. """ - statement = """ - SELECT COUNT(*) - FROM map_cdsBackup_temp; - """, None + INSERT IGNORE INTO Transcripts + (name, gene_id, chromosome, start, stop, orientation) + VALUES + (%s, %s, %s, %s, %s, %s); + """, (name, gene, chromosome, start, stop, orientation) - return int(self.query(statement)[0][0]) - #count_cdsUpdates + self.query(statement) + #ncbi_import_transcript - def merge_cdsUpdates(self) : + def ncbi_import_exon(self, transcript, start, stop, cds_start, cds_stop, + protein): """ - Merge the mapping entries that have changed without an increment in - the version number with a table that contains backups of these - entries. + Import an exon mapping in a temporary table. - SQL tables from dbNames (altered): - - map_cdsBackup ; Extended with the entries in - map_cdsBackup_temp. - - map_cdsBackup_temp ; Dropped. + SQL tables from dbNames: + - Exons ; Exon mappings from NCBI. """ - - # The statements in this function may be combined when MYSQL_BUG is - # solved. - statement = """ - INSERT INTO map_cdsBackup - SELECT * - FROM map_cdsBackup_temp; - """, None - self.query(statement) - statement = """ - DROP TABLE map_cdsBackup_temp; - """, None + INSERT IGNORE INTO Exons + (transcript, start, stop, cds_start, cds_stop, protein) + VALUES + (%s, %s, %s, %s, %s, %s); + """, (transcript, start, stop, cds_start, cds_stop, protein) self.query(statement) - #merge_cdsUpdates + #ncbi_import_exon - def merge_Update(self) : + def ncbi_aggregate_mapping(self): """ - Merge the new mapping data with the old ones. + Aggregate gene, transcript and exon mapping information from the NCBI + into one table. - SQL tables from dbNames (altered): - - map_new ; Created and filled with the merge of map_temp and map. - Dropped after use. - - map_temp ; Merged with map to form map_new. Dropped after use. - - map ; Overwritten with the merged info in map_new. + @note: Default MySQL value for group_concat_max_len is 1024, meaning + that the GROUP_CONCAT aggregate function returns values of at most + 1024 bytes long. This is not enough (currently we need around 3000 + bytes), so we explicitely set this to a higher value. + @note: We use MAX(E.protein) since MySQL does not have an ANY() + aggregator. """ - - # The statements in this function may be combined when MYSQL_BUG is - # solved. - statement = """ - CREATE TABLE map_new - SELECT * - FROM map_temp - UNION - SELECT * - FROM map - WHERE NOT EXISTS ( - SELECT * - FROM map_temp - WHERE map.acc = map_temp.acc - AND map.version = map_temp.version - AND map.txStart = map_temp.txStart - ); + SET group_concat_max_len = 32768; """, None self.query(statement) + statement = """ - DROP TABLE map; + DROP TABLE IF EXISTS MappingTemp; """, None - self.query(statement) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + self.query(statement) + statement = """ - CREATE TABLE map - SELECT * - FROM map_new; + CREATE TABLE MappingTemp LIKE Mapping; """, None self.query(statement) + statement = """ - DROP TABLE map_new; + INSERT INTO MappingTemp + SELECT + G.name as gene, + SUBSTRING(T.name FROM 1 FOR LOCATE('.', T.name) - 1) as transcript, + SUBSTRING(T.name FROM LOCATE('.', T.name) + 1) as version, + CONCAT('chr', T.chromosome) as chromosome, + T.orientation as orientation, + MIN(T.start) as start, + MAX(T.stop) as stop, + MAX(E.cds_start) as cds_start, + MAX(E.cds_stop) as cds_stop, + GROUP_CONCAT(DISTINCT E.start ORDER BY E.start ASC) as exon_starts, + GROUP_CONCAT(DISTINCT E.stop ORDER BY E.stop ASC) as exon_stops, + MAX(E.protein) as protein, + 'NCBI' as source + FROM Transcripts as T, Genes as G, Exons as E + WHERE T.gene_id = G.id AND T.name = E.transcript + GROUP BY T.name; """, None self.query(statement) - statement = """ - DROP TABLE map_temp; - """, None + #ncbi_aggregate_mapping +#Mapping - self.query(statement) - #merge_Update -#Update class Cache(Db) : """ @@ -885,6 +739,7 @@ class Cache(Db) : statement = """ INSERT INTO GBInfo + (AccNo, GI, hash, ChrAccVer, ChrStart, ChrStop, orientation, url) VALUES (%s, %s, %s, %s, %s, %s, %s, %s); """, (accNo, GI, fileHash, ChrAccVer, ChrStart, ChrStop, orientation, url) @@ -895,9 +750,9 @@ class Cache(Db) : def insertLRG(self, accNo, fileHash, url): """ Insert information about a LRG record in the internal database. - + See insertGB() for more information. - + @arg accNo: The name associated with this record @type accNo: string @arg fileHash: The hash of the content of the record @@ -908,6 +763,7 @@ class Cache(Db) : statement = """ INSERT INTO GBInfo + (AccNo, GI, hash, ChrAccVer, ChrStart, ChrStop, orientation, url) VALUES (%s, %s, %s, %s, %s, %s, %s, %s); """, (accNo, None, fileHash, None, None, None, None, url) @@ -1028,6 +884,30 @@ class Cache(Db) : return None #getGBFromGI + def getGBSince(self, created_since): + """ + Get all accession number entries with creation date {created_since} + or later. + + SQL tables from internalDb: + - GBInfo ; Information about cached and uploaded GenBank files. + + @arg created_since: Only entries with later creation dates are returned. + @type created_since: datatime.datetime + + @return: The accession number + @rtype: string + """ + statement = """ + SELECT AccNo, GI, hash, ChrAccVer, ChrStart, + ChrStop, orientation, url, created + FROM GBInfo + WHERE created >= %s; + """, created_since + + return self.query(statement) + #getGBSince + def getLoc(self, accNo) : """ Get the slicing information of an accession number, typically this @@ -1122,7 +1002,7 @@ class Cache(Db) : @arg accNo: The accession number @type accNo: string - + @return: GI number @rtype: string """ @@ -1143,13 +1023,13 @@ class Cache(Db) : """ Gets the protein accession number for the given mRNA accession number. - + SQL tables from internalDb: - Link ; mRNA and associated protein IDs. - + @arg mrnaAcc: The ID of an mRNA @type mrnaAcc: string - + @return: The protein accession number @rtype: string """ @@ -1169,13 +1049,13 @@ class Cache(Db) : def getmrnaAcc(self, protAcc) : """ Gets the mRNA accession number for a given protein accession number. - + SQL tables from internalDb: - Link ; mRNA and associated protein IDs. - + @arg protAcc: The protein ID @type protAcc: string - + @return: The mRNA accession number @rtype: string """ @@ -1197,14 +1077,14 @@ class Cache(Db) : """ Inserts the given mRNA and protein accession numbers into the Link table. - + SQL tables from internalDb: - Link ; mRNA and associated protein IDs. - + @arg protAcc: The protein ID @type protAcc: string @arg mrnaAcc: The ID of an mRNA - @type mrnaAcc: string + @type mrnaAcc: string """ statement = """ @@ -1299,7 +1179,7 @@ class Batch(Db) : #entriesLeftForJob - def addJob(self, outputFilter, email, fromHost, jobType, Arg1) : + def addJob(self, outputFilter, email, fromHost, jobType, Arg1): """ Add a job and give it a unique ID. @@ -1312,14 +1192,13 @@ class Batch(Db) : @type email: string @arg jobType: The type of batch job @type jobType: string + @arg Arg1: Possible argument. + @type Arg1: string @return: A job ID @rtype: integer """ - - M = Misc.Misc() - jobID = M.ID() - del M + jobID = util.generate_id() statement = """ INSERT INTO BatchJob VALUES (%s, %s, %s, %s, %s, %s) @@ -1519,10 +1398,3 @@ class Batch(Db) : return inputl, flags #getFromQueue #Batch - -# -# Unit test. -# -if __name__ == "__main__" : - pass -#if diff --git a/src/Modules/File.py b/mutalyzer/File.py similarity index 89% rename from src/Modules/File.py rename to mutalyzer/File.py index 3f5d32ccc7de77a3264507749502b47d66bb6396..a03a1b3af460e35b51057b900348edb8f70a0279 100644 --- a/src/Modules/File.py +++ b/mutalyzer/File.py @@ -11,7 +11,6 @@ Module for parsing CSV files and spreadsheets. @requires: xml.dom.minidom @requires: os @requires: types -@requires: Modules.Misc """ # Public classes: # - File ; Parse CSV files and spreadsheets. @@ -23,10 +22,11 @@ import xlrd # open_workbook() import zipfile # ZipFile() import xml.dom.minidom # parseString() import os # remove() +import tempfile import types # UnicodeType from cStringIO import StringIO -from Modules import Misc +from mutalyzer import util class File() : """ @@ -60,7 +60,7 @@ class File() : Private variables (altered): - __config ; Initialised with configuration variables. - __output ; Set to the Output object. - + @arg config: Configuration variables @type config: class instance @arg output: Output object @@ -85,23 +85,18 @@ class File() : @return: unknown; the output of func(). @rtype: ? """ - - # Generate an unique filename in the tempDir directory. - MiscInstance = Misc.Misc() - fileName = self.__config.tempDir + '/' + str(MiscInstance.ID()) - del MiscInstance + write_handle, filename = tempfile.mkstemp(text=True) # Dump the content of the stream pointed to by handle into the file. handle.seek(0) - writeHandle = open(fileName, "w") - writeHandle.write(handle.read()) - writeHandle.close() + os.write(write_handle, handle.read()) + os.close(write_handle) # Open the file with func(). - ret = func(fileName) + ret = func(filename) # Apperantly apache will remove this file even when opened by the # function *func - os.remove(fileName) + os.remove(filename) return ret #__tempFileWrapper @@ -183,7 +178,7 @@ class File() : try : # Todo: delimiters in config file dialect = csv.Sniffer().sniff(buf, delimiters="\t ;|,") - except csv.Error, e : + except csv.Error: #self.__output.addMessage(__file__, 4, "EBPARSE", e) #return None pass @@ -197,7 +192,7 @@ class File() : reader = csv.reader(new_handle, dialect) ret = [] - for i in reader : + for i in reader: ret.append(i) new_handle.close() @@ -277,7 +272,7 @@ class File() : Private variables: - __config ; The header configuration variable. - + @todo: Add more new style old style logic @todo: if not inputl: try to make something out of it @@ -285,13 +280,15 @@ class File() : @type job: list @return: A sanitised list of lists (without a header or empty lines) - @rtype: list + and the number of columns. + @rtype: tuple(list, int) """ + columns = 1 + #store original line numbers line 1 = job[0] jobl = [(l+1, row) for l, row in enumerate(job)] #TODO: Add more new style old style logic - if jobl[0][1] == self.__config.header : #Old style NameCheckBatch job ret = [] notthree = [] @@ -343,43 +340,50 @@ class File() : #if else: #No Header, possibly a new BatchType - if len(jobl) == 0: return - #collect all lines with data in fields other than the first - errlist = [line for line, row in jobl if any(row[1:])] + if len(jobl) == 0: + return (None, columns) + # Determine number of columns from first line. + columns = len(jobl[0][1]) + # Collect all lines with a different number of columns + errlist = [line for line, row in jobl + if any(row) and len(row) != columns] if any(errlist): self.__output.addMessage(__file__, 3, "EBPARSE", - "New Type Batch jobs (see help) should contain one " - "entry per line, please check %i line(s): %s" % + "New Type Batch jobs (see help) should contain the same " + "number of columns on every line, please check %i " + "line(s): %s" % (len(errlist), makeList(errlist))) ret = [] for line, job in jobl: if not any(job): #Empty line - ret.append("~!") + ret.extend(['~!' for _ in range(columns)]) continue if line in errlist: - inputl = "~!InputFields: " #Dirty Escape BatchEntries + #Dirty Escape BatchEntries + ret.append("~!InputFields: " + '|'.join(job)) + ret.extend(['~!' for _ in range(columns - 1)]) else: - inputl = "" - ret.append(inputl+"|".join(job)) + ret.extend(job) #else - if not ret: return None #prevent divide by zero + if not ret: + #prevent divide by zero + return (None, columns) err = float(len(errlist))/len(ret) if err == 0: - return ret + return (ret, columns) elif err < self.__config.threshold: #allow a 5 (default) percent threshold for errors in batchfiles self.__output.addMessage(__file__, 3, "EBPARSE", "There were errors in your batch entry file, they are " - "omitted and your batch is started.") - self.__output.addMessage(__file__, 3, "EBPARSE", - "Please check the batch input file help at the top of " - "this page for additional information.") - return ret + "omitted and your batch is started. Please check the " + "batch input file help at the top of this page for " + "additional information.") + return (ret, columns) else: - return None + return (None, columns) #__checkBatchFormat def getMimeType(self, handle) : @@ -436,7 +440,7 @@ class File() : return self.__parseOdsFile(handle) return None - #parseFile + #parseFileRaw def parseBatchFile(self, handle) : """ @@ -446,15 +450,15 @@ class File() : @arg handle: A handle to a stream @type handle: stream - @return: A sanitised list of lists (without a header or empty lines), - or None if an error occured - @rtype: list + @return: A sanitised list of lists (without a header or empty lines) + (or None if an error occured) and the number of columns. + @rtype: tuple(list, int) """ job = self.parseFileRaw(handle) - if job : + if job: return self.__checkBatchFormat(job) - return None + return (None, 1) #parseBatchFile #File diff --git a/src/Modules/GenRecord.py b/mutalyzer/GenRecord.py similarity index 72% rename from src/Modules/GenRecord.py rename to mutalyzer/GenRecord.py index 4dc7ff325c41182549da39bf5f56ea70242f590d..3c96487220f582f3f076ed45151dda011194faf4 100644 --- a/src/Modules/GenRecord.py +++ b/mutalyzer/GenRecord.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - """ Module to convert a GenBank record to a nested dictionary consisting of a list of genes, which itself consists of a list of loci. This structure @@ -18,9 +16,11 @@ search for them each time. # - GenRecord ; Convert a GenBank record to a nested dictionary. -import Crossmap import Bio -import Db + +from mutalyzer import util +from mutalyzer import Crossmap +from mutalyzer import Db class PList(object) : @@ -81,12 +81,13 @@ class Locus(object) : - exon ; A position list object. - txTable ; The translation table. - CM ; A Crossmap object. - + @arg name: identifier of the locus @type name: string """ self.name = name + self.current = False self.mRNA = None self.CDS = None self.location = [] @@ -109,21 +110,37 @@ class Locus(object) : self.proteinProduct = None #__init__ - def addToDescription(self, rawVariant) : + def cancelDescription(self): + """ + Set the description on this locus to 'unknown'. + + This can be used if at some point we give up creating a sensible + description on this locus. It also makes sure future additions to + the description are ignored and it keeps the 'unknown' value. + + @note: This depends on the check for the unknown value in the + addToDescription method. This is a not a beatiful solution. + """ + self.description = '?' + #cancelDescription + + def addToDescription(self, rawVariant): """ Expands the DNA description with a new raw variant. - + @arg rawVariant: description of a single mutation @type rawVariant: string """ - - if self.description : - self.description = "%s;%s" % (self.description, rawVariant) - else : + if self.description: + # Don't change anything if we already have an unknown value. + if self.description != '?': + self.description = "%s;%s" % (self.description, rawVariant) + else: self.description = rawVariant #addToDescription #Locus + class Gene(object) : """ A Gene object, to store a list of Locus objects and the orientation of @@ -134,7 +151,7 @@ class Gene(object) : Public variables: - orientation; The orientation of the gene: 1 = forward, -1 = reverse. - - TRANSCRIPTSlist; A list of Locus objects. + - transcriptslist; A list of Locus objects. """ def __init__(self, name) : @@ -149,7 +166,7 @@ class Gene(object) : - longName ; Private variables (altered): - __locusTag ; - + @arg name: gene name @type name: string """ @@ -165,7 +182,7 @@ class Gene(object) : def newLocusTag(self) : """ Generates a new Locus tag. - + @return: Locus tag @rtype: integer (3 digits, if < 100 preceeded with 0's) """ @@ -178,10 +195,10 @@ class Gene(object) : def findLocus(self, name) : """ Find a transcript, given its name. - + @arg name: transcript variant number @type name: string - + @return: transcript @rtype: object """ @@ -195,7 +212,7 @@ class Gene(object) : def listLoci(self) : """ Provides a list of transcript variant numbers - + @return: list of transcript variant numbers @rtype: list """ @@ -209,10 +226,10 @@ class Gene(object) : def findLink(self, protAcc) : """ Look in the list of transcripts for a given protein accession number. - + @arg protAcc: protein accession number @type protAcc: string - + @return: transcript @rtype: object """ @@ -279,10 +296,10 @@ class Record(object) : def findGene(self, name) : """ Returns a Gene object, given its name. - + @arg name: Gene name @type name: string - + @return: Gene object @rtype: object """ @@ -296,10 +313,10 @@ class Record(object) : def listGenes(self) : """ List the names of all genes found in this record. - + @return: Genes list @rtype: list - + """ ret = [] @@ -311,7 +328,7 @@ class Record(object) : def addToDescription(self, rawVariant) : """ Expands the DNA description with a new raw variant. - + @arg rawVariant: description of a single mutation @type rawVariant: string """ @@ -325,11 +342,11 @@ class Record(object) : def toChromPos(self, i) : """ Converts a g. position (relative to the start of the record) to a - chromosomal g. position - + chromosomal g. position + @arg i: g. position (relative to the start of the record) @type i: integer - + @return: chromosomal g. position @rtype: integer """ @@ -365,10 +382,10 @@ class GenRecord() : def __init__(self, output, config) : """ Initialise the class. - + Public variable: - record ; A record object - + @arg output: an output object @type output: object @arg config: a config object @@ -383,13 +400,13 @@ class GenRecord() : def __checkExonList(self, exonList, CDSpos) : """ @todo document me - + @arg exonList: list of splice sites @type exonList: list (object) @arg CDSpos: location of the CDS @type CDSpos: object - - @return: + + @return: @rtype: boolean """ @@ -413,12 +430,12 @@ class GenRecord() : return True return False #__checkExonList - + def __constructCDS(self, mRNA, CDSpos) : """ - Construct a list of coordinates that contains CDS start and stop and + Construct a list of coordinates that contains CDS start and stop and the internal splice sites. - + @arg mRNA: mRNA positions/coordinates list @type mRNA: list (integer) @arg CDSpos: coding DNA positions/coordinates @@ -444,22 +461,25 @@ class GenRecord() : return ret #__constructCDS - def __maybeInvert(self, gene, string) : + def __maybeInvert(self, gene, string, string_reverse=None) : """ Return the reverse-complement of a DNA sequence if the gene is in the reverse orientation. - - @arg gene: Gene + + @arg gene: Gene @type gene: object @arg string: DNA sequence @type string: string - + @kwarg string_reverse: DNA sequence to use (if not None) for the + reverse complement. + @return: reverse-complement (if applicable), otherwise return the original. @rtype: string """ - - if gene.orientation == -1 : + if gene.orientation == -1: + if string_reverse: + string = string_reverse return Bio.Seq.reverse_complement(string) return string #__maybeInvert @@ -468,7 +488,7 @@ class GenRecord() : """ Check if the record in self.record is compatible with mutalyzer. Update the mRNA PList with the exon and CDS data. - + @todo: This function should really check the record for minimal requirements """ @@ -584,10 +604,25 @@ class GenRecord() : #for #checkRecord - def name(self, start_g, stop_g, varType, arg1, arg2, roll) : + def current_transcript(self): + """ + Return the current transcript. + + @return: Current transcript if there is one, None otherwise. + @rtype: GenRecord.Locus + """ + for i in self.record.geneList: + for j in i.transcriptList: + if j.current: + return j + return None + #current_transcript + + def name(self, start_g, stop_g, varType, arg1, arg2, roll, arg1_reverse=None, + start_fuzzy=False, stop_fuzzy=False): """ Generate variant descriptions for all genes, transcripts, etc. - + @arg start_g: start position @type start_g: integer @arg stop_g: stop position @@ -600,9 +635,13 @@ class GenRecord() : @type arg2: string @arg roll: ??? @type roll: tuple (integer, integer) - + @kwarg arg1_reverse: argument 1 to be used on reverse strand + @type arg1_reverse: string + @kwarg start_fuzzy: Indicates if start position of variant is fuzzy. + @type start_fuzzy: bool + @kwarg stop_fuzzy: Indicates if stop position of variant is fuzzy. + @type stop_fuzzy: bool """ - forwardStart = start_g forwardStop = stop_g reverseStart = stop_g @@ -616,23 +655,64 @@ class GenRecord() : if varType != "subst" : if forwardStart != forwardStop : - self.record.addToDescription("%s_%s%s%s" % (forwardStart, - forwardStop, varType, arg1)) - self.record.addToChromDescription("%s_%s%s%s" % ( - self.record.toChromPos(forwardStart), - self.record.toChromPos(forwardStop), varType, arg1)) + # Todo: Fuzzy offsets to genomic positions (see bug #38). + # + # The genomic positioning is problematic. We would like to + # have it in brackets (as fuzzy positions), like the above + # g.(34299_23232)del example. + # + # Now consider a variant c.a-?_b+18del where only the offset + # before the exon is unknown but the offset after the exon is + # exact. Now a genomic description like g.(34299)_23232del + # comes to mind, however, this notation is not allowed by the + # HGVS grammar. + # + # I think all we can do is to treat both positions as fuzzy in + # the genomic description, even if only one of them really is. + # + # Peter thinks the HGVS grammar should at some point be + # updated to allow the brackets around individual locations. + if start_fuzzy or stop_fuzzy: + self.record.addToDescription("(%s_%s)%s%s" % ( + forwardStart, forwardStop, varType, arg1)) + self.record.addToChromDescription("(%s_%s)%s%s" % ( + self.record.toChromPos(forwardStart), + self.record.toChromPos(forwardStop), varType, arg1)) + else: + self.record.addToDescription("%s_%s%s%s" % ( + forwardStart, forwardStop, varType, arg1)) + self.record.addToChromDescription("%s_%s%s%s" % ( + self.record.toChromPos(forwardStart), + self.record.toChromPos(forwardStop), varType, arg1)) #if else : - self.record.addToDescription("%s%s%s" % (forwardStart, varType, - arg1)) - self.record.addToChromDescription("%s%s%s" % ( - self.record.toChromPos(forwardStart), varType, arg1)) + if start_fuzzy or stop_fuzzy: + # Todo: Current HGVS does not allow for () around single + # positions, only around ranges (see above and #38). + self.record.addToDescription("(%s)%s%s" % ( + forwardStart, varType, arg1)) + self.record.addToChromDescription("(%s)%s%s" % ( + self.record.toChromPos(forwardStart), varType, arg1)) + else: + self.record.addToDescription("%s%s%s" % ( + forwardStart, varType, arg1)) + self.record.addToChromDescription("%s%s%s" % ( + self.record.toChromPos(forwardStart), varType, arg1)) #else #if else : - self.record.addToDescription("%s%c>%c" % (forwardStart, arg1, arg2)) - self.record.addToChromDescription("%s%c>%c" % ( - self.record.toChromPos(forwardStart), arg1, arg2)) + if start_fuzzy or stop_fuzzy: + # Todo: Current HGVS does not allow for () around single + # positions, only around ranges (see above and #38). + self.record.addToDescription("(%s)%c>%c" % ( + forwardStart, arg1, arg2)) + self.record.addToChromDescription("(%s)%c>%c" % ( + self.record.toChromPos(forwardStart), arg1, arg2)) + else: + self.record.addToDescription("%s%c>%c" % ( + forwardStart, arg1, arg2)) + self.record.addToChromDescription("%s%c>%c" % ( + self.record.toChromPos(forwardStart), arg1, arg2)) for i in self.record.geneList : for j in i.transcriptList : @@ -644,35 +724,66 @@ class GenRecord() : orientedStop = reverseStop #if - # Check whether the variant hits CDS start. + # Turn of translation to protein if we hit splice sites. + # For the current transcript, this is handled with more + # care in variantchecker.py. + if not j.current and \ + util.over_splice_site(orientedStart, orientedStop, + j.CM.RNA): + j.translate = False + + # And check whether the variant hits CDS start. if j.molType == 'c' and forwardStop >= j.CM.x2g(1, 0) \ and forwardStart <= j.CM.x2g(3, 0) : self.__output.addMessage(__file__, 2, "WSTART", "Mutation in start codon of gene %s transcript " \ "%s." % (i.name, j.name)) + if not j.current: + j.translate = False # FIXME Check whether the variant hits a splice site. if varType != "subst" : if orientedStart != orientedStop : - j.addToDescription("%s_%s%s%s" % ( - j.CM.g2c(orientedStart), j.CM.g2c(orientedStop), - varType, self.__maybeInvert(i, arg1))) - self.checkIntron(i, j, orientedStart) - self.checkIntron(i, j, orientedStop) + if (start_fuzzy or stop_fuzzy) and not j.current: + # Don't generate descriptions on transcripts + # other than the current in the case of fuzzy + # positions. + j.cancelDescription() + else: + j.addToDescription("%s_%s%s%s" % ( + j.CM.g2c(orientedStart, start_fuzzy), + j.CM.g2c(orientedStop, stop_fuzzy), + varType, self.__maybeInvert(i, arg1, arg1_reverse))) + self.checkIntron(i, j, orientedStart) + self.checkIntron(i, j, orientedStop) #if else : - j.addToDescription("%s%s%s" % ( - j.CM.g2c(orientedStart), varType, - self.__maybeInvert(i, arg1))) - self.checkIntron(i, j, orientedStart) + if start_fuzzy and not j.current: + # Don't generate descriptions on transcripts + # other than the current in the case of fuzzy + # positions. + j.cancelDescription() + else: + j.addToDescription("%s%s%s" % ( + j.CM.g2c(orientedStart, start_fuzzy), + varType, + self.__maybeInvert(i, arg1, arg1_reverse))) + self.checkIntron(i, j, orientedStart) #else #if else : - j.addToDescription("%s%c>%c" % (j.CM.g2c(orientedStart), - self.__maybeInvert(i, arg1), - self.__maybeInvert(i, arg2))) - self.checkIntron(i, j, orientedStart) + if start_fuzzy and not j.current: + # Don't generate descriptions on transcripts + # other than the current in the case of fuzzy + # positions. + j.cancelDescription() + else: + j.addToDescription("%s%c>%c" % ( + j.CM.g2c(orientedStart, start_fuzzy), + self.__maybeInvert(i, arg1, arg1_reverse), + self.__maybeInvert(i, arg2))) + self.checkIntron(i, j, orientedStart) #else #if #for @@ -682,14 +793,14 @@ class GenRecord() : def checkIntron(self, gene, transcript, position) : """ Checks if a position is on or near a splice site - + @arg gene: Gene @type gene: object @arg transcript: transcript @type transcript: object @arg position: g. position @type position: integer - + @return: @todo: Also check a range properly. """ @@ -711,8 +822,3 @@ class GenRecord() : #if #checkIntron #GenRecord - -if __name__ == "__main__" : - R = GenRecord() - del R -#if diff --git a/src/Modules/Retriever.py b/mutalyzer/Retriever.py similarity index 87% rename from src/Modules/Retriever.py rename to mutalyzer/Retriever.py index 1cffa13d422982884d5c31c4b41c314613e3fcf9..d77f7fa63b00c933ff71d36a520e77822b32fced 100644 --- a/src/Modules/Retriever.py +++ b/mutalyzer/Retriever.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - """ Module for retrieving files from either the cache or the NCBI. @@ -7,23 +5,9 @@ A hash of every retrieved file is stored in the internal database. If a requested file is not found, but its hash is, we use additional information to re-download the file. -@requires: os -@requires: bz2 -@requires: hashlib -@requires: urllib2 -@requires: StringIO -@requires: ftplib -@requires: Bio.SeqIO -@requires: Bio.Entrez -@requires: Bio.Seq.UnknownSeq -@requires: Modules.Misc -@requires: Modules.LRGparser -@requires: Modules.GBparser -@requires: xml.dom.DOMException -@requires: xml.dom.minidom +Public classes: +- Retriever ; Retrieve a record from either the cache or the NCBI. """ -# Public classes: -# - Retriever ; Retrieve a record from either the cache or the NCBI. import os # path.isfile(), link() path.isdir(), path.mkdir(), @@ -32,16 +16,16 @@ import bz2 # BZ2Compressor(), BZ2File() import hashlib # md5(), update(), hexdigest() import urllib2 # urlopen() import StringIO # StringIO() -import ftplib # FTP(), all_errors from Bio import SeqIO # read() from Bio import Entrez # efetch(), read(), esearch(), esummary() from Bio.Seq import UnknownSeq +from xml.dom import DOMException, minidom +from xml.parsers import expat + +from mutalyzer import util +from mutalyzer.parsers import lrg +from mutalyzer.parsers import genbank -from Modules import Misc -from Modules import LRGparser -from Modules import GBparser -from xml.dom import DOMException -import xml.dom.minidom class Retriever(object) : """ @@ -55,7 +39,7 @@ class Retriever(object) : Special methods: - __init__(config, output, database) ; Use variables from the configuration file to initialise the class private variables. - + Private methods: @@ -90,7 +74,7 @@ class Retriever(object) : Inherited variables from Db.Output.Config: - email ; The email address which we give to the NCBI. - cache ; The directory where the records are stored. - + @arg config: @type config: @arg output: @@ -114,7 +98,7 @@ class Retriever(object) : @arg folder: Name of a directory @type folder: string - + @return: The size of the directory @rtype: integer """ @@ -147,14 +131,15 @@ class Retriever(object) : cachelist = [] for (path, dirs, files) in os.walk(self._config.cache) : for filename in files : + filepath = os.path.join(path, filename) cachelist.append( - (os.stat(os.path.join(path, filename)).st_atime, filename)) + (os.stat(filepath).st_atime, filepath)) cachelist.sort() # Now start removing pairs of files until the size of the folder is # small enough (or until the list is exhausted). for i in range(0, len(cachelist)) : - os.remove(os.path.join(path, cachelist[i][1])) + os.remove(cachelist[i][1]) if self._foldersize(self._config.cache) < self._config.cachesize: break; #for @@ -230,9 +215,7 @@ class Retriever(object) : @rtype: string """ - M = Misc.Misc() - UD = M.ID() - del M + UD = util.generate_id() return "UD_" + str(UD) #_newUD @@ -240,7 +223,7 @@ class Retriever(object) : #TODO documentation """ @todo: documentation - + @arg raw_data: @type raw_data: @arg name: @@ -268,62 +251,85 @@ class Retriever(object) : #_updateDBmd5 - def snpConvert(self, rsId, O) : + def snpConvert(self, rs_id) : """ - Search an rsId in dbSNP and return all annotated HGVS notations of + Search for an rsId in dbSNP and return all annotated HGVS notations of it. - @arg rsId: The id of the SNP. + @arg rsId: The rsId of the SNP (example: 'rs9919552'). @type rsId: string - @arg O: The Output object. - @type O: Modules.Output.Output - @return: A list of HGVS notations - @rtype: list + @return: A list of HGVS notations. + @rtype: list(string) """ - # A simple input check. - ID = rsId[2:] - if rsId[:2] != "rs" or not ID.isdigit() : - self._output.addMessage(__file__, 4, "ESNPID", "This is not a" \ - " valid dbSNP id.") + id = rs_id[2:] + if rs_id[:2] != 'rs' or not id.isdigit(): + self._output.addMessage(__file__, 4, 'ESNPID', + 'This is not a valid dbSNP id.') + return [] # Query dbSNP for the SNP. - response = Entrez.efetch(db = "SNP", id = ID, rettype = "flt", - retmode = "xml") - - # Parse the output. - doc = xml.dom.minidom.parseString(response.read()) - - set = doc.getElementsByTagName('ExchangeSet') - - if len(set) < 1: - # Not even the expected root element is present. - O.addMessage(__file__, 4, 'EENTREZ', - 'Unkown dbSNP error. Got no result from dbSNP.') - return + try: + response = Entrez.efetch(db='SNP', id=id, rettype='flt', + retmode='xml') + except IOError as e: + # Could not parse XML. + self._output.addMessage(__file__, 4, 'EENTREZ', + 'Error connecting to dbSNP.') + self._output.addMessage(__file__, -1, 'INFO', + 'IOError: %s' % str(e)) + return [] + + response_text = response.read() - rs = set[0].getElementsByTagName('Rs') + try: + # Parse the output. + doc = minidom.parseString(response_text) + exchange_set = doc.getElementsByTagName('ExchangeSet') + rs = exchange_set[0].getElementsByTagName('Rs') + except expat.ExpatError as e: + # Could not parse XML. + self._output.addMessage(__file__, 4, 'EENTREZ', 'Unknown dbSNP ' \ + 'error. Error parsing result XML.') + self._output.addMessage(__file__, -1, 'INFO', + 'ExpatError: %s' % str(e)) + self._output.addMessage(__file__, -1, 'INFO', + 'Result from dbSNP: %s' % response_text) + return [] + except IndexError: + # The expected root element is not present. + self._output.addMessage(__file__, 4, 'EENTREZ', 'Unkown dbSNP ' \ + 'error. Result XML was not as expected.') + self._output.addMessage(__file__, -1, 'INFO', + 'Result from dbSNP: %s' % response_text) + return [] if len(rs) < 1: # No Rs result element. text = [] - for node in set[0].childNodes: + for node in exchange_set[0].childNodes: if node.nodeType == node.TEXT_NODE: text.append(node.data) message = ''.join(text) if message.find('cannot get document summary') != -1: # Entrez does not have this rs ID. - O.addMessage(__file__, 4, 'EENTREZ', - 'ID rs%s could be found in dbSNP.' % ID) + self._output.addMessage(__file__, 4, 'EENTREZ', + 'ID rs%s could be found in dbSNP.' % id) else: # Something else was wrong (print {message} to see more). - O.addMessage(__file__, 4, 'EENTREZ', - 'Unkown dbSNP error. Got no result from dbSNP.') - return - + self._output.addMessage(__file__, 4, 'EENTREZ', + 'Unkown dbSNP error. Got no result ' \ + 'from dbSNP.') + self._output.addMessage(__file__, -1, 'INFO', + 'Message from dbSNP: %s' % message) + return [] + + snps = [] for i in rs[0].getElementsByTagName('hgvs'): - self._output.addOutput('snp', i.lastChild.data.encode('utf8')) + snps.append(i.lastChild.data.encode('utf8')) + + return snps #snpConvert #Retriever @@ -373,7 +379,7 @@ class GenBankRetriever(Retriever): if raw_data == "\nNothing has been found\n" : self._output.addMessage(__file__, 4, "ENORECORD", "The record could not be retrieved.") - return None, None + return None #if fakehandle = StringIO.StringIO() # Unfortunately, BioPython needs a @@ -385,7 +391,7 @@ class GenBankRetriever(Retriever): self._output.addMessage(__file__, 4, "ENOPARSE", "The file could not be parsed.") fakehandle.close() - return None, None + return None #except if type(record.seq) == UnknownSeq : @@ -393,7 +399,7 @@ class GenBankRetriever(Retriever): self._output.addMessage(__file__, 4, "ENOSEQ", "This record contains no sequence. Chromosomal or contig " \ "records should be uploaded with the GenBank uploader.") - return None, None + return None #if outfile = filename @@ -436,7 +442,10 @@ class GenBankRetriever(Retriever): return None #if else : # Something is present in the file. - name, GI = self.write(raw_data, name, 1) + result = self.write(raw_data, name, 1) + if not result: + return None + name, GI = result if name : # Processing went okay. return self._updateDBmd5(raw_data, name, GI) else : # Parse error in the GenBank file. @@ -470,7 +479,7 @@ class GenBankRetriever(Retriever): - 1 ; Forward - 2 ; Reverse complement @type orientation: integer - + @return: An UD number @rtype: string """ @@ -606,7 +615,7 @@ class GenBankRetriever(Retriever): @arg url: Location of a GenBank record @type url: string - + @return: UD or None @rtype: string """ @@ -653,22 +662,23 @@ class GenBankRetriever(Retriever): @arg raw_data: A GenBank record @type raw_data: string - - @return: + + @return: @rtype: string????? """ - md5sum = self._calcHash(raw_data) UD = self._database.getGBFromHash(md5sum) if not UD : UD = self._newUD() - self._database.insertGB(UD, None, md5sum, None, 0, 0, 0, None) + if self.write(raw_data, UD, 0): + self._database.insertGB(UD, None, md5sum, None, 0, 0, 0, None) + return UD #if - else : - if os.path.isfile(self._nametofile(UD)) : + else: + if os.path.isfile(self._nametofile(UD)): return UD - - return self.write(raw_data, UD, 0) and UD + else: + return self.write(raw_data, UD, 0) and UD #uploadrecord def loadrecord(self, identifier) : @@ -723,8 +733,8 @@ class GenBankRetriever(Retriever): return None # Now we have the file, so we can parse it. - GenBankParser = GBparser.GBparser() - return GenBankParser.createGBRecord(filename) + GenBankParser = genbank.GBparser() + return GenBankParser.create_record(filename) #loadrecord #GenBankRetriever @@ -741,14 +751,14 @@ class LRGRetriever(Retriever): #TODO documentation """ Initialize the class. - + @todo: documentation - @arg config: - @type config: - @arg output: - @type output: - @arg database: - @type database: + @arg config: + @type config: + @arg output: + @type output: + @arg database: + @type database: """ # Recall init of parent @@ -784,7 +794,7 @@ class LRGRetriever(Retriever): file_handle = bz2.BZ2File(filename, "r") #create GenRecord.Record from LRG file - record = LRGparser.createLrgRecord(file_handle.read()) + record = lrg.create_record(file_handle.read()) file_handle.close() return record @@ -812,7 +822,7 @@ class LRGRetriever(Retriever): try: return self.downloadrecord(url, name) - except urllib2.URLError, e: #Catch error: file not found + except urllib2.URLError: #Catch error: file not found pass try: # Try to get the file from the pending section @@ -820,7 +830,7 @@ class LRGRetriever(Retriever): self._output.addMessage(__file__, 2, "WPEND", "Warning: LRG file %s is a pending entry." % name) return filename - except urllib2.URLError, e: + except urllib2.URLError: self._output.addMessage(__file__, 4, "ERETR", "Could not retrieve %s." % name) return None #Explicit return in case of an Error @@ -906,7 +916,7 @@ class LRGRetriever(Retriever): # Dirty way to test if a file is valid, # Parse the file to see if it's a real LRG file. try: - LRGparser.createLrgRecord(raw_data) + lrg.create_record(raw_data) except DOMException: self._output.addMessage(__file__, 4, "ERECPARSE", "Could not parse file.") diff --git a/src/Modules/Scheduler.py b/mutalyzer/Scheduler.py similarity index 80% rename from src/Modules/Scheduler.py rename to mutalyzer/Scheduler.py index 8f97f01f5ee467e25833209784169d041b0a0135..cc10261d93cce18730550b3ae823ff1e177ffadf 100644 --- a/src/Modules/Scheduler.py +++ b/mutalyzer/Scheduler.py @@ -1,4 +1,3 @@ -#!/usr/bin/python """ Module used to add and manage the Batch Jobs. @@ -7,8 +6,6 @@ Module used to add and manage the Batch Jobs. @requires: smtplib @requires: email.mime.text.MIMEText @requires: Modules.Config -@requires: Modules.Output -@requires: Modules.Parser @requires: Modules.Mapper @requires: Mutalyzer """ @@ -18,63 +15,21 @@ Module used to add and manage the Batch Jobs. # - Batch Syntax Checker # - Batch Position Converter -import subprocess # subprocess.Popen import os # os.path.exists import smtplib # smtplib.STMP from email.mime.text import MIMEText # MIMEText -from Modules import Config # Config.Config -from Modules import Output # Output.Output -from Modules import Parser # Parser.Nomenclatureparser -from Modules import Mapper # Mapper.Converter -from Modules import Retriever # Retriever.Retriever +import mutalyzer +from mutalyzer import variantchecker +from mutalyzer.grammar import Grammar +from mutalyzer.config import Config +from mutalyzer.output import Output +from mutalyzer.mapping import Converter +from mutalyzer import Retriever # Retriever.Retriever -import Mutalyzer # Mutalyzer.process __all__ = ["Scheduler"] -def debug(f) : - """ - Decorator for functions called from within the daemon. Can be used - to debug errors that are hidden because the daemon's stdout and - errout filehandlers are closed. - - Usage: Place the decorator line above the function to investigate - - >>> @debug - >>> def process(self) : - >>> pass # function code - """ - #NOTE: All debug functions & methods should be moved to a DEBUG module - - def _tempf(*args) : - """ - The decorated function is replaced by this function. Which sets up - the filehandle to write to and print out additional debug info. - - The original function is called from within a try, except clause - which catches [AND DOES NOT RERAISE] an exception occuring in the - debugged function. - - This can result in odd behaviour, therefor the decorators should - be removed from any production version. - """ - - of = open("/tmp/daemon.out", "a+") - try : - of.write("\nFunction %s\n\targs: %s\n\t" % (`f`, `args`)) - ret = f(*args) # Actual function call - of.write("Returns: %s" % `ret`) - return ret - #try - except Exception, e : - import traceback - of.write("\nEXCEPTION:\n") - traceback.print_exc(file=of) - #except - return _tempf -#debug - class Scheduler() : """ @@ -97,7 +52,7 @@ class Scheduler() : """ Initialize the Scheduler, which requires a config object and a database connection. - + @todo: documentation @arg config: Config object @@ -108,16 +63,36 @@ class Scheduler() : self.__config = config self.__database = database + self.__run = True #__init__ + def stop(self): + """ + If the {process} method is running, the current job item will be + processed and {process} will return. + """ + self.__run = False + #stop + + def stopped(self): + """ + Test if the scheduler instance is stopped (i.e. the {stop} method is + called). + + @return: True if {stop} was called, False otherwise. + @rtype: bool + """ + return not self.__run + #stopped + def __sendMail(self, mailTo, url) : """ Send an e-mail containing an url to a batch job submitter. Private variables: - - __config ; The variables mailMessage, mailSubject and mailFrom + - __config ; The variables mailSubject and mailFrom are used. - + @todo: Handle Connection errors in a try, except clause @arg mailTo: The batch job submitter @@ -125,13 +100,32 @@ class Scheduler() : @arg url: The url containing the results @type url: string """ + if mutalyzer.is_test(): + return + + # Note: The above check with mutalyzer.is_test is bogus, since during + # a normal unit test, the batch checker process is not started in the + # environment of the unit tests. + # As sort of a hack, we therefore check for the patented address + # 'test@test.test', used in the unit tests. + if mailTo == 'test@test.test': + return #TODO: Handle Connection errors in a try, except clause #Expected errors: socket.error - handle = open(self.__config.mailMessage) - message = MIMEText(handle.read() % url) - handle.close() + message = MIMEText("""Dear submitter, + +The batch operation you have submitted, has been processed successfully. + +Your results can be found here: +%s + +Thanks for using Mutalyzer. + + +With kind regards, +Mutalyzer batch checker.""" % url) message["Subject"] = self.__config.mailSubject message["From"] = self.__config.mailFrom @@ -158,7 +152,7 @@ class Scheduler() : @rtype: boolean """ - if not flags : + if not flags : return False if 'S' in flags : #This entry is going to be skipped #Add a usefull message to the Output object @@ -197,8 +191,8 @@ class Scheduler() : false positives. e.g. NM_002001.1(FCER1A_v001):c.1A>C should not be replaced. The double bracket notation is the MySQL escape char for a regular expression. - - @arg jobID: + + @arg jobID: @type jobID: @arg old: @type old: @@ -218,8 +212,8 @@ class Scheduler() : Alias for the database.skipBatchDb method. Skip all batch entries that match a certain selector. - - @arg jobID: + + @arg jobID: @type jobID: @arg flag: @type flag: @@ -245,7 +239,7 @@ class Scheduler() : @type O: object @arg jobID: ID of job, so that the altering is only done within one job - @type jobID: + @type jobID: """ flags = O.getOutput("BatchFlags") @@ -291,6 +285,9 @@ class Scheduler() : fashion. If all jobs are done the process checks if new jobs are added during the last processing round. + If during this process the {stop} method is called, the current + job item is completed and we return. + This method uses two database tables, BatchJob and BatchQueue. The jobList is an array of tuples with three elements @@ -305,7 +302,7 @@ class Scheduler() : the flags for the job. #Flags - A job can be flagged in two ways: + A job can be flagged in three ways: - A ; Altered - this means that the input is altered before execution. This could be the case if an entry uses an accession number without a version. @@ -317,14 +314,16 @@ class Scheduler() : case if the user made a mistake that could not be auto fixed and henceforth all occurences of the mistake will be skipped. + - C ; Continue - this means the input does not end the + current row, so no new row in the output should + be started. - A Flag consists of either an A or S followed by a digit, which + A Flag consists of either an A, S or C followed by a digit, which refers to the reason of alteration / skip. """ - jobList = self.__database.getJobs() - while jobList : + while jobList and self.__run: for i, jobType, arg1 in jobList : inputl, flags = self.__database.getFromQueue(i) if not (inputl is None) : @@ -343,6 +342,8 @@ class Scheduler() : print "Job %s finished, email %s file %s" % (i, eMail, i) self.__sendMail(eMail, "%sResults_%s.txt" % (fromHost, i)) #else + if not self.__run: + break #for jobList = self.__database.getJobs() #while @@ -364,8 +365,8 @@ class Scheduler() : @type flags: """ - C = Config.Config() - O = Output.Output(__file__, C.Output) + C = Config() + O = Output(__file__, C.Output) O.addMessage(__file__, -1, "INFO", "Received NameChecker batchvariant " + cmd) @@ -375,13 +376,13 @@ class Scheduler() : if not skip : #Run mutalyzer and get values from Output Object 'O' try : - Mutalyzer.process(cmd, C, O) - except Exception, e : + variantchecker.check_variant(cmd, C, O) + except Exception: #Catch all exceptions related to the processing of cmd O.addMessage(__file__, 4, "EBATCHU", "Unexpected error occurred, dev-team notified") import traceback - O.addMessage(__file__, 4, "DEBUG", `traceback.format_exc()`) + O.addMessage(__file__, 4, "DEBUG", repr(traceback.format_exc())) #except finally : #check if we need to update the database @@ -396,8 +397,6 @@ class Scheduler() : if batchOutput : outputline += batchOutput[0] - outputline += "\n" - #Output filename = "%s/Results_%s.txt" % (self.__config.resultsDir, i) if not os.path.exists(filename) : @@ -411,7 +410,12 @@ class Scheduler() : else : handle = open(filename, 'a') - handle.write(outputline) + if flags and 'C' in flags: + separator = '\t' + else: + separator = '\n' + + handle.write("%s%s" % (outputline, separator)) handle.close() O.addMessage(__file__, -1, "INFO", "Finished NameChecker batchvariant " + cmd) @@ -433,24 +437,24 @@ class Scheduler() : @type flags: """ - C = Config.Config() - O = Output.Output(__file__, C.Output) - P = Parser.Nomenclatureparser(O) + C = Config() + output = Output(__file__, C.Output) + grammar = Grammar(output) - O.addMessage(__file__, -1, "INFO", - "Received SyntaxChecker batchvariant " + cmd) + output.addMessage(__file__, -1, "INFO", + "Received SyntaxChecker batchvariant " + cmd) - skip = self.__processFlags(O, flags) + skip = self.__processFlags(output, flags) #Process if not skip : - parsetree = P.parse(cmd) + parsetree = grammar.parse(cmd) else : parsetree = None if parsetree : result = "OK" else : - result = "|".join(O.getBatchMessages(3)) + result = "|".join(output.getBatchMessages(3)) #Output filename = "%s/Results_%s.txt" % (self.__config.resultsDir, i) @@ -465,10 +469,15 @@ class Scheduler() : else : handle = open(filename, 'a') - handle.write("%s\t%s\n" % (cmd, result)) + if flags and 'C' in flags: + separator = '\t' + else: + separator = '\n' + + handle.write("%s\t%s%s" % (cmd, result, separator)) handle.close() - O.addMessage(__file__, -1, "INFO", - "Finished SyntaxChecker batchvariant " + cmd) + output.addMessage(__file__, -1, "INFO", + "Finished SyntaxChecker batchvariant " + cmd) #_processSyntaxCheck def _processConversion(self, cmd, i, build, flags) : @@ -491,8 +500,8 @@ class Scheduler() : @type flags: """ - C = Config.Config() - O = Output.Output(__file__, C.Output) + C = Config() + O = Output(__file__, C.Output) variant = cmd variants = None gName = "" @@ -505,7 +514,7 @@ class Scheduler() : if not skip : try : #process - converter = Mapper.Converter(build, C, O) + converter = Converter(build, C, O) #Also accept chr accNo variant = converter.correctChrVariant(variant) @@ -513,8 +522,8 @@ class Scheduler() : #TODO: Parse the variant and check for c or g. This is ugly if not(":c." in variant or ":g." in variant) : #Bad name - P = Parser.Nomenclatureparser(O) - parsetree = P.parse(variant) + grammar = Grammar(O) + grammar.parse(variant) #if if ":c." in variant : @@ -540,7 +549,7 @@ class Scheduler() : # variants from a nested lists and store them. cNames = [cName for cName2 in variants.values() \ for cName in cName2] - except Exception, e : + except Exception: #Catch all exceptions related to the processing of cmd O.addMessage(__file__, 4, "EBATCHU", "Unexpected error occurred, dev-team notified") @@ -562,10 +571,15 @@ class Scheduler() : else : handle = open(filename, 'a') - handle.write("%s\t%s\t%s\t%s\n" % (cmd, error, gName, "\t".join(cNames))) + if flags and 'C' in flags: + separator = '\t' + else: + separator = '\n' + + handle.write("%s\t%s\t%s\t%s%s" % (cmd, error, gName, "\t".join(cNames), separator)) handle.close() O.addMessage(__file__, -1, "INFO", - "Finished PositionConverter batchvariant " + cmd) + "Finisehd PositionConverter batchvariant " + cmd) #_processConversion @@ -585,8 +599,8 @@ class Scheduler() : @type flags: """ - C = Config.Config() - O = Output.Output(__file__, C.Output) + C = Config() + O = Output(__file__, C.Output) O.addMessage(__file__, -1, "INFO", "Received SNP converter batch rs" + cmd) @@ -594,17 +608,16 @@ class Scheduler() : # Todo: Do something with the flags? skip = self.__processFlags(O, flags) + descriptions = [] if not skip : R = Retriever.Retriever(C.Retriever, O, None) - R.snpConvert(cmd, O) + descriptions = R.snpConvert(cmd) # Todo: Is output ok? outputline = "%s\t" % cmd - outputline += "%s\t" % "|".join(O.getOutput('snp')) + outputline += "%s\t" % "|".join(descriptions) outputline += "%s\t" % "|".join(O.getBatchMessages(3)) - outputline += "\n" - #Output filename = "%s/Results_%s.txt" % (self.__config.resultsDir, i) if not os.path.exists(filename) : @@ -618,14 +631,20 @@ class Scheduler() : else : handle = open(filename, 'a') - handle.write(outputline) + if flags and 'C' in flags: + separator = '\t' + else: + separator = '\n' + + handle.write("%s%s" % (outputline, separator)) handle.close() O.addMessage(__file__, -1, "INFO", "Finished SNP converter batch rs%s" % cmd) #_processSNP - def addJob(self, outputFilter, eMail, queue, fromHost, jobType, Arg1) : + def addJob(self, outputFilter, eMail, queue, columns, fromHost, jobType, + Arg1): """ Add a job to the Database and start the BatchChecker. @@ -635,25 +654,27 @@ class Scheduler() : @type eMail: string @arg queue: A list of jobs @type queue: list + @arg columns: The number of columns. + @type columns: int @arg fromHost: From where is the request made - @type fromHost: + @type fromHost: @arg jobType: The type of Batch Job that should be run @type jobType: @arg Arg1: Batch Arguments, for now only build info @type Arg1: - + @return: jobID @rtype: - + @todo: outputFilter is not used """ #TODO: outputFilter is not used # Add jobs to the database - jobID = self.__database.addJob(outputFilter, eMail, - fromHost, jobType, Arg1) + jobID = self.__database.addJob(outputFilter, eMail, fromHost, jobType, + Arg1) - for inputl in queue : + for i, inputl in enumerate(queue): # NOTE: # This is a very dirty way to skip entries before they are fed # to the batch processes. This is needed for e.g. an empty line @@ -663,26 +684,25 @@ class Scheduler() : # output in terms of input line and outputline. if inputl.startswith("~!"): #Dirty Escape inputl = inputl[2:] - if inputl : + if inputl: flag = "S0" # Flag for wrong format - else : + else: flag = "S9" # Flag for empty line inputl = " " #Database doesn't like an empty inputfield - #else - #if - else : + else: flag = None + if (i + 1) % columns: + # Add flag for continuing the current row + flag = '%s%s' % (flag if flag else '', 'C0') self.__database.addToQueue(jobID, inputl, flag) # Spawn child - p = subprocess.Popen(["MutalyzerBatch", - "src/BatchChecker.py"], executable="python") + # Todo: Executable should be in bin/ directory. + #p = subprocess.Popen(["MutalyzerBatch", + # "bin/batch_daemon"], executable="python") #Wait for the BatchChecker to fork of the Daemon - p.communicate() + #p.communicate() return jobID #addJob #Scheduler - -if __name__ == "__main__" : - pass diff --git a/mutalyzer/__init__.py b/mutalyzer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..07ce56981c9808d1bf5c419858cac68a6f97ca96 --- /dev/null +++ b/mutalyzer/__init__.py @@ -0,0 +1,62 @@ +""" +HGVS variant nomenclature checker. +""" + + +import os + + +# On the event of a new release, we update the __version_info__ and __date__ +# package globals and set RELEASE to True. +# Before a release, a development version is denoted by a __version_info__ +# ending with a 'dev' item. Also, RELEASE is set to False (indicating that +# the __date__ value is to be ignored). +# +# We follow a versioning scheme compatible with setuptools [1] where the +# __version_info__ variable always contains the version of the upcomming +# release (and not that of the previous release), post-fixed with a 'dev' +# item. Only in a release commit, this 'dev' item is removed (and added +# again in the next commit). +# +# [1] http://peak.telecommunity.com/DevCenter/setuptools#specifying-your-project-s-version + +RELEASE = False + +__version_info__ = ('2', '0', 'beta-11', 'dev') +__date__ = '21 Jul 2011' + + +__version__ = '.'.join(__version_info__) +__author__ = 'Leiden University Medical Center' +__contact__ = 'humgen@lumc.nl' +__homepage__ = 'http://mutalyzer.nl' + + +NOMENCLATURE_VERSION_INFO = ('2', '0') +NOMENCLATURE_VERSION = '.'.join(NOMENCLATURE_VERSION_INFO) + +SOAP_NAMESPACE = 'http://mutalyzer.nl/2.0/services' + + +def package_root(): + """ + Get the absolute path to the mutalyzer package. This is usefull for + things like locating HTML templates (which are in a subdirectory of the + package). + + @return: Absolute path to the mutalyzer package. + @rtype: string + """ + return os.path.realpath(os.path.split(__file__)[0]) + + +def is_test(): + """ + Check if we are in a test environment. This is determined by the + MUTALYZER_ENV environment variable, which should then be set to 'test'. + + @return: True if we are in a test environment, False otherwise. + @rtype: bool + """ + return 'MUTALYZER_ENV' in os.environ \ + and os.environ['MUTALYZER_ENV'] == 'test' diff --git a/mutalyzer/config.py b/mutalyzer/config.py new file mode 100644 index 0000000000000000000000000000000000000000..cb98f726eaf8cc761243400b8d3bb571fdeda4c9 --- /dev/null +++ b/mutalyzer/config.py @@ -0,0 +1,153 @@ +""" +Module for reading the config file and splitting up the variables into +subclasses. Each of these subclasses are used to configure a specific +module. +""" + + +import os +from configobj import ConfigObj + + +SYSTEM_CONFIGURATION = '/etc/mutalyzer/config' +USER_CONFIGURATION = os.path.join( + os.environ.get('XDG_CONFIG_HOME', None) or \ + os.path.join(os.path.expanduser('~'), '.config'), + 'mutalyzer', 'config') + + +class ConfigurationError(Exception): + pass + + +class Config(): + """ + Read the configuration file and store the data in subclasses. + """ + class Retriever(): pass + class Db(): pass + class Output(): pass + class Mutator(): pass + class Scheduler(): pass + class Batch(): pass + class File(): pass + class GenRecord(): pass + + def __init__(self, filename=None): + """ + Initialise the class with variables read from the configuration + file. In principle, this is the only place in the code where a + hard coded constant is used (the name and path to the configuration + file). + + Configuration values are read from two locations, in this order: + 1) /etc/mutalyzer/config + 2) $XDG_CONFIG_HOME/mutalyzer/config + + If both files exist, values defined in the second overwrite values + defined in the first. + + An exception to this system is when the optional {filename} argument + is set. In that case, the locations listed above are ignored and the + configuration is read from {filename}. + + By the DRY-principle, we don't enumerate the configuration variables + for each class in documentation. Instead, what variables are used by + each class is easy to see from the code below. + + @kwarg filename: Optional filename to read configuration from. If + present, this overrides automatic detection of configuration file + location. + @type filename: string + + @raise ConfigurationError: If configuration could not be read. + Reasons are: + - Supplied argument {filename} could not be opened. + - Configuration file could not be parsed. + - Not all variables are present in configuration file. + """ + config = None + + if filename: + config = self._load_config(filename) + else: + if os.path.isfile(SYSTEM_CONFIGURATION): + config = self._load_config(SYSTEM_CONFIGURATION) + if os.path.isfile(USER_CONFIGURATION): + user_config = self._load_config(USER_CONFIGURATION) + if config: + config.merge(user_config) + else: + config = user_config + + if not config: + raise ConfigurationError('Could not locate configuration.') + + try: + + # Set the variables needed by the Retriever module. + self.Retriever.email = config["email"] + self.Retriever.cache = config["cache"] + self.Retriever.cachesize = int(config["cachesize"]) * 1048576 + self.Retriever.maxDldSize = int(config["maxDldSize"]) * 1048576 + self.Retriever.minDldSize = int(config["minDldSize"]) + self.Retriever.lrgURL = config["lrgurl"] + + # Set the variables needed by the Db module. + self.Db.internalDb = config["internalDb"] + self.Db.dbNames = config["dbNames"] + self.Db.LocalMySQLuser = config["LocalMySQLuser"] + self.Db.LocalMySQLhost = config["LocalMySQLhost"] + + # Set the variables needed by the Output module. + self.Output.log = config["log"] + self.Output.datestring = config["datestring"] + self.Output.loglevel = int(config["loglevel"]) + self.Output.outputlevel = int(config["outputlevel"]) + self.Output.debug = config.as_bool('debug') + + # Set the variables needed by the Mutator module. + self.Mutator.flanksize = int(config["flanksize"]) + self.Mutator.maxvissize = int(config["maxvissize"]) + self.Mutator.flankclipsize = int(config["flankclipsize"]) + + # Set the variables needed by the Scheduler module. + self.Scheduler.mailFrom = config["mailFrom"] + self.Scheduler.mailSubject = config["mailSubject"] + self.Scheduler.resultsDir = config["resultsDir"] + self.Scheduler.nameCheckOutHeader = config["nameCheckOutHeader"] + self.Scheduler.syntaxCheckOutHeader = config["syntaxCheckOutHeader"] + self.Scheduler.positionConverterOutHeader = config["positionConverterOutHeader"] + self.Scheduler.snpConverterOutHeader = config["snpConverterOutHeader"] + + # Set thte variables neede for the Batch module. + self.Batch.PIDfile = config["PIDfile"] + self.Batch.batchInputMaxSize = int(config["batchInputMaxSize"]) * 1048576 + + # Set the variables needed by the File module. + self.File.bufSize = int(config["bufSize"]) + self.File.header = config["header"] + self.File.threshold = float(config["threshold"]) + + # Set the variables needed by the File module. + self.GenRecord.spliceAlarm = int(config["spliceAlarm"]) + self.GenRecord.spliceWarn = int(config["spliceWarn"]) + + except KeyError as e: + raise ConfigurationError('Missing configuration value: %s' % e) + #__init__ + + def _load_config(self, filename): + """ + Create a ConfigObj from the configuration in {filename}. + """ + try: + return ConfigObj(filename) + except IOError: + raise ConfigurationError('Could not open configuration file: %s' \ + % filename) + except SyntaxError: + raise ConfigurationError('Could not parse configuration file: %s' \ + % filename) + #_load_config +#Config diff --git a/src/Modules/Parser.py b/mutalyzer/grammar.py similarity index 86% rename from src/Modules/Parser.py rename to mutalyzer/grammar.py index 66f912f8d2a43ba67361f0c90109e8c75a17c0be..2facd917bc8592ceab6225cb3f60dc6e4c5ba08f 100644 --- a/src/Modules/Parser.py +++ b/mutalyzer/grammar.py @@ -1,23 +1,21 @@ -#!/usr/bin/python - """ Module for parsing a variant described using the HGVS nomenclature. -A context-free parser is defined here, the nomenclature rules are specified -in Backus-Naur Form (BNF), which is used (with some minor modifications) as source of this -module. +A context-free grammar is defined here, the nomenclature rules are specified +in Backus-Naur Form (BNF), which is used (with some minor modifications) as +source of this module. -@requires: pyparsing +@todo: Update docstrings. +@todo: Automatically generate a LaTeX BNF description from this. """ -# Public classes: -# - Nomenclatureparser ; Parse an input string. from pyparsing import * -class Nomenclatureparser() : + +class Grammar(): """ - Parse an input string. + Defines the HGVS nomenclature grammar. Private variables: - __output ; The output object. @@ -32,7 +30,6 @@ class Nomenclatureparser() : Public methods: - parse(input) ; Parse the input string and return a parse tree. """ - # New: # Nest -> `{' SimpleAlleleVarSet `}' SimpleAlleleVarSet = Forward() @@ -216,14 +213,20 @@ class Nomenclatureparser() : TransLoc = Suppress('t') + ChromCoords + Suppress('(') + FarLoc + \ Suppress(')') + # We use originalTextFrom to retain the original (unparsed) raw variant + # descriptions. It can be retrieved as element[0]. + # See: + # http://packages.python.org/pyparsing/pyparsing.pyparsing-module.html#originalTextFor + # RawVar -> Subst | Del | Dup | VarSSR | Ins | Indel | Inv | Conv # Changed to: # CRawVar -> Subst | Del | Dup | VarSSR | Ins | Indel | Inv | Conv # RawVar -> (CRawVar | (`(' CRawVar `)')) `?'? CRawVar = Group(Subst ^ Del ^ Dup ^ VarSSR ^ \ Ins ^ Indel ^ Inv ^ Conv)("RawVar") - RawVar = (CRawVar ^ (Suppress('(') + CRawVar + Suppress(')'))) + \ - Suppress(Optional('?')) + RawVar = originalTextFor((CRawVar ^ (Suppress('(') + CRawVar + \ + Suppress(')'))) + \ + Suppress(Optional('?')), False) # SingleVar -> Ref RawVar | TransLoc # ExtendedRawVar -> RawVar | `=' | `?' @@ -308,70 +311,50 @@ class Nomenclatureparser() : Var = SingleVar ^ MultiVar ^ MultiTranscriptVar ^ \ UnkEffectVar ^ NoRNAVar ^ SplicingVar - def __init__(self, output) : + + def __init__(self, output): """ - Initialise the class and enable packrat parsing. + Initialise the class and enable packrat parsing. Packrat speeds up + parsing considerably. Private variables (altered): - - __output ; Set to the output object. + - _output ; Set to the output object. - @arg output: The output object - @type output: object + @arg output: The output object. + @type output: mutalyzer.Output.Output """ - - self.__output = output - ParserElement.enablePackrat() # Speed up parsing considerably. + self._output = output + ParserElement.enablePackrat() #__init__ - def parse(self, variant) : + + def parse(self, variant): """ Parse the input string and return a parse tree if the parsing was successful. Otherwise print the parse error and the position in - the input where the error occurred. + the input where the error occurred (and return None). Private variables: - - __output ; The output object. + - _output ; The output object. Public variables: - Var ; The top-level rule of our parser. - @arg variant: The input string that needs to be parsed + @arg variant: The input string that needs to be parsed. @type variant: string - @return: The parse tree containing the parse results - @rtype: object + @return: The parse tree containing the parse results, or None in + case of a parsing error. + @rtype: pyparsing.ParseResults """ - - try : - return self.Var.parseString(variant, parseAll = True) - except ParseException, err : - self.__output.addMessage(__file__, 4, "EPARSE", str(err)) - - # Log the input. - #self.__output.addMessage(__file__, 4, "EPARSE", variant) - - # And log the position where the parsing error occurred. + try: + return self.Var.parseString(variant, parseAll=True) + except ParseException as err: + # Log parse error and the position where it occurred. + self._output.addMessage(__file__, 4, 'EPARSE', str(err)) pos = int(str(err).split(':')[-1][:-1]) - 1 - #self.__output.addMessage(__file__, 4, "EPARSE", pos * ' ' + '^') - - #self.__output.addOutput("parseError", str(err)) - self.__output.addOutput("parseError", variant) - self.__output.addOutput("parseError", pos * ' ' + '^') + self._output.addOutput('parseError', variant) + self._output.addOutput('parseError', pos * ' ' + '^') return None - #except #parse -#Nomenclatureparser - -# -# Unit test. -# -if __name__ == "__main__" : - P = Nomenclatureparser() - parsetree = P.parse("NM_002001.2:c.[12del]") - parsetree = P.parse("NM_002001.2:c.[(12del)]") - parsetree = P.parse("NM_002001.2:c.[(12del)?]") - parsetree = P.parse("NM_002001.2:c.[(12del);(12del)]") - parsetree = P.parse("NM_002001.2:c.[(12del;12del)]") - parsetree = P.parse("NM_002001.2:c.[((12del)?;12del)?]") - del P -#if +#Grammar diff --git a/src/Modules/Mapper.py b/mutalyzer/mapping.py similarity index 51% rename from src/Modules/Mapper.py rename to mutalyzer/mapping.py index bc753067520132585bf917c5ba63eb545f85623c..0cb27e7e41ce749205850e094546b0176889f5f6 100644 --- a/src/Modules/Mapper.py +++ b/mutalyzer/mapping.py @@ -1,108 +1,58 @@ -#!/usr/bin/python - """ -Search for an NM number in the MySQL database, if the version number -matches, get the start and end positions in a variant. Translate these -positions to I{g.} notation if the variant is in I{c.} notation or vice versa. +Work with the mappings of transcripts to chromosomes. - - If no end position is present, the start position is assumed to be the end - position. - - If the version number is not found in the database, an error message is - generated and a suggestion for an other version is given. - - If the reference sequence is not found at all, an error is returned. - - If no variant is present, the transcription start and end and CDS end in - I{c.} notation is returned. - - If the variant is not accepted by the nomenclature parser, a parse error - will be printed. - -@requires: sys -@requires: Modules.Config -@requires: Modules.Db -@requires: Modules.Crossmap -@requires: Modules.Parser -@requires: Modules.Output -@requires: Modules.Serializers.SoapMessage -@requires: Modules.Serializers.Mapping -@requires: Modules.Serializers.Transcript -@requires: Bio.Seq.reverse_complement -@requires: collections.defaultdict +Instances of the {Converter} class convert between transcript and chromosomal +locations, using the 'Mapping' table. + +The {Updater} class is an abstract base class, subclassed by {NCBIUpdater}. +Instances of {NCBIUpdater} can load NCBI mapping information from a file and +update the database with this information. """ -import sys # argv -from Modules import Config # Config() -from Modules import Db # Db(), get_NM_version(), get_NM_info() -from Modules import Crossmap # Crossmap(), g2x(), x2g(), main2int(), - # offset2int(), info() -from Modules import Parser # Nomenclatureparser(), parse() -from Modules import Output # Output(), LogMsg() -from Modules.Serializers import SoapMessage, Mapping, Transcript from Bio.Seq import reverse_complement from collections import defaultdict -def _sl2il(l) : - """ - Convert a list of strings to a list of integers. +from mutalyzer.grammar import Grammar +from mutalyzer import Db +from mutalyzer import Crossmap +from mutalyzer.models import SoapMessage, Mapping, Transcript - @arg l: A list of strings - @type l: list - @returns: A list of integers - @rtype: list +class Converter(object) : """ + Convert between transcript and chromosomal locations. - return [int(s) for s in l] -#__sl2il - -def _getcoords(C, Loc, Type) : - """ - Return main, offset and g positions given either a position in - I{c.} or in I{g.} notation. - - @arg C: A crossmapper - @type C: object - @arg Loc: A location in either I{g.} or I{c.} notation - @type Loc: object - @arg Type: The reference type - @type Type: string - @returns: triple: - 0. Main coordinate in I{c.} notation - 1. Offset coordinate in I{c.} notation - 2. Position in I{g.} notation - @rtype: triple (integer, integer, integer) - """ + Search for an NM number in the MySQL database, if the version number + matches, get the start and end positions in a variant. Translate these + positions to I{g.} notation if the variant is in I{c.} notation or vice + versa. - if Type == 'c' : - main = C.main2int(Loc.MainSgn + Loc.Main) - offset = C.offset2int(Loc.OffSgn + Loc.Offset) - g = C.x2g(main, offset) - main, offset = C.g2x(g) - #if - else : - g = int(Loc.Main) - main, offset = C.g2x(g) - #else - return (main, offset, g) -#__getcoords + - If no end position is present, the start position is assumed to be the + end position. + - If the version number is not found in the database, an error message is + generated and a suggestion for an other version is given. + - If the reference sequence is not found at all, an error is returned. + - If no variant is present, the transcription start and end and CDS end in + I{c.} notation is returned. + - If the variant is not accepted by the nomenclature parser, a parse error + will be printed. -class Converter(object) : + @todo: Refactor anything using {mutalyzer.models} into the {webservice} + module. """ - Converter object docstring - """ - def __init__(self, build, C, O) : """ Initialise the class. - + @arg build: the genome build version of the organism (e.g. hg19 for human genome build version 19) @type build: string @arg C: crossmapper object @type C: object @arg O: output object - @type O: object + @type O: object """ - self.build = None self.__output = O self.__config = C @@ -114,24 +64,6 @@ class Converter(object) : self.dbFields = {} #__init__ - def _changeBuild(self, build) : - """ - @todo document me (figure out what is does) - Change the build if it different from the one previously set?????. - - @arg build: the genome build version of the organism (e.g. hg19 for - human genome build version 19) - @type build: string - """ - - if self.build != build : - self.crossmap = None - self.dbFields = {} - self.build = build - self.__database = Db.Mapping(build, C.Db) - #if - #_changeBuild - def _reset(self) : self.crossmap = None self.dbFields = {} @@ -140,16 +72,15 @@ class Converter(object) : def _parseInput(self, variant) : """ Parse a variant. - + @arg variant: variant description @type variant: string - + @return: parsetree object @rtype: object """ - - P = Parser.Nomenclatureparser(self.__output) - parseTree = P.parse(variant) + grammar = Grammar(self.__output) + parseTree = grammar.parse(variant) if not parseTree : self.__output.addMessage(__file__, 4, "EPARSE", "Could not parse the given variant") @@ -171,35 +102,29 @@ class Converter(object) : #_parseInput def _populateFields(self, Fields) : - #TODO: ADD Error Messages, unlikely that CDS info is missing """ Create a Mutalyzer compatible exon list. - + @todo: ADD Error Messages, unlikely that CDS info is missing. - + @arg Fields: dictionary with exon start and end positions taken from the MySQL database @type Fields: dictionary - + @return: Exon list @rtype: list """ + Fields["exon_starts"] = map(int, Fields["exon_starts"].split(',')) + Fields["exon_stops"] = map(int, Fields["exon_stops"].split(',')) + assert(len(Fields["exon_starts"]) == len(Fields["exon_stops"])) - Fields["exonStarts"] =\ - _sl2il(Fields["exonStarts"].split(',')[:-1]) - Fields["exonEnds"] =\ - _sl2il(Fields["exonEnds"].split(',')[:-1]) - assert(len(Fields["exonStarts"]) == len(Fields["exonEnds"])) - - Fields["cdsStart"] = int(Fields["cdsStart"]) - Fields["cdsEnd"] = int(Fields["cdsEnd"]) - - for i in range(len(Fields["exonStarts"])) : - Fields["exonStarts"][i] += 1 + if Fields['cds_start'] and Fields['cds_stop']: + Fields["cds_start"] = int(Fields["cds_start"]) + Fields["cds_stop"] = int(Fields["cds_stop"]) # Create Mutalyzer compatible exon list Fields["exons"] = [] - for exon in zip(Fields["exonStarts"], Fields["exonEnds"]) : + for exon in zip(Fields["exon_starts"], Fields["exon_stops"]) : Fields["exons"].extend(exon) self.dbFields = Fields @@ -212,18 +137,18 @@ class Converter(object) : (zip returns a list of tuples, where the i-th tuple contains the i-th element from each of the argument sequences or iterables. dict(arg) creates a new data dictionary, with items taken from arg.) - + @arg values: list of values take from the MySQL database @type values: list - + @return: dictionary with values taken from the MySQL database @rtype: dictionary """ Fields = dict(zip( - ("acc", "txStart", "txEnd", "cdsStart", "cdsEnd", - "exonStarts", "exonEnds", "geneName", - "chrom", "strand", "protAcc", "version"), + ("transcript", "start", "stop", "cds_start", "cds_stop", + "exon_starts", "exon_stops", "gene", + "chromosome", "orientation", "protein", "version"), values)) return self._populateFields(Fields) #_FieldsFromValues @@ -231,7 +156,7 @@ class Converter(object) : def _FieldsFromDb(self, acc, version) : """ Get data from database and populate dbFields dict. - + @arg acc: NM_ accession number (without version) @type acc: string @arg version: version number @@ -282,9 +207,9 @@ class Converter(object) : #_FieldsFromDb def makeCrossmap(self) : - """ + """ Build the crossmapper. - + @todo: ADD Error Messages @return: Cross ; A Crossmap object @@ -295,15 +220,14 @@ class Converter(object) : if not self.dbFields: return None CDS = [] - if self.dbFields["cdsStart"] != self.dbFields["cdsEnd"] : - #The counting from 0 conversion. - CDS = [self.dbFields["cdsStart"] + 1, self.dbFields["cdsEnd"]] + if self.dbFields["cds_start"] and self.dbFields["cds_stop"]: + CDS = [self.dbFields["cds_start"], self.dbFields["cds_stop"]] mRNA = self.dbFields["exons"] # Convert the strand information to orientation. orientation = 1 - if self.dbFields["strand"] == '-' : + if self.dbFields["orientation"] == '-' : orientation = -1 # Build the crossmapper. @@ -311,6 +235,37 @@ class Converter(object) : return self.crossmap #makeCrossmap + @staticmethod + def _getcoords(C, Loc, Type) : + """ + Return main, offset and g positions given either a position in + I{c.} or in I{g.} notation. + + @arg C: A crossmapper + @type C: object + @arg Loc: A location in either I{g.} or I{c.} notation + @type Loc: object + @arg Type: The reference type + @type Type: string + @returns: triple: + 0. Main coordinate in I{c.} notation + 1. Offset coordinate in I{c.} notation + 2. Position in I{g.} notation + @rtype: triple (integer, integer, integer) + """ + if Type == 'c' : + main = C.main2int(Loc.MainSgn + Loc.Main) + offset = C.offset2int(Loc.OffSgn + Loc.Offset) + g = C.x2g(main, offset) + main, offset = C.g2x(g) + #if + else : + g = int(Loc.Main) + main, offset = C.g2x(g) + #else + return (main, offset, g) + #_getcoords + def _coreMapping(self) : """ Build the Mapping ClassSerializer. @@ -320,7 +275,7 @@ class Converter(object) : """ Cross = self.makeCrossmap() - if not Cross : + if not Cross : return None mutation = self.parseTree.RawVar @@ -330,14 +285,14 @@ class Converter(object) : # Get the coordinates of the start position startmain, startoffset, start_g = \ - _getcoords(Cross, mutation.StartLoc.PtLoc, - self.parseTree.RefType) + self._getcoords(Cross, mutation.StartLoc.PtLoc, + self.parseTree.RefType) # If there is an end position, calculate the coordinates. if mutation.EndLoc : endmain, endoffset, end_g = \ - _getcoords(Cross, mutation.EndLoc.PtLoc, - self.parseTree.RefType) + self._getcoords(Cross, mutation.EndLoc.PtLoc, + self.parseTree.RefType) else : end_g, endmain, endoffset = start_g, startmain, startoffset @@ -358,10 +313,10 @@ class Converter(object) : """ Returns transcription start, transcription end and CDS stop, if available. - + @arg accNo: transcript (NM_) accession number (with or without version) @type accNo: string - + @return: transcription start, transcription end and CDS stop @rtype: triple """ @@ -372,7 +327,7 @@ class Converter(object) : acc, ver = accNo.split('.') self._FieldsFromDb(acc, ver) CM = self.makeCrossmap() - if CM : + if CM : return CM.info() #giveInfo @@ -402,12 +357,12 @@ class Converter(object) : def mainMapping(self, accNo, mutation) : """ One of the entry points (called by the HTML publisher). - + @arg accNo: transcript (NM_) accession number (with version?) @type accNo: string @arg mutation: the 'mutation' (e.g. c.123C>T) @type mutation: string - + @return: ClassSerializer object @rtype: object """ @@ -419,15 +374,21 @@ class Converter(object) : self._FieldsFromDb(acc, version) mapping = self._coreMapping() - soaperrors = self.__output.getSoapMessages() + + errors = [] + for message in self.__output.getMessages(): + soap_message = SoapMessage() + soap_message.errorcode = message.code + soap_message.message = message.description + errors.append(soap_message) if mapping is None : # Something went wrong mapping = Mapping() - mapping.errorcode = len(soaperrors) + mapping.errorcode = len(errors) else : mapping.errorcode = 0 - mapping.messages = soaperrors + mapping.messages = errors return mapping #main_Mapping @@ -449,20 +410,20 @@ class Converter(object) : self._FieldsFromDb(acc, version) #if M = self._coreMapping() - if M is None : + if M is None : return None # construct the variant description - chromAcc = self.__database.chromAcc(self.dbFields["chrom"]) + chromAcc = self.__database.chromAcc(self.dbFields["chromosome"]) f_change = self._constructChange(False) r_change = self._constructChange(True) - if self.dbFields["strand"] == "+" : + if self.dbFields["orientation"] == "+" : change = f_change else : change = r_change if M.start_g != M.end_g : - if self.dbFields["strand"] == '+' : + if self.dbFields["orientation"] == '+' : var_in_g = "g.%s_%s%s" % (M.start_g, M.end_g, change) else : var_in_g = "g.%s_%s%s" % (M.end_g, M.start_g, change) @@ -477,8 +438,8 @@ class Converter(object) : """ @arg variant: @type variant: string - - @return: variant ; + + @return: variant ; @rtype: string """ @@ -491,8 +452,8 @@ class Converter(object) : #Remove whitespace variant = variant.replace(" ","") - if variant.startswith("chr") : - preco, postco = variant.split(":") + if variant.startswith('chr') and ':' in variant: + preco, postco = variant.split(':', 1) chrom = self.__database.chromAcc(preco) if chrom is None : self.__output.addMessage(__file__, 4, "ENOTINDB", @@ -512,8 +473,8 @@ class Converter(object) : @type variant: string @arg rt: the return type @type rt: string - - @return: HGVS_notatations ; + + @return: HGVS_notatations ; @rtype: dictionary or list """ @@ -551,9 +512,9 @@ class Converter(object) : #balen continue # construct the variant description - accNo = "%s.%s" % (self.dbFields["acc"],self.dbFields["version"]) - geneName = self.dbFields["geneName"] or "" - strand = self.dbFields["strand"] == '+' + accNo = "%s.%s" % (self.dbFields["transcript"],self.dbFields["version"]) + geneName = self.dbFields["gene"] or "" + strand = self.dbFields["orientation"] == '+' startp = self.crossmap.tuple2string((M.startmain, M.startoffset)) endp = self.crossmap.tuple2string((M.endmain, M.endoffset)) @@ -590,7 +551,7 @@ class Converter(object) : @arg revc: @type revc: - + @return: @rtype: string """ @@ -617,3 +578,254 @@ class Converter(object) : return change #_constructChange #Converter + + +class Updater(object): + """ + Abstract base class for updating the mapping information in the database. + + Subclasses should implement the {load} method, loading new mapping + information into the 'MappingTemp' table. The {merge} method merges this + table into the real 'Mapping' table. + """ + def __init__(self, build, config): + """ + @arg build: Human genome build (or database name), i.e. 'hg18' or + 'hg19'. + @type build: string + @arg config: A configuration object. + @type config: mutalyzer.config.Config + """ + self.build = build + self.config = config + self.db = Db.Mapping(build, config.Db) + #__init__ + + def load(self, *args, **kwargs): + """ + The implementation of this method in subclasses should load mapping + information in the 'MappingTemp' table. + """ + raise NotImplementedError('Implement this method in subclasses') + #load + + def merge(self): + """ + Merge the 'Mapping' and 'MappingTemp' tables. The result is stored in + the 'Mapping' table, of which a backup is created as 'MappingBackup'. + + @todo: Report how much was updated/added. + """ + self.db.merge_update() + #merge +#Updater + + +class NCBIUpdater(Updater): + """ + Update the mapping information in the database with mapping information + from the NCBI. + + Example usage: + + >>> updater = NCBIUpdater('hg19', mutalyzer.config.Config()) + >>> updater.load('/tmp/seq_gene.md', 'GRCh37.p2-Primary Assembly') + >>> updater.merge() + + """ + COLUMNS = ['taxonomy', 'chromosome', 'start', 'stop', 'orientation', + 'contig', 'ctg_start', 'ctg_stop', 'ctg_orientation', + 'feature_name', 'feature_id', 'feature_type', 'group_label', + 'transcript', 'evidence_code'] + + def __init__(self, build, config): + """ + @arg build: Human genome build (or database name), i.e. 'hg18' or + 'hg19'. + @type build: string + @arg config: A configuration object. + @type config: mutalyzer.config.Config + """ + self.exon_backlog = {} + super(NCBIUpdater, self).__init__(build, config) + #__init__ + + def load(self, mapping_file, assembly): + """ + Load NCBI mapping information from {mapping_file} into the database. + + The NCBI mapping file consists of entries, one per line, in order of + their location in the genome (more specifically by start location). + Every entry has a 'group_name' column, denoting the assembly it is + from. We only use entries where this value is {assembly}. + + There are four types of entries (for our purposes): + - Gene: Name, identifier, and location of a gene. + - Transcript: Name, gene id, and location of a transcript. + - UTR: Location and transcript of a non-coding exon (or part of it). + - CDS: Location and transcript of a coding exon (or part of it). + + A bit troublesome for us is that exons are split in UTR exons and CDS + exons, with exons overlapping the UTR/CDS border defined as two + separate entries (one of type UTR and one of type CDS). + + Another minor annoyance is that some transcripts (~ 15) are split over + two contigs (NT_*). In that case, they are defined by two entries in + the file, where we should merge them by taking the start position of + the first and the stop position of the second. + + Our strategy is to loop over all entries and store them in three + temporary tables (for genes, transcripts, exons). The entries of type + UTR and CDS are merged to correct exon entries by keeping a backlog + of these entries that can still be modified before storing them in the + database. + + The values from the three temporary tables are aggregated into the + 'MappingTemp' table. + + @arg mapping_file: Path to NCBI mapping information. + @type mapping_file: string + @arg assembly: Use only entries from this assembly (this is the + 'group_name' column in the NCBI mapping file). + @type assembly: string + """ + self._create_temporary_tables() + self._import_mapping(mapping_file, assembly) + self._aggregate_mapping() + self._drop_temporary_tables() + #load + + def _import_mapping(self, mapping_file, assembly): + """ + Import mapping information from {mapping_file} into three temporary + tables. + + @note: We issue a separate INSERT statement to the database for every + entry. An alternative is to write everything to tab-separated + files and load those into the database with LOAD DATA LOCAL INFILE + statements. This alternative seems to be about twice as fast, but + for now we stick with the simpler solution. + """ + self.exon_backlog = {} + + with open(mapping_file, 'r') as mapping: + for line in mapping: + if line.startswith('#'): + continue + entry = dict(zip(self.COLUMNS, line.rstrip().split('\t'))) + + # Only use entries from the given assembly. + if entry['group_label'] != assembly: + continue + + # Only use entries on the normal chromosomes. + try: + int(entry['chromosome']) + except ValueError: + if entry['chromosome'] not in 'XY': + continue + + if entry['feature_type'] == 'GENE': + self._import_gene(entry) + elif entry['feature_type'] == 'RNA': + self._import_transcript(entry) + elif entry['feature_type'] in ('UTR', 'CDS'): + self._import_exon(entry) + + self._import_exon_backlog() + #_import_mapping + + def _import_gene(self, entry): + """ + Insert a gene in the database. + """ + self.db.ncbi_import_gene(entry['feature_id'], entry['feature_name']) + #_import_gene + + def _import_transcript(self, entry): + """ + Insert a transcript in the database. + """ + self.db.ncbi_import_transcript( + entry['feature_name'], entry['feature_id'], entry['chromosome'], + int(entry['start']), int(entry['stop']), entry['orientation']) + #_import_transcript + + def _import_exon(self, entry): + """ + Instead of directly inserting each exon in the database, we keep them + in a backlog of at most one exon per transcript. Exons are taken from + the backlog and inserted in the database only when we passed their + genomic stop location by more than one position. + + This way, exons in the backlog can be merged when they are on a + UTR/CDS boundary. + """ + cds = entry['feature_type'] == 'CDS' + entry['start'] = int(entry['start']) + entry['stop'] = int(entry['stop']) + entry['protein'] = entry['feature_name'] if cds else None + entry['cds'] = cds + + self._import_exon_backlog(entry['start'] - 1) + + try: + previous = self.exon_backlog[entry['transcript']] + if previous['cds'] != entry['cds'] \ + and previous['stop'] == entry['start'] - 1: + if entry['cds']: + entry['cds_start'] = entry['start'] + else: + entry['cds_stop'] = previous['stop'] + if 'cds_start' in previous: + entry['cds_start'] = previous['cds_start'] + entry['protein'] = previous['protein'] + entry['start'] = previous['start'] + except KeyError: + pass + + self.exon_backlog[entry['transcript']] = entry + #_import_exon + + def _import_exon_backlog(self, up_to_position=None): + """ + Import exons from the backlog in the database. If the optional + argument {up_to_position} is set, only import exons with a stop + position before this value. + + We explicitely remove imported exons from the backlog, because it + might be suboptimal to keep more than 30,000 exons in there. + """ + for transcript, exon in self.exon_backlog.items(): + if not up_to_position or exon['stop'] < up_to_position: + del self.exon_backlog[transcript] + del exon['cds'] + self.db.ncbi_import_exon( + exon['transcript'], exon['start'], exon['stop'], + exon['cds_start'] if 'cds_start' in exon else None, + exon['cds_stop'] if 'cds_stop' in exon else None, + exon['protein'] or None) + #_import_exon_backlog + + def _aggregate_mapping(self): + """ + Aggregate the genes, transcripts and exons from their temporary + tables into the 'MappingTemp' table. + """ + self.db.ncbi_aggregate_mapping() + #_aggregate_mapping + + def _create_temporary_tables(self): + """ + Create temporary tables needed for loading the NCBI mapping data. + """ + self.db.ncbi_create_temporary_tables() + #_create_temporary_tables + + def _drop_temporary_tables(self): + """ + Drop temporary tables needed for loading the NCBI mapping data. + """ + self.db.ncbi_drop_temporary_tables() + #_drop_temporary_tables +#NCBIUpdater diff --git a/src/Modules/Serializers.py b/mutalyzer/models.py similarity index 65% rename from src/Modules/Serializers.py rename to mutalyzer/models.py index 2db8d1509af3fd901b26d3e4a2e6b6b9481acabb..fba12fbdfc751370e6b0955678973b7b3c43d089 100644 --- a/src/Modules/Serializers.py +++ b/mutalyzer/models.py @@ -1,28 +1,26 @@ -#!/usr/bin/python - """ -Collection of Serilizable Objects used by the webservice +Collection of serilizable objects used by the SOAP webservice. They extend +from the soaplib ClassModel. + +Default attributes for the soaplib ClassModel: +- nillable = True +- min_occurs = 0 +- max_occurs = 1 -@requires: soaplib.serializers.primitive.String -@requires: soaplib.serializers.primitive.Integer -@requires: soaplib.serializers.primitive.Array -@requires: soaplib.serializers.clazz.ClassSerializer +Additional attributes values for the soaplib String model: +- min_len = 0 +- max_len = 'unbounded' +- pattern = None -@todo: documentation +@todo: Use Mandatory.* models in the ClassModel extensions? +@todo: See if it improves client code if we use Array(_, nillable=False). """ -from soaplib.core.model.primitive import String, Integer, Boolean -from soaplib.core.model.clazz import ClassModel, Array -# Default attributes for soaplib models: -# nillable = True -# min_occurs = 0 -# max_occurs = 1 -# -# Additional attributes values for String model: -# min_len = 0 -# max_len = "unbounded" -# pattern = None +from soaplib.core.model.primitive import String, Integer, Boolean, DateTime +from soaplib.core.model.clazz import ClassModel, Array + +from mutalyzer import SOAP_NAMESPACE class Mandatory(object): @@ -33,28 +31,26 @@ class Mandatory(object): String = String(min_occurs=1, nillable=False) Integer = Integer(min_occurs=1, nillable=False) Boolean = Boolean(min_occurs=1, nillable=False) - - -# Todo: Use Mandatory.* models in the classmodels below? -# Todo: See if it improves client code if we use Array(_, nillable=False) + DateTime = DateTime(min_occurs=1, nillable=False) +#Mandatory class SoapMessage(ClassModel): """ Type of messages used in SOAP method return values. """ - __namespace__ = 'http://mutalyzer.nl/2.0/services' + __namespace__ = SOAP_NAMESPACE errorcode = Mandatory.String message = Mandatory.String #SoapMessage -class Mapping(ClassModel) : +class Mapping(ClassModel): """ Return type of SOAP method mappingInfo. """ - __namespace__ = 'http://mutalyzer.nl/2.0/services' + __namespace__ = SOAP_NAMESPACE startmain = Integer startoffset = Integer @@ -68,11 +64,11 @@ class Mapping(ClassModel) : #Mapping -class Transcript(ClassModel) : +class Transcript(ClassModel): """ Return type of SOAP method transcriptInfo. """ - __namespace__ = 'http://mutalyzer.nl/2.0/services' + __namespace__ = SOAP_NAMESPACE trans_start = Integer trans_stop = Integer @@ -84,18 +80,18 @@ class RawVariant(ClassModel): """ Used in MutalyzerOutput data type. """ - __namespace__ = 'http://mutalyzer.nl/2.0/services' + __namespace__ = SOAP_NAMESPACE description = Mandatory.String visualisation = Mandatory.String #RawVariant -class MutalyzerOutput(ClassModel) : +class MutalyzerOutput(ClassModel): """ Return type of SOAP method runMutalyzer. """ - __namespace__ = 'http://mutalyzer.nl/2.0/services' + __namespace__ = SOAP_NAMESPACE original = String mutated = String @@ -125,11 +121,11 @@ class MutalyzerOutput(ClassModel) : #MutalyzerOutput -class TranscriptNameInfo(ClassModel) : +class TranscriptNameInfo(ClassModel): """ Return type of SOAP method getGeneAndTranscript. """ - __namespace__ = 'http://mutalyzer.nl/2.0/services' + __namespace__ = SOAP_NAMESPACE transcriptName = Mandatory.String productName = Mandatory.String @@ -140,7 +136,7 @@ class ExonInfo(ClassModel): """ Used in TranscriptInfo data type. """ - __namespace__ = 'http://mutalyzer.nl/2.0/services' + __namespace__ = SOAP_NAMESPACE cStart = Mandatory.String gStart = Mandatory.Integer @@ -153,7 +149,7 @@ class ProteinTranscript(ClassModel): """ Used in TranscriptInfo data type. """ - __namespace__ = 'http://mutalyzer.nl/2.0/services' + __namespace__ = SOAP_NAMESPACE name = Mandatory.String id = Mandatory.String @@ -169,7 +165,7 @@ class TranscriptInfo(ClassModel): both trans and CDS. Ivar asked for 'end'. Internally, we have trans 'end' and CDS 'stop'. """ - __namespace__ = 'http://mutalyzer.nl/2.0/services' + __namespace__ = SOAP_NAMESPACE name = Mandatory.String id = Mandatory.String @@ -195,12 +191,47 @@ class TranscriptInfo(ClassModel): #TranscriptInfo -class CheckSyntaxOutput(ClassModel) : +class CheckSyntaxOutput(ClassModel): """ Return type of SOAP method checkSyntax. """ - __namespace__ = 'http://mutalyzer.nl/2.0/services' + __namespace__ = SOAP_NAMESPACE valid = Mandatory.Boolean messages = Array(SoapMessage) #CheckSyntaxOutput + + +class InfoOutput(ClassModel): + """ + Return type of SOAP method info. + """ + __namespace__ = SOAP_NAMESPACE + + version = String + versionParts = Array(String) + releaseDate = String + nomenclatureVersion = String + nomenclatureVersionParts = Array(String) + serverName = String + contactEmail = String +#InfoOutput + + +class CacheEntry(ClassModel): + """ + Used in getCache SOAP method. + """ + __namespace__ = SOAP_NAMESPACE + + name = Mandatory.String + gi = String + hash = Mandatory.String + chromosomeName = String + chromosomeStart = Integer + chromosomeStop = Integer + chromosomeOrientation = Integer + url = String + created = Mandatory.DateTime + cached = String +#CacheEntry diff --git a/src/Modules/Mutator.py b/mutalyzer/mutator.py similarity index 94% rename from src/Modules/Mutator.py rename to mutalyzer/mutator.py index a913397ae094269556202566304553c9121018a7..9065205016dfb634fd5c24f3d115f0a5285ba491 100644 --- a/src/Modules/Mutator.py +++ b/mutalyzer/mutator.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - """ Module for mutating a string. @@ -11,22 +9,15 @@ visualisation of each raw variant within a combined variant is made and effects on restriction sites are also analysed. The original as well as the mutated string are stored here. - -@requires: itertools.izip_longest -@requires: Bio.Restriction -@requires: Bio.Seq.Seq -@requires: Bio.Alphabet.IUPAC.IUPACAmbiguousDNA -@requires: Bio.Seq.reverse_complement """ -# Public classes: -# - Mutator ; Mutate a string and register all shift points. -from itertools import izip_longest +from mutalyzer import util from Bio import Restriction from Bio.Seq import Seq from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA -from Bio.Seq import reverse_complement # reverse_complement() +from Bio.Seq import reverse_complement + class Mutator() : """ @@ -43,6 +34,8 @@ class Mutator() : where the modifications in length are stored. Each first element of the tuples in this list is unique, each second element is non-zero. + - __removed_sites ; Set of splice sites to ignore in mutated + string. - __restrictionBatch ; Public variables: @@ -106,6 +99,7 @@ class Mutator() : self.__config = config self.__output = output self.__shift = [] + self.__removed_sites = set() self.__restrictionBatch = Restriction.RestrictionBatch([], ['N']) self.orig = orig @@ -171,7 +165,7 @@ class Mutator() : ret = [] for i in d.keys() : - for j in d[i] : + for _ in d[i] : ret.append(str(i)) return ret @@ -363,6 +357,21 @@ class Mutator() : return ret #shiftpos + def add_removed_sites(self, sites): + """ + Add sites to the set of splice sites to ignore in the mutated string. + + @arg sites: A list of splice sites to ignore. + @type sites: list of int + + @todo: Resulting list of ignored sites should always be even. + @todo: Don't remove CDS start/stop, as happens e.g. with + AL449423.14(CDKN2A_v002):c.5_400del. + """ + for site in sites: + self.__removed_sites.add(site) + #add_ignore_sites + def newSplice(self, sites) : """ Generate a list of new splice sites. @@ -374,7 +383,7 @@ class Mutator() : @rtype: list of int - Example 1 (DNA): NG_012772.1 + Example 1 (DNA): NG_012772.1(BRCA2_v001) ...---------[=========]----------... ^ ^ @@ -430,9 +439,9 @@ class Mutator() : new_sites = [] - prev_donor = sites[0] - 1 - sites_iter = iter(sites) - for acceptor, donor in izip_longest(sites_iter, sites_iter): + prev_donor = None + filtered_sites = filter(lambda s: s not in self.__removed_sites, sites) + for acceptor, donor in util.grouper(filtered_sites): # We don't want to do the -1+1 dance if # 1) there is a deletion directly before the exon, or @@ -446,7 +455,8 @@ class Mutator() : # Condition 3) makes sure we don't include insertions directly # in front of CDS start in the CDS. It also affects translation # start, but this should be no problem. - if prev_donor == acceptor - 1 or self.shift_minus_at(acceptor): + if not prev_donor or prev_donor == acceptor - 1 or \ + self.shift_minus_at(acceptor): new_sites.append(self.shiftpos(acceptor)) else: new_sites.append(self.shiftpos(acceptor - 1) + 1) @@ -581,10 +591,3 @@ class Mutator() : self.__output.addOutput("visualisation", visualisation) #dupM #Mutator - -# -# Unit test. -# -if __name__ == "__main__" : - pass -#if diff --git a/src/Modules/Output.py b/mutalyzer/output.py similarity index 72% rename from src/Modules/Output.py rename to mutalyzer/output.py index a8808d38f2ce3dd7bcb94e58e4372cafe8e492b8..53bedeed5aa127318bfdfdb3f41535e2b011effd 100644 --- a/src/Modules/Output.py +++ b/mutalyzer/output.py @@ -1,30 +1,33 @@ -#!/usr/bin/python - """ Module for storing output and messages. -Output is stored as a named list that can be expanded. -Messages can be retrieved at a later time to provide flexibility. Message -levels are defined to increase or decrease the amount of logging and ouput. + +Output is stored as a named list that can be expanded. Messages can be +retrieved at a later time to provide flexibility. Message levels are +defined to increase or decrease the amount of logging and ouput. + The position of the log file, as well as the levels are defined in the configuration file. Message levels: - - E{-}1 : Log ; Specifically log a message. - - 0 : Debug ; Debug information. - - 1 : Info ; Info. - - 2 : Warning ; Regular warnings. - - 3 : Error ; Serious errors that can be compensated for. - - 4 : Fatal ; Errors that are not recoverable. - - 5 : Off ; Can be used as a log/output level to turn off output. - -@requires: time + - -1 : Log ; Specifically log a message. + - 0 : Debug ; Debug information. + - 1 : Info ; Info. + - 2 : Warning ; Regular warnings. + - 3 : Error ; Serious errors that can be compensated for. + - 4 : Fatal ; Errors that are not recoverable. + - 5 : Off ; Can be used as a log/output level to turn off output. + +Public classes: + - Message ; Container class for message variables. + - Output ; Output interface for errors, warnings and logging. """ -# Public classes: -# - Message ; Container class for message variables. -# - Output ; Output interface for errors, warnings and logging. -import time # strftime() +import time + +from mutalyzer import util +from mutalyzer.models import SoapMessage + class Output() : """ @@ -45,11 +48,6 @@ class Output() : module. - __del__() ; Close the logfile and clean up. - Private methods: - - __niceName(filename) ; Strip the path and the extention from a - filename. - - __levelToName(level) ; Convert a log level to a readable string. - Public methods: - addMessage(filename, level, code, description) ; Add a message to the message list. @@ -86,7 +84,7 @@ class Output() : self.__config = config self.__outputData = {} self.__messages = [] - self.__instance = self.__niceName(instance) + self.__instance = util.nice_filename(instance) self.__loghandle = open(self.__config.log, "a+") self.__errors = 0 self.__warnings = 0 @@ -111,44 +109,6 @@ class Output() : del i #__del__ - def __niceName(self, filename) : - """ - Strip the path and the extention from a filename. - - @arg filename: A complete path plus extention - @type filename: string - - @return: The bare filename without a path and extention - @rtype: string - """ - - return filename.split('/')[-1].split('.')[0] - #__niceName - - def __levelToName(self, level) : - """ - Convert a log level to a readable string. - - @arg level: A log level (an integer between -1 and 5) - @type level: integer - - @return: A readable description of the log level - @rtype: string - """ - - if level == 0 : - return "Debug: " - if level == 1 : - return "Info: " - if level == 2 : - return "Warning: " - if level == 3 : - return "Error: " - if level == 4 : - return "Fatal: " - return "" - #__levelToName - def addMessage(self, filename, level, code, description) : """ Add a message to the message list. @@ -172,23 +132,23 @@ class Output() : @arg code: Error code of the message @arg description: Description of the message """ - - niceName = self.__niceName(filename) + nice_name = util.nice_filename(filename) + message = Message(nice_name, level, code, description) # Append a new message object to the messages list. - self.__messages.append(Message(niceName, level, code, description)) + self.__messages.append(message) - if level == 2 : + if level == 2: self.__warnings += 1 - if level > 2 : + if level > 2: self.__errors += 1 # Log the message if the message is important enough, or if it is only # meant to be logged (level -1). if level >= self.__config.loglevel or level == -1 : self.__loghandle.write(time.strftime( - self.__config.datestring + ' ') + "%s (%s) %s: %s%s\n" % ( - self.__instance, niceName, code, self.__levelToName(level), + self.__config.datestring + ' ') + "%s (%s) %s: %s: %s\n" % ( + self.__instance, nice_name, code, message.named_level(), description)) self.__loghandle.flush() #if @@ -205,45 +165,25 @@ class Output() : @return: A list of messages @rtype: list """ - - ret = [] - for i in self.__messages : - if i.level >= self.__config.outputlevel : - #print "%s(%s): %s" % (self.__levelToName(i.level), i.origin, - # i.description) - ret.append("%s(%s): %s" % (self.__levelToName(i.level), - i.origin, i.description)) - return ret + return filter(lambda m: m.level >= self.__config.outputlevel, + self.__messages) #getMessages - def getSoapMessages(self): + def getMessagesWithErrorCode(self, errorcode): """ - Returns a list of SoapMessages for over the wire + Retrieve all messages that have a specific error code. Private variables: - - __messages ; The messages list. - - __config ; The variable outputlevel is used. + - __messages ; The messages list. - @requires: Modules.Serializers.SoapMessage + @arg errorcode: The error code to filter on + @type errorcode: string - @return: list of SoapMessages + @return: A filtered list @rtype: list """ - - #TODO: MOVE to top if works - from Modules.Serializers import SoapMessage - - ret = [] - for i in self.__messages: - if i.level >= self.__config.outputlevel: - mess = SoapMessage() - mess.errorcode = i.code - mess.message = i.description - ret.append(mess) - #if - #for - return ret - #getSoapMessages + return filter(lambda m: m.code == errorcode, self.__messages) + #getMessagesWithErrorCode def getBatchMessages(self, level): """ @@ -259,12 +199,13 @@ class Output() : @return: list of Messages @rtype: list """ - ret = [] lastorigin = "" for i in self.__messages: if i.level >= level: - if lastorigin == "Parser": #Only one parse error + # Todo: We changed this from 'Parser' to 'grammar', does this + # still work? + if lastorigin == 'grammar': #Only one parse error continue lastorigin = i.origin ret.append("(%s): %s" % (i.origin, i.description)) @@ -273,7 +214,6 @@ class Output() : return ret #getBatchMessages - def addOutput(self, name, data) : """ If the output dictionary already has a node with the specified @@ -288,7 +228,6 @@ class Output() : @arg data: The data to be stored at this node @type data: object """ - if self.__outputData.has_key(name) : self.__outputData[name].append(data) else : @@ -304,24 +243,25 @@ class Output() : @arg name: Name of a node in the output dictionary @type name: string - + @return: output dictionary @rtype: dictionary """ - if self.__outputData.has_key(name) : return self.__outputData[name] return [] #getOutput - def getIndexedOutput(self, name, index) : + def getIndexedOutput(self, name, index, default=None): """ Return an element of a list, the list is called 'name' in de __outputData dictionary. If either the list or the element does not - exist, return None. + exist, return {default}. @arg name: Name of the list. @arg index: Index of the element to be retuned. + @arg default: Default to return if either the list or the element + does not exist. Private variables: - __outputData ; The output dictionary. @@ -329,34 +269,11 @@ class Output() : @return: The requested element or None @rtype: any type """ - if self.__outputData.has_key(name) : if 0 <= index < len(self.__outputData[name]) : return self.__outputData[name][index] - return None - #getFirst - - def getMessagesWithErrorCode(self, errorcode): - """ - Retrieve all messages that have a specific error code. - - Private variables: - - __messages ; The messages list. - - @arg errorcode: The error code to filter on - @type errorcode: string - - @return: A filtered list - @rtype: list - """ - - ret = [] - for i in self.__messages: - if i.code == errorcode: - ret.append(i) - return ret - #getMessagesWithErrorCode - + return default + #getIndexedOutput def Summary(self) : """ @@ -373,7 +290,6 @@ class Output() : - Summary @rtype: integer, integer, string """ - e_s = 's' w_s = 's' if self.__errors == 1 : @@ -419,17 +335,39 @@ class Message() : @arg description: A description of the message @type description: string """ - self.origin = origin self.level = level self.code = code self.description = description #__init__ -#Message -# -# Unit test. -# -if __name__ == "__main__" : - pass -#if + def __repr__(self): + return 'Message("%s", %i, "%s", "%s")' % \ + (self.origin, self.level, self.code, self.description) + #__repr__ + + def __str__(self): + return '%s (%s): %s' % \ + (self.named_level(), self.origin, self.description) + #__str__ + + def named_level(self): + """ + Get message log level as readable string. + + @return: A readable description of the log level. + @rtype: string + """ + if self.level == 0: + return "Debug" + if self.level == 1: + return "Info" + if self.level == 2: + return "Warning" + if self.level == 3: + return "Error" + if self.level == 4: + return "Fatal" + return '' + #named_level +#Message diff --git a/mutalyzer/parsers/__init__.py b/mutalyzer/parsers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3e1bd90dd08aa288d05a8c342e2bbae9218a730c --- /dev/null +++ b/mutalyzer/parsers/__init__.py @@ -0,0 +1,3 @@ +""" +Parsers for GenRecord objects. +""" diff --git a/src/Modules/GBparser.py b/mutalyzer/parsers/genbank.py similarity index 93% rename from src/Modules/GBparser.py rename to mutalyzer/parsers/genbank.py index 433df8a8dce66921d0fe4ee053aa0174de691373..0e5bf2eede5a2b87a24812388a03d3125c18ef4b 100644 --- a/src/Modules/GBparser.py +++ b/mutalyzer/parsers/genbank.py @@ -1,27 +1,18 @@ -#!/usr/bin/python - """ -Module contains one public function createGBRecord which returns a +Module contains one public function create_record which returns a mutalyzer GenRecord. Record populated with data from a GenBank file. - -@requires: bz2 -@requires: Db -@requires: Bio.SeqIO -@requires: Bio.Entrez -@requires: GenRecord.PList -@requires: GenRecord.Locus -@requires: GenRecord.Gene -@requires: GenRecord.Record -@requires: GenRecord.GenRecord """ -import bz2 # BZ2Compressor(), BZ2File() -import Db -from Bio import SeqIO, Entrez # read() -from GenRecord import PList, Locus, Gene, Record, GenRecord +import bz2 +from Bio import SeqIO, Entrez + +from mutalyzer.config import Config +from mutalyzer import Db +from mutalyzer.GenRecord import PList, Locus, Gene, Record, GenRecord -class tempGene() : + +class tempGene(): """ Container class for a given gene name. @@ -33,14 +24,14 @@ class tempGene() : - cdsList ; CDS list (including internal splice sites). """ - def __init__(self, name) : + def __init__(self, name): """ Initialise the class for a given gene name. - + Public variables: - rnaList ; List of splice sites. - cdsList ; CDS list (including internal splice sites). - + @arg name: Gene name @type name: string """ @@ -51,31 +42,29 @@ class tempGene() : #__init__ #tempGene -class GBparser() : + +class GBparser(): """ @todo: documentation """ - - def __init__(self) : + def __init__(self): """ Initialise the class - + Public variables: - config ; Config object. - + Private variables: - __database ; Db.Cache object - + @requires: Config """ - - import Config - config = Config.Config() + config = Config() Entrez.email = config.Retriever.email self.__database = Db.Cache(config.Db) #__init__ - def __location2pos(self, location) : + def __location2pos(self, location): """ Convert a location object to a tuple of integers. @@ -99,7 +88,7 @@ class GBparser() : return ret #__location2pos - def __locationList2posList(self, locationList) : + def __locationList2posList(self, locationList): """ Convert a list of locations to a list of integers. @@ -132,7 +121,7 @@ class GBparser() : return ret #__locationList2posList - def __transcriptToProtein(self, transcriptAcc) : + def __transcriptToProtein(self, transcriptAcc): """ Try to find the protein linked to a transcript id. @@ -142,7 +131,7 @@ class GBparser() : @arg transcriptAcc: Accession number of the transcript for which we want to find the protein @type transcriptAcc: string - + @return: Accession number of a protein or None if nothing can be found @rtype: string """ @@ -177,7 +166,7 @@ class GBparser() : return proteinAcc #__transcriptToProtein - def __findMismatch(self, productList, direction) : + def __findMismatch(self, productList, direction): """ Find the index of the first or last word that distinguishes one sentence from an other. @@ -189,7 +178,7 @@ class GBparser() : @type productList: list of strings @arg direction: The direction in which to search @type direction: integer (1 or -1) - + @return: The index of the word where sentences start to differ @rtype: integer """ @@ -209,7 +198,7 @@ class GBparser() : return 0 #__findMismatch - def __tagByDict(self, locus, key) : + def __tagByDict(self, locus, key): """ Transfer a variable in the qualifiers dictionary to the locus object. If the variable does not exist, set it to the empty string. @@ -226,7 +215,7 @@ class GBparser() : setattr(locus, key, "") #__tagByDict - def __tagLocus(self, locusList) : + def __tagLocus(self, locusList): """ Enrich a list of locus objects (mRNA or CDS) with information used for linking (locus_tag, proteinLink and productTag). Also @@ -282,7 +271,7 @@ class GBparser() : #__tagLocus - def __checkTags(self, locusList, tagName) : + def __checkTags(self, locusList, tagName): """ Check whether all tags in a locus list are unique. Prune all the non unique tags. @@ -311,7 +300,7 @@ class GBparser() : #for #__checkTags - def __matchByRange(self, mrna, cds) : + def __matchByRange(self, mrna, cds): """ Match the mRNA list to the CDS list. @@ -354,7 +343,7 @@ class GBparser() : return 1 # Everything matches, but there is little information. #__matchByRange - def link(self, rnaList, cdsList) : + def link(self, rnaList, cdsList): """ Link mRNA loci to CDS loci (all belonging to one gene). @@ -363,10 +352,10 @@ class GBparser() : method is by looking at the locus_tag, if this fails, we try to match the proteinLink tags, if this also fails, we try the productTag. - + If no link could be found, but there is only one possibility left, the loci are linked too. - + The method that was used to link the loci, is put in the linkmethod variable of the transcript locus. The link variable of the transcript locus is a pointer to the CDS locus. Furthermore, the @@ -403,7 +392,7 @@ class GBparser() : i.link = j i.linkMethod = "locus" j.linked = True - print "Linked:", j.locus_tag + #print "Linked:", j.locus_tag break #if # Try the proteinLink tag. @@ -446,17 +435,16 @@ class GBparser() : #for #link - def createGBRecord(self, filename): + def create_record(self, filename): """ - Create a GenRecord.Record from a GenBank file + Create a GenRecord.Record from a GenBank file - @arg filename: The full path to the compressed GenBank file - @type filename: string + @arg filename: The full path to the compressed GenBank file + @type filename: string - @return: A GenRecord.Record instance - @rtype: object (record) + @return: A GenRecord.Record instance + @rtype: object (record) """ - # first create an intermediate genbank record with BioPython file_handle = bz2.BZ2File(filename, "r") biorecord = SeqIO.read(file_handle, "genbank") @@ -516,7 +504,7 @@ class GBparser() : #if #if - if i.type in ["mRNA", "misc_RNA", "ncRNA", "rRNA", "tRNA", + if i.type in ["mRNA", "misc_RNA", "ncRNA", "rRNA", "tRNA", "tmRNA"] : geneDict[geneName].rnaList.append(i) if i.type == "CDS" : @@ -627,5 +615,5 @@ class GBparser() : record.geneList.remove(i) return record - #parseRecord + #create_record #GBparser diff --git a/src/Modules/LRGparser.py b/mutalyzer/parsers/lrg.py similarity index 93% rename from src/Modules/LRGparser.py rename to mutalyzer/parsers/lrg.py index 43fc5954a556e28dd47bb5e8a42907021fd2ae0a..4c2def5e40e07276eea74ddb80bdc29b60d1b835 100644 --- a/src/Modules/LRGparser.py +++ b/mutalyzer/parsers/lrg.py @@ -1,7 +1,5 @@ -#!/usr/bin/python - """ -Module contains one public function createLrgRecord which returns a +Module contains one public function create_record which returns a mutalyzer GenRecord.Record populated with data from a LRG file. A LRG file is an XML formatted file and consists of a fixed and @@ -17,29 +15,24 @@ This module is based on the result of the minidom xml parser. NOTE: A strong alternative to the minidom parser would be ElementTree which is added in python2.5. Its main strengths are speed and readability [pythonesque]. (http://docs.python.org/library/xml.etree.elementtree.html) - -@requires: xml.dom.minidom -@requires: xml.parsers.expat.ExpatError -@requires: Bio.Seq.Seq -@requires: Bio.Alphabet.IUPAC """ + +import xml.dom.minidom from Bio.Seq import Seq from Bio.Alphabet import IUPAC -from Modules import GenRecord -import xml.dom.minidom -from xml.parsers.expat import ExpatError # Raised on invalid XML files -__all__ = ["createLrgRecord"] # Only import createLrgRecord from this module +from mutalyzer import GenRecord + -def __debugParsedData(title, data): +def _debug_parsed_data(title, data): """ Output additional data to stdout. Used for debugging the intermediate format used while parsing a LRG file. - + @requires: pprint - - @arg title: + + @arg title: @type title: string @arg data: minidom object @type data: object @@ -48,9 +41,10 @@ def __debugParsedData(title, data): print "#"*79+"\nDEBUG: Start of "+title+"\n"+"#"*79 pprint.pprint(data) print "#"*79+"\nDEBUG: End of "+title+"\n"+"#"*79 -#__debugParsedData +#_debug_parsed_data + -def _getContent(data, refname): +def _get_content(data, refname): """ Return string-content of an XML textnode. @@ -67,7 +61,8 @@ def _getContent(data, refname): return temp[0].lastChild.data.encode("utf8") else: return "" -#_getContent +#_get_content + def _attr2dict(attr): """ @@ -88,7 +83,8 @@ def _attr2dict(attr): return ret #_attr2dict -def createLrgRecord(data): + +def create_record(data): """ Create a GenRecord.Record of a LRG <xml> formatted string. @@ -112,7 +108,7 @@ def createLrgRecord(data): # NOTE: To get insight in the structure of the intermediate # nested dictionary format please comment out the following line - #__debugParsedData("Updatable Section",updParsed) + #_debug_parsed_data('Updatable Section', updParsed) # Get the genomic mapping from the Updatable Section -> LRG # NOTE: The mapping is not yet used in the mutalyzer program @@ -125,9 +121,9 @@ def createLrgRecord(data): # from the updatable section. # get sequence from Fixed Section - #assert(_getContent(fixed, "mol_type") == "dna") + #assert(_get_content(fixed, "mol_type") == "dna") record.molType = 'g' - record.seq = Seq(_getContent(fixed, "sequence"), IUPAC.unambiguous_dna) + record.seq = Seq(_get_content(fixed, "sequence"), IUPAC.unambiguous_dna) # Get the genename of the fixed gene in the LRG # and put that gene on top of the geneList. @@ -194,7 +190,8 @@ def createLrgRecord(data): transcription.CDS = CDSPList #for return record -#createLrgRecord +#create_record + def genesFromUpdatable(updParsed): """ @@ -227,6 +224,7 @@ def genesFromUpdatable(updParsed): return genes #genesFromUpdatable + def transcriptsFromParsed(parsedData): """ Populate GenRecord.Locus instances with updatable LRG node data @@ -255,13 +253,11 @@ def transcriptsFromParsed(parsedData): return transcripts #transcriptsFromParsed + def _emptyTranscripts(data): - #TODO: This function can be moved to the GenRecord.checkRecord method """ Populate a GenRecord.Locus instance with minimal data to make the gene compatible with mutalyzer. Data abstracted from the gene. - - @todo: This function can be moved to the GenRecord.checkRecord method. @arg data: Data from the gene which is used to populate the create a minimal GenRecord.Locus instance @@ -270,6 +266,8 @@ def _emptyTranscripts(data): @return: List with a single bogus GenRecord.Locus instance, in which location and mRNA are copied from the gene @rtype: list + + @todo: This function can be moved to the GenRecord.checkRecord method. """ transcript = GenRecord.Locus('') transcript.molType = 'n' @@ -281,6 +279,7 @@ def _emptyTranscripts(data): return [transcript,] #_emptyTranscripts + def _transcriptPopulator(trName, trData): """ Populate GenRecord.Locus instance with updatable LRG node data. @@ -289,7 +288,7 @@ def _transcriptPopulator(trName, trData): @type trName: string @arg trData: Data associated with the transcript @type trData: dictionary - + @return: transcript ; GenRecord.Locus instance, populated with the content of the parsed Data @rtype: object @@ -318,6 +317,7 @@ def _transcriptPopulator(trName, trData): return transcript #_transcriptPopulator + def getMapping(rawMapData): """ Collect all necessary info to map the current LRG sequence to the @@ -344,6 +344,7 @@ def getMapping(rawMapData): return ret #getMapping + def parseUpdatable(data): """ Mediator function which transforms the minidom object to a nested dict @@ -362,7 +363,7 @@ def parseUpdatable(data): ret = {"LRG":{}, "NCBI":{}, "Ensembl":{}} annotation_nodes = data.getElementsByTagName("annotation_set") for anno in annotation_nodes: - name = _getContent(anno, "name") + name = _get_content(anno, "name") if name == "LRG": ret["LRG"] = getLrgAnnotation(anno) elif name == "NCBI RefSeqGene": @@ -376,6 +377,7 @@ def parseUpdatable(data): return ret #parseUpdatable + def getLrgAnnotation(data): """ Retrieves three parts of the LRG annotation: @@ -406,10 +408,11 @@ def getLrgAnnotation(data): ret["mapping"] = (mapattr,spanattr,diffs) #for # Get the LRG Gene Name, this is the main gene in this LRG - ret["genename"] = _getContent(data, "lrg_gene_name") + ret["genename"] = _get_content(data, "lrg_gene_name") return ret #getLrgAnnotation + def getFeaturesAnnotation(data): """ Retrieves feature annotations from NCBI & Ensembl nodes. @@ -422,9 +425,9 @@ def getFeaturesAnnotation(data): NOTE: an xml node has attributes and elements, this function squashes this ambiguity and collects only the attributes and elements of interest - + @todo: check documentation - + @arg data: updatable section -> Annotations -> NCBI | Ensembl @type data: dictionary @@ -451,18 +454,18 @@ def getFeaturesAnnotation(data): feature = data.getElementsByTagName("features")[0] for gene in feature.getElementsByTagName("gene"): geneAttr = _attr2dict(gene.attributes) - geneLongName = _getContent(gene, "long_name") + geneLongName = _get_content(gene, "long_name") transcripts = {"noFixedId": []} for transcript in gene.getElementsByTagName("transcript"): transAttr = _attr2dict(transcript.attributes) - transLongName = _getContent(transcript, "long_name") + transLongName = _get_content(transcript, "long_name") # Check if the transcript has a protein product proteinProduct =\ transcript.getElementsByTagName("protein_product") if proteinProduct: protein = proteinProduct[0] proteinAttr = _attr2dict(protein.attributes) - proteinLongName = _getContent(protein, "long_name") + proteinLongName = _get_content(protein, "long_name") else: proteinAttr = {} proteinLongName = "" @@ -490,6 +493,3 @@ def getFeaturesAnnotation(data): #for gene return ret #getFeaturesAnnotation - -if __name__ == "__main__": - print "Use the unit tests to test this Module" diff --git a/mutalyzer/sync.py b/mutalyzer/sync.py new file mode 100644 index 0000000000000000000000000000000000000000..2a96446bf2b268e284c1a9b0db6dca4349c20d27 --- /dev/null +++ b/mutalyzer/sync.py @@ -0,0 +1,201 @@ +""" +Module for synchronizing the database with other Mutalyzer instances. +""" + + +from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() + +import os +import re +from datetime import datetime, timedelta +import urllib2 +from suds.client import Client + +from mutalyzer import Retriever + + +DEFAULT_CREATED_SINCE_DAYS = 7 + + +class CacheSync(object): + """ + Synchronize the database cache with other Mutalyzer instances. + """ + def __init__(self, config, output, database): + """ + Instantiate the object. + + @arg config: A configuration object. + @type config: mutalyzer.config.Config.Retriever + @arg output: An output object. + @type output: mutalyzer.output.Output + @arg database: A database object. + @type database: mutalyzer.Db.Cache + """ + self._config = config + self._output = output + self._database = database + + def local_cache(self, created_since=None): + """ + Get all entries in the local cache with creation date {created_since} + or later. + + @kwarg created_since: Only entries with this creation date or later + are returned. + @type created_since: datatime.datetime + + @return: List of cache entries. + @rtype: list(dictionary) + """ + if not created_since: + created_since = datetime.today() - \ + timedelta(days=DEFAULT_CREATED_SINCE_DAYS) + + entries = self._database.getGBSince(created_since) + cache = [] + + # Translate each entry to a dictionary and check if it is cached on + # our filesystem. + for entry in entries: + # Note that this way we only include Genbank files, not LRG files. + cached = None + if os.path.isfile(os.path.join(self._config.cache, + '%s.gb.bz2' % entry[0])): + cached = '%s.gb' % entry[0] + cache.append({'name': entry[0], + 'gi': entry[1], + 'hash': entry[2], + 'chromosomeName': entry[3], + 'chromosomeStart': entry[4], + 'chromosomeStop': entry[5], + 'chromosomeOrientation': entry[6], + 'url': entry[7], + 'created': entry[8], + 'cached': cached}) + + return cache + + def remote_cache(self, remote_wsdl, created_since=None): + """ + Get all entries in the remote cache with creation date {created_since} + or later. + + @arg remote_wsdl: The url of the remote SOAP WSDL description. + @type remote_wsdl: string + @kwarg created_since: Only entries with this creation date or later + are returned. + @type created_since: datatime.datetime + + @return: List of cache entries. + @rtype: list(dictionary) + """ + self._output.addMessage(__file__, -1, 'INFO', 'Getting remote cache' + ' from %s' % remote_wsdl) + + if not created_since: + created_since = datetime.today() - \ + timedelta(days=DEFAULT_CREATED_SINCE_DAYS) + client = Client(remote_wsdl, cache=None) + cache = client.service.getCache(created_since) + + def cache_entry_from_soap(entry): + """ + Create a nice dictionary out of the CacheEntry object. + """ + entry_dict = {'name': str(entry.name), + 'hash': str(entry.hash), + 'created': entry.created} + for attribute in ('gi', 'chromosomeName', 'url', 'cached'): + entry_dict[attribute] = str(entry[attribute]) \ + if attribute in entry else None + for attribute in ('chromosomeStart', 'chromosomeStop', + 'chromosomeOrientation'): + entry_dict[attribute] = int(entry[attribute]) \ + if attribute in entry else None + return entry_dict + + return map(cache_entry_from_soap, cache.CacheEntry) + + def store_remote_file(self, name, url): + """ + Download a remote file located at {url} and store it as {name}. + + @arg name: Name to store the file under. + @type name: string + @arg url: Url to the remote file. + @type url: string + """ + if not re.match('^[\da-zA-Z\._-]+$', name): + return + + # Download remote data + handle = urllib2.urlopen(url) + data = handle.read() + handle.close() + + # Store remote data + retriever = Retriever.GenBankRetriever(self._config, + self._output, + self._database) + retriever.write(data, name, 0) + + def sync_with_remote(self, remote_wsdl, url_template, + days=DEFAULT_CREATED_SINCE_DAYS): + """ + Synchronize the local cache with the remote cache. + + >>> wsdl = 'http://mutalyzer.nl/mutalyzer/services/?wsdl' + >>> template = 'http://mutalyzer.nl/mutalyzer/Reference/{file}' + >>> self.sync_with_remote(wsdl, template) + (14, 3) + + @arg remote_wsdl: The url of the remote SOAP WSDL description. + @type remote_wsdl: string + @arg url_template: Formatting string containing a {file} occurence, + see examle usage above. + @string url_template: string + @kwarg days: Only remote entries added this number of days ago or + later are considered. + @type days: int + + @return: The number of entries added to the local cache and the number + cache files downloaded from the remote site. + @rtype: tuple(int, int) + """ + self._output.addMessage(__file__, -1, 'INFO', 'Starting cache sync') + + created_since = datetime.today() - timedelta(days=days) + remote_cache = self.remote_cache(remote_wsdl, created_since) + + inserted = downloaded = 0 + + for entry in remote_cache: + if self._database.getHash(entry['name']): + continue + if self._database.getGBFromHash(entry['hash']): + continue + if entry['gi'] and self._database.getGBFromGI(entry['gi']): + continue + self._database.insertGB(entry['name'], + entry['gi'], + entry['hash'], + entry['chromosomeName'], + entry['chromosomeStart'], + entry['chromosomeStop'], + entry['chromosomeOrientation'], + entry['url']) + inserted += 1 + if not entry['chromosomeName'] and not entry['url'] \ + and entry['cached']: + url = url_template.format(file=entry['cached']) + self.store_remote_file(entry['name'], url) + downloaded += 1 + + self._output.addMessage(__file__, -1, 'INFO', + 'Inserted %d entries in the cache,' + ' downloaded %d files.' \ + % (inserted, downloaded)) + self._output.addMessage(__file__, -1, 'INFO', 'Finished cache sync') + + return inserted, downloaded diff --git a/templates/about.html b/mutalyzer/templates/about.html similarity index 86% rename from templates/about.html rename to mutalyzer/templates/about.html index 185d818f907c0dd6307fe4def5421ee8bd404551..e9bb78bd56a19874be4b38dcf15f60903ea06a43 100644 --- a/templates/about.html +++ b/mutalyzer/templates/about.html @@ -15,8 +15,8 @@ by Gerben R. Stouten.</li> <li>The position converter interfaces (webservice and WWW) are written by Gerard C. P. Schaafsma.</li> - <li>Development of several smaller features and maintenance - programming is done by Martijn Vermaat.</li> + <li>Current development and maintenance is done by Martijn + Vermaat.</li> </ul> Furthermore we would like to thank the following people for their valuable work on previous versions that acted as a guideline for the @@ -27,10 +27,10 @@ <li>Corinne Bareil.</li> <li>Gerben R. Stouten.</li> </ul> - Specifications are given by Peter E. M. Taschner and Johan T. den + Specifications are given by Peter E. M. Taschner and Johan T. den Dunnen.<br> <br> - Since the publication is under development, please use the old + Since the publication is under development, please use the old reference for now when referring to these pages: <a href="http://www.ncbi.nlm.nih.gov/entrez/utils/fref.fcgi?PrId=3058&itool=AbstractPlus-def&uid=18000842&db=pubmed&url=http://dx.doi.org/10.1002/humu.20654"> "Wildeman M et al. (2008). Improving sequence variant @@ -40,7 +40,7 @@ <br> Project development is sponsored by <a href="http://www.gen2phen.org" target="_blank"> - <img src="base/images/gen_2_phen_logo_print.png" + <img src="base/images/gen_2_phen_logo_print.png" width="110" height="50" align="middle" @@ -50,10 +50,13 @@ <a href="http://www.eurogentest.org" target="_blank"> <img src="base/images/Eurogentest.png" width="110" - height="50" + height="50" align="middle" border="0" alt="Eurogentest"></a> + <br><br> + Some icons are copyright © + <a href="http://p.yusukekamiyamane.com/">Yusuke Kamiyamane</a>. <br> </div> </body> diff --git a/templates/base/css/style.css b/mutalyzer/templates/base/css/style.css similarity index 92% rename from templates/base/css/style.css rename to mutalyzer/templates/base/css/style.css index ddc3c66248b2cee0debe77f8e9808ecfa29cf979..1368c1bb4517a67d1909403427c269f958c35916 100644 --- a/templates/base/css/style.css +++ b/mutalyzer/templates/base/css/style.css @@ -301,6 +301,31 @@ i { font-family : Arial, Helvetica, sans-serif; } +.messages { + width: 620px; +} + +.debug, .information, .warning, .error { + padding-left: 25px; + background: left top no-repeat; +} + +.debug { + background-image: url('../images/debug.png'); +} + +.information { + background-image: url('../images/info.png'); +} + +.warning { + background-image: url('../images/warning.png'); +} + +.error { + background-image: url('../images/error.png'); +} + .thnormal { font-family: Arial, Helvetica, sans-serif; font-weight: bold; diff --git a/templates/base/images/1x1b.gif b/mutalyzer/templates/base/images/1x1b.gif similarity index 100% rename from templates/base/images/1x1b.gif rename to mutalyzer/templates/base/images/1x1b.gif diff --git a/templates/base/images/1x1w.gif b/mutalyzer/templates/base/images/1x1w.gif similarity index 100% rename from templates/base/images/1x1w.gif rename to mutalyzer/templates/base/images/1x1w.gif diff --git a/templates/base/images/Eurogentest.png b/mutalyzer/templates/base/images/Eurogentest.png similarity index 100% rename from templates/base/images/Eurogentest.png rename to mutalyzer/templates/base/images/Eurogentest.png diff --git a/templates/base/images/LUMC_24x24.png b/mutalyzer/templates/base/images/LUMC_24x24.png similarity index 100% rename from templates/base/images/LUMC_24x24.png rename to mutalyzer/templates/base/images/LUMC_24x24.png diff --git a/templates/base/images/background.gif b/mutalyzer/templates/base/images/background.gif similarity index 100% rename from templates/base/images/background.gif rename to mutalyzer/templates/base/images/background.gif diff --git a/templates/base/images/background.gif.old b/mutalyzer/templates/base/images/background.gif.old similarity index 100% rename from templates/base/images/background.gif.old rename to mutalyzer/templates/base/images/background.gif.old diff --git a/templates/base/images/banner.jpg b/mutalyzer/templates/base/images/banner.jpg similarity index 100% rename from templates/base/images/banner.jpg rename to mutalyzer/templates/base/images/banner.jpg diff --git a/templates/base/images/bullit.gif b/mutalyzer/templates/base/images/bullit.gif similarity index 100% rename from templates/base/images/bullit.gif rename to mutalyzer/templates/base/images/bullit.gif diff --git a/templates/base/images/bullitdonker.gif b/mutalyzer/templates/base/images/bullitdonker.gif similarity index 100% rename from templates/base/images/bullitdonker.gif rename to mutalyzer/templates/base/images/bullitdonker.gif diff --git a/templates/base/images/bullitlicht1.gif b/mutalyzer/templates/base/images/bullitlicht1.gif similarity index 100% rename from templates/base/images/bullitlicht1.gif rename to mutalyzer/templates/base/images/bullitlicht1.gif diff --git a/templates/base/images/bullitlicht2.gif b/mutalyzer/templates/base/images/bullitlicht2.gif similarity index 100% rename from templates/base/images/bullitlicht2.gif rename to mutalyzer/templates/base/images/bullitlicht2.gif diff --git a/templates/base/images/bullitmiddel.gif b/mutalyzer/templates/base/images/bullitmiddel.gif similarity index 100% rename from templates/base/images/bullitmiddel.gif rename to mutalyzer/templates/base/images/bullitmiddel.gif diff --git a/templates/base/images/bullitmiddel.gif.old b/mutalyzer/templates/base/images/bullitmiddel.gif.old similarity index 100% rename from templates/base/images/bullitmiddel.gif.old rename to mutalyzer/templates/base/images/bullitmiddel.gif.old diff --git a/templates/base/images/cubic4-17.gif b/mutalyzer/templates/base/images/cubic4-17.gif similarity index 100% rename from templates/base/images/cubic4-17.gif rename to mutalyzer/templates/base/images/cubic4-17.gif diff --git a/templates/base/images/cubic4-17c.gif b/mutalyzer/templates/base/images/cubic4-17c.gif similarity index 100% rename from templates/base/images/cubic4-17c.gif rename to mutalyzer/templates/base/images/cubic4-17c.gif diff --git a/mutalyzer/templates/base/images/debug.png b/mutalyzer/templates/base/images/debug.png new file mode 100644 index 0000000000000000000000000000000000000000..555887a28d64bc812c4dfa98a6ff1da1927b7792 Binary files /dev/null and b/mutalyzer/templates/base/images/debug.png differ diff --git a/templates/base/images/detailkaart.jpg b/mutalyzer/templates/base/images/detailkaart.jpg similarity index 100% rename from templates/base/images/detailkaart.jpg rename to mutalyzer/templates/base/images/detailkaart.jpg diff --git a/mutalyzer/templates/base/images/error.png b/mutalyzer/templates/base/images/error.png new file mode 100644 index 0000000000000000000000000000000000000000..517725822ba2859be6811af489efc672fe4117df Binary files /dev/null and b/mutalyzer/templates/base/images/error.png differ diff --git a/templates/base/images/favicon.ico b/mutalyzer/templates/base/images/favicon.ico similarity index 100% rename from templates/base/images/favicon.ico rename to mutalyzer/templates/base/images/favicon.ico diff --git a/templates/base/images/gen_2_phen_logo_print.png b/mutalyzer/templates/base/images/gen_2_phen_logo_print.png similarity index 100% rename from templates/base/images/gen_2_phen_logo_print.png rename to mutalyzer/templates/base/images/gen_2_phen_logo_print.png diff --git a/mutalyzer/templates/base/images/info.png b/mutalyzer/templates/base/images/info.png new file mode 100644 index 0000000000000000000000000000000000000000..bd4f552a8bfccb0abe072437a8762598a562a7a7 Binary files /dev/null and b/mutalyzer/templates/base/images/info.png differ diff --git a/templates/base/images/logoULE.gif b/mutalyzer/templates/base/images/logoULE.gif similarity index 100% rename from templates/base/images/logoULE.gif rename to mutalyzer/templates/base/images/logoULE.gif diff --git a/templates/base/images/mutalyzer_logo.png b/mutalyzer/templates/base/images/mutalyzer_logo.png similarity index 100% rename from templates/base/images/mutalyzer_logo.png rename to mutalyzer/templates/base/images/mutalyzer_logo.png diff --git a/templates/base/images/mutalyzer_logo_bw.png b/mutalyzer/templates/base/images/mutalyzer_logo_bw.png similarity index 100% rename from templates/base/images/mutalyzer_logo_bw.png rename to mutalyzer/templates/base/images/mutalyzer_logo_bw.png diff --git a/templates/base/images/project-1.jpg b/mutalyzer/templates/base/images/project-1.jpg similarity index 100% rename from templates/base/images/project-1.jpg rename to mutalyzer/templates/base/images/project-1.jpg diff --git a/mutalyzer/templates/base/images/warning.png b/mutalyzer/templates/base/images/warning.png new file mode 100644 index 0000000000000000000000000000000000000000..9b0460ed47c11361b948e4f7ebf39e2a799f3d8f Binary files /dev/null and b/mutalyzer/templates/base/images/warning.png differ diff --git a/templates/base/js/generator.js b/mutalyzer/templates/base/js/generator.js similarity index 100% rename from templates/base/js/generator.js rename to mutalyzer/templates/base/js/generator.js diff --git a/templates/base/js/index.js b/mutalyzer/templates/base/js/index.js similarity index 100% rename from templates/base/js/index.js rename to mutalyzer/templates/base/js/index.js diff --git a/templates/base/js/interface.js b/mutalyzer/templates/base/js/interface.js similarity index 99% rename from templates/base/js/interface.js rename to mutalyzer/templates/base/js/interface.js index d2adda3c9789a532853ef787a576c8409813f284..be42d8d761caf5f734d48df0d32559bf57c45c1f 100644 --- a/templates/base/js/interface.js +++ b/mutalyzer/templates/base/js/interface.js @@ -98,7 +98,6 @@ function updatePercentage() { http.send(null); } - function linkify(text){ var replacePattern = /([A-Za-z_0-9]+(\.\d+)?(\([A-Za-z0-9]+(_[vi]\d+)?\))?\:[cgpn]\.([-\*]?\d+((\+|-)[ud]?\d+)?)(_([-\*]?\d+((\+|-)[ud]?\d+)?))?(([delinsup]+[ACTGactg]*)|([ACTGactg].[ACTGactg])))/gim; //reference = /([A-Za-z_0-9]+(\.\d+)?(\([A-Za-z0-9]+(_[vi]\d+)?\))?\:[cgpn]\. diff --git a/templates/base/js/m.js b/mutalyzer/templates/base/js/m.js similarity index 100% rename from templates/base/js/m.js rename to mutalyzer/templates/base/js/m.js diff --git a/templates/base/js/menu.js b/mutalyzer/templates/base/js/menu.js similarity index 95% rename from templates/base/js/menu.js rename to mutalyzer/templates/base/js/menu.js index e12206c36e0a4f4878fe87bcca1fed36f571d15d..563bd5c45f90fc2016295089d4908436cc08f0e2 100644 --- a/templates/base/js/menu.js +++ b/mutalyzer/templates/base/js/menu.js @@ -128,7 +128,7 @@ function initActive() { var winLoc; winLoc = window.location.href; - winLoc = winLoc.replace(/http:\/\/[^\/]*\//, ""); + winLoc = winLoc.replace(/https?:\/\/[^\/]*\//, ""); if (winLoc.match("~")) winLoc = winLoc.replace(/[^\/]*\//, ""); @@ -141,6 +141,9 @@ function initActive() { winLoc = winLoc.replace("#", ""); a_Act = winLoc.split("/"); + // This is a Quick Hack (tm) + if (a_Act[0] == '2.0' || a_Act[0] == 'mutalyzer') + a_Act.shift(); a_navAct = a_Act; alterActive(a_Act, 1); diff --git a/templates/batch.html b/mutalyzer/templates/batch.html similarity index 84% rename from templates/batch.html rename to mutalyzer/templates/batch.html index 2143f8c55ca206949dd1e8904a9085c9d284f387..95f55ff06f264906fa48f8609622d1499fa46cfa 100644 --- a/templates/batch.html +++ b/mutalyzer/templates/batch.html @@ -19,7 +19,18 @@ and the maximum size is <span tal:content = "maxSize"></span> megabytes. </p> <h5>We accept two types of input files, you can download examples below</h5> - <h5>Old Style: + <h5>New Style <a href="downloads/batchtestnew.txt">Download Example File</a></h5> + <div style="padding-left:20px; width:400px"> + <p>This file format has no header-row. Each row consists of one or + more tab delimited fields, where every field contains a single + variant description. Note that all rows must have the same number + of fields.</p> + <table> + <tr><td>AB026906.1:c.274G>T</td></tr> + <tr><td>AL449423.14(CDKN2A_v002):c.5_400del</td></tr> + </table> + </div> + <h5>Old Style: <a href="downloads/batchtestold.txt">Download Example File</a></h5> <div style="padding-left:20px; width:400px"> <p >This file format has a header-row, which consists of @@ -34,15 +45,6 @@ </tr> </table> </div> - <h5>New Style <a href="downloads/batchtestnew.txt">Download Example File</a></h5> - <div style="padding-left:20px; width:400px"> - <p>This file format has no header-row and no columns. - Instead each row contains a single variant for the Batch.</p> - <table> - <tr><td>AB026906.1:c.274G>T</td></tr> - <tr><td>AL449423.14(CDKN2A_v002):c.5_400del<td></tr> - </table> - </div> <h5>Output Format</h5> <div style="padding-left:20px; width:400px"> <p>The output of a Mutalyzer Batch run is a CSV file, which has a @@ -61,7 +63,7 @@ <td> <select id="batchType" name="batchType" onchange="return changeBatch(this)"> - <option + <option tal:repeat = "i batchTypes" tal:content = "structure string:${i}" tal:attributes = "value i"> @@ -83,15 +85,15 @@ </tr> <tr> <td><b>Email</b></td> - <td><input type = "text" - name = "batchEmail" + <td><input type = "text" + name = "batchEmail" tal:attributes = "value lastpost" style = "width:200px"></td> </tr> <tr> <td><b>File</b></td> - <td><input type = "file" - name = "batchFile" + <td><input type = "file" + name = "batchFile" style = "width:200px"></td> </tr> <tr> @@ -109,20 +111,21 @@ } </script> - <script language="javascript" + <script language="javascript" tal:content="structure string:document.getElementById('batchType').selectedIndex = ${selected}; document.getElementById('batchRow').style.display = '${hideTypes}'; window.onload = initpage;"> </script> <div tal:condition = "errors" id="errors"> <b>Errors:</b><br /> - <div tal:repeat = "i errors" - tal:replace = "structure string:${i}<br>"> - </div><br /> + <div class="messages"> + <p tal:repeat = "m errors" tal:content = "m/description" + tal:attributes = "class m/class; title string:${m/level} (origin: ${m/origin})"></p> + </div> </div> <div tal:condition = "messages"> <b>Messages</b><br> - <div tal:repeat = "i messages" + <div tal:repeat = "i messages" tal:replace = "structure string:${i}<br>"> </div> <div tal:condition = "jobID"> diff --git a/mutalyzer/templates/check.html b/mutalyzer/templates/check.html new file mode 100644 index 0000000000000000000000000000000000000000..ca1fd558341c2202182036351688ad0a207221c9 --- /dev/null +++ b/mutalyzer/templates/check.html @@ -0,0 +1,273 @@ +<html> + <head> + <link rel="stylesheet" + type="text/css" + href="base/css/style.css"> + <title></title> + </head> + <body> + <div metal:define-macro="content"> + <center> + <h3>Name checker</h3> + </center> + <div style="border: 1px solid grey; background-color: aliceblue; padding: 20px;"> + <div id = "output" tal:condition = "interactive"> + <div> + Please insert the mutation name using the + <span class = "helper" + title = "Human Genome Variation Society standard variant nomenclature"> + <a href = "http://www.hgvs.org/mutnomen">HGVS</a> format</span>:<br> + <Accession Number>.<version + number>(<Gene symbol>):<sequence + type>.<mutation> + </div><br> + Example: AB026906.1:c.274G>T<br> + <br> + <form action = "" method = "post"> + <input + type = "text" + name = "mutationName" + tal:attributes = "value lastpost" + style = "width:100%" + ><br> + <input type="submit" value="Submit"> + <input type="button" value="Clear field" + onClick = "clearForm(this.form, 'mutationName');"> + </form> + </div> + <div tal:condition = "visualisation"> + <b>Overview of the raw variants:</b><br> + <div tal:repeat = "i visualisation"> + <br> + <div tal:repeat = "j i"> + <div tal:condition = "repeat/j/start" + tal:content = "structure string:Raw variant + ${repeat/i/number}: ${j}"></div> + <tt tal:condition = "not: repeat/j/start" tal:content = "j"> + </tt> + </div> + </div> + </div> <!-- not:visualisation --> + </div> <!-- form area --> + <br> + <div tal:condition = "lastpost"> + <h3>Name checker results:</h3> + <div class="messages"> + <p tal:repeat = "m messages" tal:content = "m/description" + tal:attributes = "class m/class; title string:${m/level} (origin: ${m/origin})"></p> + <p tal:content = "summary"></p> + </div> + <br> + <div tal:condition = "parseError"> + <h4>Details of the parse error:</h4> + <pre tal:content = + "structure string:${parseError/0}<br>${parseError/1}"> + </pre> + The "^" indicates the position where the error occurred. + <br> + </div> <!-- parseError --> + <div tal:condition = "genomicDescription/0"> + <div tal:condition = "genomicDNA" tal:omit-tag=""> + <b>Genomic description:</b><br> + </div> <!-- genomicDNA --> + <div tal:condition = "not:genomicDNA" tal:omit-tag=""> + <b>Description relative to transcription start:</b><br> + (Not for use in LSDBs in case of protein-coding transcripts).<br> + </div> <!-- not:genomicDNA --> + <br> + <tt> + <a tal:content = "genomicDescription/0" + tal:attributes = "href + string:checkForward?mutationName=${genomicDescription/1}"> + </a><br> + </tt> + <br> + <br> + </div> <!-- genomicDescription/0 --> + <div tal:condition = "chromDescription"> + Alternative chromosomal position:<br> + <br> + <tt><div tal:replace = "chromDescription"></div><br></tt> + <br> + <br> + </div> <!-- chromDescription --> + <div tal:condition = "descriptions"> + <b>Affected transcripts:</b><br> + <br> + <tt tal:repeat = "i descriptions"> + <a tal:condition = "i/1" tal:content = "i/0" + tal:attributes = + "href string:checkForward?mutationName=${i/1}"></a><tal + tal:condition = "not:i/1" tal:replace = "i/0"></tal><br> + </tt> + <br> + <br> + </div> <!-- descriptions --> + <div tal:condition = "protDescriptions"> + <b>Affected proteins:</b><br> + <br> + <tt> + <div tal:repeat = "i protDescriptions" + tal:replace = "structure string:${i}<br>"> + </div> + </tt> + <br> + <br> + </div> <!-- protDescriptions --> + <div tal:condition = "transcriptInfo"> + <b>Detailed information about the selected transcript:</b><br> + <br> + <div style = "background-color : aliceblue; padding : 20px; border: 1px solid grey"> + <div tal:condition = "oldProtein"> + <b>Reference protein:</b><br> + <pre><div tal:repeat = "i oldProtein" + tal:replace = "structure string:${i}<br>"> + </div></pre> + <br> + <b>Protein predicted from variant coding sequence:</b><br> + <div tal:condition = "not:newProtein"> + <br> + No change: Predicted protein (not shown) equals reference + protein. <br> + <br> + </div> <!-- not:newProtein --> + <div tal:condition = "newProtein"> + <pre><div tal:repeat = "i newProtein" + tal:replace = "structure string:${i}<br>"> + </div></pre> + </div> <!-- newProtein --> + <br> + <div tal:condition = "altStart"> + <b tal:content = "structure string:Alternative protein + using start codon ${altStart}:"></b><br> + <div tal:condition = "altProtein"> + <pre><div tal:repeat = "i altProtein" + tal:replace = "structure string:${i}<br>"> + </div></pre> + </div> <!-- altProtein --> + <div tal:condition = "not:altProtein"> + <br> + No change: Predicted protein (not shown) equals reference + protein. <br> + <br> + </div> <!-- not:altProtein --> + <br> + </div> <!-- altStart --> + </div> <!-- oldProtein --> + <b>Exon information:</b><br> + <table class = "raTable"> + <tr> + <td>Number</td> + <td>Start (g.)</td> + <td>Stop (g.)</td> + <td> + <span tal:condition="transcriptCoding" tal:omit-tag="">Start (c.)</span> + <span tal:condition="not:transcriptCoding" tal:omit-tag="">Start (n.)</span> + </td> + <td> + <span tal:condition="transcriptCoding" tal:omit-tag="">Stop (c.)</span> + <span tal:condition="not:transcriptCoding" tal:omit-tag="">Stop (n.)</span> + </td> + </tr> + <tr tal:repeat = "i exonInfo"> + <td tal:content = "repeat/i/number"></td> + <td tal:repeat = "j i" tal:content = "j"></td> + </tr> + </table> + <div tal:condition="transcriptCoding" tal:omit-tag=""> + <br> + <b><span class = "helper" title = "Coding Sequence">CDS</span> + information:</b><br> + <table class = "raTable"> + <tr> + <td></td> + <td>g.</td> + <td>c.</td> + </tr> + <tr> + <td>Start</td> + <td tal:content = "cdsStart_g"></td> + <td tal:content = "cdsStart_c"></td> + </tr> + <tr> + <td>Stop</td> + <td tal:content = "cdsStop_g"></td> + <td tal:content = "cdsStop_c"></td> + </tr> + <tr> + </tr> + </table> + </div> + </div> <!-- background color --> + <br> + <br> + </div> + <div tal:condition = "restrictionSites"> + <b>Effects on Restriction sites:</b><br> + <br> + <table class = "laTable"> + <tr> + <td>Raw variant</td> + <td>Created</td> + <td>Deleted</td> + </tr> + <tr tal:repeat = "i restrictionSites"> + <td tal:content = "repeat/i/number"></td> + <td> + <span tal:repeat = "j i/0"> + <span tal:condition = "not:repeat/j/end" + tal:content = "structure string:${j},"> + </span> + <span tal:condition = "repeat/j/end" + tal:content = "structure string:${j}"> + </span> + </span> + </td> + <td> + <span tal:repeat = "j i/1"> + <span tal:condition = "not:repeat/j/end" + tal:content = "structure string:${j},"> + </span> + <span tal:condition = "repeat/j/end" + tal:content = "structure string:${j}"> + </span> + </span> + </td> + </tr> + </table> + <br> + <br> + </div> <!-- restrictionSites --> + <div tal:condition = "legends"> + <b>Legend:</b><br> + <br> + <table class = "laTable"> + <tr> + <td>Name</td> + <td>ID</td> + <td>Locus tag</td> + <td>Product</td> + <td>Link method</td> + </tr> + <tr tal:repeat = "i legends"> + <td tal:repeat = "j i" tal:content = "j"></td> + </tr> + </table> + <br> + <br> + </div> <!-- legends --> + <div tal:condition = "reference"> + <div tal:condition = "interactive"> + <b>Links:</b><br> + <br> + Download this reference sequence file: + <a tal:content = "reference" + tal:attributes = "href string:Reference/${reference}"></a> + <br> + <br> + </div> <!-- interactive --> + </div> <!-- reference --> + </div> <!-- lastpost --> + </div> + </body> +</html> diff --git a/templates/client-mono.cs b/mutalyzer/templates/client-mono.cs similarity index 100% rename from templates/client-mono.cs rename to mutalyzer/templates/client-mono.cs diff --git a/templates/client-php.php b/mutalyzer/templates/client-php.php similarity index 100% rename from templates/client-php.php rename to mutalyzer/templates/client-php.php diff --git a/templates/client-savon.rb b/mutalyzer/templates/client-savon.rb similarity index 100% rename from templates/client-savon.rb rename to mutalyzer/templates/client-savon.rb diff --git a/templates/client-soappy.py b/mutalyzer/templates/client-soappy.py similarity index 100% rename from templates/client-soappy.py rename to mutalyzer/templates/client-soappy.py diff --git a/templates/client-suds.py b/mutalyzer/templates/client-suds.py similarity index 100% rename from templates/client-suds.py rename to mutalyzer/templates/client-suds.py diff --git a/templates/converter.html b/mutalyzer/templates/converter.html similarity index 64% rename from templates/converter.html rename to mutalyzer/templates/converter.html index 269c0f9a32bf87d2813a7f59228d434f8b175a16..46fdea23bc6b9e69f3017412a2964cb82b99fb9f 100644 --- a/templates/converter.html +++ b/mutalyzer/templates/converter.html @@ -45,35 +45,26 @@ </form> </table> <!-- inputform --> - <div tal:condition = "gName"> - <br> - <br> - <h3>Output:</h3> - <br> - <b>Chromosomal Variant:</b><br> - <pre><div tal:replace = "structure string:${gName}<br>"></div></pre> - <b tal:condition = "not:cNames">No transcripts found in mutation region</b><br><br> - </div> - - <div tal:condition = "cNames"> - <b>Found transcripts in mutation region:</b><br> - <pre><div tal:repeat = "i cNames" - tal:replace = "structure string:${i}<br>"> - </div></pre> - </div> - - <div tal:condition = "debug"> - <b>Debug output:</b><br> - <pre><div tal:repeat = "i debug" - tal:replace = "structure string:${i}<br>"> - </div></pre> - </div> - - <div tal:condition = "errors"> - <b>Error output:</b><br> - <pre><div tal:repeat = "i errors" - tal:replace = "structure string:${i}<br>"> - </div></pre> + <div tal:condition = "posted"> + <h3>Results:</h3> + <div class="messages"> + <p tal:repeat = "m messages" tal:content = "m/description" + tal:attributes = "class m/class; title string:${m/level} (origin: ${m/origin})"></p> + <p tal:content = "summary"></p> + </div> + <div tal:condition = "gName"> + <br> + <br> + <b>Chromosomal Variant:</b><br> + <pre><div tal:replace = "structure string:${gName}<br>"></div></pre> + <b tal:condition = "not:cNames">No transcripts found in mutation region</b><br><br> + </div> + <div tal:condition = "cNames"> + <b>Found transcripts in mutation region:</b><br> + <pre><div tal:repeat = "i cNames" + tal:replace = "structure string:${i}<br>"> + </div></pre> + </div> </div> </div> <!-- div metal:define-macro="content" --> diff --git a/templates/disclaimer.html b/mutalyzer/templates/disclaimer.html similarity index 98% rename from templates/disclaimer.html rename to mutalyzer/templates/disclaimer.html index fa71b1b9f6baf983f0648e60222ef1d972a0fe9a..1b1f5304f9bfc1ac4911c581ff49127d48635814 100644 --- a/templates/disclaimer.html +++ b/mutalyzer/templates/disclaimer.html @@ -17,7 +17,7 @@ University Medical Center (LUMC) or any of their employees or agents for the effects of any product, process or method that may be produced or adopted by any part, notwithstanding that the formulation of such - product, process or method may be based upon information here provided. + product, process or method may be based upon information here provided. <br> <br> <br> diff --git a/templates/downloads/batchtestnew.txt b/mutalyzer/templates/downloads/batchtestnew.txt similarity index 100% rename from templates/downloads/batchtestnew.txt rename to mutalyzer/templates/downloads/batchtestnew.txt diff --git a/templates/downloads/batchtestold.txt b/mutalyzer/templates/downloads/batchtestold.txt similarity index 100% rename from templates/downloads/batchtestold.txt rename to mutalyzer/templates/downloads/batchtestold.txt diff --git a/templates/downloads/textmining_sample.txt b/mutalyzer/templates/downloads/textmining_sample.txt similarity index 100% rename from templates/downloads/textmining_sample.txt rename to mutalyzer/templates/downloads/textmining_sample.txt diff --git a/templates/exercise.html b/mutalyzer/templates/exercise.html similarity index 98% rename from templates/exercise.html rename to mutalyzer/templates/exercise.html index 4da5215af2fb9cb56139127dfc45279c33cf9465..07eef5207b8fe470bad23164a87c3dbb703a2dd9 100644 --- a/templates/exercise.html +++ b/mutalyzer/templates/exercise.html @@ -76,7 +76,7 @@ well-annotated genomic reference sequence. (See <a href="http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=Nucleotide&dopt=GenBank&val=16944057">AL449423.14</a> or NCBI's new <a href="http://www.ncbi.nlm.nih.gov/RefSeq/RSG/">RefSeqGene</a> - records for example). Alternatively, you can use the GenBank Uploader + records for example). Alternatively, you can use the Reference File Loader options described below to obtain a suitable genomic reference sequence. If the annotation of this record contains information about the transcript specified by the NM_ number, Mutalyzer can use the @@ -219,7 +219,7 @@ </p> <p> </p> <p> - 4) The Mutalyzer reference sequence uploader + 4) The Mutalyzer reference file loader </p> <p> <span style="font-size: 12.0pt; font-family: Times New Roman">Users can @@ -248,7 +248,7 @@ </p> <p> <i> - <span style="font-family: Times New Roman">Click the GenBank Uploader + <span style="font-family: Times New Roman">Click the Reference File Loader link in </span> the list on the left side of any Mutalyzer window to see its options. @@ -257,7 +257,7 @@ <p> <span style="font-family: Times New Roman"> If you already have a well-annotated GenBank file on your computer or - stored on a web server, the first two GenBank uploader options + stored on a web server, the first two Reference File Loader options facilitate easy uploading and will return the </span> <span style="font-size: 12.0pt; font-family: Times New Roman"> diff --git a/templates/faq.html b/mutalyzer/templates/faq.html similarity index 98% rename from templates/faq.html rename to mutalyzer/templates/faq.html index acf5217fe47ccdede8b1a6ed04fc46f8dbce92fe..d2aa9eceb5a5f27f4f4170246ea445ad49cb6846 100644 --- a/templates/faq.html +++ b/mutalyzer/templates/faq.html @@ -67,7 +67,7 @@ try to retrieve the underlying sequences to check the sequence variant, but it may lose track of the corresponding positions due to the different levels of assembly and return errors. Users are advised - circumvent this problem by using the Genbank uploader when (part of) + circumvent this problem by using the Reference File Loader when (part of) these NC_ or NT references are used. The <a href="http://www.humgen.nl/mutalyzer_exercise.html"> Mutalyzer exercise</a> provides more detailed information. @@ -116,7 +116,7 @@ </p> <p> If you are the curator of an LSDB in need of an appropriate genomic - reference sequence, you can use the options on the GenBank uploader + reference sequence, you can use the options on the Reference File Loader page to select a genomic reference sequence. More information about the selection and modification of reference sequences can be found in the <a href="http://www.humgen.nl/mutalyzer_exercise.html"> @@ -161,7 +161,7 @@ </p> <h3> - Why does the Genbank uploader not work with my local file or the URL + Why does the Reference File Loader not work with my local file or the URL provided? </h3> <p> diff --git a/templates/gbupload.html b/mutalyzer/templates/gbupload.html similarity index 96% rename from templates/gbupload.html rename to mutalyzer/templates/gbupload.html index 7ed6d0af66534179c05da1db57818bc9d65fe0b5..28eaa5276c30e7017973b776671e188f8745b9c9 100644 --- a/templates/gbupload.html +++ b/mutalyzer/templates/gbupload.html @@ -11,8 +11,8 @@ updateVisibility(); } </script> - <center><h3>Reference Sequence Uploader</h3></center> - The Reference Sequence Uploader allows you to use your own reference + <center><h3>Reference File Loader</h3></center> + The Reference File Loader allows you to use your own reference sequence when no appropriate RefSeq, GenBank or LRG file is available. <br> Please select one of the options below to upload or retrieve your reference @@ -126,7 +126,7 @@ <div tal:condition = "UD"> <b>Output:</b><br> <br> - Your reference sequence was uploaded successfully.<br> + Your reference sequence was loaded successfully.<br> You now can use mutalyzer with the following accession number as reference: <b tal:content = "UD"></b><br> <a tal:attributes = "href string:Reference/${UD}.gb"> diff --git a/templates/help.html b/mutalyzer/templates/help.html similarity index 98% rename from templates/help.html rename to mutalyzer/templates/help.html index e879d3c532450571b4bb561953b6fc32f40f1343..c6c7f4fdb17af28e089bd6cc6e87505cade9c30e 100644 --- a/templates/help.html +++ b/mutalyzer/templates/help.html @@ -63,7 +63,7 @@ with the Name checker </p> <p> - - <a href="#GenBankUploader">GenBank Uploader</a>: allows you to + - <a href="#GenBankUploader">Refrence File Loader</a>: allows you to upload and use your own reference sequence. </p> <p> @@ -112,7 +112,7 @@ user-defined files in GenBank format, including slices of chromosomal reference sequences. These files are specified by unique <i><a href="#UD">UD identifiers</a></i>, which are returned by Mutalyzer - after upload (See the <a href="#GenBankUploader">GenBank Uploader</a> + after upload (See the <a href="#GenBankUploader">Reference File Loader</a> section for more information).<br> </p> <p> @@ -132,7 +132,7 @@ LRG_1). The <a href="http://www.lrg-sequence.org/"> LRG website</a> lists existing LRG sequences and has an <a href="ftp://ftp.ebi.ac.uk/pub/databases/lrgex/">FTP site</a> for - downloading LRGs. To maintain LRG stability, Mutalyzer's uploader does + downloading LRGs. To maintain LRG stability, Mutalyzer's Reference File Loader does not accept user-defined LRG files. </p> @@ -734,7 +734,7 @@ <hr> - <h3>GenBank Uploader Help<a name="GenBankUploader"></a></h3> + <h3>Reference File Loader Help<a name="GenBankUploader"></a></h3> <p> Users can upload their own reference sequence file in <a href="http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html" @@ -754,7 +754,7 @@ >these instructions</a>. </p> <p> - Uploader options: + Loader options: </p> <p> <b>The reference sequence file is a local file</b> @@ -857,14 +857,14 @@ </p> <div style="padding-left:20px; width:400px"> <p> - Your reference sequence was uploaded successfully. You now can use + Your reference sequence was loaded successfully. You now can use mutalyzer with the following accession number as reference: UD_127955523176 <br> Download this reference sequence. </p> </div> <p> - The GenBank uploader uses JavaScript to change the form depending on + The Reference File Loader uses JavaScript to change the form depending on the selected option. In Internet Explorer, forms may not be displayed correctly. Adding Mutalyzer to your trusted sites is one option to solve this. @@ -895,7 +895,7 @@ <a href="http://www.hgvs.org/mutnomen/" >standard human sequence variation nomenclature</a>. When trying to retrieve genomic reference sequences using gene symbols with the - Genbank uploader or when specifying a particular gene in a genomic + Reference File Loader or when specifying a particular gene in a genomic reference sequence, the gene symbol should be similar to that used in the (genome) sequence annotation. </p> @@ -913,7 +913,7 @@ <p> Occasionally, Mutalyzer will display an Internal Server Error message due to unexpected behavior. You can use Mutalyzer's - <a href="https://www.mutalyzer.nl/projects/mutalyzer2/" + <a href="https://humgenprojects.lumc.nl/trac/mutalyzer" >bugtracking system</a> to report errors and send in feature requests. </p> diff --git a/templates/index.html b/mutalyzer/templates/index.html similarity index 96% rename from templates/index.html rename to mutalyzer/templates/index.html index 329d53a3875ac2055cc7749102e6c4507921b830..6cab60e15e89ef63448b1205595153cce3de1cf8 100644 --- a/templates/index.html +++ b/mutalyzer/templates/index.html @@ -32,7 +32,7 @@ versa. </li> <li> - The <a href = "upload">GenBank Uploader</a> allows you to upload and + The <a href = "upload">Reference File Loader</a> allows you to load and use your own reference sequence. </li> <li> diff --git a/templates/menu.html b/mutalyzer/templates/menu.html similarity index 97% rename from templates/menu.html rename to mutalyzer/templates/menu.html index 14bebd8d926a91635e53a4e0f10e426172c0868b..9d94389cb9f6569270d2d8321b86e7622fa97601 100644 --- a/templates/menu.html +++ b/mutalyzer/templates/menu.html @@ -298,7 +298,7 @@ 'upload');" onmouseout="navDeAct('base/images/bullitdonker.gif', 'upload');" - class="vertnavsub">GenBank Uploader</a> + class="vertnavsub">Reference File Loader</a> </td> </tr> @@ -391,7 +391,7 @@ <td colspan="3"> <a id="page_bugTrack" onclick="swapActive('bugTrack');" - href="https://www.mutalyzer.nl/projects/mutalyzer2" + href="https://humgenprojects.lumc.nl/trac/mutalyzer" onmouseover="navAct('base/images/bullitlicht1.gif', 'bugTrack');" onmouseout="navDeAct('base/images/bullitdonker.gif', @@ -468,7 +468,7 @@ <td colspan="2"> <a id="page_external_oldmut" onclick="swapActive('external_oldmut');" - href="http://www.mutalyzer.nl/1.0.4_old/" + href="http://132.229.137.14/1.0.4_old/" class="vertnavsub">Mutalyzer 1.0.4</a> </td> </tr> @@ -519,10 +519,16 @@ <center> - <h2 tal:content = "structure string:Mutalyzer ${version}<br> + <h2 tal:condition = "release" + tal:content = "structure string:Mutalyzer ${version}<br> <small><small><small><small> released on ${releaseDate} </small></small></small></small>"></h2> + <h2 tal:condition = "not:release" + tal:content = "structure string:Mutalyzer ${version}<br> + <small><small><small><small> + development version + </small></small></small></small>"></h2> HGVS nomenclature version <span tal:content = "nomenclatureVersion"></span> </center> diff --git a/templates/nameGenerator.html b/mutalyzer/templates/nameGenerator.html similarity index 90% rename from templates/nameGenerator.html rename to mutalyzer/templates/nameGenerator.html index 7155d4faaba9133b388e127864b765fc714f0cdd..006b6d83cd2bd02e7bd23ec4bf684e189d70e2ee 100644 --- a/templates/nameGenerator.html +++ b/mutalyzer/templates/nameGenerator.html @@ -16,7 +16,7 @@ <tr id="refe"> <td>Reference</td> <td><input type="text" name="refe" size="20" value=""></td> - <td id="refeerror" class="error"></td> + <td id="refeerror" class="errors"></td> </tr> <tr id="seqT"> @@ -33,7 +33,7 @@ <option value="e">EST</option> </select> </td> - <td id="seqTerror" class="error"></td> + <td id="seqTerror" class="errors"></td> </tr> <tr id="tlc" style="display: none; "> @@ -50,7 +50,7 @@ <td> <input type="text" name="gSym" size="20" value=""> </td> - <td id="gSymerror" class="error"></td> + <td id="gSymerror" class="errors"></td> </tr> <tr id="tVar"> @@ -58,7 +58,7 @@ <td> <input type="text" name="tVar" size="20" value=""> </td> - <td id="tVarerror" class="error"></td> + <td id="tVarerror" class="errors"></td> </tr> </tbody> </table> @@ -100,7 +100,7 @@ <style type="text/css"> - .error{ + .errors{ color: #FF0000; font-size: 10px; } @@ -150,7 +150,7 @@ <option value="6">Inversion</option> </select> </td> - <td id="{NMBR}mutTerror" class="error"></td> + <td id="{NMBR}mutTerror" class="errors"></td> </tr> </tbody> @@ -160,7 +160,7 @@ <td> <input type="text" name="V{NMBR}P1" size="20" value=""> </td> - <td id="V{NMBR}P1error" class="error"></td> + <td id="V{NMBR}P1error" class="errors"></td> </tr> </tbody> @@ -170,7 +170,7 @@ <td> <input type="text" name="V{NMBR}P2" size="20" value=""> </td> - <td id="V{NMBR}P2error" class="error"></td> + <td id="V{NMBR}P2error" class="errors"></td> </tr> </tbody> @@ -181,7 +181,7 @@ <td> <input type="text" name="V{NMBR}S1" size="20" value=""> </td> - <td id="V{NMBR}S1error" class="error"></td> + <td id="V{NMBR}S1error" class="errors"></td> </tr> </tbody> @@ -191,7 +191,7 @@ <td> <input type="text" name="V{NMBR}S2" size="20" value=""> </td> - <td id="V{NMBR}S2error" class="error"></td> + <td id="V{NMBR}S2error" class="errors"></td> </tr> </tbody> diff --git a/templates/parse.html b/mutalyzer/templates/parse.html similarity index 82% rename from templates/parse.html rename to mutalyzer/templates/parse.html index 44cbdf67600f0f9745f7f69e72bd465d65e7e312..1a92d11caf6dfc8bc07a1a33eee2d8ba8b3c03e2 100644 --- a/templates/parse.html +++ b/mutalyzer/templates/parse.html @@ -29,12 +29,14 @@ onClick = "clearForm(this.form, 'variant');"> </form><br> <div tal:condition = "variant"> - <h3>Output:</h3> + <h3>Variant syntax checker results:</h3> <div tal:condition = "parseError"> - <div tal:repeat = "i messages" - tal:replace = "structure string:${i}<br>"> + <div class="messages"> + <p tal:repeat = "m messages" tal:content = "m/description" + tal:attributes = "class m/class; title string:${m/level} (origin: ${m/origin})"></p> + <p tal:content = "summary"></p> + <br> </div> - <br> <h4>Details of the parse error:</h4> <pre tal:content = "structure string:${parseError/0}<br>${parseError/1}"> diff --git a/templates/skel.html b/mutalyzer/templates/skel.html similarity index 100% rename from templates/skel.html rename to mutalyzer/templates/skel.html diff --git a/templates/snp.html b/mutalyzer/templates/snp.html similarity index 81% rename from templates/snp.html rename to mutalyzer/templates/snp.html index 5c4e5be74e4f4031f1ab714f6ebbc71f7a96a019..bd98c4754c099bf6c458009aa8aca09d8c471def 100644 --- a/templates/snp.html +++ b/mutalyzer/templates/snp.html @@ -32,11 +32,12 @@ </form> <div tal:condition = "lastpost"> <br> - <h3>Results:</h3><br> - <div tal:repeat = "i messages" - tal:replace = "structure string:${i}<br>"> + <h3>SNP converter results:</h3> + <div class="messages"> + <p tal:repeat = "m messages" tal:content = "m/description" + tal:attributes = "class m/class; title string:${m/level} (origin: ${m/origin})"></p> + <p tal:content = "summary"></p> </div> - <div tal:replace = "summary"></div><br> <br> <h4>dbSNP rs ID:</h4> <tt tal:content = "lastpost"></tt><br> diff --git a/templates/test.js b/mutalyzer/templates/test.js similarity index 100% rename from templates/test.js rename to mutalyzer/templates/test.js diff --git a/templates/textmining.py b/mutalyzer/templates/textmining.py similarity index 100% rename from templates/textmining.py rename to mutalyzer/templates/textmining.py diff --git a/templates/webservdoc.html b/mutalyzer/templates/webservdoc.html similarity index 100% rename from templates/webservdoc.html rename to mutalyzer/templates/webservdoc.html diff --git a/templates/webservices.html b/mutalyzer/templates/webservices.html similarity index 100% rename from templates/webservices.html rename to mutalyzer/templates/webservices.html diff --git a/templates/wsdl-viewer.xsl b/mutalyzer/templates/wsdl-viewer.xsl similarity index 100% rename from templates/wsdl-viewer.xsl rename to mutalyzer/templates/wsdl-viewer.xsl diff --git a/mutalyzer/util.py b/mutalyzer/util.py new file mode 100644 index 0000000000000000000000000000000000000000..0918d4c2af139fba75c0a71d2d12c5fc8e9aed31 --- /dev/null +++ b/mutalyzer/util.py @@ -0,0 +1,819 @@ +""" +General utility functions. + +@todo: All these functions come from the old Mutalyzer.py file. Try to find + general utility functions in other modules too. +@todo: Use exceptions for failure handling. +@todo: End vs stop. I guess we should use start/stop (end goes with beginning). + Or first/last, or acceptor/donor. Anyway, CDS is always denoted with + start/stop. Important thing is that the semantics should be clear. + Idea: + * CDS -> use start/stop + * splice sites or exons -> acceptor/donor + * translation -> begin/end + * any range of bases -> first/last + * interbase position (if two numbers are used) -> before/after +@todo: We can also group this in separate files in a util/ directory, according + to function (e.g. util/sequences.py, util/positioning.py, etc). +@todo: Unit tests (some can directly be extracted from the docstring). +""" + + +import sys +import os +import math +import time +import inspect +from itertools import izip_longest + +import Bio.Seq +from Bio.Alphabet import IUPAC +from Bio.SeqUtils import seq3 + + +def grouper(iterable, n=2, fillvalue=None): + """ + Make an iterator that takes {n} elements at a time from {iterable}, using + {fillvalue} as default values where we don't have a multiple of {n}. + + >>> for g in grouper('ABCDEFG', 3, 'x'): + ... print g + ('A', 'B', 'C') + ('D', 'E', 'F') + ('G', 'x', 'x') + + >>> splice_sites = [1, 4, 5, 12, 13, 18] + >>> for acceptor, donor in grouper(splice_sites): + ... print 'Exon of length %d' % (donor - acceptor + 1) + Exon of length 4 + Exon of length 8 + Exon of length 6 + + Modified from the example at [1]. + + @arg iterable: Iterable to take groups of elements from. + @type iterable: any iterable type + @kwarg n: Number of elements to take at a time (default 2). + @type n: int + @kwarg fillvalue: Default value to use as padding if length of {iterable} + is not a multiple of {n} (default None). + @return: Iterator that gives elements of {iterable} as groups of {n}. + @rtype: tuple + + [1] http://docs.python.org/library/itertools.html#recipes + """ + args = [iter(iterable)] * n + return izip_longest(*args, fillvalue=fillvalue) +#grouper + + +def over_splice_site(first, last, splice_sites): + """ + Check wheter a genomic range {first}_{last} hits a splice site. Hitting + a splice site means that the range covers the exon boundary. + + >>> splice_sites = [1, 4, 8, 12, 19, 28] + >>> over_splice_site(7, 8, splice_sites) + True + >>> over_splice_site(12, 13, splice_sites) + True + >>> over_splice_site(8, 9, splice_sites) + False + >>> over_splice_site(8, 8, splice_sites) + False + + @arg first: The first coordinate of the range in g. notation. + @type first: int + @arg last: The last coordinate of the range in g. notation. + @type last: int + @arg sites: A list of splice sites in g. notation. + @type sites: list(int) + + @return: True if one or more splice sites are hit, False otherwise. + @rtype: boolean + + @todo: Assert number of splice sites is even. + """ + for acceptor, donor in grouper(splice_sites): + if first < acceptor and last >= acceptor: + return True + if donor and first <= donor and last > donor: + return True + + return False +#over_splice_site + + +def splice(s, splice_sites): + """ + Construct the transcript or the coding sequence from a record and a list + of splice sites. + + >>> splice('abcdefghijklmnopqrstuvwxyz', [2, 4, 7, 16, 20, 23]) + 'bcdghijklmnoptuvw' + + @arg s: A DNA sequence. + @type s: string + @arg splice_sites: A list of even length of integers. + @type splice_sites: list + + @return: The concatenation of slices from the sequence that is present + in the GenBank record. + @rtype: string + + @todo: Assert length of splice_sites is even. + """ + transcript = '' + + for acceptor, donor in grouper(splice_sites): + transcript += s[acceptor - 1:donor] + + return transcript +#splice + + +# Todo: refactor +def __nsplice(string, splice_sites, CDS, orientation) : + """ + Just like _splice(), but it only keeps the parts between CDS[0] and + CDS[1] (in the right orientation). + + I guess we could easily do this as a separate step after _splice()? + + @todo: keep this function? + @todo: documentation + """ + + transcript = "" + if orientation == 1 : + for i in range(0, len(splice_sites), 2) : + if CDS[0] >= splice_sites[i] and CDS[0] <= splice_sites[i + 1] : + transcript += string[CDS[0] - 1:splice_sites[i + 1]] + else : + if splice_sites[i] > CDS[0] : + transcript += \ + string[splice_sites[i] - 1:splice_sites[i + 1]] + #for + #if + else : + for i in range(0, len(splice_sites), 2) : + if CDS[1] >= splice_sites[i] and CDS[1] <= splice_sites[i + 1] : + transcript += string[splice_sites[i] - 1:CDS[1]] + else : + if splice_sites[i] < CDS[1] : + transcript += \ + string[splice_sites[i] - 1:splice_sites[i + 1]] + #for + #else + + return transcript +#__nsplice + + +def cds_length(splice_sites): + """ + Calculate the length of a CDS. + + >>> cds_length([2, 4, 7, 16, 20, 23]) + 17 + + @arg splice_sites: The coordinates of the CDS including internal splice + sites. + @type splice_sites: list + + @return: Length of the CDS. + @rtype: int + + @todo: Assert length of splice_sites is even. + """ + l = 0 + + for acceptor, donor in grouper(splice_sites): + l += donor - acceptor + 1 + + return l +#cds_length + + +def format_range(first, last): + """ + Simplify a range to one position when applicable. + + >>> format_range(3, 5) + '3_5' + >>> format_range(3, 3) + '3' + + @arg first: First coordinate of a range. + @type first: integer + @arg last: Second coordinate of a range. + @type last: integer + + @return: {first}_{last} in case of a real range, {first} otherwise. + @rtype: string + """ + if first == last: + return str(first) + + return '%i_%i' % (first, last) +#format_range + + +def roll(s, first, last): + """ + Determine the variability of a variant by looking at cyclic + permutations. Not all cyclic permutations are tested at each time, it + is sufficient to check ``aW'' if ``Wa'' matches (with ``a'' a letter, + ``W'' a word) when rolling to the left for example. + + >>> roll('abbabbabbabb', 4, 6) + (3, 6) + >>> roll('abbabbabbabb', 5, 5) + (0, 1) + >>> roll('abcccccde', 4, 4) + (1, 3) + + @arg s: A reference sequence. + @type s: string + @arg first: First position of the pattern in the reference sequence. + @type first: int + @arg last: Last position of the pattern in the reference sequence. + @type last: int + + @return: tuple: + - left ; Amount of positions that the pattern can be shifted to + the left. + - right ; Amount of positions that the pattern can be shifted to + the right. + @rtype: tuple(int, int) + """ + pattern = s[first - 1:last] # Extract the pattern + pattern_length = len(pattern) + + # Keep rolling to the left as long as a cyclic permutation matches. + minimum = first - 2 + j = pattern_length - 1 + while minimum > -1 and s[minimum] == pattern[j % pattern_length]: + j -= 1 + minimum -= 1 + + # Keep rolling to the right as long as a cyclic permutation matches. + maximum = last + j = 0 + while maximum < len(s) and s[maximum] == pattern[j % pattern_length]: + j += 1 + maximum += 1 + + return first - minimum - 2, maximum - last +#roll + + +def palinsnoop(s): + """ + Check a sequence for a reverse-complement-palindromic prefix (and + suffix). If one is detected, return the length of this prefix. If the + string equals its reverse complement, return -1. + + >>> palinsnoop('TACGCTA') + 2 + >>> palinsnoop('TACGTA') + -1 + >>> palinsnoop('TACGCTT') + 0 + + @arg s: A nucleotide sequence. + @type s: string + + @return: The number of elements that are palindromic or -1 if the string + is a 'palindrome'. + @rtype: string + """ + s_revcomp = Bio.Seq.reverse_complement(s) + + for i in range(int(math.ceil(len(s) / 2.0))): + if s[i] != s_revcomp[i]: + # The first i elements are 'palindromic'. + return i + + # Perfect 'palindrome'. + return -1 +#palinsnoop + + +def longest_common_prefix(s1, s2): + """ + Calculate the longest common prefix of two strings. + + >>> longest_common_prefix('abcdefg', 'abcabcdefg') + 'abc' + >>> longest_common_prefix('abcdefg', 'abcdefg') + 'abcdefg' + + @arg s1: The first string. + @type s1: string + @arg s2: The second string. + @type s2: string + + @return: The longest common prefix of s1 and s2. + @rtype: string + + @todo: This is mostly used just for the length of the returned string, + and we could also return that directly. + """ + pos = 0 + + while pos < min(len(s1), len(s2)) and s1[pos] == s2[pos]: + pos += 1 + + return s1[:pos] +#longest_common_prefix + + +def longest_common_suffix(s1, s2): + """ + Calculate the longest common suffix of two strings. + + >>> longest_common_suffix('abcdefg', 'abcabcdefg') + 'abcdefg' + >>> longest_common_suffix('abcdefg', 'abcefg') + 'efg' + + @arg s1: The first string. + @type s1: string + @arg s2: The second string. + @type s2: string + + @return: The longest common suffix of s1 and s2. + @rtype: string + """ + return longest_common_prefix(s1[::-1], s2[::-1])[::-1] +#longest_common_suffix + + +def trim_common(s1, s2): + """ + Given two strings, trim their longest common prefix and suffix. + + >>> trim_common('abcdefg', 'abcabcdefg') + ('', 'abc', 3, 4) + >>> trim_common('abcxyzefg', 'abcabcg') + ('xyzef', 'abc', 3, 1) + + @arg s1: A string. + @type s1: string + @arg s2: Another string. + @type s2: string + + @return: A tuple of: + - string: Trimmed version of s1. + - string: Trimmed version of s2. + - int: Length of longest common prefix. + - int: Length of longest common suffix. + + @todo: More intelligently handle longest_common_prefix(). + """ + lcp = len(longest_common_prefix(s1, s2)) + lcs = len(longest_common_suffix(s1[lcp:], s2[lcp:])) + return s1[lcp:len(s1) - lcs], s2[lcp:len(s2) - lcs], lcp, lcs +#trim_common + + +def is_dna(s): + """ + Check whether a string is a DNA string. + + >>> is_dna('TACTGT') + True + >>> is_dna('TACUGT') + False + + @arg s: Any string or Bio.Seq.Seq instance. + @type s: string + + @return: True if the string is a DNA string, False otherwise. + @rtype: boolean + """ + for i in str(s): + if not i in IUPAC.unambiguous_dna.letters: + return False + + return True +#is_dna + + +def in_frame_description(s1, s2) : + """ + Give a description of an inframe difference of two proteins. Also give + the position at which the proteins start to differ and the positions at + which they are the same again. + + >>> in_frame_description('MTAPQQMT', 'MTAQQMT') + ('p.(Pro4del)', 3, 4, 3) + >>> in_frame_description('MTAPQQMT', 'MTAQMT') + ('p.(Pro4_Gln5del)', 3, 5, 3) + >>> in_frame_description('MTAPQQT', 'MTAQQMT') + ('p.(Pro4_Gln6delinsGlnGlnMet)', 3, 6, 6) + + @arg s1: The original protein. + @type s1: string + @arg s2: The mutated protein. + @type s2: string + + @return: A tuple of: + - string ; Protein description of the change. + - int ; First position of the change. + - int ; Last position of the change in the first protein. + - int ; Last position of the change in the second protein. + @rtype: tuple(string, int, int, int) + + @todo: More intelligently handle longest_common_prefix(). + @todo: Refactor this code (too many return statements). + """ + if s1 == s2: + # Nothing happened. + return ('p.(=)', 0, 0, 0) + + lcp = len(longest_common_prefix(s1, s2)) + lcs = len(longest_common_suffix(s1[lcp:], s2[lcp:])) + s1_end = len(s1) - lcs + s2_end = len(s2) - lcs + + # Insertion / Duplication / Extention. + if not s1_end - lcp: + if len(s1) == lcp: + return ('p.(*%i%sext*%i)' % \ + (len(s1) + 1, seq3(s2[len(s1)]), abs(len(s1) - len(s2))), + len(s1), len(s1), len(s2)) + ins_length = s2_end - lcp + + if lcp - ins_length >= 0 and s1[lcp - ins_length:lcp] == s2[lcp:s2_end]: + if ins_length == 1: + return ('p.(%s%idup)' % \ + (seq3(s1[lcp - ins_length]), lcp - ins_length + 1), + lcp, lcp, lcp + 1) + return ('p.(%s%i_%s%idup)' % \ + (seq3(s1[lcp - ins_length]), + lcp - ins_length + 1, seq3(s1[lcp - 1]), lcp), + lcp, lcp, lcp + ins_length) + #if + return ('p.(%s%i_%s%iins%s)' % \ + (seq3(s1[lcp - 1]), lcp, seq3(s1[lcp]), + lcp + 1, seq3(s2[lcp:s2_end])), + lcp, lcp, s2_end) + #if + + # Deletion / Inframe stop. + if not s2_end - lcp: + if len(s2) == lcp: + return ('p.(%s%i*)' % (seq3(s1[len(s2)]), len(s2) + 1), + 0, 0, 0) + + if lcp + 1 == s1_end: + return ('p.(%s%idel)' % (seq3(s1[lcp]), lcp + 1), + lcp, lcp + 1, lcp) + return ('p.(%s%i_%s%idel)' % \ + (seq3(s1[lcp]), lcp + 1, seq3(s1[s1_end - 1]), s1_end), + lcp, s1_end, lcp) + #if + + # Substitution. + if s1_end == s2_end and s1_end == lcp + 1: + return ('p.(%s%i%s)' % (seq3(s1[lcp]), lcp + 1, seq3(s2[lcp])), + lcp, lcp + 1, lcp + 1) + + # InDel. + if lcp + 1 == s1_end: + return ('p.(%s%idelins%s)' % \ + (seq3(s1[lcp]), lcp + 1, seq3(s2[lcp:s2_end])), + lcp, lcp + 1, s2_end) + return ('p.(%s%i_%s%idelins%s)' % \ + (seq3(s1[lcp]), lcp + 1, seq3(s1[s1_end - 1]), s1_end, + seq3(s2[lcp:s2_end])), + lcp, s1_end, s2_end) +#in_frame_description + + +def out_of_frame_description(s1, s2): + """ + Give the description of an out of frame difference between two + proteins. Give a description of an inframe difference of two proteins. + Also give the position at which the proteins start to differ and the + end positions (to be compatible with the in_frame_description function). + + >>> out_of_frame_description('MTAPQQMT', 'MTAQQMT') + ('p.(Pro4Glnfs*5)', 3, 8, 7) + >>> out_of_frame_description('MTAPQQMT', 'MTAQMT') + ('p.(Pro4Glnfs*4)', 3, 8, 6) + >>> out_of_frame_description('MTAPQQT', 'MTAQQMT') + ('p.(Pro4Glnfs*5)', 3, 7, 7) + + @arg s1: The original protein. + @type s1: string + @arg s2: The mutated protein. + @type s2: string + + @return: A tuple of: + - string ; Protein description of the change. + - int ; First position of the change. + - int ; Last position of the first protein. + - int ; Last position of the second protein. + @rtype: tuple(string, int, int, int) + + @todo: More intelligently handle longest_common_prefix(). + """ + lcp = len(longest_common_prefix(s1, s2)) + + if lcp == len(s2): # NonSense mutation. + if lcp == len(s1): # Is this correct? + return ('p.(=)', 0, 0, 0) + return ('p.(%s%i*)' % (seq3(s1[lcp]), lcp + 1), lcp, len(s1), lcp) + if lcp == len(s1) : + return ('p.(*%i%sext*%i)' % \ + (len(s1) + 1, seq3(s2[len(s1)]), abs(len(s1) - len(s2))), + len(s1), len(s1), len(s2)) + return ('p.(%s%i%sfs*%i)' % \ + (seq3(s1[lcp]), lcp + 1, seq3(s2[lcp]), len(s2) - lcp + 1), + lcp, len(s1), len(s2)) +#out_of_frame_description + + +def protein_description(cds_stop, s1, s2) : + """ + Wrapper function for the in_frame_description() and + out_of_frame_description() functions. It uses the value cds_stop to + decide which one to call. + + >>> protein_description(34, 'MTAPQQMT', 'MTAQQMT') + ('p.(Pro4Glnfs*5)', 3, 8, 7) + >>> protein_description(33, 'MTAPQQMT', 'MTAQQMT') + ('p.(Pro4del)', 3, 4, 3) + >>> protein_description(33, 'MTAPQQMT', 'TTAQQMT') + ('p.?', 0, 4, 3) + + @arg cds_stop: Position of the stop codon in c. notation (CDS length). + @type cds_stop: int + @arg s1: The original protein. + @type s1: string + @arg s2: The mutated protein. + @type s2: string + + @return: A tuple of: + - string ; Protein description of the change. + - int ; First position of the change. + - int ; Last position of the change in the first protein. + - int ; Last position of the change in the second protein. + @rtype: tuple(string, int, int, int) + """ + if cds_stop % 3: + description = out_of_frame_description(str(s1), str(s2)) + else: + description = in_frame_description(str(s1), str(s2)) + + if not s2 or str(s1[0]) != str(s2[0]): + # Mutation in start codon. + return 'p.?', description[1], description[2], description[3] + + return description +#protein_description + + +# Todo: cleanup +def _insert_tag(s, pos1, pos2, tag1, tag2): + """ + Insert two tags (tag1 and tag2) in string s at positions pos1 and pos2 + respectively if the positions are within the length of s. If not, + either insert one tag or do nothing. If pos1 equals pos2, don't do + anything either. + + @arg s: A sequence. + @type s: + @arg pos1: Position of tag1. + @type pos1: int + @arg pos2: Position of tag2. + @type pos2: int + @arg tag1: Content of tag1. + @type tag1: string + @arg tag2: Content of tag2. + @type tag2: string + + @return: The original sequence, or a sequence with eiter tag1, tag2 or + both tags inserted. + @rtype: string + + @todo: Cleanup (note: only used in print_protein_html). + """ + output = s + block = len(s) + + # Only do something if pos1 != pos2. + if pos1 != pos2: + if 0 <= pos1 < block: + # Insert tag1. + output = output[:pos1] + tag1 + output[pos1:] + if 0 <= pos2 < block: + # Insert tag2. + output = output[:-(block - pos2)] + tag2 \ + + output[-(block - pos2):] + + return output +#_insert_tag + + +# Todo: cleanup +def print_protein_html(s, first, last, O, where): + """ + Make a fancy representation of a protein and put it in the Output + object under the name 'where'. The representation contains HTML tags + and is suitable for viewing in a monospaced font. + + @arg s: A protein sequence. + @type s: string + @arg first: First position to highlight. + @type first: int + @arg last: Last position to highlight. + @type last: int + @arg O: The Output object. + @type O: Modules.Output.Output + @arg where: Location in the {O} object to store the representation. + @type where: string + + @todo: Cleanup. + """ + if not s: return + + block = 10 # Each block consists of 10 amino acids. + line = 6 * block # Each line consists of 6 blocks. + + tag1 = '<b style="color:#FF0000">' # Use this tag for highlighting. + tag2 = '</b>' # And this one to end highlighting. + + # The maximum length for positions is the 10_log of the length of the + # protein. + m = int(math.floor(math.log(len(s), 10)) + 1) + o = 1 + + # Add the first position. + output = '%s ' % str(o).rjust(m) + + for i in range(0, len(s), block): + # Add the blocks. + output += ' ' + _insert_tag(s[i:i + block], first - i, last - i, + tag1, tag2) + if not (i + block) % line and i + block < len(s): + # One line done. + o += line + O.addOutput(where, output) + # Add the position (while escaping any potential highlighting). + output = '<tt style="color:000000;font-weight:normal">%s</tt> ' \ + % str(o).rjust(m) + + # Add last line. + O.addOutput(where, output) +#print_protein_html + + +def generate_id(): + """ + Generates a (somewhat) unique number, using time(). + + Note: Don't use this in very high frequencies, because it utilizes a + short time.sleep() call to get a higher uniqueness. + + @return: A somewhat unique number. + @rtype: int + """ + unique_per_second = 100 + time.sleep(1.0 / unique_per_second) + return int(time.time() * unique_per_second) +#generate_id + + +def nice_filename(filename): + """ + Strip the path and the extention from a filename. + + @arg filename: A complete path plus extention. + @type filename: string + + @return: The bare filename without a path and extention. + @rtype: string + """ + return filename.split('/')[-1].split('.')[0] +#nice_filename + + +def message_info(message): + """ + Construct a dictionary with information about {message}. + + @arg message: A message instance. + @type message: output.Message + + @return: A dictionary with fields of {message}. + @rtype: dictionary + """ + classes = {0: 'debug', + 1: 'information', + 2: 'warning', + 3: 'error', + 4: 'error'} + + return {'level': message.named_level(), + 'origin': message.origin, + 'class': classes[message.level], + 'description': message.description} +#message_info + + +def format_usage(usage=None, keywords={}): + """ + Format a usage string suitable for printing to the console. Some magic + is employed so you can usually just call this function without arguments + to have the calling module's docstring pretty-printed. + + @kwarg usage: The string to format. If omitted, the calling module's + docstring is used. + @type usage: string + @kwarg keywords: A dictionary of (keyword, value) pairs used to format + the usage string. If it does not contain the key 'command', it is + added with the value of sys.argv[0]. + @type keywords: dictionary(string, string) + + @return: Formatted usage string. This is {usage} with any entries from + {keywords} replaced and cut-off at the first occurence of two + consecutive empty lines. + @rtype: string + """ + if not usage: + caller = inspect.stack()[1] + usage = inspect.getmodule(caller[0]).__doc__ + if not 'command' in keywords: + keywords['command'] = sys.argv[0] + + return usage.split('\n\n\n')[0].strip().format(**keywords) +#format_usage + + +def slow(f): + """ + Decorator for slow tests. This makes them to pass immediately, without + running them. But only if the environment variable MUTALYZER_QUICK_TEST + is 1. + + @todo: I don't think this actually belongs here (a separate util module + for the unit tests?). + """ + def slow_f(*args, **kwargs): + if 'MUTALYZER_QUICK_TEST' in os.environ \ + and os.environ['MUTALYZER_QUICK_TEST'] == '1': + return + else: + f(*args, **kwargs) + return slow_f +#slow + + +def skip(f): + """ + Decorator to disable a unit test. This makes it pass immediately, without + running them. + + @todo: Perhaps it's possible to indicate to nose that the test is skipped? + @todo: I don't think this actually belongs here (a separate util module + for the unit tests?). + """ + def disabled_f(*args, **kwargs): + return + return disabled_f +#skip + + +def monkey_patch_suds(): + """ + Apply our monkey-patch for the suds package. + + For some weird reason the location http://www.w3.org/2001/xml.xsd is used + for the XML namespace, but the W3C seems to respond too slow on that url. + We therefore use http://www.w3.org/2009/01/xml.xsd which fixes this. + + Call this function before importing anything from the suds package. For + example, start your file with the following: + + import monkey; monkey.monkey_patch_suds() + from suds.client import Client + """ + from suds.xsd.sxbasic import Import + _import_open = Import.open + + # Only apply the patch once. + if getattr(Import, 'MUTALYZER_MONKEY_PATCHED', False): + return + + def _import_open_patched(self, *args, **kwargs): + if self.location == 'http://www.w3.org/2001/xml.xsd': + self.location = 'http://www.w3.org/2009/01/xml.xsd' + return _import_open(self, *args, **kwargs) + + Import.open = _import_open_patched + Import.MUTALYZER_MONKEY_PATCHED = True +#monkey_patch_suds diff --git a/mutalyzer/variantchecker.py b/mutalyzer/variantchecker.py new file mode 100644 index 0000000000000000000000000000000000000000..8b7eb9e7091513288f413a604fc14a70df517a64 --- /dev/null +++ b/mutalyzer/variantchecker.py @@ -0,0 +1,1690 @@ +""" +The HGVS variant nomenclature checker. + +Entrypoint is the check_variant() function. + +Notes about naming positions: +* CDS -> use start/stop +* splice sites or exons -> acceptor/donor +* translation -> begin/end +* any range of bases -> first/last +* interbase position (if two numbers are used) -> before/after +""" + + +from operator import itemgetter, attrgetter + +import Bio +import Bio.Seq +from Bio.Seq import Seq +from Bio.Alphabet import IUPAC + +from mutalyzer import util +from mutalyzer.grammar import Grammar +from mutalyzer.mutator import Mutator +from mutalyzer import Retriever +from mutalyzer import GenRecord +from mutalyzer import Db + + +# Exceptions used (privately) in this module. +class _VariantError(Exception): pass +class _RawVariantError(_VariantError): pass +class _UnknownPositionError(_RawVariantError): pass +class _NotDNAError(_RawVariantError): pass +class _PositionsNotConsecutiveError(_RawVariantError): pass +class _LengthMismatchError(_RawVariantError): pass +class _ReferenceMismatchError(_RawVariantError): pass +class _RangeInsertionError(_RawVariantError): pass +class _OffsetSignError(_RawVariantError): + def __init__(self, main, offset, acceptor): + self.main = main + self.offset = offset + self.acceptor = acceptor +class _OffsetNotFromBoundaryError(_RawVariantError): + def __init__(self, main): + self.main = main +class _InvalidExonError(_RawVariantError): + def __init__(self, exon): + self.exon = exon +class _InvalidIntronError(_RawVariantError): + def __init__(self, intron): + self.intron = intron + + +def _is_coding_intronic(loc): + """ + Check whether a location is an intronic c. position. + + @arg loc: A location from the Parser module. + @type loc: pyparsing.ParseResults + + @return: True if the location is c. intronic, False otherwise. + @rtype: boolean + """ + if not loc: + return False + if not loc.PtLoc: + return False + if not loc.PtLoc.Offset: + return False + return True +#_is_coding_intronic + + +def _check_intronic_position(main, offset, transcript): + """ + Check whether a c. position is really in an intron: The main coordinate + must be a splice site and the offset coordinate must have the correct + sign. Raise _RawVariantError exception if this check fails. + + @arg main: Main coordinate of the position. + @type main: int + @arg offset: Offset coordinate of the position. + @type offset: int + @arg transcript: Transcript under scrutiny. + @type transcript: object + + @raise _OffsetNotFromBoundary: An offset from a non-boundary position + was used. + @raise _OffsetSignError: Offset from exon boundary has the wrong sign. + + @todo: Check if the offset is really in the flanking intron. + """ + main_g = transcript.CM.x2g(main, 0) + sites = transcript.CM.RNA + + if offset: + oriented_offset = offset * transcript.CM.orientation + try: + i = sites.index(main_g) + if not i % 2: + # Splice acceptor, so sign must be -. + if oriented_offset > 0: + raise _OffsetSignError( + transcript.CM.int2main(main), + transcript.CM.int2offset((main, offset)), + True) + else: + # Splice donor, so sign must be +. + if oriented_offset < 0: + raise _OffsetSignError( + transcript.CM.int2main(main), + transcript.CM.int2offset((main, offset)), + False) + except ValueError: + # The main coordinate is not a splice site. + raise _OffsetNotFromBoundaryError(transcript.CM.int2main(main)) +#_check_intronic_position + + +def _check_argument(argument, reference, first, last, output): + """ + Do several checks for the optional argument of a variant. Raise a + _RawVariantError exception if the checks fail. + + @arg reference: The reference sequence. + @type reference: string + @arg first: Start position of the variant. + @type first: int + @arg last: End position of the variant. + @type last: int + @arg argument: The optional argument. + @type argument: string + @arg output: The Output object. + @type output: mutalyzer.Output.Output + + @raise _LengthMismatchError: The argument is a length, but it does not + match the given range length. + @raise _NotDNAError: The argument should be DNA, but it is not. + @raise _ReferenceMismatchError: The argument is DNA, but it does not + match the given reference. + """ + if not argument: + # The argument is optional, if it is not present, it is correct. + return + + if argument.isdigit(): + # If it is a digit (3_9del7 for example), the digit must be equal to + # the length of the given range. + length = int(argument) + interval = first - last + 1 + if length != interval: + output.addMessage(__file__, 3, 'EARGLEN', + 'The length (%i) differed from that of the ' \ + 'range (%i).' % (length, interval)) + raise _LengthMismatchError() + else: + # If it is not a digit, it muse be DNA. + if not util.is_dna(argument): + output.addMessage(__file__, 4, 'ENODNA', + 'Invalid letters in argument.') + raise _NotDNAError() + # And the DNA must match the reference sequence. + reference_slice = str(reference[first - 1:last]) + if reference_slice != str(argument): + # Todo: Be more informative. + output.addMessage(__file__, 3, 'EREF', + '%s not found at position %s, found %s ' \ + 'instead.' % (argument, + util.format_range(first, last), + reference_slice)) + raise _ReferenceMismatchError() +#_check_argument + + +def _add_batch_output(O): + """ + Format the results to a batch output. + + Filter the mutalyzer output and reformat it for use in the batch system + as output object 'batchDone'. + + @arg O: The Output object + @type O: Modules.Output.Output + + @todo: More documentation. + """ + goi, toi = O.getOutput("geneSymbol")[-1] # Two strings [can be empty] + tList = [] # Temporary List + tDescr = [] # Temporary Descr + + reference = O.getOutput("reference")[-1] + recordType = O.getOutput("recordType")[0] + descriptions = O.getOutput("NewDescriptions") + #iName, jName, mType, cDescr, pDescr, gAcc, cAcc, pAcc, + #fullDescr, fullpDescr + + if len(descriptions) == 0: + #No descriptions generated [unlikely] + return + if O.Summary()[0]: + #There were errors during the run, return. + return + for descr in descriptions: + if goi in descr[0] and toi in descr[1]: # Gene and Transcript + if tDescr: + # Already inserted a value in the tDescr + tDescr, tList = [], descriptions + break + tDescr = descr + + tList = descriptions + + var = O.getOutput("variant")[-1] + + # Generate output + outputline = "" + if tDescr: #Filtering worked, only one Description left + (gName, trName, mType, cDescr, + pDescr, gAcc, cAcc, pAcc, fullD, fullpD) = tDescr + + gene = "%s_v%.3i" % (gName, int(trName)) + + outputline += "%s\t%s\t%s\t" % (reference, gene, var) + + #Add genomic Description + outputline += "%s\t" % O.getOutput("gDescription")[0] + + #Add coding Description & protein Description + outputline += "%s\t%s\t" % (cDescr, pDescr) + + gc = cDescr and "%s:%s" % (gene, cDescr) + gp = pDescr and "%s:%s" % (gene, pDescr) + + #Add mutation with GeneSymbols + outputline += "%s\t%s\t" % (gc, gp) + + #Add References, should get genomic ref from parsed data + if recordType == "LRG": + gAcc = reference + if recordType == "GB": + geno = ["NC", "NG", "AC", "NT", "NW", "NZ", "NS"] + for g in geno: + if reference.startswith(g): + gAcc = reference + break + outputline += "%s\t%s\t%s\t" % (gAcc or "", cAcc or "", pAcc or "") + + else: + outputline += "\t"*11 + + #Add list of affected transcripts "|" seperator + if tList: + outputline += "%s\t" % "|".join(e[-2] for e in tList) + outputline += "%s\t" % "|".join(e[-1] for e in tList) + else: + outputline += "\t"*2 + + #Link naar additional info: + #outputline+="http://localhost/mutalyzer2/redirect?mutationName=%s" %\ + # "todovariant" + + O.addOutput("batchDone", outputline) +#_add_batch_output + + +def apply_substitution(position, original, substitute, mutator, record, O): + """ + Do a semantic check for a substitution, do the actual substitution and + give it a name. + + @arg position: Genomic location of the substitution. + @type position: int + @arg original: Nucleotide in the reference sequence. + @type original: string + @arg substitute: Nucleotide in the mutated sequence. + @type substitute: string + @arg mutator: A Mutator instance. + @type mutator: mutalyzer.mutator.Mutator + @arg record: A GenRecord object. + @type record: Modules.GenRecord.GenRecord + @arg O: The Output object. + @type O: Modules.Output.Output + + @raise _NotDNAError: Invalid (non-DNA) letter in input. + """ + if not util.is_dna(substitute): + O.addMessage(__file__, 4, 'ENODNA', 'Invalid letter in input') + raise _NotDNAError() + + if original == substitute: + O.addMessage(__file__, 2, 'WNOCHANGE', + 'No mutation given (%c>%c) at position %i.' % \ + (original, substitute, position)) + return + + mutator.subM(position, substitute) + + record.name(position, position, 'subst', mutator.orig[position - 1], + substitute, None) +#apply_substitution + + +def apply_deletion_duplication(first, last, type, mutator, record, O, + first_fuzzy=False, last_fuzzy=False): + """ + Do a semantic check for a deletion or duplication, do the actual + deletion/duplication and give it a name. + + @arg first: Genomic start position of the del/dup. + @type first: int + @arg last: Genomic end position of the del/dup. + @type last: int + @arg type: The variant type (del or dup). + @type type: string + @arg mutator: A Mutator instance. + @type mutator: mutalyzer.mutator.Mutator + @arg record: A GenRecord object. + @type record: Modules.GenRecord.GenRecord + @arg O: The Output object. + @type O: Modules.Output.Output + + @kwarg first_fuzzy: Denotes that the start position is fuzzy (e.g. in the + case of an unknown offset in c. notation). + @type first_fuzzy: bool + @kwarg last_fuzzy: Denotes that the end position is fuzzy (e.g. in the + case of an unknown offset in c. notation). + @type last_fuzzy: bool + """ + reverse_roll, forward_roll = util.roll(mutator.orig, first, last) + + # In the case of RNA, check if we roll over a splice site. If so, make + # the roll shorter, just up to the splice site. (Effectively this always + # means we roll over two splice sites, since they are adjacent.) + # We only have to consider the forward roll, since RNA reference + # sequences are always orientated in correspondence with the transcript. + original_forward_roll = forward_roll + if record.record.molType != 'g': + # Todo: Do we assume .geneList[0].transcriptList[0] is the selected + # transcript here?? Why not use record.current_transcript? + splice_sites = record.record.geneList[0].transcriptList[0] \ + .mRNA.positionList + for acceptor, donor in util.grouper(splice_sites): + # Note that acceptor and donor splice sites both point to the + # first, respectively last, position of the exon, so they are + # both at different sides of the boundary. + if last < acceptor and last + forward_roll >= acceptor: + forward_roll = acceptor - 1 - last + break + if last <= donor and last + forward_roll > donor: + forward_roll = donor - last + break + + # Did we select a transcript on the reverse strand? + transcript = record.current_transcript() + reverse_strand = transcript and transcript.CM.orientation == -1 + + if forward_roll and not reverse_strand: + new_first = first + forward_roll + new_stop = last + forward_roll + O.addMessage(__file__, 2, 'WROLLFORWARD', + 'Sequence "%s" at position %s was given, however, ' \ + 'the HGVS notation prescribes that on the forward strand ' \ + 'it should be "%s" at position %s.' % ( + mutator.visualiseLargeString(str(mutator.orig[first - 1:last])), + util.format_range(first, last), + mutator.visualiseLargeString(str(mutator.orig[new_first - 1:new_stop])), + util.format_range(new_first, new_stop))) + + if forward_roll != original_forward_roll and not reverse_strand: + # The original roll was decreased because it crossed a splice site. + incorrect_first = first + original_forward_roll + incorrect_stop = last + original_forward_roll + O.addMessage(__file__, 1, 'IROLLBACK', + 'Sequence "%s" at position %s was not corrected to "%s" at ' \ + 'position %s, since they reside in different exons.' % ( + mutator.visualiseLargeString(str(mutator.orig[first - 1:last])), + util.format_range(first, last), + mutator.visualiseLargeString(str(mutator.orig[incorrect_first - 1:incorrect_stop])), + util.format_range(incorrect_first, incorrect_stop))) + + if reverse_roll and reverse_strand: + new_first = first - reverse_roll + new_stop = last - reverse_roll + O.addMessage(__file__, 2, 'WROLLREVERSE', + 'Sequence "%s" at position %s was given, however, ' \ + 'the HGVS notation prescribes that on the reverse strand ' \ + 'it should be "%s" at position %s.' % ( + mutator.visualiseLargeString(str(mutator.orig[first - 1:last])), + util.format_range(first, last), + mutator.visualiseLargeString(str(mutator.orig[new_first - 1:new_stop])), + util.format_range(new_first, new_stop))) + + # We don't go through the trouble of visualising the *corrected* variant + # and are happy with visualising what the user gave us. + if type == 'del': + mutator.delM(first, last) + else: + mutator.dupM(first, last) + + record.name(first, last, type, '', '', (reverse_roll, forward_roll), + start_fuzzy=first_fuzzy, + stop_fuzzy=last_fuzzy) +#apply_deletion_duplication + + +def apply_inversion(first, last, mutator, record, O): + """ + Do a semantic check for an inversion, do the actual inversion, and give + it a name. + + @arg first: Genomic start position of the inversion. + @type first: int + @arg last: Genomic end position of the inversion. + @type last: int + @arg mutator: A Mutator instance. + @type mutator: mutalyzer.mutator.Mutator + @arg record: A GenRecord object. + @type record: Modules.GenRecord.GenRecord + @arg O: The Output object. + @type O: Modules.Output.Output + """ + snoop = util.palinsnoop(mutator.orig[first - 1:last]) + + if snoop: + # We have a reverse-complement-palindromic prefix. + if snoop == -1 : + # Actually, not just a prefix, but the entire selected sequence is + # a 'palindrome'. + O.addMessage(__file__, 2, 'WNOCHANGE', + 'Sequence "%s" at position %i_%i is a palindrome ' \ + '(its own reverse complement).' % ( + mutator.visualiseLargeString(str(mutator.orig[first - 1:last])), + first, last)) + return + else: + O.addMessage(__file__, 2, 'WNOTMINIMAL', + 'Sequence "%s" at position %i_%i is a partial ' \ + 'palindrome (the first %i nucleotide(s) are the reverse ' \ + 'complement of the last one(s)), the HGVS notation ' \ + 'prescribes that it should be "%s" at position %i_%i.' % ( + mutator.visualiseLargeString(str(mutator.orig[first - 1:last])), + first, last, snoop, + mutator.visualiseLargeString( + str(mutator.orig[first + snoop - 1: last - snoop])), + first + snoop, last - snoop)) + first += snoop + last -= snoop + + mutator.invM(first, last) + + if first == last: + O.addMessage(__file__, 2, 'WWRONGTYPE', 'Inversion at position ' \ + '%i is actually a substitution.' % first) + record.name(first, first, 'subst', mutator.orig[first - 1], + Bio.Seq.reverse_complement(mutator.orig[first - 1]), None) + else : + record.name(first, last, 'inv', '', '', None) +#apply_inversion + + +def apply_insertion(before, after, s, mutator, record, O): + """ + Do a semantic check for an insertion, do the actual insertion, and give + it a name. + + @arg before: Genomic position before the insertion. + @type before: int + @arg after: Genomic position after the insertion. + @type after: int + @arg s: Nucleotides to be inserted. + @type s: string + @arg mutator: A Mutator instance. + @type mutator: mutalyzer.mutator.Mutator + @arg record: A GenRecord object. + @type record: Modules.GenRecord.GenRecord + @arg O: The Output object. + @type O: Modules.Output.Output + + @raise _NotDNAError: Invalid (non-DNA) letter in input. + @raise _PositionsNotConsecutiveError: Positions {before} and {after} are + not consecutive. + """ + if before + 1 != after: + O.addMessage(__file__, 3, 'EINSRANGE', + '%i and %i are not consecutive positions.' % (before, after)) + raise _PositionsNotConsecutiveError() + + if not s or not util.is_dna(s): + O.addMessage(__file__, 3, 'EUNKVAR', 'Although the syntax of this ' \ + 'variant is correct, the effect can not be analysed.') + raise _NotDNAError() + + insertion_length = len(s) + + # We don't go through the trouble of visualising the *corrected* variant + # and are happy with visualising what the user gave us. + mutator.insM(before, s) + + new_before = mutator.shiftpos(before) + new_stop = mutator.shiftpos(before) + insertion_length + + reverse_roll, forward_roll = util.roll(mutator.mutated, new_before + 1, new_stop) + + # In the case of RNA, check if we roll over a splice site. If so, make + # the roll shorter, just up to the splice site. (Effectively this always + # means we roll over two splice sites, since they are adjacent.) + # We only have to consider the forward roll, since RNA reference + # sequences are always orientated in correspondence with the transcript. + original_forward_roll = forward_roll + if record.record.molType != 'g' : + splice_sites = record.record.geneList[0].transcriptList[0] \ + .mRNA.positionList + for acceptor, donor in util.grouper(splice_sites): + # Note that acceptor and donor splice sites both point to the + # first, respectively last, position of the exon, so they are + # both at different sides of the boundary. + if new_stop < acceptor and new_stop + forward_roll >= acceptor: + forward_roll = acceptor - 1 - new_stop + break + if new_stop <= donor and new_stop + forward_roll > donor: + forward_roll = donor - new_stop + break + + if reverse_roll + forward_roll >= insertion_length: + # Todo: Could there also be a IROLLBACK message in this case? + O.addMessage(__file__, 2, 'WINSDUP', + 'Insertion of %s at position %i_%i was given, ' \ + 'however, the HGVS notation prescribes that it should be a ' \ + 'duplication of %s at position %i_%i.' % ( + s, before, before + 1, + mutator.mutated[new_before + forward_roll:new_stop + forward_roll], + before + forward_roll, + before + forward_roll + insertion_length - 1)) + after += forward_roll - 1 + before = after - insertion_length + 1 + record.name(before, after, 'dup', '', '', + (reverse_roll + forward_roll - insertion_length, 0)) + return + + # Did we select a transcript on the reverse strand? + transcript = record.current_transcript() + reverse_strand = transcript and transcript.CM.orientation == -1 + + if forward_roll and not reverse_strand: + O.addMessage(__file__, 2, 'WROLLFORWARD', 'Insertion of %s at position ' \ + '%i_%i was given, however, the HGVS notation prescribes ' \ + 'that on the forward strand it should be an insertion of %s ' \ + 'at position %i_%i.' % ( + s, before, before + 1, + mutator.mutated[new_before + forward_roll:new_stop + forward_roll], + new_before + forward_roll, new_before + forward_roll + 1)) + + if forward_roll != original_forward_roll and not reverse_strand: + # The original roll was decreased because it crossed a splice site. + O.addMessage(__file__, 1, 'IROLLBACK', + 'Insertion of %s at position %i_%i was not corrected to an ' \ + 'insertion of %s at position %i_%i, since they reside in ' \ + 'different exons.' % ( + s, before, before + 1, + mutator.mutated[new_before + original_forward_roll:new_stop + original_forward_roll], + new_before + original_forward_roll, new_before + original_forward_roll + 1)) + + if reverse_roll and reverse_strand: + O.addMessage(__file__, 2, 'WROLLREVERSE', 'Insertion of %s at position ' \ + '%i_%i was given, however, the HGVS notation prescribes ' \ + 'that on the reverse strand it should be an insertion of %s ' \ + 'at position %i_%i.' % ( + s, before, before + 1, + mutator.mutated[new_before - reverse_roll:new_stop - reverse_roll], + new_before - reverse_roll, (new_before - reverse_roll) + 1)) + + record.name(before, before + 1, 'ins', + mutator.mutated[new_before + forward_roll:new_stop + forward_roll], + '', (reverse_roll, forward_roll), + mutator.mutated[new_before - reverse_roll:new_stop - reverse_roll]) +#apply_insertion + + +def apply_delins(first, last, delete, insert, mutator, record, output): + """ + Do a semantic check for an delins, do the actual delins, and give + it a name. + + @arg first: Genomic start position of the delins. + @type first: int + @arg last: Genomic end position of the delins. + @type last: int + @arg delete: Sequence to delete (may be None, in which case it will be + constructed from the reference sequence). + @type delete: string + @arg insert: Sequence to insert. + @type insert: string + @arg mutator: A Mutator instance. + @type mutator: mutalyzer.mutator.Mutator + @arg record: A GenRecord object. + @type record: Modules.GenRecord.GenRecord + @arg output: The Output object. + @type output: Modules.Output.Output + """ + if not delete: + delete = mutator.orig[first - 1:last] + + if str(delete) == str(insert): + output.addMessage(__file__, 2, 'WNOCHANGE', + 'Sequence "%s" at position %i_%i is identical to ' \ + 'the variant.' % ( + mutator.visualiseLargeString(str(mutator.orig[first - 1:last])), + first, last)) + return + + delete_trimmed, insert_trimmed, lcp, lcs = util.trim_common(delete, insert) + + if not len(delete_trimmed): + output.addMessage(__file__, 2, 'WWRONGTYPE', 'The given DelIns ' \ + 'is actually an insertion.') + apply_insertion(first + lcp - 1, first + lcp, insert_trimmed, mutator, + record, output) + return + + if len(delete_trimmed) == 1 and len(insert_trimmed) == 1: + output.addMessage(__file__, 2, 'WWRONGTYPE', 'The given DelIns ' \ + 'is actually a substitution.') + apply_substitution(first + lcp, delete_trimmed, insert_trimmed, + mutator, record, output) + return + + if not len(insert_trimmed): + output.addMessage(__file__, 2, 'WWRONGTYPE', 'The given DelIns ' \ + 'is actually a deletion.') + apply_deletion_duplication(first + lcp, last - lcs, 'del', + mutator, record, output) + return + + if str(Bio.Seq.reverse_complement(delete_trimmed)) == insert_trimmed: + output.addMessage(__file__, 2, 'WWRONGTYPE', 'The given DelIns ' \ + 'is actually an inversion.') + apply_inversion(first + lcp, last - lcs, mutator, + record, output) + return + + if len(insert) != len(insert_trimmed): + output.addMessage(__file__, 2, 'WNOTMINIMAL', + 'Sequence "%s" at position %i_%i has the same prefix or ' \ + 'suffix as the inserted sequence "%s". The HGVS notation ' \ + 'prescribes that it should be "%s" at position %i_%i.' % ( + mutator.visualiseLargeString(str(mutator.orig[first - 1:last])), + first, last, insert, insert_trimmed, first + lcp, last - lcs)) + + mutator.delinsM(first + lcp, last - lcs, insert_trimmed) + + record.name(first + lcp, last - lcs, 'delins', insert_trimmed, '', None) +#apply_delins + + +def _get_offset(location, main_genomic, sites, output): + """ + Convert the offset coordinate in a location (from the Parser) to an + integer. + + @arg location: A location. + @type location: pyparsing.ParseResults + @arg main_genomic: Genomic main position to which the offset belongs. + @type main_genomic: int + @arg sites: List of splice sites. + @type sites: list + @arg output: The Output object. + @type output: Modules.Output.Output + + @return: Integer representation of the offset coordinate. + @rtype: int + """ + if location.Offset : + if location.Offset == '?' : + try: + # Todo: If it removes CDS start, don't do protein translation. + # Todo: Wrt orientation, perhaps always go to splice site + # locations via the crossmapper... + # Todo: Also check if +? and -? are correctly used. + # Todo: Exactly centering might not be so nice, since the center + # might be closer to a neighbouring exon, making a+xxx from b-? + # and vice versa. This might not be fixed directly by doing a + # center +/- 1 because there might be rolling. Ideally we + # disable rolling entirely for these positions... + # + # Note that the code below might be a bit confusing, especially + # considering reverse strand transcripts. Magically, it works + # for both orientations. + i = sites.index(main_genomic) + if i == 0: + # Before first exon (or last on the reverse strand). + offset = main_genomic / 2 + elif i == len(sites) - 1: + # After last exon (or first on the reverse strand). + # Todo: Get length of reference, and calculate a sensible + # offset. + # + # We now use that 2000 is the default downstream length, + # but of course this is bogus on the reverse strand and + # just a hack anyway. + offset = 1000 + elif i % 2 == 0: + # Acceptor site (or donor on the reverse strand). + offset = abs(main_genomic - sites[i - 1]) / 2 - 1 + else: + # Donor site (or acceptor on the reverse strand). + offset = abs(sites[i + 1] - main_genomic) / 2 - 1 + # Todo: We would like to use the c. position in this message. + output.addMessage(__file__, 1, "IUNKNOWNOFFSET", "Unknown offset " \ + "relative to %s interpreted as middle of " \ + "flanking intron." % main_genomic) + except ValueError: + # Todo: This means we don't get an error if the main position + # was not on an exon boundary. We should return something else + # than 0 I guess. + #return 0 # This is highly debatable. + # Any non-zero value will do. + return 1 + else: + offset = int(location.Offset) + if location.OffSgn == '-' : + return -offset + return offset + + return 0 +#_get_offset + + +def _intronic_to_genomic(location, transcript): + """ + Get genomic location from IVS location. + + @arg location: A location. + @type location: pyparsing.ParseResults + @arg transcript: todo + @type transcript: todo + + @return: Genomic location represented by given IVS location. + @rtype: int + + @raise _InvalidIntronError: Intron does not exist. + """ + ivs_number = int(location.IVSNumber) + + if ivs_number < 1 or ivs_number > transcript.CM.numberOfIntrons(): + raise _InvalidIntronError(ivs_number) + + if location.OffSgn == '+': + return transcript.CM.getSpliceSite(ivs_number * 2 - 1) + \ + transcript.CM.orientation * int(location.Offset) + else: + return transcript.CM.getSpliceSite(ivs_number * 2) - \ + transcript.CM.orientation * int(location.Offset) +#_intronic_to_genomic + + +def _exonic_to_genomic(location, transcript) : + """ + Get genomic range from EX location. + + @arg location: A location. + @type location: pyparsing.ParseResults + @arg transcript: todo + @type transcript: todo + + @return: A tuple of: + - first: Genomic start location represented by given EX location. + - last: Genomic end location represented by given EX location. + @rtype: tuple(int, int) + + @raise _InvalidExonError: Exon does not exist. + + @todo: We probably want to treat this as a-?_b+?, so take the centers of + flanking exons. + """ + first_exon = int(location.EXNumberStart) + if first_exon < 1 or first_exon > transcript.CM.numberOfExons(): + raise _InvalidExonError(first_exon) + first = transcript.CM.getSpliceSite(first_exon * 2 - 2) + + if location.EXNumberStop: + last_exon = int(location.EXNumberStop) + if last_exon < 1 or last_exon > transcript.CM.numberOfExons(): + raise _InvalidExonError(last_exon) + last = transcript.CM.getSpliceSite(last_exon * 2 - 1) + else: + last = transcript.CM.getSpliceSite(first_exon * 2 - 1) + + return first, last +#_exonic_to_genomic + + +def _genomic_to_genomic(first_location, last_location): + """ + Get genomic range from parsed genomic location. + + @arg first_location: The start location (g.) of the variant. + @type first_location: pyparsing.ParseResults + @arg last_location: The start location (g.) of the variant. + @type last_location: pyparsing.ParseResults + + @return: A tuple of: + - first: Genomic start location represented by given location. + - last: Genomic end location represented by given location. + @rtype: tuple(int, int) + + @raise _UnknownPositionError: Unknown positions were used. + @raise _RawVariantError: Range cannot be intepreted. + """ + if not first_location.Main or not last_location.Main: + # Unknown positions are denoted by the '?' character. + raise _UnknownPositionError() + + if not first_location.Main.isdigit() or not last_location.Main.isdigit(): + raise _RawVariantError() + + first = int(first_location.Main) + last = int(last_location.Main) + + return first, last + + +def _coding_to_genomic(first_location, last_location, transcript, output): + """ + Get genomic range from parsed c. location. + + @arg first_location: The start location (c.) of the variant. + @type first_location: pyparsing.ParseResults + @arg last_location: The start location (c.) of the variant. + @type last_location: pyparsing.ParseResults + @arg transcript: todo + @type transcript: todo + @arg output: The Output object. + @type output: Modules.Output.Output + + @return: A tuple of: + - first: Genomic start location represented by given location. + - last: Genomic end location represented by given location. + @rtype: tuple(int, int) + + @raise _UnknownPositionError: Unknown positions were used. + @raise _RawVariantError: Range cannot be interpreted. + @raise _OffsetNotFromBoundary: An offset from a non-boundary position + was used. + @raise _OffsetSignError: Offset from exon boundary has the wrong sign. + """ + if not first_location.Main or not last_location.Main: + # Unknown positions are denoted by the '?' character. + raise _UnknownPositionError() + + if not first_location.Main.isdigit() or not last_location.Main.isdigit(): + raise _RawVariantError() + + first_main = transcript.CM.main2int(first_location.MainSgn + \ + first_location.Main) + first_main_genomic = transcript.CM.x2g(first_main, 0) + first_offset = _get_offset(first_location, first_main_genomic, + transcript.CM.RNA, output) + + last_main = transcript.CM.main2int(last_location.MainSgn + \ + last_location.Main) + last_main_genomic = transcript.CM.x2g(last_main, 0) + last_offset = _get_offset(last_location, last_main_genomic, + transcript.CM.RNA, output) + + # These raise _RawVariantError exceptions on invalid positions. + _check_intronic_position(first_main, first_offset, transcript) + _check_intronic_position(last_main, last_offset, transcript) + + first = transcript.CM.x2g(first_main, first_offset) + last = transcript.CM.x2g(last_main, last_offset) + + if transcript.CM.orientation == -1: + first, last = last, first + + return first, last +#_coding_to_genomic + + +def process_raw_variant(mutator, variant, record, transcript, output): + """ + Process a raw variant. + + We raise _RawVariantError if there was something seriously in error + with the raw variant (but it is still okay to process other raw + variants). We might (don't at the moment) also raise _VariantError, + meaning to stop processing the entire variant. + + @arg mutator: A Mutator instance. + @type mutator: mutalyzer.mutator.Mutator + @arg variant: A parsed raw (simple, noncompound) variant. + @type variant: pyparsing.ParseResults + @arg record: A GenRecord object. + @type record: Modules.GenRecord.GenRecord + @arg transcript: A transcript object. + @type transcript: Modules.GenRecord.Locus + @arg output: The Output object. + @type output: Modules.Output.Output + + @raise _RawVariantError: Cannot process this raw variant. + @raise _VariantError: Cannot further process the entire variant. + """ + # {argument} may be a number, or a subsequence of the reference. + # {sequence} is the variant subsequence. + argument = variant.Arg1 + sequence = variant.Arg2 + + # If we are on the reverse strand, subsequences must be in reverse + # complement. + if transcript and transcript.CM.orientation == -1: + sequence = Bio.Seq.reverse_complement(sequence) + if util.is_dna(argument): + argument = Bio.Seq.reverse_complement(argument) + + # Get genomic first and last positions for this variant. Below we handle + # the different ways of describing these positions. + + if variant.EXLoc: + # EX positioning. + try: + first, last = _exonic_to_genomic(variant.EXLoc, transcript) + except _InvalidExonError as e: + output.addMessage(__file__, 4, 'EINVALIDEXON', + 'Non-existing exon number %d given.' % e.exon) + raise + if last < first: + # Todo: Why could this ever happen? + first, last = last, first + + elif not variant.StartLoc: + # All non-EX positioning ways need a start location. + # Todo: Better message. + output.addMessage(__file__, 4, 'EUNKNOWN', + 'An unknown error occurred.') + raise _RawVariantError() + + elif variant.StartLoc.IVSLoc: + # IVS positioning. + if record.record.molType != 'g': + output.addMessage(__file__, 3, 'ENOINTRON', 'Intronic ' \ + 'position given for a non-genomic reference sequence.') + raise _RawVariantError() + try: + first = last = _intronic_to_genomic(variant.StartLoc.IVSLoc, + transcript) + except _InvalidIntronError as e: + output.addMessage(__file__, 4, 'EINVALIDINTRON', + 'Non-existing intron number %d given.' % \ + e.intron) + raise + if variant.EndLoc and variant.EndLoc.IVSLoc: + try: + last = _intronic_to_genomic(variant.EndLoc.IVSLoc, transcript) + except _InvalidIntronError as e: + output.addMessage(__file__, 4, 'EINVALIDINTRON', + 'Non-existing intron number %d given.' % \ + e.intron) + raise + if last < first: + # Todo: Why could this ever happen? + first, last = last, first + + else: + # Genomic or coding positioning. + if record.record.molType != 'g' and \ + (_is_coding_intronic(variant.StartLoc) or + _is_coding_intronic(variant.EndLoc)): + output.addMessage(__file__, 3, 'ENOINTRON', 'Intronic ' \ + 'position given for a non-genomic reference sequence.') + raise _RawVariantError() + + first_location = last_location = variant.StartLoc.PtLoc + if variant.EndLoc: + last_location = variant.EndLoc.PtLoc + + # Todo: Check these error messages for minus strand variants etc. + try: + if transcript: + # Coding positioning. + first, last = _coding_to_genomic(first_location, last_location, + transcript, output) + else: + # Genomic positioning. + first, last = _genomic_to_genomic(first_location, last_location) + except _UnknownPositionError: + output.addMessage(__file__, 4, 'EUNKNOWN', + 'Unknown positions (denoted by "?") are ' \ + 'not supported.') + raise + except _OffsetSignError as e: + output.addMessage(__file__, 3, 'EOFFSETSIGN', 'Offset %s from ' \ + 'position %s is in %s direction but should ' \ + 'be in %s direction.' % \ + (e.offset, e.main, + 'downstream' if e.acceptor else 'upstream', + 'upstream' if e.acceptor else 'downstream')) + raise + except _OffsetNotFromBoundaryError as e: + output.addMessage(__file__, 3, 'EOFFSETFROMBOUNDARY', + 'Offset may not be from position %s because ' \ + ' this is not an exon boundary.' % e.main) + raise + except _RawVariantError: + # Todo: What does this situation really mean? I don't think + # this is the right message. + #output.addMessage(__file__, 3, 'ESPLICE', 'Invalid intronic ' \ + # 'position given.') + output.addMessage(__file__, 4, 'EPOSITION', + 'Could not determine position.') + raise + + if last < first: + output.addMessage(__file__, 3, 'ERANGE', 'End position is smaller than ' \ + 'the begin position.') + raise _RawVariantError() + + if first < 1: + output.addMessage(__file__, 4, 'ERANGE', 'Position %i is out of range.' % + first) + raise _RawVariantError() + + if last > len(mutator.orig): + output.addMessage(__file__, 4, 'ERANGE', 'Position %s is out of range.' % + last) + raise _RawVariantError() + + splice_abort = False + + # If we hit a splice site, issue a warning. Later on we decide if we + # can still process this variant in any way (e.g. if it deletes an + # entire exon). + if transcript and util.over_splice_site(first, last, transcript.CM.RNA): + splice_abort = True + output.addMessage(__file__, 2, 'WOVERSPLICE', + 'Variant hits one or more splice sites in ' \ + 'selected transcript.') + + # If we have a deletion, and it covers exactly an even number of splice + # sites, remove these splice sites. + # Note, this is not the same as util.over_splice_site(). Here we collect + # sites where the deletion borders the exon/intron boundary. + # Todo: Special cases for first/last exon? Upstream/downstream exons? + # Todo: This still goes horribly wrong in some cases, example: + # NM_000088.3(COL1A1_v001):c.588del + if transcript and variant.MutationType == 'del': + removed_sites = [] + for acceptor, donor in util.grouper(transcript.CM.RNA): + + # If we have introns, we match splice sites in a fuzzy way. This + # Means that in the case of + # + # a b + # ===========------------============= + # + # with splice sites a and b, a deletion a+1_b-1 of the entire + # intron gets treated as a deletion of both splice sites. + # + # We don't want this behaviour on e.g. RNA, where we only have + # exons. In the case of + # + # a b c d + # ========== ============= =========== + # + # with splice sites a b c d, a deletion b_c of the middle exon + # should only remove splice sites b and c, not a and d. + if record.record.molType == 'g': + fuzzy = 1 + else: + fuzzy = 0 + + if first <= acceptor <= last + fuzzy: + removed_sites.append(acceptor) + if first - fuzzy <= donor <= last: + removed_sites.append(donor) + + if len(removed_sites) and not len(removed_sites) % 2: + # An even number of splice sites was removed. We can deal with + # this, but issue a warning. + # However, don't do this trick if we end up removing an odd number + # of sites from the CDS. + # Todo: We might cripple the start codon, fix the translation code + # (further down) to deal with this. + # Todo: Bit unrelated, but find out the difference between + # - transcript.CM.RNA + # - transcript.mRNA.positionList + # and what we should use (likewise for CDS). + removed_cds_sites = filter(lambda s: s in transcript.CDS.positionList, + removed_sites) if transcript.CDS else [] + if not len(removed_cds_sites) % 2: + # Todo: splice_abort=False undoes the warning (sort of), but + # the warning might (also) be about other sites... + splice_abort = False + mutator.add_removed_sites(removed_sites) + output.addMessage(__file__, 1, 'IDELSPLICE', + 'Removed %i splice sites from selected ' \ + 'transcript.' % len(removed_sites)) + # This is primarily for use in unittests. + output.addOutput('removedSpliceSites', len(removed_sites)) + + # If splice_abort is set, this basically means WOVERSPLICE was called and + # IDELSPLICE was not called. + # I guess in that case we do want to generate the visualisation, the + # genomic description, and affected transcripts. But NOT the predicted + # protein. + # The following solution is a bit of a hack. By setting the .translate + # field of the transcript to False, we force that no protein is predicted. + if splice_abort: + transcript.translate = False + + # The following functions can raise _RawVariantError exceptions, but we + # just let them flow through to the caller. + + # Check if the (optional) argument is valid. + if variant.MutationType in ['del', 'dup', 'subst', 'delins']: + _check_argument(argument, mutator.orig, first, last, output) + + # Substitution. + if variant.MutationType == 'subst': + apply_substitution(first, argument, sequence, mutator, record, output) + + # Deletion or duplication. + if variant.MutationType in ['del', 'dup']: + # The fuzzy flags are to support deletions of the form c.a-?_b+?del. + first_fuzzy = variant.StartLoc.PtLoc.Offset == '?' + last_fuzzy = variant.EndLoc and variant.EndLoc.PtLoc.Offset == '?' + apply_deletion_duplication(first, last, variant.MutationType, mutator, + record, output, first_fuzzy=first_fuzzy, + last_fuzzy=last_fuzzy) + + # Inversion. + if variant.MutationType == 'inv': + apply_inversion(first, last, mutator, record, output) + + # Insertion. + if variant.MutationType == 'ins': + # Check if the inserted sequence is not a range. + # Todo: Implement this feature. + if not argument: + output.addMessage(__file__, 4, 'ENOTIMPLEMENTED', + 'Insertion of a range is not implemented yet.') + raise _RangeInsertionError() + apply_insertion(first, last, argument, mutator, record, output) + + # DelIns. + if variant.MutationType == 'delins': + # Check if the inserted sequence is not a range. + # Todo: Implement this feature. + if not sequence: + output.addMessage(__file__, 4, 'ENOTIMPLEMENTED', + 'Insertion of a range is not implemented yet.') + raise _RangeInsertionError() + apply_delins(first, last, argument, sequence, mutator, record, output) +#process_raw_variant + + +def _add_static_transcript_info(transcript, output): + """ + Add static (unrelated to the variant) transcript-specific information to + the {output} object. + + @arg transcript: A transcript object. + @type transcript: Modules.GenRecord.Locus + @arg output: The Output object. + @type output: Modules.Output.Output + """ + output.addOutput('hasTranscriptInfo', True) + + # Add exon table to output. + for i in range(0, transcript.CM.numberOfExons() * 2, 2): + acceptor = transcript.CM.getSpliceSite(i) + donor = transcript.CM.getSpliceSite(i + 1) + output.addOutput('exonInfo', [acceptor, donor, + transcript.CM.g2c(acceptor), + transcript.CM.g2c(donor)]) + + # Add CDS info to output. + cds_stop = transcript.CM.info()[2] + output.addOutput('cdsStart_g', transcript.CM.x2g(1, 0)) + output.addOutput('cdsStart_c', 1) + output.addOutput('cdsStop_g', transcript.CM.x2g(cds_stop, 0)) + output.addOutput('cdsStop_c', cds_stop) + + # Is this transcript coding? + # Example of non-coding transcript variant: + # AL449423.14(CDKN2A_v004):n.42_437del + output.addOutput('transcriptCoding', bool(transcript.CM.CDS)) + + # Is this transcript on the reverse strand? + output.addOutput('transcriptReverse', transcript.CM.orientation == -1) + + +def _add_transcript_info(mutator, transcript, output): + """ + Add transcript-specific information (including protein prediction) to + the {output} object. + + @arg mutator: A Mutator instance. + @type mutator: mutalyzer.mutator.Mutator + @arg transcript: A transcript object. + @type transcript: Modules.GenRecord.Locus + @arg output: The Output object. + @type output: Modules.Output.Output + + @todo: Documentation. + @todo: Don't generate the fancy HTML protein descriptions here. + @todo: Add mutated transcript and CDS info. + """ + # Add transcript info to output. + if transcript.transcribe: + output.addOutput('myTranscriptDescription', transcript.description) + output.addOutput('origMRNA', + str(util.splice(mutator.orig, transcript.mRNA.positionList))) + output.addOutput('mutatedMRNA', + str(util.splice(mutator.mutated, + mutator.newSplice(transcript.mRNA.positionList)))) + + # Add protein prediction to output. + if transcript.translate: + cds_original = Seq(str(util.splice(mutator.orig, transcript.CDS.positionList)), + IUPAC.unambiguous_dna) + cds_variant = Seq(str(util.__nsplice(mutator.mutated, + mutator.newSplice(transcript.mRNA.positionList), + mutator.newSplice(transcript.CDS.location), + transcript.CM.orientation)), + IUPAC.unambiguous_dna) + + #output.addOutput('origCDS', cds_original) + + if transcript.CM.orientation == -1: + cds_original = Bio.Seq.reverse_complement(cds_original) + cds_variant = Bio.Seq.reverse_complement(cds_variant) + + if not util.is_dna(cds_original): + output.addMessage(__file__, 4, 'ENODNA', + 'Invalid letters in reference sequence.') + return + + if '*' in cds_original.translate(table=transcript.txTable)[:-1]: + output.addMessage(__file__, 3, 'ESTOP', + 'In frame stop codon found.') + return + + protein_original = cds_original.translate(table=transcript.txTable, + to_stop=True) + protein_variant = cds_variant.translate(table=transcript.txTable, + to_stop=True) + + # Note: addOutput('origCDS', ...) was first before the possible + # reverse complement operation above. + output.addOutput('origCDS', cds_original) + output.addOutput("newCDS", cds_variant[:(len(str(protein_variant)) + 1) * 3]) + + output.addOutput('oldprotein', protein_original + '*') + + # Todo: Don't generate the fancy HTML protein views here, do this in + # website.py. + # I think it would also be nice to include the mutated list of splice + # sites. + if not protein_variant or protein_variant[0] != 'M': + # Todo: Protein differences are not color-coded, + # use something like below in protein_description(). + util.print_protein_html(protein_original + '*', 0, 0, output, + 'oldProteinFancy') + if str(cds_variant[0:3]) in \ + Bio.Data.CodonTable.unambiguous_dna_by_id \ + [transcript.txTable].start_codons: + output.addOutput('newprotein', '?') + util.print_protein_html('?', 0, 0, output, 'newProteinFancy') + output.addOutput('altStart', str(cds_variant[0:3])) + if str(protein_original[1:]) != str(protein_variant[1:]): + output.addOutput('altProtein', + 'M' + protein_variant[1:] + '*') + util.print_protein_html('M' + protein_variant[1:] + '*', 0, 0, + output, 'altProteinFancy') + else : + output.addOutput('newprotein', '?') + util.print_protein_html('?', 0, 0, output, 'newProteinFancy') + + else: + cds_length = util.cds_length( + mutator.newSplice(transcript.CDS.positionList)) + descr, first, last_original, last_variant = \ + util.protein_description(cds_length, protein_original, + protein_variant) + + # This is never used. + output.addOutput('myProteinDescription', descr) + + util.print_protein_html(protein_original + '*', first, last_original, + output, 'oldProteinFancy') + if str(protein_original) != str(protein_variant): + output.addOutput('newprotein', protein_variant + '*') + util.print_protein_html(protein_variant + '*', first, last_variant, + output, 'newProteinFancy') +#_add_transcript_info + + +def process_variant(mutator, description, record, output): + """ + @arg mutator: A Mutator instance. + @type mutator: mutalyzer.mutator.Mutator + @arg description: Parsed HGVS variant description. + @type description: pyparsing.ParseResults + @arg record: A GenRecord object. + @type record: Modules.GenRecord.GenRecord + @arg output: The Output object. + @type output: Modules.Output.Output + + @raise _VariantError: Cannot process this variant. + + @todo: Documentation. + """ + if not description.RawVar and not description.SingleAlleleVarSet: + output.addMessage(__file__, 4, 'ENOVARIANT', + 'Variant description contains no mutation.') + raise _VariantError() + + if description.RefType == 'r': + output.addMessage(__file__, 4, 'ERNA', + 'Descriptions on RNA level are not supported.') + raise _VariantError() + + transcript = None + + if description.RefType in ['c', 'n']: + + gene = None + gene_symbol, transcript_id = output.getOutput('geneSymbol')[-1] + + if description.LrgAcc: + # LRG case, pick the top gene. + gene = record.record.geneList[0] + if transcript_id: + transcript = gene.findLocus(transcript_id) + if not transcript: + # Todo: Incorrect error message, it might also be that + # there are no transcripts at all (e.g. N4BP2L1 on + # NG_012772.1). + output.addMessage(__file__, 4, "ENOTRANSCRIPT", + "Multiple transcripts found for gene %s. Please " \ + "choose from: %s" %(gene.name, + ", ".join(gene.listLoci()))) + else: + # No transcript id given. + if len(gene.transcriptList) == 1: + # No transcript given, only 1 found, pick that. + transcript = gene.transcriptList[0] + else: + output.addMessage(__file__, 4, "ENOTRANSCRIPT", + "No transcript given for gene %s. Please " \ + "choose from: %s" %(gene.name, + ", ".join(gene.listLoci()))) + + else: + # Not an LRG, find our gene manually. + genes = record.record.listGenes() + transcript_id = transcript_id and "%.3i" % int(transcript_id) + + if gene_symbol in genes: + # We found our gene. + gene = record.record.findGene(gene_symbol) + elif (len(genes) == 1) and not(gene_symbol): + # No gene given and there is only one gene in the record. + # Todo: message? + gene = record.record.geneList[0] + else: + output.addMessage(__file__, 4, "EINVALIDGENE", + "Gene %s not found. Please choose from: %s" % ( + gene_symbol, ", ".join(genes))) + + if gene: + # Find transcript. + transcripts = gene.listLoci() + if transcript_id in transcripts: + # Found our transcript. + transcript = gene.findLocus(transcript_id) + elif (len(transcripts) == 1) and not(transcript_id): + # No transcript given and there is only one transcript for + # this gene. + transcript = gene.transcriptList[0] + else: + # Todo: Incorrect error message, it might also be that + # there are no transcripts at all (e.g. N4BP2L1 on + # NG_012772.1). + output.addMessage(__file__, 4, "ENOTRANSCRIPT", + "Multiple transcripts found for gene %s. Please " \ + "choose from: %s" %(gene.name, + ", ".join(gene.listLoci()))) + + # Add selected gene symbol to output + output.addOutput('geneSymbol', (gene and gene.name or '', + transcript and transcript.name or '')) + + # Return if no transcript is selected + if not transcript: + # Skip all BatchJobs with the same preColon data. + output.addOutput('BatchFlags', + ('S2', output.getOutput('preColon')[-1])) + raise _VariantError() + elif not transcript.transcribe: + # Todo: Shouldn't we add some message here? + raise _VariantError() + + # Mark this as the current transcript we work with. + transcript.current = True + + # Add static transcript-specific information. + if transcript and record.record.geneList: + _add_static_transcript_info(transcript, output) + + # Now process all raw variants (or just the only one). The function + # process_raw_variant might raise a _VariantError exception. + if description.SingleAlleleVarSet: + for var in description.SingleAlleleVarSet: + try: + process_raw_variant(mutator, var.RawVar, record, transcript, + output) + except _RawVariantError: + #output.addMessage(__file__, 2, 'WSKIPRAWVARIANT', + # 'Ignoring raw variant "%s".' % var[0]) + output.addMessage(__file__, 1, 'IRAWVARIANTERROR', + 'Aborted variant check due to error in ' \ + 'raw variant "%s".' % var[0]) + raise + else: + process_raw_variant(mutator, description.RawVar, record, + transcript, output) + + # Add transcript-specific variant information. + if transcript and record.record.geneList: + _add_transcript_info(mutator, transcript, output) +#process_variant + + +def check_variant(description, config, output): + """ + Check the variant described by {description} according to the HGVS variant + nomenclature and populate the {output} object with various information + about the variant and its reference sequence. + + @arg description: Variant description in HGVS notation. + @type description: string + @arg config: A configuration object. + @type config: Modules.Config.Config + @arg output: An output object. + @type output: Modules.Output.Output + + @todo: Documentation. + @todo: Raise exceptions on failure instead of just return. + """ + output.addOutput('inputvariant', description) + + grammar = Grammar(output) + parsed_description = grammar.parse(description) + + if not parsed_description: + # Parsing went wrong. + return + + # Add GeneSymbol and Transcript Var to the Output object for batch. + if parsed_description.Gene: + output.addOutput('geneOfInterest', + dict(parsed_description.Gene.items())) + else: + output.addOutput('geneOfInterest', dict()) + + if parsed_description.Version: + record_id = parsed_description.RefSeqAcc + '.' + parsed_description.Version + else: + record_id = parsed_description.RefSeqAcc + + gene_symbol = transcript_id = '' + + database = Db.Cache(config.Db) + if parsed_description.LrgAcc: + filetype = 'LRG' + record_id = parsed_description.LrgAcc + transcript_id = parsed_description.LRGTranscriptID + retriever = Retriever.LRGRetriever(config.Retriever, output, database) + else: + filetype = 'GB' + if parsed_description.Gene: + gene_symbol = parsed_description.Gene.GeneSymbol or '' + transcript_id = parsed_description.Gene.TransVar or '' + if parsed_description.Gene.ProtIso: + output.addMessage(__file__, 4, 'EPROT', 'Indexing by ' \ + 'protein isoform is not supported.') + retriever = Retriever.GenBankRetriever(config.Retriever, output, + database) + + retrieved_record = retriever.loadrecord(record_id) + + if not retrieved_record: + return + + # Add recordType to output for output formatting. + output.addOutput('recordType', filetype) + + output.addOutput('reference', record_id) + + # Note: geneSymbol[0] is used as a filter for batch runs. + output.addOutput('geneSymbol', (gene_symbol, transcript_id)) + + # Note: preColon is used to filter out Batch entries that will result in + # identical errors. + output.addOutput('preColon', description.split(':')[0]) + output.addOutput('variant', description.split(':')[-1]) + + record = GenRecord.GenRecord(output, config.GenRecord) + record.record = retrieved_record + record.checkRecord() + + # Create the legend. + for gene in record.record.geneList: + for transcript in sorted(gene.transcriptList, key=attrgetter('name')): + if not transcript.name: + continue + output.addOutput('legends', + ['%s_v%s' % (gene.name, transcript.name), + transcript.transcriptID, transcript.locusTag, + transcript.transcriptProduct, + transcript.linkMethod]) + if transcript.translate: + output.addOutput('legends', + ['%s_i%s' % (gene.name, transcript.name), + transcript.proteinID, transcript.locusTag, + transcript.proteinProduct, + transcript.linkMethod]) + + # Note: The GenRecord instance is carrying the sequence in .record.seq. + # So is the Mutator instance in .mutator.orig. + + mutator = Mutator(record.record.seq, config.Mutator, output) + + # Todo: If processing of the variant fails, we might still want to show + # information about the record, gene, transcript. + + try: + process_variant(mutator, parsed_description, record, output) + except _VariantError: + return + + output.addOutput('original', str(mutator.orig)) + output.addOutput('mutated', str(mutator.mutated)) + + # Protein. + for gene in record.record.geneList: + for transcript in gene.transcriptList: + + if not (transcript.CDS and transcript.translate) \ + or ';' in transcript.description \ + or transcript.description == '?': + # Default value is '?', but later on we don't prefix a 'p.' + # string, so we include it here. If there's no good reason + # for this, I think we should only add the 'p.' later (so + # __toProtDescr should also not add it). + transcript.proteinDescription = 'p.?' + continue + + cds_original = Seq(str(util.splice(mutator.orig, transcript.CDS.positionList)), + IUPAC.unambiguous_dna) + cds_variant = Seq(str(util.__nsplice(mutator.mutated, + mutator.newSplice(transcript.mRNA.positionList), + mutator.newSplice(transcript.CDS.location), + transcript.CM.orientation)), + IUPAC.unambiguous_dna) + + if transcript.CM.orientation == -1: + cds_original = Bio.Seq.reverse_complement(cds_original) + cds_variant = Bio.Seq.reverse_complement(cds_variant) + + #if '*' in cds_original.translate()[:-1]: + # output.addMessage(__file__, 3, "ESTOP", + # "In frame stop codon found.") + # return + ##if + + # Todo: Figure out if this is all ok, even if the CDS stop is + # somehow removed, if the sequence is really short, etc. + + if not len(cds_original) % 3: + try: + # FIXME this is a bit of a rancid fix. + protein_original = cds_original.translate(table=transcript.txTable, + cds=True, + to_stop=True) + except Bio.Data.CodonTable.TranslationError: + output.addMessage(__file__, 4, "ETRANS", "Original " \ + "CDS could not be translated.") + return + protein_variant = cds_variant.translate(table=transcript.txTable, + to_stop=True) + try: + cds_length = util.cds_length( + mutator.newSplice(transcript.CDS.positionList)) + transcript.proteinDescription = util.protein_description( + cds_length, protein_original, protein_variant)[0] + except IndexError: + # Todo: Probably CDS start was hit by removal of exon... + transcript.proteinDescription = 'p.?' + + else: + output.addMessage(__file__, 2, "ECDS", "CDS length is " \ + "not a multiple of three in gene %s, transcript " \ + "variant %s." % (gene.name, transcript.name)) + transcript.proteinDescription = 'p.?' + + reference = output.getOutput('reference')[-1] + if ';' in record.record.description: + generated_description = '[' + record.record.description + ']' + else: + generated_description = record.record.description + + output.addOutput('genomicDescription', '%s:%c.%s' % \ + (reference, record.record.molType, generated_description)) + output.addOutput('gDescription', '%c.%s' % \ + (record.record.molType, generated_description)) + output.addOutput('molType', record.record.molType) + + if record.record.chromOffset: + if ';' in record.record.chromDescription: + chromosomal_description = '[' + record.record.chromDescription + ']' + else: + chromosomal_description = record.record.chromDescription + output.addOutput('genomicChromDescription', '%s:%c.%s' % \ + (record.record.recordId, + record.record.molType, chromosomal_description)) + + # Now we add variant descriptions for all transcripts, including protein + # level descriptions. + for gene in record.record.geneList: + for transcript in sorted(gene.transcriptList, key=attrgetter('name')): + + # Note: I don't think genomic_id is ever used, because it is + # always ''. + coding_description = '' + protein_description = '' + full_description = '' + full_protein_description = '' + genomic_id = coding_id = protein_id = '' + + if ';' in transcript.description: + generated_description = '[' + transcript.description + ']' + else: + generated_description = transcript.description + + if record.record._sourcetype == 'LRG': + if transcript.name: + full_description = '%st%s:%c.%s' % \ + (reference, transcript.name, + transcript.molType, + generated_description) + output.addOutput('descriptions', full_description) + else: + output.addOutput('descriptions', gene.name) + else: + full_description = '%s(%s_v%s):%c.%s' % \ + (reference, gene.name, transcript.name, + transcript.molType, + generated_description) + output.addOutput('descriptions', full_description) + + if transcript.molType == 'c': + coding_description = 'c.%s' % generated_description + protein_description = transcript.proteinDescription + if record.record._sourcetype == 'LRG': + full_protein_description = '%sp%s:%s' % \ + (reference, transcript.name, + protein_description) + else: + full_protein_description = '%s(%s_i%s):%s' % \ + (reference, gene.name, + transcript.name, + protein_description) + + coding_id, protein_id = \ + transcript.transcriptID, transcript.proteinID + output.addOutput('protDescriptions', + full_protein_description) + + # The 'NewDescriptions' field is used in _add_batch_output. + output.addOutput('NewDescriptions', + (gene.name, transcript.name, + transcript.molType, coding_description, + protein_description, genomic_id, coding_id, + protein_id, full_description, + full_protein_description)) + + _add_batch_output(output) +#check_variant diff --git a/src/webservice.py b/mutalyzer/webservice.py similarity index 74% rename from src/webservice.py rename to mutalyzer/webservice.py index 2e74230af43e742568c8617ae3948a1f5eb69a27..2feecd256ef36e4f5ba652eae8cc0192aa5b57a9 100644 --- a/src/webservice.py +++ b/mutalyzer/webservice.py @@ -1,19 +1,6 @@ -#!/usr/bin/env python - """ Mutalyzer webservices. -The SOAP webservice is exposed through a WSGI interface. - -Example Apache/mod_wsgi configuration: - - WSGIScriptAlias /services /var/www/mutalyzer/src/webservice.py - -Be sure to have this line first if you also define a / alias, like this: - - WSGIScriptAlias /services /var/www/mutalyzer/src/webservice.py - WSGIScriptAlias / /var/www/mutalyzer/src/wsgi.py - @todo: Do we really use namespaces correctly? @todo: For some reason, the server exposes its location including ?wsdl. @todo: More thourough input checking. The @soap decorator does not do any @@ -22,6 +9,7 @@ Be sure to have this line first if you also define a / alias, like this: use __checkBuild.) """ + # WSGI applications should never print anything to stdout. We redirect to # stderr, but eventually Mutalyzer should be fixed to never just 'print' # anything. @@ -32,47 +20,42 @@ sys.stdout = sys.stderr # Log exceptions to stdout import logging; logging.basicConfig() -# We now use very current soaplib: -# $ git clone https://github.com/soaplib/soaplib.git -# $ cd soaplib -# $ sudo python setup.py install - from soaplib.core import Application from soaplib.core.service import soap from soaplib.core.service import DefinitionBase -from soaplib.core.model.primitive import String, Integer, Boolean +from soaplib.core.model.primitive import String, Integer, Boolean, DateTime from soaplib.core.model.clazz import Array from soaplib.core.model.exception import Fault from soaplib.core.server import wsgi import os -import site +import socket from operator import itemgetter, attrgetter -# Add /src to Python path -site.addsitedir(os.path.dirname(__file__)) +import mutalyzer +from mutalyzer.config import Config +from mutalyzer.output import Output +from mutalyzer.grammar import Grammar +from mutalyzer.sync import CacheSync +from mutalyzer import variantchecker +from mutalyzer import Db +from mutalyzer.mapping import Converter +from mutalyzer import Retriever +from mutalyzer import GenRecord +from mutalyzer.models import * -# Todo: fix Mutalyzer to not depend on working directory -os.chdir(os.path.split(os.path.dirname(__file__))[0]) -import Mutalyzer -from Modules import Db -from Modules import Output -from Modules import Config -from Modules import Parser -from Modules import Mapper -from Modules import Retriever -from Modules import GenRecord -from Modules.Serializers import Mapping, Transcript, MutalyzerOutput, Mandatory, TranscriptNameInfo, CheckSyntaxOutput, SoapMessage, TranscriptInfo, ExonInfo, ProteinTranscript, RawVariant - - -class MutalyzerService(DefinitionBase) : +class MutalyzerService(DefinitionBase): """ Mutalyzer webservices. These methods are made public via a SOAP interface. """ + def __init__(self, environ=None): + self._config = Config() + super(MutalyzerService, self).__init__(environ) + #__init__ - def __checkBuild(self, L, build, config) : + def __checkBuild(self, L, build) : """ Check if the build is supported (hg18 or hg19). @@ -83,11 +66,9 @@ class MutalyzerService(DefinitionBase) : @type L: object @arg build: The human genome build name that needs to be checked. @type build: string - @arg config: Configuration object of the Db module. - @type config: object """ - if not build in config.dbNames : + if not build in self._config.Db.dbNames : L.addMessage(__file__, 4, "EARG", "EARG %s" % build) raise Fault("EARG", "The build argument (%s) was not a valid " \ @@ -159,7 +140,7 @@ class MutalyzerService(DefinitionBase) : @soap(Mandatory.String, Mandatory.String, Mandatory.Integer, Boolean, _returns = Array(Mandatory.String)) - def getTranscripts(self, build, chrom, pos, versions = False) : + def getTranscripts(self, build, chrom, pos, versions=False) : """ Get all the transcripts that overlap with a chromosomal position. @@ -174,23 +155,21 @@ class MutalyzerService(DefinitionBase) : @arg chrom: A chromosome encoded as "chr1", ..., "chrY". @type chrom: string @arg pos: A position on the chromosome. - @type pos: integer - @arg versions: Also return version numbers. - @type versions: boolean + @type pos: int + @kwarg versions: If set to True, also include transcript versions. + @type versions: bool @return: A list of transcripts. @rtype: list """ - - C = Config.Config() - L = Output.Output(__file__, C.Output) + L = Output(__file__, self._config.Output) L.addMessage(__file__, -1, "INFO", - "Received request getTranscripts(%s %s %s)" % (build, - chrom, pos)) + "Received request getTranscripts(%s %s %s %s)" % (build, + chrom, pos, versions)) - self.__checkBuild(L, build, C.Db) - D = Db.Mapping(build, C.Db) + self.__checkBuild(L, build) + D = Db.Mapping(build, self._config.Db) self.__checkChrom(L, D, chrom) self.__checkPos(L, pos) @@ -198,37 +177,35 @@ class MutalyzerService(DefinitionBase) : ret = D.get_Transcripts(chrom, pos, pos, True) #filter out the accNo - if versions : + if versions: ret = [r[0] + '.' + str(r[-1]) for r in ret] - else : + else: ret = [r[0] for r in ret] L.addMessage(__file__, -1, "INFO", - "Finished processing getTranscripts(%s %s %s)" % (build, - chrom, pos)) + "Finished processing getTranscripts(%s %s %s %s)" % (build, + chrom, pos, versions)) L.addMessage(__file__, -1, "INFO", "We return %s" % ret) - del D, L, C + del D, L return ret #getTranscripts @soap(Mandatory.String, Mandatory.String, _returns = Array(Mandatory.String)) - def getTranscriptsByGeneName(self, build, name) : + def getTranscriptsByGeneName(self, build, name): """ Todo: documentation. """ - - C = Config.Config() - L = Output.Output(__file__, C.Output) + L = Output(__file__, self._config.Output) L.addMessage(__file__, -1, "INFO", "Received request getTranscriptsByGene(%s %s)" % (build, name)) - self.__checkBuild(L, build, C.Db) - D = Db.Mapping(build, C.Db) + self.__checkBuild(L, build) + D = Db.Mapping(build, self._config.Db) ret = D.get_TranscriptsByGeneName(name) @@ -261,16 +238,14 @@ class MutalyzerService(DefinitionBase) : @return: A list of transcripts. @rtype: list """ - - C = Config.Config() - L = Output.Output(__file__, C.Output) + L = Output(__file__, self._config.Output) L.addMessage(__file__, -1, "INFO", "Received request getTranscriptsRange(%s %s %s %s %s)" % (build, chrom, pos1, pos2, method)) - D = Db.Mapping(build, C.Db) - self.__checkBuild(L, build, C.Db) + D = Db.Mapping(build, self._config.Db) + self.__checkBuild(L, build) ret = D.get_Transcripts(chrom, pos1, pos2, method) @@ -281,7 +256,7 @@ class MutalyzerService(DefinitionBase) : "Finished processing getTranscriptsRange(%s %s %s %s %s)" % ( build, chrom, pos1, pos2, method)) - del D, L, C + del D, L return ret #getTranscriptsRange @@ -298,22 +273,20 @@ class MutalyzerService(DefinitionBase) : @return: The name of the associated gene. @rtype: string """ - - C = Config.Config() - L = Output.Output(__file__, C.Output) + L = Output(__file__, self._config.Output) L.addMessage(__file__, -1, "INFO", "Received request getGeneName(%s %s)" % (build, accno)) - D = Db.Mapping(build, C.Db) - self.__checkBuild(L, build, C.Db) + D = Db.Mapping(build, self._config.Db) + self.__checkBuild(L, build) ret = D.get_GeneName(accno.split('.')[0]) L.addMessage(__file__, -1, "INFO", "Finished processing getGeneName(%s %s)" % (build, accno)) - del D, L, C + del D, L return ret #getGeneName @@ -359,22 +332,20 @@ class MutalyzerService(DefinitionBase) : - type ; The mutation type. @rtype: object """ - - C = Config.Config() - L = Output.Output(__file__, C.Output) + L = Output(__file__, self._config.Output) L.addMessage(__file__, -1, "INFO", "Reveived request mappingInfo(%s %s %s %s)" % ( LOVD_ver, build, accNo, variant)) - conv = Mapper.Converter(build, C, L) + conv = Converter(build, self._config, L) result = conv.mainMapping(accNo, variant) L.addMessage(__file__, -1, "INFO", "Finished processing mappingInfo(%s %s %s %s)" % ( LOVD_ver, build, accNo, variant)) - del L, C + del L return result #mappingInfo @@ -399,15 +370,13 @@ class MutalyzerService(DefinitionBase) : - CDS_stop ; CDS stop in I{c.} notation. @rtype: object """ - - C = Config.Config() - O = Output.Output(__file__, C.Output) + O = Output(__file__, self._config.Output) O.addMessage(__file__, -1, "INFO", "Received request transcriptInfo(%s %s %s)" % (LOVD_ver, build, accNo)) - converter = Mapper.Converter(build, C, O) + converter = Converter(build, self._config, O) T = converter.mainTranscript(accNo) O.addMessage(__file__, -1, "INFO", @@ -429,14 +398,13 @@ class MutalyzerService(DefinitionBase) : @return: The accession number of a chromosome. @rtype: string """ - C = Config.Config() # Read the configuration file. - D = Db.Mapping(build, C.Db) - L = Output.Output(__file__, C.Output) + D = Db.Mapping(build, self._config.Db) + L = Output(__file__, self._config.Output) L.addMessage(__file__, -1, "INFO", "Received request chromAccession(%s %s)" % (build, name)) - self.__checkBuild(L, build, C.Db) + self.__checkBuild(L, build) self.__checkChrom(L, D, name) result = D.chromAcc(name) @@ -445,7 +413,7 @@ class MutalyzerService(DefinitionBase) : "Finished processing chromAccession(%s %s)" % (build, name)) - del D,L,C + del D,L return result #chromAccession @@ -462,14 +430,13 @@ class MutalyzerService(DefinitionBase) : @return: The name of a chromosome. @rtype: string """ - C = Config.Config() # Read the configuration file. - D = Db.Mapping(build, C.Db) - L = Output.Output(__file__, C.Output) + D = Db.Mapping(build, self._config.Db) + L = Output(__file__, self._config.Output) L.addMessage(__file__, -1, "INFO", "Received request chromName(%s %s)" % (build, accNo)) - self.__checkBuild(L, build, C.Db) + self.__checkBuild(L, build) # self.__checkChrom(L, D, name) result = D.chromName(accNo) @@ -478,7 +445,7 @@ class MutalyzerService(DefinitionBase) : "Finished processing chromName(%s %s)" % (build, accNo)) - del D,L,C + del D,L return result #chromosomeName @@ -495,14 +462,13 @@ class MutalyzerService(DefinitionBase) : @return: The name of a chromosome. @rtype: string """ - C = Config.Config() # Read the configuration file. - D = Db.Mapping(build, C.Db) - L = Output.Output(__file__, C.Output) + D = Db.Mapping(build, self._config.Db) + L = Output(__file__, self._config.Output) L.addMessage(__file__, -1, "INFO", "Received request getchromName(%s %s)" % (build, acc)) - self.__checkBuild(L, build, C.Db) + self.__checkBuild(L, build) # self.__checkChrom(L, D, name) result = D.get_chromName(acc) @@ -511,7 +477,7 @@ class MutalyzerService(DefinitionBase) : "Finished processing getchromName(%s %s)" % (build, acc)) - del D,L,C + del D,L return result #chromosomeName @@ -530,14 +496,12 @@ class MutalyzerService(DefinitionBase) : @return: The variant(s) in either I{g.} or I{c.} notation. @rtype: list """ - - C = Config.Config() # Read the configuration file. - D = Db.Mapping(build, C.Db) - O = Output.Output(__file__, C.Output) + D = Db.Mapping(build, self._config.Db) + O = Output(__file__, self._config.Output) O.addMessage(__file__, -1, "INFO", "Received request cTogConversion(%s %s)" % ( build, variant)) - converter = Mapper.Converter(build, C, O) + converter = Converter(build, self._config, O) variant = converter.correctChrVariant(variant) if "c." in variant : @@ -567,26 +531,28 @@ class MutalyzerService(DefinitionBase) : - messages: List of (error) messages as strings. @rtype: object """ - C = Config.Config() # Read the configuration file. - O = Output.Output(__file__, C.Output) - O.addMessage(__file__, -1, "INFO", - "Received request checkSyntax(%s)" % (variant)) + output = Output(__file__, self._config.Output) + output.addMessage(__file__, -1, "INFO", + "Received request checkSyntax(%s)" % (variant)) result = CheckSyntaxOutput() - self.__checkVariant(O, variant) + self.__checkVariant(output, variant) - P = Parser.Nomenclatureparser(O) - parsetree = P.parse(variant) - del C, P + grammar = Grammar(output) + parsetree = grammar.parse(variant) result.valid = bool(parsetree) - O.addMessage(__file__, -1, "INFO", - "Finished processing checkSyntax(%s)" % (variant)) + output.addMessage(__file__, -1, "INFO", + "Finished processing checkSyntax(%s)" % (variant)) - result.messages = O.getSoapMessages() + result.messages = [] + for message in output.getMessages(): + soap_message = SoapMessage() + soap_message.errorcode = message.code + soap_message.message = message.description + result.messages.append(soap_message) - del O return result #checkSyntax @@ -595,11 +561,10 @@ class MutalyzerService(DefinitionBase) : """ Todo: documentation. """ - C = Config.Config() # Read the configuration file. - O = Output.Output(__file__, C.Output) + O = Output(__file__, self._config.Output) O.addMessage(__file__, -1, "INFO", "Received request runMutalyzer(%s)" % (variant)) - Mutalyzer.process(variant, C, O) + variantchecker.check_variant(variant, self._config, O) result = MutalyzerOutput() @@ -642,7 +607,12 @@ class MutalyzerService(DefinitionBase) : O.addMessage(__file__, -1, "INFO", "Finished processing runMutalyzer(%s)" % (variant)) - result.messages = O.getSoapMessages() + result.messages = [] + for message in O.getMessages(): + soap_message = SoapMessage() + soap_message.errorcode = message.code + soap_message.message = message.description + result.messages.append(soap_message) return result #runMutalyzer @@ -652,17 +622,16 @@ class MutalyzerService(DefinitionBase) : """ Todo: documentation. """ - C = Config.Config() - O = Output.Output(__file__, C.Output) - D = Db.Cache(C.Db) + O = Output(__file__, self._config.Output) + D = Db.Cache(self._config.Db) O.addMessage(__file__, -1, "INFO", "Received request getGeneAndTranscript(%s, %s)" % (genomicReference, transcriptReference)) - retriever = Retriever.GenBankRetriever(C.Retriever, O, D) + retriever = Retriever.GenBankRetriever(self._config.Retriever, O, D) record = retriever.loadrecord(genomicReference) - GenRecordInstance = GenRecord.GenRecord(O, C.GenRecord) + GenRecordInstance = GenRecord.GenRecord(O, self._config.GenRecord) GenRecordInstance.record = record GenRecordInstance.checkRecord() @@ -719,24 +688,23 @@ class MutalyzerService(DefinitionBase) : - id - product """ - C = Config.Config() - O = Output.Output(__file__, C.Output) - D = Db.Cache(C.Db) + O = Output(__file__, self._config.Output) + D = Db.Cache(self._config.Db) O.addMessage(__file__, -1, "INFO", "Received request getTranscriptsAndInfo(%s)" % genomicReference) - retriever = Retriever.GenBankRetriever(C.Retriever, O, D) + retriever = Retriever.GenBankRetriever(self._config.Retriever, O, D) record = retriever.loadrecord(genomicReference) # Todo: If loadRecord failed (e.g. DTD missing), we should abort here. - GenRecordInstance = GenRecord.GenRecord(O, C.GenRecord) + GenRecordInstance = GenRecord.GenRecord(O, self._config.GenRecord) GenRecordInstance.record = record GenRecordInstance.checkRecord() transcripts = [] # The following loop is basically the same as building the legend in - # the name checker web interface (wsgi.Check). + # the name checker web interface (website.Check). for gene in GenRecordInstance.record.geneList: # Only return transcripts for requested gene (if there was one) @@ -816,7 +784,7 @@ class MutalyzerService(DefinitionBase) : """ Not implemented yet. """ - raise Exception('Not implemented yet') + raise Fault('ENOTIMPLEMENTED', 'Not implemented yet') #upLoadGenBankLocalFile @soap(Mandatory.String, _returns = Mandatory.String) @@ -824,7 +792,7 @@ class MutalyzerService(DefinitionBase) : """ Not implemented yet. """ - raise Exception('Not implemented yet') + raise Fault('ENOTIMPLEMENTED', 'Not implemented yet') #upLoadGenBankRemoteFile @soap(Mandatory.String, Mandatory.String, Mandatory.Integer, @@ -834,11 +802,9 @@ class MutalyzerService(DefinitionBase) : """ Todo: documentation, error handling, argument checking, tests. """ - - C = Config.Config() - O = Output.Output(__file__, C.Output) - D = Db.Cache(C.Db) - retriever = Retriever.GenBankRetriever(C.Retriever, O, D) + O = Output(__file__, self._config.Output) + D = Db.Cache(self._config.Db) + retriever = Retriever.GenBankRetriever(self._config.Retriever, O, D) O.addMessage(__file__, -1, "INFO", "Received request sliceChromosomeByGene(%s, %s, %s, %s)" % ( @@ -850,9 +816,10 @@ class MutalyzerService(DefinitionBase) : "Finished processing sliceChromosomeByGene(%s, %s, %s, %s)" % ( geneSymbol, organism, upStream, downStream)) + # Todo: use SOAP Fault object here (see Trac issue #41). if not UD: error = 'The request could not be completed\n' \ - + '\n'.join(O.getMessages()) + + '\n'.join(map(lambda m: str(m), O.getMessages())) raise Exception(error) return UD @@ -864,11 +831,9 @@ class MutalyzerService(DefinitionBase) : """ Todo: documentation, error handling, argument checking, tests. """ - - C = Config.Config() - O = Output.Output(__file__, C.Output) - D = Db.Cache(C.Db) - retriever = Retriever.GenBankRetriever(C.Retriever, O, D) + O = Output(__file__, self._config.Output) + D = Db.Cache(self._config.Db) + retriever = Retriever.GenBankRetriever(self._config.Retriever, O, D) O.addMessage(__file__, -1, "INFO", "Received request sliceChromosome(%s, %s, %s, %s)" % ( @@ -883,26 +848,126 @@ class MutalyzerService(DefinitionBase) : return UD #sliceChromosome + @soap(_returns = InfoOutput) + def info(self): + """ + Gives some static application information, such as the current running + version. + + @return: Object with fields: + - version: A string of the current running version. + - versionParts: The parts of the current running version as a list + of strings. + - releaseDate: The release date for the running version as a + string, or the empty string in case of a development version. + - nomenclatureVersion: Version of the HGVS nomenclature used. + - nomenclatureVersionParts: The parts of the HGVS nomenclature + version as a list of strings. + - serverName: The name of the server that is being queried. + - contactEmail: The email address to contact for more information. + @rtype: object + """ + output = Output(__file__, self._config.Output) + output.addMessage(__file__, -1, 'INFO', 'Received request info') + + result = InfoOutput() + result.version = mutalyzer.__version__ + result.versionParts = mutalyzer.__version_info__ + if mutalyzer.RELEASE: + result.releaseDate = mutalyzer.__date__ + else: + result.releaseDate = '' + result.nomenclatureVersion = mutalyzer.NOMENCLATURE_VERSION + result.nomenclatureVersionParts = mutalyzer.NOMENCLATURE_VERSION_INFO + result.serverName = socket.gethostname() + result.contactEmail = mutalyzer.__contact__ + + output.addMessage(__file__, -1, 'INFO', 'Finished processing info') + return result + #info + @soap(_returns = Mandatory.String) - def ping(self) : + def ping(self): """ Simple function to test the interface. - """ - return "PONG" + @return: Always the value 'pong'. + @rtype: string + """ + return 'pong' #ping + + @soap(DateTime, _returns = Array(CacheEntry)) + def getCache(self, created_since=None): + """ + Get a list of entries from the local cache created since given date. + + This method is intended to be used by Mutalyzer itself to synchronize + the cache between installations on different servers. + """ + output = Output(__file__, self._config.Output) + + output.addMessage(__file__, -1, 'INFO', + 'Received request getCache') + + database = Db.Cache(self._config.Db) + sync = CacheSync(self._config.Retriever, output, database) + + cache = sync.local_cache(created_since) + + def cache_entry_to_soap(entry): + e = CacheEntry() + for attr in ('name', 'gi', 'hash', 'chromosomeName', + 'chromosomeStart', 'chromosomeStop', + 'chromosomeOrientation', 'url', 'created', 'cached'): + setattr(e, attr, entry[attr]) + return e + + output.addMessage(__file__, -1, 'INFO', + 'Finished processing getCache') + + return map(cache_entry_to_soap, cache) + #getCache + + @soap(Mandatory.String, _returns = Array(Mandatory.String)) + def getdbSNPDescriptions(self, rs_id): + """ + Lookup HGVS descriptions for a dbSNP rs identifier. + + @arg rs_id: The dbSNP rs identifier, e.g. 'rs9919552'. + @type rs_id: string + + @return: List of HGVS descriptions. + @rtype: list(string) + """ + output = Output(__file__, self._config.Output) + + output.addMessage(__file__, -1, 'INFO', + 'Received request getdbSNPDescription(%s)' % rs_id) + + retriever = Retriever.Retriever(self._config.Retriever, output, None) + descriptions = retriever.snpConvert(rs_id) + + output.addMessage(__file__, -1, 'INFO', + 'Finished processing getdbSNPDescription(%s)' % rs_id) + + # Todo: use SOAP Fault object here (see Trac issue #41). + messages = output.getMessages() + if messages: + error = 'The request could not be completed\n' \ + + '\n'.join(map(lambda m: str(m), output.getMessages())) + raise Exception(error) + + return descriptions + #getdbSNPDescriptions #MutalyzerService + # WSGI application for use with e.g. Apache/mod_wsgi -soap_application = Application([MutalyzerService], - 'http://mutalyzer.nl/2.0/services', # namespace +soap_application = Application([MutalyzerService], mutalyzer.SOAP_NAMESPACE, 'Mutalyzer') +# Note: We would like to create the wsgi.Application instance only in the +# bin/mutalyzer-webservice.wsgi script, but unfortunately this breaks the +# get_wsdl method of soap_application which we use to generate API +# documentation in website.py. application = wsgi.Application(soap_application) - -# We can also use the built-in webserver by executing this file directly -if __name__ == '__main__': - # Todo: Setting the working directory probably doesn't work - from wsgiref.simple_server import make_server - print 'Listening to http://localhost:8081/' - print 'WDSL file is at http://localhost:8081/?wsdl' - make_server('localhost', 8081, application).serve_forever() diff --git a/src/wsgi.py b/mutalyzer/website.py similarity index 66% rename from src/wsgi.py rename to mutalyzer/website.py index 663c3451c1b142ab816c2c91c32b2dda95cb1576..03e946a009d4f57862e9102551c3e537b4bf63e5 100644 --- a/src/wsgi.py +++ b/mutalyzer/website.py @@ -1,46 +1,8 @@ -#!/usr/bin/env python - """ -General WSGI interface. - -The WSGI interface is exposed through the module variable 'application'. -Static files are not handled by this interface and should be served through -the '/base' url prefix separately. - -Example Apache/mod_wsgi configuration: - - WSGIScriptAlias / /var/www/mutalyzer/src/wsgi.py - Alias /base /var/www/mutalyzer/templates/base - -You can also use the built-in HTTP server by running this file directly. -Note, however, that static files are not served by this server. A common -pattern is to use Nginx as a proxy and static file server. - -Example Nginx configuration (assumes the built-in HTTP server is running on -port 8080): - - server { - listen 80; - location /base/ { - root /var/www/mutalyzer/templates/base; - if (-f $request_filename) { - rewrite ^/base/(.*)$ /base/$1 break; - } - } - location / { - proxy_read_timeout 300; # 5 minutes - proxy_pass http://127.0.0.1:8080; - } - } - -@todo: Integrate webservice.py (http://webpy.org/cookbook/webservice/). -@todo: Move /templates/base to /static for web.py compatibility. +General Mutalyzer website interface. """ -VERSION = '2.0 β-8' -NOMENCLATURE_VERSION = '2.0' -RELEASE_DATE = '31 Jan 2011' WEBSERVICE_LOCATION = '/services' WSDL_VIEWER = 'templates/wsdl-viewer.xsl' @@ -60,44 +22,37 @@ import os import bz2 import web import urllib -import site from lxml import etree from cStringIO import StringIO from simpletal import simpleTALES from simpletal import simpleTAL -# Add /src to Python path -site.addsitedir(os.path.dirname(__file__)) - -# Todo: Get this from the configuration file -root_dir = os.path.split(os.path.dirname(__file__))[0] -# Todo: Fix Mutalyzer to not depend on working directory -if not __name__ == '__main__': - os.chdir(root_dir) +import mutalyzer +from mutalyzer import util +from mutalyzer.config import Config +from mutalyzer.grammar import Grammar +from mutalyzer import webservice +from mutalyzer import variantchecker +from mutalyzer.output import Output +from mutalyzer.mapping import Converter +from mutalyzer import Db +from mutalyzer import Scheduler +from mutalyzer import Retriever +from mutalyzer import File -import webservice -import Mutalyzer -import VarInfo -from Modules import Config -from Modules import Output -from Modules import Parser -from Modules import Mapper -from Modules import Db -from Modules import Scheduler -from Modules import Retriever -from Modules import File - -web.config.debug = False +# Load configuration from configuration file +config = Config() -# Load configuration from configuration file -C = Config.Config() +# Show web.py debugging information. +web.config.debug = config.Output.debug # URL dispatch table urls = ( + '', 'RedirectHome', '/(index)?', 'Static', '/(about)', 'Static', '/(help)', 'Static', @@ -131,8 +86,8 @@ class render_tal: Example to render /templates/hello.html with parameter 'alice': - render = render_tal('templates') - render.hello('alice') + >>> render = render_tal('templates') + >>> render.hello('alice') """ def __init__(self, path, globals={}): """ @@ -206,19 +161,33 @@ class render_tal: # TAL template render -render = render_tal(os.path.join(root_dir, 'templates'), - globals={'version': VERSION, - 'nomenclatureVersion': NOMENCLATURE_VERSION, - 'releaseDate': RELEASE_DATE, - 'contactEmail': C.Retriever.email}) +render = render_tal(os.path.join(mutalyzer.package_root(), 'templates'), + globals={ + 'version': mutalyzer.__version__, + 'nomenclatureVersion': mutalyzer.NOMENCLATURE_VERSION, + 'releaseDate': mutalyzer.__date__, + 'release': mutalyzer.RELEASE, + 'contactEmail': config.Retriever.email}) # web.py application app = web.application(urls, globals(), autoreload=False) -# Sessions are only used by CheckForward (as a hack) -session = web.session.Session(app, - web.session.DiskStore(os.path.join(root_dir, 'var', 'sessions')), - initializer={'variant': None}) + +class RedirectHome: + """ + Permanent redirect to the homepage. + """ + def GET(self): + """ + Redirect to / and include the query string. + """ + raise web.redirect('/' + web.ctx.query) + + def POST(self): + """ + Redirect to / and include the query string. + """ + raise web.redirect('/' + web.ctx.query) class Download: @@ -237,13 +206,14 @@ class Download: The url routing currently makes sure to only call this with filenames of the form [a-zA-Z-]+\.(?:py|cs). """ - if not os.path.isfile("templates/" + file): + file_path = os.path.join(mutalyzer.package_root(), 'templates', file) + if not os.path.isfile(file_path): raise web.notfound() - content = open('templates/' + file, 'r').read() + content = open(file_path, 'r').read() # Force downloading web.header('Content-Type', 'text/plain') web.header('Content-Disposition', 'attachment; filename="%s"' % file) - # We use new style string formatting (available from Python 2.6 + # We use new style string formatting (available from Python 2.6) # http://www.python.org/dev/peps/pep-3101/ return content.format(path=web.ctx.homedomain + web.ctx.homepath) #Download @@ -265,10 +235,12 @@ class Downloads: The url routing currently makes sure to only call this with filenames of the form [a-zA-Z\._-]+. """ - if not os.path.isfile("templates/downloads/" + file): + file_path = os.path.join(mutalyzer.package_root(), + 'templates', 'downloads', file) + if not os.path.isfile(file_path): raise web.notfound() - handle = open("templates/downloads/" + file) - F = File.File(C.File, None) + handle = open(file_path) + F = File.File(config.File, None) web.header('Content-Type', F.getMimeType(handle)[0]) web.header('Content-Disposition', 'attachment; filename="%s"' % file) return handle.read() @@ -291,13 +263,44 @@ class Reference: The url routing currently makes sure to only call this with filenames of the form [a-zA-Z\._-]+. """ - fileName = "%s/%s.bz2" % (C.Retriever.cache, file) - if not os.path.isfile(fileName): + file_path = os.path.join(config.Retriever.cache, '%s.bz2' % file) + if not os.path.isfile(file_path): raise web.notfound() - handle = bz2.BZ2File(fileName, 'r') + handle = bz2.BZ2File(file_path, 'r') web.header('Content-Type', 'text/plain') web.header('Content-Disposition', 'attachment; filename="%s"' % file) return handle.read() + + def HEAD(self, file): + """ + Do the same as in the GET case, but don't actually bunzip and send the + file, just check if it exists. + + @arg file: Filename to download from cache. + @type file: string + + This is used by LOVD to quickly check if a reference file is in the + cache. If it isn't, it will resubmit it. + Of course a more proper solution here would be to have some webservice + method which checks if the GenBank file is in the cache *or* can be + reconstructed from the information in the database. Because if the + latter is the case, Mutalyzer will add it to the cache on the fly. + """ + file_path = os.path.join(config.Retriever.cache, '%s.bz2' % file) + if not os.path.isfile(file_path): + # The following is a hack to return a 404 not found status with + # empty body (as is checked by our unit test framework, WebTest). + # Just passing nothing, or the empty string, causes web.py to + # insert some default 'not found' message. + class TrueEmptyString(object): + def __str__(self): + return '' + def __nonzero__( self): + return True + raise web.notfound(message=TrueEmptyString()) + web.header('Content-Type', 'text/plain') + web.header('Content-Disposition', 'attachment; filename="%s"' % file) + return '' #Reference @@ -322,16 +325,17 @@ class GetGS: @return: Output of name checker if forward is set, otherwise the GeneSymbol with the variant notation as string. """ - O = Output.Output(__file__, C.Output) + O = Output(__file__, config.Output) i = web.input(mutationName=None, variantRecord=None, forward=None) # Todo: The following is probably a problem elsewhere too. # We stringify the variant, because a unicode string crashes - # Bio.Seq.reverse_complement in Mapper.py:607. + # Bio.Seq.reverse_complement in mapping.py:607. # We are only interested in the legend - Mutalyzer.process(str(i.mutationName), C, O) + #Mutalyzer.process(str(i.mutationName), config, O) + variantchecker.check_variant(str(i.mutationName), config, O) legends = O.getOutput("legends") @@ -374,25 +378,24 @@ class SyntaxCheck: Parameters: - variant: Variant name to check. """ - O = Output.Output(__file__, C.Output) + output = Output(__file__, config.Output) i = web.input() variant = i.variant if variant.find(',') >= 0: - O.addMessage(__file__, 2, "WCOMMASYNTAX", - "Comma's are not allowed in the syntax, autofixed") + output.addMessage(__file__, 2, "WCOMMASYNTAX", + "Comma's are not allowed in the syntax, autofixed.") variant = variant.replace(',', '') #args["variant"]=variant - P = Parser.Nomenclatureparser(O) - parsetree = P.parse(variant) - pe = O.getOutput("parseError") + grammar = Grammar(output) + grammar.parse(variant) + pe = output.getOutput("parseError") if pe: pe[0] = pe[0].replace('<', "<") args = { "variant" : variant, - "messages" : O.getMessages(), + "messages" : map(util.message_info, output.getMessages()), "parseError" : pe, "debug" : "" } - del O return render.parse(args) #SyntaxCheck @@ -415,32 +418,35 @@ class Snp: Convert to HGVS description(s) and render SNP converter HTML form. Parameters: - - rsId: The dbSNP rs number. + - rsId: The dbSNP rs number. """ i = web.input(rsId=None) return self.snp(i.rsId) - def snp(self, rsId=None): + def snp(self, rs_id=None): """ - Convert to HGVS description(s) and render SNP converter HTML form. + Convert {rs_id} to HGVS description(s) and render SNP converter HTML + form. - @kwarg rsId: The dbSNP rs number. + @kwarg rs_id: The dbSNP rs number (including 'rs' prefix). + @type rs_id: string """ - O = Output.Output(__file__, C.Output) - - if rsId : - O.addMessage(__file__, -1, "INFO", "Received rs%s" % rsId) - R = Retriever.Retriever(C.Retriever, O, None) - R.snpConvert(rsId, O) - O.addMessage(__file__, -1, "INFO", - "Finished processing rs%s" % rsId) - #if + output = Output(__file__, config.Output) + + descriptions = [] + + if rs_id: + output.addMessage(__file__, -1, 'INFO', 'Received %s' % rs_id) + retriever = Retriever.Retriever(config.Retriever, output, None) + descriptions = retriever.snpConvert(rs_id) + output.addMessage(__file__, -1, 'INFO', + 'Finished processing %s' % rs_id) args = { - "snp" : O.getOutput("snp"), - "messages" : O.getMessages(), - "summary" : O.Summary()[2], - "lastpost" : rsId + 'snp' : descriptions, + 'messages' : map(util.message_info, output.getMessages()), + 'summary' : output.Summary()[2], + 'lastpost' : rs_id } return render.snp(args) @@ -468,7 +474,7 @@ class PositionConverter: i = web.input(build='', variant='') # Todo: The following is probably a problem elsewhere too. # We stringify the variant, because a unicode string crashes - # Bio.Seq.reverse_complement in Mapper.py:607. + # Bio.Seq.reverse_complement in mapping.py:607. return self.position_converter(i.build, str(i.variant)) def position_converter(self, build='', variant=''): @@ -478,9 +484,9 @@ class PositionConverter: @kwarg build: Human genome build (currently 'hg18' or 'hg19'). @kwarg variant: Variant to convert. """ - O = Output.Output(__file__, C.Output) + output = Output(__file__, config.Output) - avail_builds = C.Db.dbNames[::-1] + avail_builds = config.Db.dbNames[::-1] if build : avail_builds.remove(build) @@ -493,12 +499,11 @@ class PositionConverter: "gName" : "", "cNames" : [], "messages" : [], - "errors" : [], - "debug" : [] + "posted" : build and variant } if build and variant: - converter = Mapper.Converter(build, C, O) + converter = Converter(build, config, output) #Convert chr accNo to NC number variant = converter.correctChrVariant(variant) @@ -506,8 +511,8 @@ class PositionConverter: if variant : if not(":c." in variant or ":g." in variant): #Bad name - P = Parser.Nomenclatureparser(O) - parsetree = P.parse(variant) + grammar = Grammar(output) + grammar.parse(variant) #if if ":c." in variant: @@ -520,14 +525,14 @@ class PositionConverter: # Do the g2c dance variants = converter.chrom2c(variant, "dict") if variants: - output = ["%-10s:\t%s" % (key[:10], "\n\t\t".join(value))\ - for key, value in variants.items()] - attr["cNames"].extend(output) + out = ["%-10s:\t%s" % (key[:10], "\n\t\t".join(value))\ + for key, value in variants.items()] + attr["cNames"].extend(out) #if #if #if - attr["errors"].extend(O.getMessages()) + attr['messages'] = map(util.message_info, output.getMessages()) return render.converter(attr) #PositionConverter @@ -535,16 +540,48 @@ class PositionConverter: class VariantInfo: """ The I{g.} to I{c.} and vice versa interface for LOVD. + + Search for an NM number in the MySQL database, if the version number + matches, get the start and end positions in a variant and translate these + positions to I{g.} notation if the variant is in I{c.} notation and vice + versa. + - If no end position is present, the start position is assumed to be the + end position. + - If the version number is not found in the database, an error message is + generated and a suggestion for an other version is given. + - If the reference sequence is not found at all, an error is returned. + - If no variant is present, the transcription start and end and CDS end + in I{c.} notation is returned. + - If the variant is not accepted by the nomenclature parser, a parse error + will be printed. """ def GET(self): """ - Run VarInfo and return the result as plain text. + Get variant info and return the result as plain text. Parameters: - LOVD_ver: The version of the calling LOVD. - build: The human genome build (hg19 assumed). - acc: The accession number (NM number). - var: A description of the variant. + + Returns: + - start_main ; The main coordinate of the start position in I{c.} + (non-star) notation. + - start_offset ; The offset coordinate of the start position in I{c.} + notation (intronic position). + - end_main ; The main coordinate of the end position in I{c.} + (non-star) notation. + - end_offset ; The offset coordinate of the end position in I{c.} + notation (intronic position). + - start_g ; The I{g.} notation of the start position. + - end_g ; The I{g.} notation of the end position. + - type ; The mutation type. + + Returns (alternative): + - trans_start ; Transcription start in I{c.} notation. + - trans_stop ; Transcription stop in I{c.} notation. + - CDS_stop ; CDS stop in I{c.} notation. """ i = web.input(var='') LOVD_ver = i.LOVD_ver @@ -552,14 +589,46 @@ class VariantInfo: acc = i.acc var = i.var - result = VarInfo.main(LOVD_ver, build, acc, var) + output = Output(__file__, config.Output) + + output.addMessage(__file__, -1, 'INFO', + 'Received %s:%s (LOVD_ver %s, build %s)' \ + % (acc, var, LOVD_ver, build)) + + converter = Converter(build, config, output) + + result = '' + + # If no variant is given, return transcription start, transcription + # end and CDS stop in c. notation. + if var: + ret = converter.mainMapping(acc, var) + else: + ret = converter.giveInfo(acc) + if ret: + result = '%i\n%i\n%i' % ret + + if not result and not getattr(ret, 'startmain', None): + out = output.getOutput('LOVDERR') + if out: + result = out[0] + else: + result = 'Unknown error occured' + + output.addMessage(__file__, -1, 'INFO', + 'Finished processing %s:%s (LOVD_ver %s, build %s)' \ + % (acc, var, LOVD_ver, build)) + + if not result and getattr(ret, 'startmain', None): + result = '%i\n%i\n%i\n%i\n%i\n%i\n%s' \ + % (ret.startmain, ret.startoffset, ret.endmain, + ret.endoffset, ret.start_g, ret.end_g, ret.mutationType) web.header('Content-Type', 'text/plain') if LOVD_ver == "2.0-23" : # Obsoleted error messages, remove when possible. - import re return re.sub("^Error \(.*\):", "Error:", result) - #if + return result #VariantInfo @@ -576,7 +645,7 @@ class Check: 1. Provide the 'mutationName' parameter. In this case, the checker is called non-interactively, meaning the result is rendered without the HTML form, site layout, and menu. - 2. By having a 'variant' value in the session. The value is removed. + 2. By having a 'variant' value in the cookie. The value is removed. Parameters: - mutationName: Variant to check. @@ -588,9 +657,9 @@ class Check: interactive = False variant = i.mutationName else: - # Run checker if session.variant is not None - variant = session.variant - session.variant = None + # Run checker if cookie variant is not None + variant = web.cookies().get('variant') + web.setcookie('variant', '', 60) return self.check(variant, interactive=interactive) def POST(self): @@ -611,101 +680,101 @@ class Check: @kwarg name: Variant to check. @kwarg interactive: Run interactively, meaning we wrap the result in - the site layout and include the HTML form. + the site layout and include the HTML form. """ - O = Output.Output(__file__, C.Output) - - if name: - O.addMessage(__file__, -1, "INFO", "Received variant %s" % name) - # Todo: The following is probably a problem elsewhere too. - # We stringify the variant, because a unicode string crashes - # Bio.Seq.reverse_complement in Mapper.py:607. - RD = Mutalyzer.process(str(name), C, O) - O.addMessage(__file__, -1, "INFO", "Finished processing variant %s" % \ - name) - - errors, warnings, summary = O.Summary() - recordType = O.getIndexedOutput("recordType",0) - reference = O.getIndexedOutput("reference", 0) - if recordType == "LRG" : - reference = reference + ".xml" if reference else "" - else : - reference = reference + ".gb" if reference else "" - - pe = O.getOutput("parseError") - if pe : - pe[0] = pe[0].replace('<', "<") - - genomicDNA = True - if O.getIndexedOutput("molType", 0) == 'n' : - genomicDNA = False - - genomicDescription = O.getIndexedOutput("genomicDescription", 0) - - def urlEncode(descriptions): - """ - @todo: This should probably be done in the template. + output = Output(__file__, config.Output) - @arg descriptions: - @type descriptions: list + args = { + 'lastpost' : name + } - @return: urlEncode descriptions??????????????? - @rtype: list - """ - newDescr = [] - for i in descriptions : - newDescr.append([i, urllib.quote(i)]) - return newDescr + if not name: + return render.check(args, standalone=not interactive) + output.addMessage(__file__, -1, 'INFO', 'Received variant %s' % name) + # Todo: The following is probably a problem elsewhere too. + # We stringify the variant, because a unicode string crashes + # Bio.Seq.reverse_complement in mapping.py:607. + variantchecker.check_variant(str(name), config, output) + output.addMessage(__file__, -1, 'INFO', + 'Finished processing variant %s' % name) + + errors, warnings, summary = output.Summary() + record_type = output.getIndexedOutput('recordType', 0, '') + reference = output.getIndexedOutput('reference', 0, '') + + if reference: + if record_type == 'LRG': + reference = reference + '.xml' + else: + reference = reference + '.gb' + + # This is a tuple (variant, position) + parse_error = output.getOutput('parseError') + if parse_error: + parse_error[0] = parse_error[0].replace('<', '<') + + genomic_dna = output.getIndexedOutput('molType', 0) != 'n' + + genomic_description = output.getIndexedOutput('genomicDescription', 0, '') + + # Create a tuple (description, link) from a description + def description_to_link(description): + link = None + if description[-1] != '?': + link = urllib.quote(description) + return description, link + + # Todo: Generate the fancy HTML views for the proteins here instead + # of in mutalyzer/variantchecker.py. args = { - "lastpost" : name, - "messages" : O.getMessages(), - "summary" : summary, - "parseError" : pe, - "errors" : errors, - "genomicDescription" : urlEncode([genomicDescription])[0] if genomicDescription else "", - "chromDescription" : O.getIndexedOutput("genomicChromDescription", 0), - "genomicDNA" : genomicDNA, - "visualisation" : O.getOutput("visualisation"), - "descriptions" : urlEncode(O.getOutput("descriptions")), - "protDescriptions" : O.getOutput("protDescriptions"), - "oldProtein" : O.getOutput("oldProteinFancy"), - "altStart" : O.getIndexedOutput("altStart", 0), - "altProtein" : O.getOutput("altProteinFancy"), - "newProtein" : O.getOutput("newProteinFancy"), - "exonInfo" : O.getOutput("exonInfo"), - "cdsStart_g" : O.getIndexedOutput("cdsStart_g", 0), - "cdsStart_c" : O.getIndexedOutput("cdsStart_c", 0), - "cdsStop_g" : O.getIndexedOutput("cdsStop_g", 0), - "cdsStop_c" : O.getIndexedOutput("cdsStop_c", 0), - "restrictionSites" : O.getOutput("restrictionSites"), - "legends" : O.getOutput("legends"), - "reference" : reference + 'lastpost' : name, + 'messages' : map(util.message_info, output.getMessages()), + 'summary' : summary, + 'parseError' : parse_error, + 'errors' : errors, + 'genomicDescription' : (genomic_description, urllib.quote(genomic_description)), + 'chromDescription' : output.getIndexedOutput('genomicChromDescription', 0), + 'genomicDNA' : genomic_dna, + 'visualisation' : output.getOutput('visualisation'), + 'descriptions' : map(description_to_link, output.getOutput('descriptions')), + 'protDescriptions' : output.getOutput('protDescriptions'), + 'oldProtein' : output.getOutput('oldProteinFancy'), + 'altStart' : output.getIndexedOutput('altStart', 0), + 'altProtein' : output.getOutput('altProteinFancy'), + 'newProtein' : output.getOutput('newProteinFancy'), + 'transcriptInfo' : output.getIndexedOutput('hasTranscriptInfo', 0, False), + 'transcriptCoding' : output.getIndexedOutput('transcriptCoding', 0, False), + 'exonInfo' : output.getOutput('exonInfo'), + 'cdsStart_g' : output.getIndexedOutput('cdsStart_g', 0), + 'cdsStart_c' : output.getIndexedOutput('cdsStart_c', 0), + 'cdsStop_g' : output.getIndexedOutput('cdsStop_g', 0), + 'cdsStop_c' : output.getIndexedOutput('cdsStop_c', 0), + 'restrictionSites' : output.getOutput('restrictionSites'), + 'legends' : output.getOutput('legends'), + 'reference' : reference } - # Todo: This shouldn't really be necessary - del O - return render.check(args, standalone=not interactive) #Check class CheckForward: """ - Set the given variant in the session and redirect to the name checker. + Set the given variant in the cookie and redirect to the name checker. - @todo: Cleaner solution (one without using a session variable). + @todo: Cleaner solution (one without using a cookie). """ def GET(self): """ - Set the 'variant' session value to the given variant and redirect + Set the 'variant' cookie value to the given variant and redirect to the name checker (where we will arrive by a GET request). Parameters: - - mutationName: Variant to set in the session. + - mutationName: Variant to set in the cookie. """ i = web.input(mutationName=None) - session.variant = i.mutationName + web.setcookie('variant', i.mutationName, 5 * 60) # Five minutes raise web.seeother('check') #CheckForward @@ -727,18 +796,25 @@ class BatchProgress: - ajax: If set, return plain text result. @todo: The 'progress' template does not exist. + @todo: Actually, signaling 'OK' here only means the last entry was + taken from the database queue. It might still be processing, in + which case not all output is yet written to the result file. + For the standard use case, this is no big deal, since any user + will take more than a few milliseconds to actually click the + download link. + However, if we imagine some scripted batch uploader, it might get + bitten by this bug. (This includes our unit tests, where we work + around it by explicitely waiting a second.) """ - O = Output.Output(__file__, C.Output) - attr = {"percentage": 0} i = web.input(ajax=None) try: jobID = int(i.jobID) total = int(i.totalJobs) - except Exception, e: + except ValueError: return - D = Db.Batch(C.Db) + D = Db.Batch(config.Db) left = D.entriesLeftForJob(jobID) percentage = int(100 - (100 * left / float(total))) if i.ajax: @@ -800,9 +876,9 @@ class BatchChecker: (default), 'SyntaxChecker', 'PositionConverter', or 'SnpConverter'. """ - O = Output.Output(__file__, C.Output) + O = Output(__file__, config.Output) - maxUploadSize = C.Batch.batchInputMaxSize + maxUploadSize = config.Batch.batchInputMaxSize attr = {"messages" : [], "errors" : [], @@ -815,7 +891,7 @@ class BatchChecker: "hideTypes" : batchType and 'none' or '', "selected" : "0", "batchType" : batchType or "", - "avail_builds" : C.Db.dbNames[::-1], + "avail_builds" : config.Db.dbNames[::-1], "jobID" : None, "totalJobs" : None } @@ -847,29 +923,29 @@ class BatchChecker: web.ctx.status = '413 Request entity too large' return 'Sorry, only files up to %s megabytes are accepted.' % (float(maxUploadSize) / 1048576) - D = Db.Batch(C.Db) - S = Scheduler.Scheduler(C.Scheduler, D) - FileInstance = File.File(C.File, O) + D = Db.Batch(config.Db) + S = Scheduler.Scheduler(config.Scheduler, D) + FileInstance = File.File(config.File, O) # Generate the fromhost URL from which the results can be fetched fromHost = web.ctx.homedomain + web.ctx.homepath + '/' #fromHost = "http://%s%s" % ( # req.hostname, req.uri.rsplit("/", 1)[0]+"/") - job = FileInstance.parseBatchFile(inFile.file) + job, columns = FileInstance.parseBatchFile(inFile.file) if job is None: O.addMessage(__file__, 4, "PRSERR", "Could not parse input" " file, please check your file format.") else: #TODO: Add Binair Switches to toggle some events - attr["jobID"] =\ - S.addJob("BINSWITHCES", email, job, fromHost, batchType, arg1) + attr["jobID"] = S.addJob("BINSWITHCES", email, job, columns, + fromHost, batchType, arg1) attr["totalJobs"] = len(job) or 1 attr["messages"].append("Your file has been parsed and the job" " is scheduled, you will receive an email when the job is " "finished.") - attr["errors"].extend(O.getMessages()) + attr["errors"].extend(map(util.message_info, O.getMessages())) return render.batch(attr) #BatchChecker @@ -888,15 +964,16 @@ class BatchResult: Be very careful to not call this with anything but an ordinary filename. A possible security issue is allowing this method to be - called with file='../../mutalyzer.conf' for example. + called with result='../../mutalyzer.conf' for example. The url routing currently makes sure to only call this with filenames of the form \d+. """ - file = 'Results_%s.txt' % result - handle = open(os.path.join(C.Scheduler.resultsDir, file)) + filename = 'Results_%s.txt' % result + handle = open(os.path.join(config.Scheduler.resultsDir, filename)) web.header('Content-Type', 'text/plain') - web.header('Content-Disposition', 'attachment; filename="%s"' % file) + web.header('Content-Disposition', + 'attachment; filename="%s"' % filename) return handle.read() #BatchResult @@ -918,7 +995,7 @@ def _checkInt(inpv, refname): inpv = inpv.replace(',','').replace('.','').replace('-','') try: return int(inpv) - except ValueError, e: + except ValueError: raise InputException("Expected an integer in field: %s" % refname) #_checkInt @@ -943,7 +1020,7 @@ class Uploader: """ Render reference sequence uploader form. """ - maxUploadSize = C.Retriever.maxDldSize + maxUploadSize = config.Retriever.maxDldSize UD, errors = "", [] args = { "UD" : UD, @@ -986,11 +1063,11 @@ class Uploader: - stop: Stop position. - orientation: Orientation. """ - maxUploadSize = C.Retriever.maxDldSize + maxUploadSize = config.Retriever.maxDldSize - O = Output.Output(__file__, C.Output) - D = Db.Cache(C.Db) - R = Retriever.GenBankRetriever(C.Retriever, O, D) + O = Output(__file__, config.Output) + D = Db.Cache(config.Db) + R = Retriever.GenBankRetriever(config.Retriever, O, D) UD, errors = "", [] @@ -1054,7 +1131,7 @@ class Uploader: if not UD: #Something went wrong errors += ["The request could not be completed"] - errors.extend(O.getMessages()) + errors.extend(map(lambda m: str(m), O.getMessages())) args = { "UD" : UD, @@ -1091,7 +1168,8 @@ class Documentation: """ url = web.ctx.homedomain + web.ctx.homepath + WEBSERVICE_LOCATION wsdl_handle = StringIO(webservice.soap_application.get_wsdl(url)) - xsl_handle = open(WSDL_VIEWER, 'r') + xsl_handle = open(os.path.join(mutalyzer.package_root(), WSDL_VIEWER), + 'r') wsdl_doc = etree.parse(wsdl_handle) xsl_doc = etree.parse(xsl_handle) transform = etree.XSLT(xsl_doc) @@ -1120,13 +1198,3 @@ class Static: if not page: page = 'index' return getattr(render, page)() - - -if __name__ == '__main__': - # Todo: Setting the working directory probably doesn't work - # Usage: - # ./src/wsgi.py [port] - app.run() -else: - # WSGI application - application = app.wsgifunc() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..2899ae3d0673eb0b2cfae85305df8e294270a744 --- /dev/null +++ b/setup.py @@ -0,0 +1,35 @@ +import sys +from setuptools import setup, find_packages + +if sys.version_info < (2, 6): + raise Exception('Mutalyzer requires Python 2.6 or higher.') + +import mutalyzer as distmeta + +setup( + name='mutalyzer', + version=distmeta.__version__, + description=distmeta.__doc__, + author=distmeta.__author__, + author_email=distmeta.__contact__, + url=distmeta.__homepage__, + license='Not distributable', + platforms=['any'], + packages=find_packages(exclude=['doc', 'extras', 'tests']), + include_package_data=True, + scripts=['bin/mutalyzer', + 'bin/mutalyzer-batchd', + 'bin/mutalyzer-cache-sync', + 'bin/mutalyzer-mapping-update', + 'bin/mutalyzer-webservice.wsgi', + 'bin/mutalyzer-website.wsgi'], + zip_safe=False +) + +# Things not handled by this setup.py: +# - Copy extras/config.example to /etc/mutalyzer/config +# - Database setup +# - Chown /var/log/mutalyzer.log and /var/cache/mutalyzer +# - Copy extras/init.d/mutalyzer-batchd to /etc/init.d/mutalyzer-batchd +# - Copy doc to /usr/share/doc +# Check extras/post-install.sh for these. diff --git a/src/BatchChecker.py b/src/BatchChecker.py deleted file mode 100644 index 6241b88c9506e8856522e5b1537cc48e02887ad7..0000000000000000000000000000000000000000 --- a/src/BatchChecker.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/python - -""" -@requires: os -@requires: sys -@requires: daemon -@requires: signal -@requires: fcntl -@requires: ftplib -@requires: Modules.Config -@requires: Modules.Db.Batch -@requires: Modules.Scheduler -""" -import os -import sys -import daemon -import signal -import fcntl - -from Modules import Config -from Modules.Db import Batch -from Modules import Scheduler - -def sigusr1_daemon_notified(*args): - """ - Stop the Daemon with SIGUSR1 signal: kill -10 PID - """ - sys.exit() - -# Change dir -if len(sys.argv[0].split('/')) > 2 : - os.chdir(sys.argv[0].rsplit('/', 2)[0]) - -C = Config.Config() - -batchconfig = C.Batch -cwd = os.getcwd() -pidfile_path = os.path.realpath(batchconfig.PIDfile) - -pidfile = open(pidfile_path, 'w') - -try: - fcntl.flock(pidfile.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB) -except IOError,e: - #process is already running and file is locked - print "Can't lock: %s\nBatchChecker already running\n" % pidfile_path - sys.exit(2) - -#If we get here the file is not locked and no Daemon is running. - -# Write PID to pidfile -pidfile.write(`os.getpid()`) - -# Populate signal map -sigmap ={signal.SIGUSR1: sigusr1_daemon_notified} -stdout = sys.stdout -DaemonInst = daemon.DaemonContext(signal_map = sigmap, - files_preserve = [ pidfile ], working_directory = cwd) -DaemonInst.__enter__() -# stdout = stdout, stderr = stdout -C = Config.Config() -D = Batch(C.Db) -S = Scheduler.Scheduler(C.Scheduler, D) -S.process() -DaemonInst.__exit__() diff --git a/src/Modules/Config.py b/src/Modules/Config.py deleted file mode 100644 index 62f329c4fa2a70ff3e447edaba6a23e574a1116c..0000000000000000000000000000000000000000 --- a/src/Modules/Config.py +++ /dev/null @@ -1,237 +0,0 @@ -#!/usr/bin/python - -""" -Module for reading the config file and splitting up the variables into -subclasses. Each of these subclasses are used to configure a specific -module. -""" - -class Config() : - """ - Read the configuration file and store the data in subclasses. - - Special Methods: - - __init__ ; Read the configuration file and initialise the - subclasses. - """ - # Public subclasses: - # - Retriever ; Container for the Retriever configuration variables. - # - Db ; Container for the Db configuration variables. - # - Output ; Container for the Output configuration variables. - # - Mutator ; Container for the Mutator configuration variables. - # - Scheduler ; Container for the Scheduler configuration variables. - # - File ; Container for the File configuration variables. - # - GBparser ; Container for the File configuration variables. - - - class Retriever() : - """ - Container class for the Retriever configuration variables. - - Public variables: - - email ; Email address used for Entrez. - - cache ; Location of the cache directory. - - cachesize ; Maximum size of the cache directory in bytes. - - maxDldSize ; Maximum size of a GenBank record in bytes. - - minDldSize ; Minimum size of a GenBank record in bytes. - - lrgURL ; base URL of LRG files. - """ - - pass - #Retriever - - class Db() : - """ - Container class for the Db configuration variables. - - Public variables: - - internalDb ; Name of the internal database. - - dbNames ; Name of the mapping databases - - LocalMySQLuser ; Username for the local databases. - - LocalMySQLhost ; Hostname of the local databases. - - - RemoteMySQLuser ; Username for the remote UCSC database. - - RemoteMySQLhost ; Hostname of the UCSC database server. - - UpdateInterval ; Time window (in days) to search for - updates. - - TempFile ; Location for downloaded updates. - """ - #Db - - class Output() : - """ - Container class for the Output configuration variables. - - Public variables: - - log ; Name and location of the logfile. - - datestring ; Prefix for log messages. - - loglevel ; Default level for logging. - - outputlevel ; Default level for output. - """ - - pass - #Output - - class Mutator() : - """ - Container class for the Mutator configuration variables. - - Public variables: - - flanksize ; Length of the flanking sequences in the - visualisation. - - maxvissize ; Maximum length of the variation in the - visualisation. - - flankclipsize ; Length of the inserted/deleted flanks. - """ - - pass - #Mutator - - class Scheduler() : - """ - Container class for the Scheduler configuration variables. - - Public variables: - - processName ; Name of the scheduler in the process list. - - mailFrom ; Return e-mail address. - - mailMessage ; Template e-mail. - - mailSubject ; Subject of the e-mail. - - resultsDir ; Location of the results. - """ - - pass - #Scheduler - - class Batch() : - """ - Container class for the Scheduler configuration variables. - - Public variables: - - PIDfile ; Location of the PID file. - - batchInputMaxSize ; Max size for batch input files in bytes. - """ - - pass - #Batch - - class File() : - """ - Container class for the File configuration variables. - - Public variables: - - bufSize ; Amount of bytes to be read for determining the file - type. - - header ; The obligatory header in batch request files. - - tempDir ; Directory for temporary files. - - threshold ; The threshold under which the percentage of errors - is allowed in a batchfile. - """ - - pass - #File - - class GBparser() : - """ - Container class for the GBparser configuration variables. - - Public variables: - - upstream ; Number of upstream nucleotides when searching for a - transcript. - - downstream ; Number of downstream nucleotides when searching for a - transcript. - """ - - pass - #GBparser - - class GenRecord() : - pass - - def __init__(self) : - """ - Initialise the class with variables read from the configuration - file. In principle, this is the only place in the code where a - hard coded constant is used (the name and path to the configuration - file). - - Public subclasses (altered): - - Retriever ; Initialised with Retriever configuration variables. - - Db ; Initialised with Db configuration variables. - - Output ; Initialised with Output configuration variables. - - Mutator ; Initialised with Mutator configuration variables. - - Scheduler ; Initialised with Scheduler configuration variables. - - @requires: ConfigObj - """ - from configobj import ConfigObj # ConfigObj() - - config = ConfigObj("./mutalyzer.conf") - - # Set the variables needed by the Retriever module. - self.Retriever.email = config["email"] - self.Retriever.cache = config["cache"] - self.Retriever.cachesize = int(config["cachesize"]) * 1048576 - self.Retriever.maxDldSize = int(config["maxDldSize"]) * 1048576 - self.Retriever.minDldSize = int(config["minDldSize"]) - self.Retriever.lrgURL = config["lrgurl"] - - # Set the variables needed by the Db module. - self.Db.internalDb = config["internalDb"] - self.Db.dbNames = config["dbNames"] - self.Db.LocalMySQLuser = config["LocalMySQLuser"] - self.Db.LocalMySQLhost = config["LocalMySQLhost"] - self.Db.RemoteMySQLuser = config["RemoteMySQLuser"] - self.Db.RemoteMySQLhost = config["RemoteMySQLhost"] - self.Db.UpdateInterval = int(config["UpdateInterval"]) - self.Db.TempFile = config["TempFile"] - - # Set the variables needed by the Output module. - self.Output.log = config["log"] - self.Output.datestring = config["datestring"] - self.Output.loglevel = int(config["loglevel"]) - self.Output.outputlevel = int(config["outputlevel"]) - - # Set the variables needed by the Mutator module. - self.Mutator.flanksize = int(config["flanksize"]) - self.Mutator.maxvissize = int(config["maxvissize"]) - self.Mutator.flankclipsize = int(config["flankclipsize"]) - - # Set the variables needed by the Scheduler module. - self.Scheduler.processName = config["processName"] - self.Scheduler.mailFrom = config["mailFrom"] - self.Scheduler.mailMessage = config["mailMessage"] - self.Scheduler.mailSubject = config["mailSubject"] - self.Scheduler.resultsDir = config["resultsDir"] - self.Scheduler.nameCheckOutHeader = config["nameCheckOutHeader"] - self.Scheduler.syntaxCheckOutHeader= config["syntaxCheckOutHeader"] - self.Scheduler.positionConverterOutHeader= config["positionConverterOutHeader"] - self.Scheduler.snpConverterOutHeader= config["snpConverterOutHeader"] - - # Set thte variables neede for the Batch module. - self.Batch.PIDfile = config["PIDfile"] - self.Batch.batchInputMaxSize = int(config["batchInputMaxSize"]) * 1048576 - - # Set the variables needed by the File module. - self.File.bufSize = int(config["bufSize"]) - self.File.header = config["header"] - self.File.tempDir = config["tempDir"] - self.File.threshold = float(config["threshold"]) - - # Set the variables needed by the GBparser module. - self.GBparser.email = config["email"] - - ## Set the variables needed by the File module. - #self.File.upstream = int(config["upstream"]) - #self.File.downstream = int(config["downstream"]) - self.GenRecord.spliceAlarm = int(config["spliceAlarm"]) - self.GenRecord.spliceWarn = int(config["spliceWarn"]) - #__init__ -#Config - -# -# Unit test. -# -if __name__ == "__main__" : - C = Config() # Will crash if the config file is not found. - del C -#if diff --git a/src/Modules/Misc.py b/src/Modules/Misc.py deleted file mode 100644 index 69ab2cb4044222848809e626ff448918d1db9e75..0000000000000000000000000000000000000000 --- a/src/Modules/Misc.py +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/python - -""" -@todo: documentation -""" - -import time - -class Misc() : - """ - @todo: documentation - """ - - def ID(self) : - """ - Generates an ID using time() - @todo: documentation - - @return: - @rtype: - """ - - IDsPerSec = 100 - - time.sleep(1.0 / IDsPerSec) - return int(time.time() * IDsPerSec) - #ID -#Misc diff --git a/src/Modules/__init__.py b/src/Modules/__init__.py deleted file mode 100644 index 9d9af1c47e829cdb8c7c0e58bb111143af9bb6ad..0000000000000000000000000000000000000000 --- a/src/Modules/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -""" -@organization: Leiden University Medical Center (LUMC) -@copyright: 2010, Jeroen Laros, LUMC -""" -# Public modules: -# - Config ; -# - Crossmap ; -# - Db ; -# - GenRecord ; -# - Misc ; -# - Mutator ; -# - Output ; -# - Parser ; -# - Retriever ; -# - Scheduler ; diff --git a/src/Mutalyzer.py b/src/Mutalyzer.py deleted file mode 100644 index 3cf6657be22052471fa253a054445d649c79049a..0000000000000000000000000000000000000000 --- a/src/Mutalyzer.py +++ /dev/null @@ -1,1884 +0,0 @@ -#!/usr/bin/python - -""" -The nomenclature checker. - -@requires: sys -@requires: math -@requires: types -@requires: itertools.izip_longest -@requires: Bio -@requires: Bio.Seq -@requires: Bio.Seq.Seq -@requires: Bio.Alphabet.IUPAC -@requires: Bio.SeqUtils.seq3 -@requires: Bio.Restriction -@requires: Modules.Retriever -@requires: Modules.GenRecord -@requires: Modules.Crossmap -@requires: Modules.Parser -@requires: Modules.Db -@requires: Modules.Mutator -@requires: Modules.Output -@requires: Modules.Config -@requires: operator.itemgetter -@requires: operator.attrgetter - -@todo: SET TO FALSE DEBUG FLAG -""" - -import sys -import math -import types -from itertools import izip_longest -import Bio - -import Bio.Seq -from Bio.Seq import Seq -from Bio.Alphabet import IUPAC -from Bio.SeqUtils import seq3 -from Bio import Restriction - -from Modules import Retriever -from Modules import GenRecord -from Modules import Crossmap -from Modules import Parser -from Modules import Db -from Modules import Mutator -from Modules import Output -from Modules import Config - -from operator import itemgetter, attrgetter - -#TODO: SET TO FALSE DEBUG FLAG -DEBUG = False - -def __formatRange(pos1, pos2) : - """ - Simplify a range to one position when applicable. - - @arg pos1: First coordinate of a range - @type pos1: integer - @arg pos2: Second coordinate of a range - @type pos2: integer - - @return: pos1_pos2 in case of a real range, pos1 otherwise - @rtype: string - """ - - if pos1 == pos2 : - return str(pos1) - return "%i_%i" % (pos1, pos2) -#__formatRange - -def __intronicPosition(Loc) : - """ - Check whether a location is intronic. - - @arg Loc: A location from the Parser module - @type Loc: - - @return: True if the location is intronic, False otherwise - @rtype: boolean - """ - - if not Loc : - return False - if not Loc.PtLoc : - return False - if not Loc.PtLoc.Offset : - return False - return True -#__intronicPosition - -def __checkIntronPosition(main, offset, transcript) : - """ - Check whether a c. position is really in an intron: The main coordinate - must be a splice site and the offset coordinate must have the correct - sign. - - @arg main: Main coordinate of the position - @type main: integer - @arg offset: Offset coordinate of the position - @type offset: integer - @arg transcript: Transcript under scrutiny - @type transcript: object - - @return: True if the combination (main, offset) is valid for this - transcript, False otherwise - @rtype: boolean - """ - - main_g = transcript.CM.x2g(main, 0) - rnaList = transcript.CM.RNA - - if offset : - #print main_g, offset, rnaList - orientedOffset = offset * transcript.CM.orientation - if main_g in rnaList : # The main coordinate is a splice site. - if rnaList.index(main_g) % 2 == 0 : # Splice donor. - if orientedOffset > 0 : # So the sign must be '+'. - return False - else : # Splice acceptor. - if orientedOffset < 0 : # So the sign must be '-'. - return False - #if - else : - return False - #if - - return True -#__checkIntronPosition - -def __roll(ref, start, stop) : - """ - Determine the variability of a variant by looking at cyclic - permutations. Not all cyclic permutations are tested at each time, it - is sufficient to check ``aW'' if ``Wa'' matches (with ``a'' a letter, - ``W'' a word) when rolling to the left for example. - - @arg ref: A reference sequence - @type ref: string - @arg start: Start position of the pattern in the reference sequence - @type start: integer - @arg stop: End position of the pattern in the reference sequence. - @type stop: integer - - @return: tuple: - - left ; Amount of positions that the pattern can be shifted to the left - - right ; Amount of positions that the pattern can be shifted to the - right - @rtype: tuple (integer, integer) - """ - - pattern = ref[start - 1:stop] # Extract the pattern. - patternLength = len(pattern) - - # Keep rolling to the left as long as a cyclic permutation matches. - minimum = start - 2 - j = patternLength - 1 - while minimum > -1 and ref[minimum] == pattern[j % patternLength] : - j -= 1 - minimum -= 1 - #while - - # Keep rolling to the right as long as a cyclic permutation matches. - maximum = stop - j = 0 - while maximum < len(ref) and ref[maximum] == pattern[j % patternLength] : - j += 1 - maximum += 1 - #while - - return start - minimum - 2, maximum - stop -#__roll - -def __palinsnoop(string) : - """ - Check a sequence for a reverse-complement-palindromic prefix (and - suffix). If one is detected, return the length of this prefix. If the - string equals its reverse complement, return -1. - - @arg string: A nucleotide sequence - @type string: string - - @return: The number of elements that are palindromic or -1 if the string is - a "palindrome". - @rtype: string - """ - - revcomp = Bio.Seq.reverse_complement(string) - - for i in range(int(math.ceil(len(string) / 2.0))) : - if string[i] != revcomp[i] : - return i # The first i elements are ``palindromic''. - return -1 # Perfect ``palindrome''. -#__palinsnoop - -def __bprint(s, O, where) : - # FIXME obsoleted function (replaced by __bprint2()), but still used. - """ - @todo: FIXME obsoleted function (replaced by __bprint2()), but still used. - """ - - if not s : - return - - block = 10 - line = 6 * block - - m = int(math.floor(math.log(len(s), 10)) + 1) - o = 1 - output = "%s " % str(o).rjust(m) - for i in range(0, len(s), block) : - output += ' ' + s[i:i + block] - if not (i + block) % line and i + block < len(s) : - o += line - O.addOutput(where, output) - output = "%s " % str(o).rjust(m) - #if - #for - O.addOutput(where, output) -#__bprint - -def __insertTag(s, pos1, pos2, tag1, tag2) : - """ - Insert two tags (tag1 and tag2) in string s at positions pos1 and pos2 - respectively if the positions are within the length of s. If not, - either insert one tag or do nothing. If pos1 equals pos2, don't do - anything either. - - @arg s: A sequence - @type s: - @arg pos1: Position of tag1 - @type pos1: - @arg pos2: Position of tag2 - @type pos2: - @arg tag1: Content of tag1 - @type tag1: - @arg tag2: Content of tag2 - @type tag2: - - @return: The original sequence, or a sequence with eiter tag1, tag2 or both - tags inserted. - @rtype: string - """ - - output = s - block = len(s) - - if pos1 != pos2 : # Only do something if pos1 != pos2. - if 0 <= pos1 < block : - output = output[:pos1] + tag1 + output[pos1:] # Insert tag1. - if 0 <= pos2 < block : - output = output[:-(block - pos2)] + tag2 + \ - output[-(block - pos2):] # Insert tag2. - #if - - return output -#__insertTag - -def __bprint2(s, pos1, pos2, O, where) : - """ - Make a fancy representation of a protein and put it in the Output - object under the name "where". - - @arg s: A protein sequence - @type s: string - @arg pos1: First position to highlight - @type pos1: - @arg pos2: Last position to highlight - @type pos2: - @arg O: The Output object - @type O: object - @arg where: Location in the Output object to store the representation - @type where: - """ - - if not s : - return - - block = 10 # Each block consists of 10 amino acids. - line = 6 * block # Each line consists of 6 blocks. - - tag1 = "<b style=\"color:#FF0000\">" # Use this tag for highlighting. - tag2 = "</b>" # And this one to end highlighting. - - # The maximum length for positions is the 10_log of the length of the - # protein. - m = int(math.floor(math.log(len(s), 10)) + 1) - o = 1 - output = "%s " % str(o).rjust(m) # Add the first position. - for i in range(0, len(s), block) : # Add the blocks. - output += ' ' + __insertTag(s[i:i + block], pos1 - i, - pos2 - i, tag1, tag2) - if not (i + block) % line and i + block < len(s) : - o += line # One line done. - O.addOutput(where, output) # Add it to the output. - # And add the next line (while escaping any potential highlighting). - output = \ - "<tt style = \"color:000000;font-weight:normal\">%s</tt> " % \ - str(o).rjust(m) - #if - #for - O.addOutput(where, output) -#__bprint2 - -def __PtLoc2main(Loc) : - """ - Convert the main coordinate in a location (from the Parser) to an - integer. - - @arg Loc: A location - @type Loc: object - - @return: Integer representation of the main coordinate - @rtype: integer - """ - - main = int(Loc.Main) - if Loc.MainSgn == '-' : - return -main - - return main -#__PtLoc2main - -def __PtLoc2offset(Loc) : - """ - Convert the offset coordinate in a location (from the Parser) to an - integer. - - @arg Loc: A location. - @type Loc: object - - @return: Integer representation of the offset coordinate - @rtype: integer - """ - - if Loc.Offset : - if Loc.Offset == '?' : # This is highly debatable. - return 0 - offset = int(Loc.Offset) - if Loc.OffSgn == '-' : - return -offset - return offset - #if - - return 0 -#__PtLoc2offset - -def __splice(string, splice_sites) : - """ - Construct the transcript or the coding sequence from a record and - a list of splice sites. - - @arg string: a DNA sequence - @type string: string - @arg splice_sites: A list of even length of integers. - @type splice_sites: list - - @return: The concatenation of slices from the sequence that is present in - the GenBank record - @rtype: string - """ - - transcript = "" - - for i in range(0, len(splice_sites), 2) : - transcript += string[splice_sites[i] - 1:splice_sites[i + 1]] - - return transcript -#__splice - -def __nsplice(string, splice_sites, CDS, orientation) : - #FIXME document this. - """ - @todo: documentation - """ - - transcript = "" - if orientation == 1 : - for i in range(0, len(splice_sites), 2) : - if CDS[0] >= splice_sites[i] and CDS[0] <= splice_sites[i + 1] : - transcript += string[CDS[0] - 1:splice_sites[i + 1]] - else : - if splice_sites[i] > CDS[0] : - transcript += \ - string[splice_sites[i] - 1:splice_sites[i + 1]] - #for - #if - else : - for i in range(0, len(splice_sites), 2) : - if CDS[1] >= splice_sites[i] and CDS[1] <= splice_sites[i + 1] : - transcript += string[splice_sites[i] - 1: CDS[1]] - else : - if splice_sites[i] < CDS[1] : - transcript += \ - string[splice_sites[i] - 1:splice_sites[i + 1]] - #for - #else - - return transcript -#__nsplice - -def __cdsLen(splice_sites) : - """ - Calculate the length of a CDS. - - @arg splice_sites: The coordinates of the CDS including internal splice - sites. - @type splice_sites: list - - @return: Length of the CDS - @rtype: integer - """ - - l = 0 - - for i in range(0, len(splice_sites), 2) : - l += splice_sites[i + 1] - splice_sites[i] + 1 - return l -#__cdsLen - -def __checkDNA(arg) : - """ - Check whether a string is a DNA string. - - @arg arg: Any string - @type arg: string - - @return: True if the string is a DNA string, False otherwise - @rtype: boolean - """ - - for i in str(arg) : - if not i in IUPAC.unambiguous_dna.letters : - return False - return True -#__checkDNA - -def __checkOptArg(ref, p1, p2, arg, O) : - """ - Do several checks for the optional argument of a variant. - - - @arg ref: The reference sequence - @type ref: string - @arg p1: Start position of the variant - @type p1: integer - @arg p2: End position of the variant - @type p2: integer - @arg arg: The optional argument - @type arg: - @arg O: The Output object - @type O: object - - @return: True if the optional argument is correct, False otherwise. - @rtype: boolean - """ - - if arg : # The argument is optional, if it is not present, it is correct. - if arg.isdigit() : # If it is a digit (3_9del7 for example), - length = int(arg) # the digit must be equal to the length - interval = p2 - p1 + 1 # of the given range. - if length != interval : - O.addMessage(__file__, 3, "EARGLEN", - "The length (%i) differed from that of the range (%i)." % ( - length, interval)) - return False - #if - #if - else : - if not __checkDNA(arg) : # If it is not a digit, it muse be DNA. - O.addMessage(__file__, 4, "ENODNA", - "Invalid letters in argument.") - return False - #if - # And the DNA must match the reference sequence. - ref_slice = str(ref[p1 - 1:p2]) - if ref_slice != str(arg) : # FIXME more informative. - O.addMessage(__file__, 3, "EREF", - "%s not found at position %s, found %s instead." % ( - arg, __formatRange(p1, p2), ref_slice)) - return False - #if - #else - #if - return True -#__checkOptArg - -def __lcp(str1, str2) : - """ - Calculate the length of the longest common prefix of two strings. - - @arg str1: The first string - @type str1: string - @arg str2: The second string - @type str2: string - - @return: The length of the longest common prefix of str1 and str2 - @rtype: integer - """ - - pos = 0 - s1l = len(str1) # Use the lengths to make sure we don't exceed the length - s2l = len(str2) # of the strings. - - while pos < s1l and pos < s2l and str1[pos] == str2[pos] : - pos += 1 - - return pos -#__lcp - -def __lcs(str1, str2) : - """ - Calculate the length of the longest common suffix of two strings. - - @arg str1: The first string - @type str1: string - @arg str2: The second string - @type str2: string - - @return: The length of the longest common suffix of str1 and str2 - @rtype: integer - """ - - t1 = str1[::-1] # Invert str1. - t2 = str2[::-1] # Invert str2. - - # The lcp of the two inverted strings is the lcs of the original strings. - return __lcp(t1, t2) -#__lcs - -def __overSplice(pos1, pos2, sites) : - """ - Check wheter a genomic range (pos1_pos2) hits a splice site. - - @arg pos1: The first coordinate of the range in g. notation. - @type pos1: integer - @arg pos2: The first coordinate of the range in g. notation. - @type pos2: integer - @arg sites: A list of splice sites in g. notation. - @type sites: list(integer) - - @return: True if one or more splice sites are hit, False otherwise. - @rtype: boolean - """ - - for i in range(len(sites)) : - if i % 2 : - if (pos1 <= sites[i] and pos2 > sites[i]) : - return True - else : - if (pos1 < sites[i] and pos2 >= sites[i]) : - return True - #for - - return False -#__overSplice - - -def findInFrameDescription(str1, str2) : - """ - Give a description of an inframe difference of two proteins. Also give - the position at which the proteins start to differ and the positions at - which they are the same again. - - @arg str1: The original protein - @type str1: string - @arg str2: The mutated protein - @type str2: string - - @return: vector: - - string ; Protein description of the change - - integer ; Start position of the change - - integer ; End position of the change in the first protein - - integer ; End position of the change in the second protein - @rtype: string - """ - - # Nothing happened. - if str1 == str2 : - return ("p.(=)", 0, 0, 0) - - lcp = __lcp(str1, str2) - lcs = __lcs(str1[lcp:], str2[lcp:]) - str1_end = len(str1) - lcs - str2_end = len(str2) - lcs - - # Insertion / Duplication / Extention. - if not str1_end - lcp : - if len(str1) == lcp : - return ("p.(*%i%sext*%i)" % (len(str1) + 1, seq3(str2[len(str1)]), - abs(len(str1) - len(str2))), len(str1), len(str1), len(str2)) - inLen = str2_end - lcp - - if lcp - inLen >= 0 and str1[lcp - inLen:lcp] == str2[lcp:str2_end] : - if inLen == 1 : - return ("p.(%s%idup)" % (seq3(str1[lcp - inLen]), - lcp - inLen + 1), - lcp, lcp, lcp + 1) - return ("p.(%s%i_%s%idup)" % (seq3(str1[lcp - inLen]), - lcp - inLen + 1, seq3(str1[lcp - 1]), lcp), lcp, lcp, - lcp + inLen) - #if - return ("p.(%s%i_%s%iins%s)" % (seq3(str1[lcp - 1]), lcp, - seq3(str1[lcp]), lcp + 1, seq3(str2[lcp:str2_end])), lcp, lcp, - str2_end) - #if - - # Deletion / Inframe stop. - if not str2_end - lcp : - if len(str2) == lcp : - return ("p.(%s%i*)" % (seq3(str1[len(str2)]), len(str2) + 1), - 0, 0, 0) - - if lcp + 1 == str1_end : - return ("p.(%s%idel)" % (seq3(str1[lcp]), lcp + 1), - lcp, lcp + 1, lcp) - return ("p.(%s%i_%s%idel)" % (seq3(str1[lcp]), lcp + 1, - seq3(str1[str1_end - 1]), str1_end), lcp, str1_end, lcp) - #if - - # Substitution. - if str1_end == str2_end and str1_end == lcp + 1 : - return ("p.(%s%i%s)" % (seq3(str1[lcp]), lcp + 1, seq3(str2[lcp])), - lcp, lcp + 1, lcp + 1) - - # InDel. - if lcp + 1 == str1_end : - return ("p.(%s%idelins%s)" % (seq3(str1[lcp]), lcp + 1, - seq3(str2[lcp:str2_end])), lcp, lcp + 1, str2_end) - return ("p.(%s%i_%s%idelins%s)" % (seq3(str1[lcp]), lcp + 1, - seq3(str1[str1_end - 1]), str1_end, seq3(str2[lcp:str2_end])), lcp, - str1_end, str2_end) -#findInFrameDescription - -def findFrameShift(str1, str2) : - """ - Give the description of an out of frame difference between two - proteins. Give a description of an inframe difference of two proteins. - Also give the position at which the proteins start to differ and the - end positions (to be compatible with the findInFrameDescription() - function). - - @arg str1: The original protein - @type str1: string - @arg str2: The mutated protein - @type str2: string - - @return: vector: - - string ; Protein description of the change. - - integer ; Start position of the change. - - integer ; End position of the first protein. - - integer ; End position of the second protein. - @rtype: string - """ - - lcp = __lcp(str1, str2) - - if lcp == len(str2) : # NonSense mutation. - if lcp == len(str1) : # Is this correct? - return ("p.(=)", 0, 0, 0) - return ("p.(%s%i*)" % (seq3(str1[lcp]), lcp + 1), lcp, len(str1), lcp) - if lcp == len(str1) : - return ("p.(*%i%sext*%i)" % (len(str1) + 1, seq3(str2[len(str1)]), - abs(len(str1) - len(str2))), len(str1), len(str1), len(str2)) - return ("p.(%s%i%sfs*%i)" % (seq3(str1[lcp]), lcp + 1, seq3(str2[lcp]), - len(str2) - lcp + 1), lcp, len(str1), len(str2)) -#findFrameShift - -def __toProtDescr(CDSStop, orig, trans) : - """ - Wrapper function for the findInFrameDescription() and findFrameShift() - functions. It uses the value CDSStop to decide which one to call. - - @arg CDSStop: Position of the stop codon in c. notation (CDS length) - @type CDSStop: integer - @arg orig: The original protein - @type orig: string - @arg trans: The mutated protein - @type trans: string - - @return: vector: - - string ; Protein description of the change. - - integer ; Start position of the change. - - integer ; End position of the change in the first protein. - - integer ; End position of the change in the second protein. - @rtype: tuple (string, integer, integer, integer) - """ - - if CDSStop % 3 : - ret = findFrameShift(str(orig), str(trans)) - else : - ret = findInFrameDescription(str(orig), str(trans)) - if not trans or str(orig[0]) != str(trans[0]) : # Mutation in start codon. - return ("p.?", ret[1], ret[2], ret[3]) - return ret -#__toProtDescr - -def __trim2(str1, str2) : - """ - Given two strings, trim the lcp and the lcs. - - @arg str1: A string - @type str1: string - @arg str2: An other string - @type str2: string - - @return: tuple: - - string: Trimmed version of str1. - - string: Trimmed version of str2. - """ - - lcp = __lcp(str1, str2) - lcs = __lcs(str1[lcp:], str2[lcp:]) - return str1[lcp:len(str1) - lcs], str2[lcp:len(str2) - lcs], lcp, lcs -#__trim2 - -def __rangeToC(M, g1, g2) : - # FIXME apparently obsolete. - """ - Convert a genomic range to a CDS oriented range. - - @arg M: - @type M: - @arg g1: - @type g1: - @arg g2: - @type g2: - - @return: tuple (string, string) - @rtype: tuple - @todo: FIXME apparently obsolete. - """ - - if M.orientation == -1 : - return M.g2c(g2), M.g2c(g1) - return M.g2c(g1), M.g2c(g2) -#__rangeToC - -def _createBatchOutput(O): - #TODO More documentation. - """ - Format the results to a batch output. - - Filter the mutalyzer output - - @arg O: - @type O: - - @todo: More documentation. - """ - goi, toi = O.getOutput("geneSymbol")[-1] # Two strings [can be empty] - tList = [] # Temporary List - tDescr = [] # Temporary Descr - - reference = O.getOutput("reference")[-1] - recordType = O.getOutput("recordType")[0] - descriptions = O.getOutput("NewDescriptions") - #iName, jName, mType, cDescr, pDescr, gAcc, cAcc, pAcc, - #fullDescr, fullpDescr - - if len(descriptions) == 0: - #No descriptions generated [unlikely] - return - if O.Summary()[0]: - #There were errors during the run, return. - return - for descr in descriptions: - if goi in descr[0] and toi in descr[1]: # Gene and Transcript - if tDescr: - # Already inserted a value in the tDescr - tDescr, tList = [], descriptions - break - tDescr = descr - - tList = descriptions - - var = O.getOutput("variant")[-1] - - # Generate output - outputline = "" - if tDescr: #Filtering worked, only one Description left - (gName, trName, mType, cDescr, - pDescr, gAcc, cAcc, pAcc, fullD, fullpD) = tDescr - - gene = "%s_v%.3i" % (gName, int(trName)) - - outputline += "%s\t%s\t%s\t" % (reference, gene, var) - - #Add genomic Description - outputline += "%s\t" % O.getOutput("gDescription")[0] - - #Add coding Description & protein Description - outputline += "%s\t%s\t" % (cDescr, pDescr) - - gc = cDescr and "%s:%s" % (gene, cDescr) - gp = pDescr and "%s:%s" % (gene, pDescr) - - #Add mutation with GeneSymbols - outputline += "%s\t%s\t" % (gc, gp) - - #Add References, should get genomic ref from parsed data - if recordType == "LRG": - gAcc = reference - if recordType == "GB": - geno = ["NC", "NG", "AC", "NT", "NW", "NZ", "NS"] - for g in geno: - if reference.startswith(g): - gAcc = reference - break - outputline += "%s\t%s\t%s\t" % (gAcc or "", cAcc or "", pAcc or "") - - else: - outputline += "\t"*11 - - #Add list of affected transcripts "|" seperator - if tList: - outputline += "%s\t" % "|".join(e[-2] for e in tList) - outputline += "%s\t" % "|".join(e[-1] for e in tList) - else: - outputline += "\t"*2 - - #Link naar additional info: - #outputline+="http://localhost/mutalyzer2/redirect?mutationName=%s" %\ - # "todovariant" - - - O.addOutput("batchDone", outputline) -#_createBatchOutput - -def checkSubstitution(start_g, Arg1, Arg2, MUU, GenRecordInstance, O) : - """ - Do a semantic check for substitutions, do the actual substitution - and give it a name. - - @arg start_g: Genomic location of the substitution - @type start_g: integer - @arg Arg1: Nucleotide in the reference sequence. - @type Arg1: string - @arg Arg2: Nucleotide in the mutated sequence. - @type Arg2: string - @arg MUU: A Mutator object. - @type MUU: object - @arg GenRecordInstance: A GenRecord object. - @type GenRecordInstance: object - @arg O: The Output object. - @type O: object - """ - - if not __checkDNA(Arg2) : # It must be DNA. - #O.addMessage(__file__, 4, "ENODNA", "Invalid letter in input") - return - if Arg1 == Arg2 : # And there must be a real change. - O.addMessage(__file__, 3, "ENOVAR", - "No mutation given (%c>%c) at position %i." % ( - Arg1, Arg1, start_g)) - - MUU.subM(start_g, Arg2) - GenRecordInstance.name(start_g, start_g, "subst", MUU.orig[start_g - 1], - Arg2, None) -#checkSubstitution - -def checkDeletionDuplication(start_g, end_g, mutationType, MUU, - GenRecordInstance, O) : - """ - Do a semantic check for a deletion or duplication, do the actual - deletion/duplication and give it a name. - - @arg start_g : Genomic start position of the del/dup - @type start_g: integer - @arg end_g: Genomic end position of the del/dup - @type end_g: integer - @arg mutationType: The type (del or dup) - @type mutationType: string - @arg MUU: A Mutator object - @type MUU: object - @arg GenRecordInstance: A GenRecord object - @type GenRecordInstance: object - @arg O: The Output object - @type O: object - """ - - roll = __roll(MUU.orig, start_g, end_g) - - # In the case of RNA, check if we roll over a splice site. If so, make - # the roll shorter, just up to the splice site. - shift = roll[1] - if GenRecordInstance.record.molType == 'n' : - mRNA = iter(GenRecordInstance.record.geneList[0].transcriptList[0] \ - .mRNA.positionList) - for acceptor, donor in izip_longest(mRNA, mRNA): - # Do a shorter roll, just up to the splice site. - # Note that acceptor and donor splice sites both point to the - # first, respectively last, position of the exon, so they are - # both at different sides of the boundary. - if end_g < acceptor and end_g + roll[1] >= acceptor: - shift = acceptor - 1 - end_g - break - #if - if end_g <= donor and end_g + roll[1] > donor: - shift = donor - end_g - break - #if - #for - #if - - if shift : # FIXME, The warning may not be apropriate. - newStart = start_g + shift - newStop = end_g + shift - O.addMessage(__file__, 2, "WROLL", - "Sequence \"%s\" at position %s was given, however, " \ - "the HGVS notation prescribes that it should be \"%s\" at " \ - "position %s." % ( - MUU.visualiseLargeString(str(MUU.orig[start_g - 1:end_g])), - __formatRange(start_g, end_g), - MUU.visualiseLargeString(str(MUU.orig[newStart - 1:newStop])), - __formatRange(newStart, newStop))) - #if - if shift != roll[1]: - incorrectStart = start_g + roll[1] - incorrectStop = end_g + roll[1] - O.addMessage(__file__, 1, "IROLLBACK", - "Sequence \"%s\" at position %s was not corrected to \"%s\" at " \ - "position %s, since they reside in different exons." % ( - MUU.visualiseLargeString(str(MUU.orig[start_g - 1:end_g])), - __formatRange(start_g, end_g), - MUU.visualiseLargeString(str(MUU.orig[incorrectStart - 1:incorrectStop])), - __formatRange(incorrectStart, incorrectStop))) - #if - if mutationType == "del" : - MUU.delM(start_g, end_g) - else : - MUU.dupM(start_g, end_g) - GenRecordInstance.name(start_g, end_g, mutationType, "", "", - (roll[0], shift)) -#checkDeletionDuplication - -def checkInversion(start_g, end_g, MUU, GenRecordInstance, O) : - """ - @todo: documentation - """ - - snoop = __palinsnoop(MUU.orig[start_g - 1:end_g]) - if snoop : - if snoop == -1 : - O.addMessage(__file__, 2, "WNOCHANGE", - "Sequence \"%s\" at position %i_%i is a palindrome " \ - "(its own reverse complement)." % ( - MUU.visualiseLargeString(str(MUU.orig[start_g - 1:end_g])), - start_g, end_g)) - return - #if - else : - O.addMessage(__file__, 2, "WNOTMINIMAL", - "Sequence \"%s\" at position %i_%i is a partial " \ - "palindrome (the first %i nucleotide(s) are the reverse " \ - "complement of the last one(s)), the HGVS notation " \ - "prescribes that it should be \"%s\" at position %i_%i." % ( - MUU.visualiseLargeString(str(MUU.orig[start_g - 1:end_g])), - start_g, end_g, snoop, - MUU.visualiseLargeString( - str(MUU.orig[start_g + snoop - 1: end_g - snoop])), - start_g + snoop, end_g - snoop)) - start_g += snoop - end_g -= snoop - #else - #if - MUU.invM(start_g, end_g) - if start_g == end_g : - O.addMessage(__file__, 2, "WWRONGTYPE", "Inversion at position "\ - "%i is actually a substitution." % start_g) - GenRecordInstance.name(start_g, start_g, "subst", MUU.orig[start_g - 1], - Bio.Seq.reverse_complement(MUU.orig[start_g - 1]), None) - #if - else : - GenRecordInstance.name(start_g, end_g, "inv", "", "", None) -#checkInversion - -def checkInsertion(start_g, end_g, Arg1, MUU, GenRecordInstance, O) : - """ - @todo: documentation - """ - - if start_g + 1 != end_g : - O.addMessage(__file__, 3, "EINSRANGE", - "%i and %i are not consecutive positions." % (start_g, end_g)) - return - #if - if not Arg1 or not __checkDNA(Arg1) : - O.addMessage(__file__, 3, "EUNKVAR", "Although the syntax of this " \ - "variant is correct, the effect can not be analysed.") - return - #if - - MUU.insM(start_g, Arg1) - insertionLength = len(Arg1) - newStart = MUU.shiftpos(start_g) - newStop = MUU.shiftpos(start_g) + insertionLength - roll = __roll(MUU.mutated, newStart + 1, newStop) - - # In the case of RNA, check if we roll over a splice site. If so, make - # the roll shorter, just up to the splice site. - shift = roll[1] - if GenRecordInstance.record.molType == 'n' : - mRNA = iter(GenRecordInstance.record.geneList[0].transcriptList[0] \ - .mRNA.positionList) - for acceptor, donor in izip_longest(mRNA, mRNA): - # Do a shorter roll, just up to the splice site. - # Note that acceptor and donor splice sites both point to the - # first, respectively last, position of the exon, so they are - # both at different sides of the boundary. - if newStop < acceptor and newStop + roll[1] >= acceptor: - shift = acceptor - 1 - newStop - break - #if - if newStop <= donor and newStop + roll[1] > donor: - shift = donor - newStop - break - #if - #for - #if - - if roll[0] + shift >= insertionLength : - # Todo: could there also be a IROLLBACK message in this case? - O.addMessage(__file__, 2, "WINSDUP", - "Insertion of %s at position %i_%i was given, " \ - "however, the HGVS notation prescribes that it should be a " \ - "duplication of %s at position %i_%i." % ( - Arg1, start_g, start_g + 1, - MUU.mutated[newStart + shift:newStop + shift], start_g + shift, - start_g + shift + insertionLength - 1)) - end_g += shift - 1 - start_g = end_g - insertionLength + 1 - GenRecordInstance.name(start_g, end_g, "dup", "", "", - (roll[0] + shift - insertionLength, 0)) - #if - else : - if shift : - O.addMessage(__file__, 2, "WROLL", "Insertion of %s at position " \ - "%i_%i was given, however, the HGVS notation prescribes " \ - "that it should be an insertion of %s at position %i_%i." % ( - Arg1, start_g, start_g + 1, - MUU.mutated[newStart + shift:newStop + shift], - newStart + shift, newStart + shift + 1)) - if shift != roll[1]: - O.addMessage(__file__, 1, "IROLLBACK", - "Insertion of %s at position %i_%i was not corrected to an " \ - "insertion of %s at position %i_%i, since they reside in " \ - "different exons." % ( - Arg1, start_g, start_g + 1, - MUU.mutated[newStart + roll[1]:newStop + roll[1]], - newStart + roll[1], newStart + roll[1] + 1)) - #if - GenRecordInstance.name(start_g, start_g + 1, "ins", - MUU.mutated[newStart + shift:newStop + shift] , "", - (roll[0], shift)) -#checkInsertion - -def __ivs2g(location, transcript) : - """ - @todo: documentation - """ - - ivsNumber = int(location.IVSNumber) - - if ivsNumber < 1 or ivsNumber > transcript.CM.numberOfIntrons() : - return None - - if location.OffSgn == '+' : - return transcript.CM.getSpliceSite(ivsNumber * 2 - 1) + \ - transcript.CM.orientation * int(location.Offset) - return transcript.CM.getSpliceSite(ivsNumber * 2) - \ - transcript.CM.orientation * int(location.Offset) -#__ivs2g - -def __ex2g(location, transcript) : - """ - @todo: documentation - """ - - numberOfExons = transcript.CM.numberOfExons() - - exNumberStart = int(location.EXNumberStart) - if exNumberStart < 1 or exNumberStart > transcript.CM.numberOfExons() : - return None - start_g = transcript.CM.getSpliceSite(exNumberStart * 2 - 2) - - if location.EXNumberStop : - exNumberStop = int(location.EXNumberStop) - if exNumberStop < 1 or exNumberStop > transcript.CM.numberOfExons() : - return None - stop_g = transcript.CM.getSpliceSite(exNumberStop * 2 - 1) - else : - stop_g = transcript.CM.getSpliceSite(exNumberStart * 2 - 1) - - return start_g, stop_g -#__ex2g - -def __normal2g(RawVar, transcript) : - """ - @todo: documentation - """ - - if not RawVar.StartLoc.PtLoc.Main.isdigit() : - return None, None # For ? in a position. - - start_g = int(RawVar.StartLoc.PtLoc.Main) - end_g = start_g - if RawVar.EndLoc : - if not RawVar.EndLoc.PtLoc.Main.isdigit() : # For ? in a position. - return None, None - #end_g = transcript.CM.main2int( - # RawVar.EndLoc.PtLoc.MainSgn + RawVar.EndLoc.PtLoc.Main) - end_g = int(RawVar.EndLoc.PtLoc.Main) - #if - - - # If it is not, convert it to g. notation. - if transcript : - start_main = transcript.CM.main2int(RawVar.StartLoc.PtLoc.MainSgn + \ - RawVar.StartLoc.PtLoc.Main) - #if not RawVar.StartLoc.PtLoc.Offset.isdigit() : - # return - - start_offset = __PtLoc2offset(RawVar.StartLoc.PtLoc) - - if not __checkIntronPosition(start_main, start_offset, transcript) : - return None, None - - start_g = transcript.CM.x2g(start_main, start_offset) - end_g = start_g - if RawVar.EndLoc : - end_main = transcript.CM.main2int(RawVar.EndLoc.PtLoc.MainSgn + \ - RawVar.EndLoc.PtLoc.Main) - #if not RawVar.EndLoc.PtLoc.Offset.isdigit() : - # return - end_offset = __PtLoc2offset(RawVar.EndLoc.PtLoc) - if not __checkIntronPosition(end_main, end_offset, transcript) : - return None, None - end_g = transcript.CM.x2g(end_main, end_offset) - #if - if transcript.CM.orientation == -1 : - start_g, end_g = end_g, start_g - #if - - return start_g, end_g -#__normal2g - -def __rv(MUU, RawVar, GenRecordInstance, parts, O, transcript) : - """ - @todo: documentation - """ - - # FIXME check this - # First assume that the variant is given in g. notation. - #print RawVar.StartLoc.PtLoc.MainSgn + RawVar.StartLoc.PtLoc.Main - #print __PtLoc2offset(RawVar.StartLoc.PtLoc) - - Arg1 = RawVar.Arg1 - Arg2 = RawVar.Arg2 - - if RawVar.EXLoc : - start_g, end_g = __ex2g(RawVar.EXLoc, transcript) - if not start_g : - O.addMessage(__file__, 3, "EPOS", "Invalid EX position given.") - return - #if - if end_g < start_g : # FIXME - start_g, end_g = end_g, start_g - #if - else : - if RawVar.StartLoc : - if RawVar.StartLoc.IVSLoc : - if GenRecordInstance.record.molType != 'g' : - O.addMessage(__file__, 3, "ENOINTRON", "Intronic " \ - "position given for a non-genomic reference sequence.") - return - start_g = __ivs2g(RawVar.StartLoc.IVSLoc, transcript) - if not start_g : - O.addMessage(__file__, 3, "EPOS", - "Invalid IVS position given.") - return - #if - end_g = start_g - if RawVar.EndLoc and RawVar.EndLoc.IVSLoc : # FIXME - end_g = __ivs2g(RawVar.EndLoc.IVSLoc, transcript) - if end_g < start_g : - start_g, end_g = end_g, start_g - #if - #if - else : - if GenRecordInstance.record.molType != 'g' and \ - (__intronicPosition(RawVar.StartLoc) or - __intronicPosition(RawVar.EndLoc)) : - O.addMessage(__file__, 3, "ENOINTRON", "Intronic " \ - "position given for a non-genomic reference sequence.") - return - start_g, end_g = __normal2g(RawVar, transcript) - if not start_g : - O.addMessage(__file__, 3, "ESPLICE", "Invalid intronic " \ - "position given.") - return - #else - #if - else : - O.addMessage(__file__, 4, "EUNKNOWN", "An unknown error occurred.") - return - #else - #else - if end_g < start_g : - O.addMessage(__file__, 3, "ERANGE", "End position is smaller than " \ - "the begin position.") - return - #if - - if start_g < 1 : - O.addMessage(__file__, 4, "ERANGE", "Position %i is out of range." % - start_g) - return - #if - if end_g > len(MUU.orig) : - O.addMessage(__file__, 4, "ERANGE", "Position %s is out of range." % - end_g) - return - #if - - if transcript and transcript.CM.orientation == -1 : - Arg1 = Bio.Seq.reverse_complement(RawVar.Arg1) - Arg2 = Bio.Seq.reverse_complement(RawVar.Arg2) - - if transcript and __overSplice(start_g, end_g, transcript.CM.RNA) : - O.addMessage(__file__, 2, "WOVERSPLICE", - "Variant hits one or more splice sites.") - - if RawVar.MutationType in ["del", "dup", "subst", "delins"] : - __checkOptArg(MUU.orig, start_g, end_g, Arg1, O) - - if RawVar.MutationType == "subst" : - checkSubstitution(start_g, Arg1, Arg2, MUU, GenRecordInstance, O) - if RawVar.MutationType in ["del", "dup"] : - checkDeletionDuplication(start_g, end_g, RawVar.MutationType, MUU, - GenRecordInstance, O) - if RawVar.MutationType == "inv" : - checkInversion(start_g, end_g, MUU, GenRecordInstance, O) - - # TODO implement this feature. - if RawVar.MutationType in ["delins", "ins"] : - if not Arg1 : - O.addMessage(__file__, 4, "ENOTIMPLEMENTED", - "Insertion of a range is not implemented yet.") - return - #if - #if - - if RawVar.MutationType == "ins" : - checkInsertion(start_g, end_g, Arg1, MUU, GenRecordInstance, O) - - # DelIns. - if RawVar.MutationType == "delins" : - if not Arg1 : - Arg1 = MUU.orig[start_g - 1:end_g] - - if str(Arg1) == str(Arg2) : - O.addMessage(__file__, 2, "WNOCHANGE", - "Sequence \"%s\" at position %i_%i is identical to the " \ - "variant." % ( - MUU.visualiseLargeString(str(MUU.orig[start_g - 1:end_g])), - start_g, end_g)) - return - #if - - del_part, ins_part, lcp, lcs = __trim2(Arg1, Arg2) - if not len(del_part) : - O.addMessage(__file__, 2, "WWRONGTYPE", "The given DelIns " \ - "is actually an insertion.") - checkInsertion(start_g + lcp - 1, start_g + lcp, ins_part, MUU, - GenRecordInstance, O) - return - #if - if len(del_part) == 1 and len(ins_part) == 1 : - O.addMessage(__file__, 2, "WWRONGTYPE", "The given DelIns " \ - "is actually a substitution.") - checkSubstitution(start_g + lcp, del_part, ins_part, MUU, - GenRecordInstance, O) - return - #if - if not len(ins_part) : - O.addMessage(__file__, 2, "WWRONGTYPE", "The given DelIns " \ - "is actually a deletion.") - checkDeletionDuplication(start_g + lcp, end_g - lcs, "del", - MUU, GenRecordInstance, O) - return - #if - if str(Bio.Seq.reverse_complement(del_part)) == ins_part : - O.addMessage(__file__, 2, "WWRONGTYPE", "The given DelIns " \ - "is actually an inversion.") - checkInversion(start_g + lcp, end_g - lcs, MUU, - GenRecordInstance, O) - return - #if - if len(Arg2) != len(ins_part) : - O.addMessage(__file__, 2, "WNOTMINIMAL", - "Sequence \"%s\" at position %i_%i has the same prefix or " \ - "suffix as the inserted sequence \"%s\". The HGVS notation " \ - "prescribes that it should be \"%s\" at position %i_%i." % ( - MUU.visualiseLargeString(str(MUU.orig[start_g - 1:end_g])), - start_g, end_g, Arg2, ins_part, start_g + lcp, end_g - lcs)) - - MUU.delinsM(start_g + lcp, end_g - lcs, ins_part) - - GenRecordInstance.name(start_g + lcp, end_g - lcs, "delins", ins_part, - "", None) - #if -#__rv - -def __ppp(MUU, parts, GenRecordInstance, O) : - """ - @todo: documentation - """ - if parts.RawVar or parts.SingleAlleleVarSet : - if parts.RefType == 'r' : - O.addMessage(__file__, 4, "ERNA", "Descriptions on RNA level " \ - "are not supported.") - if parts.RefType in ['c', 'n'] : - GS, W = None, None - goi, toi = O.getOutput("geneSymbol")[-1] - if parts.LrgAcc: # LRG - GS = GenRecordInstance.record.geneList[0] #LRG pick top gene - if toi: - W = GS.findLocus(toi) - if not W: - O.addMessage(__file__, 4, "ENOTRANSCRIPT", - "Multiple transcripts found for gene %s. Please " \ - "choose from: %s" %(GS.name, - ", ".join(GS.listLoci()))) - else: # No transcript id given - if len(GS.transcriptList) == 1: - #No transcript given, only 1 found - W = GS.transcriptList[0] - else: - O.addMessage(__file__, 4, "ENOTRANSCRIPT", - "No transcript given for gene %s. Please " \ - "choose from: %s" %(GS.name, - ", ".join(GS.listLoci()))) - - #if - else: - # gene of interest - genes = GenRecordInstance.record.listGenes() - toi = toi and "%.3i" % int(toi) - - if goi in genes: #we found our gene - GS = GenRecordInstance.record.findGene(goi) - elif (len(genes) == 1) and not(goi): - #There is only one gene in the Record, message? - GS = GenRecordInstance.record.geneList[0] - else: - O.addMessage(__file__, 4, "EINVALIDGENE", - "Gene %s not found. Please choose from: %s" % ( - goi, ", ".join(genes))) - - if GS: - #Find Transcript - transcripts = GS.listLoci() - if toi in transcripts: - W = GS.findLocus(toi) - elif (len(transcripts) == 1) and not(toi): - W = GS.transcriptList[0] - else: - O.addMessage(__file__, 4, "ENOTRANSCRIPT", - "Multiple transcripts found for gene %s. Please " \ - "choose from: %s" %(GS.name, - ", ".join(GS.listLoci()))) - #else - - # Add seletcted geneSymbol to output - O.addOutput("geneSymbol", (GS and GS.name or "", W and W.name or "")) - - # Return if no transcript is selected - if not W: - #Skip all BatchJobs with the same preColon data - O.addOutput("BatchFlags", ("S2", - O.getOutput("preColon")[-1])) - return None #Explicit return in case of an error - #if - else : - W = None - #if W and not W.location : - # W = None - if W and not W.transcribe : - return - - if parts.SingleAlleleVarSet: - for i in parts.SingleAlleleVarSet : - __rv(MUU, i.RawVar, GenRecordInstance, parts, O, W) - else : - __rv(MUU, parts.RawVar, GenRecordInstance, parts, O, W) - - - if not W : # Genomic given or error with transcript - return - if not GenRecordInstance.record.geneList : # EST - return - - for i in range(0, W.CM.numberOfExons() * 2, 2) : - exonStart = W.CM.getSpliceSite(i) - exonStop = W.CM.getSpliceSite(i + 1) - O.addOutput("exonInfo", [exonStart, exonStop, - W.CM.g2c(exonStart), W.CM.g2c(exonStop)]) - - O.addOutput("cdsStart_g", W.CM.x2g(1, 0)) - O.addOutput("cdsStart_c", 1) - cdsStop = W.CM.info()[2] - O.addOutput("cdsStop_g", W.CM.x2g(cdsStop, 0)) - O.addOutput("cdsStop_c", cdsStop) - - if W.transcribe : - O.addOutput("myTranscriptDescription", W.description) - - O.addOutput("origMRNA", - str(__splice(MUU.orig, W.mRNA.positionList))) - O.addOutput("mutatedMRNA", - str(__splice(MUU.mutated, MUU.newSplice(W.mRNA.positionList)))) - #if - - - if W.translate : - cds = Seq(str(__splice(MUU.orig, W.CDS.positionList)), - IUPAC.unambiguous_dna) - cdsm = Seq(str(__nsplice(MUU.mutated, - MUU.newSplice(W.mRNA.positionList), - MUU.newSplice(W.CDS.location), - W.CM.orientation)), - IUPAC.unambiguous_dna) - O.addOutput("origCDS", cds) - - if W.CM.orientation == -1 : - cds = Bio.Seq.reverse_complement(cds) - cdsm = Bio.Seq.reverse_complement(cdsm) - #if - - if not __checkDNA(cds) : - O.addMessage(__file__, 4, "ENODNA", "Invalid letters in " - "reference sequence.") - return - #if - if '*' in cds.translate(table = W.txTable)[:-1] : - O.addMessage(__file__, 3, "ESTOP", "In frame stop codon found.") - return - #if - orig = cds.translate(table = W.txTable, to_stop = True) - O.addOutput("oldprotein", orig + '*') - trans = cdsm.translate(table = W.txTable, to_stop = True) - O.addOutput("newCDS", cdsm[:(len(str(trans)) + 1) * 3]) - - if not trans or trans[0] != 'M' : - __bprint(orig + '*', O, "oldProteinFancy") - if str(cdsm[0:3]) in \ - Bio.Data.CodonTable.unambiguous_dna_by_id[ - W.txTable].start_codons : - O.addOutput("newprotein", '?') - __bprint('?', O, "newProteinFancy") - O.addOutput("altStart", str(cdsm[0:3])) - if str(orig[1:]) != str(trans[1:]) : - O.addOutput("altProtein", 'M' + trans[1:] + '*') - __bprint('M' + trans[1:] + '*', O, "altProteinFancy") - #if - else : - O.addOutput("newprotein", '?') - __bprint('?', O, "newProteinFancy") - #else - else : - cdsLen = __cdsLen(MUU.newSplice(W.CDS.positionList)) - descr = __toProtDescr(cdsLen, orig, trans) - O.addOutput("myProteinDescription", descr[0]) - - __bprint2(orig + '*', descr[1], descr[2], O, - "oldProteinFancy") - if str(orig) != str(trans) : - O.addOutput("newprotein", trans + '*') - __bprint2(trans + '*', descr[1], descr[3], O, - "newProteinFancy") - #else - #if - #if -#__ppp - -def process(cmd, C, O) : - """ - @todo: documentation - """ - parser = Parser.Nomenclatureparser(O) - O.addOutput("inputvariant", cmd) - ParseObj = parser.parse(cmd) - del parser - if not ParseObj : - #Parsing went wrong - return None #Excplicit return of None in case of an error - - if ParseObj.Version : - RetrieveRecord = ParseObj.RefSeqAcc + '.' + ParseObj.Version - else : - RetrieveRecord = ParseObj.RefSeqAcc - - D = Db.Cache(C.Db) - if ParseObj.LrgAcc : - filetype = "LRG" - RetrieveRecord = ParseObj.LrgAcc - geneSymbol = ("", ParseObj.LRGTranscriptID) - retriever = Retriever.LRGRetriever(C.Retriever, O, D) - else : - if ParseObj.Gene: - geneSymbol = (ParseObj.Gene.GeneSymbol or "", - ParseObj.Gene.TransVar or "") - if ParseObj.Gene.ProtIso : - O.addMessage(__file__, 4, "EPROT", "Indexing by protein " \ - "isoform is not supported.") - else: - geneSymbol = ("", "") - retriever = Retriever.GenBankRetriever(C.Retriever, O, D) - filetype = "GB" - - # Store the recordType for output formatting - O.addOutput("recordType", filetype) - - # Note concerning objects in outputObject, example: - # O.getOutput('reference')[-1] countains the last added value - # O.getOutput('reference')[0] countains the first added value - # These can refer to the same element - O.addOutput("reference", RetrieveRecord) - - # The geneSymbol[0] is used as a filter for batch runs - O.addOutput("geneSymbol", geneSymbol) #tuple(Gene, TransV) - - # preColon is used to filter out Batch entries - # that will result in identical errors - O.addOutput("preColon", cmd.split(":")[0]) - O.addOutput("variant", cmd.split(":")[-1]) - - record = retriever.loadrecord(RetrieveRecord) - #if record and record.version and not '.' in RetrieveRecord : #FIXME - # O.addOutput("reference", RetrieveRecord + '.' + record.version) - #else : - - if not record : - return - del retriever - del D - - GenRecordInstance = GenRecord.GenRecord(O, C.GenRecord) - GenRecordInstance.record = record - GenRecordInstance.checkRecord() - #NOTE: GenRecordInstance is carrying the sequence in .record.seq - # so is the Mutator.Mutator instance MUU .orig - - MUU = Mutator.Mutator(GenRecordInstance.record.seq, C.Mutator, O) - __ppp(MUU, ParseObj, GenRecordInstance, O) - - # PROTEIN - for i in GenRecordInstance.record.geneList : - #if i.location : - for j in i.transcriptList : - if not ';' in j.description and j.CDS and j.translate : - cds = Seq(str(__splice(MUU.orig, j.CDS.positionList)), - IUPAC.unambiguous_dna) - cdsm = Seq(str(__nsplice(MUU.mutated, - MUU.newSplice(j.mRNA.positionList), - MUU.newSplice(j.CDS.location), - j.CM.orientation)), - IUPAC.unambiguous_dna) - if j.CM.orientation == -1 : - cds = Bio.Seq.reverse_complement(cds) - cdsm = Bio.Seq.reverse_complement(cdsm) - #if - - #if '*' in cds.translate()[:-1] : - # O.addMessage(__file__, 3, "ESTOP", - # "In frame stop codon found.") - # return - ##if - - if not len(cds) % 3 : - try : # FIXME this is a bit of a rancid fix. - orig = cds.translate(table = j.txTable, cds = True, - to_stop = True) - except Bio.Data.CodonTable.TranslationError : - O.addMessage(__file__, 4, "ETRANS", "Original " \ - "CDS could not be translated.") - return GenRecordInstance - trans = cdsm.translate(table = j.txTable, - to_stop = True) - - cdsLen = __cdsLen(MUU.newSplice(j.CDS.positionList)) - j.proteinDescription = __toProtDescr(cdsLen, orig, - trans)[0] - #if - else : - O.addMessage(__file__, 2, "ECDS", "CDS length is " \ - "not a multiple of three in gene %s, transcript " \ - "variant %s." % (i.name, j.name)) - j.proteinDescription = '?' - # /PROTEIN - - reference = O.getOutput("reference")[-1] - if ';' in GenRecordInstance.record.description : - descr = '['+GenRecordInstance.record.description+']' - else: - descr = GenRecordInstance.record.description - - O.addOutput("genomicDescription", "%s:%c.%s" % (reference, - GenRecordInstance.record.molType, descr)) - O.addOutput("gDescription", "%c.%s" % ( - GenRecordInstance.record.molType, descr)) - O.addOutput("molType", GenRecordInstance.record.molType) - - if GenRecordInstance.record.chromOffset : - if ';' in GenRecordInstance.record.chromDescription : - chromDescr = '['+GenRecordInstance.record.chromDescription+']' - else: - chromDescr = GenRecordInstance.record.chromDescription - - O.addOutput("genomicChromDescription", "%s:%c.%s" % ( - GenRecordInstance.record.recordId, - GenRecordInstance.record.molType, chromDescr)) - #if - - if GenRecordInstance.record._sourcetype == "LRG": #LRG record - for i in GenRecordInstance.record.geneList: - for j in sorted(i.transcriptList, key = attrgetter("name")) : - (iName, jName, mType, cDescr, pDescr, - gAcc, cAcc, pAcc, fullDescr, fullpDescr) =\ - (i.name, j.name, j.molType, "", "", "", "", "", "", "") - - if ';' in j.description: - descr = '['+j.description+']' - else: - descr = j.description - - if j.name: - fullDescr =\ - "%st%s:%c.%s" % (reference, j.name, j.molType, descr) - O.addOutput("descriptions", fullDescr) - #if - else: - O.addOutput("descriptions", (i.name)) - - if j.molType == 'c': - cDescr = "c.%s" % descr - pDescr = j.proteinDescription - fullpDescr = "%sp%s:%s" % (reference, j.name, pDescr) - O.addOutput("protDescriptions", fullpDescr) - cAcc, pAcc = j.transcriptID, j.proteinID - #if - - O.addOutput("NewDescriptions", ( - iName, jName, mType, cDescr, pDescr, gAcc, - cAcc, pAcc, fullDescr, fullpDescr)) - #for - #for - #if - else : - for i in GenRecordInstance.record.geneList : - for j in sorted(i.transcriptList, key = attrgetter("name")) : - (iName, jName, mType, cDescr, pDescr, - gAcc, cAcc, pAcc, fullDescr, fullpDescr) =\ - (i.name, j.name, j.molType, "", "", "", "", "", "", "") - - if ';' in j.description : - descr = '['+j.description+']' - else: - descr = j.description - - fullDescr = "%s(%s_v%s):%c.%s" % (reference,\ - iName, jName, mType, descr) - O.addOutput("descriptions", fullDescr) - - if (j.molType == 'c') : - cDescr = "c.%s" % descr - pDescr = j.proteinDescription - fullpDescr = "%s(%s_i%s):%s" % ( - reference, iName, jName, pDescr) - O.addOutput("protDescriptions", fullpDescr) - cAcc, pAcc = j.transcriptID, j.proteinID - #if - - O.addOutput("NewDescriptions", ( - iName, jName, mType, cDescr, pDescr, gAcc, - cAcc, pAcc, fullDescr, fullpDescr)) - #for - #for - #else - - - # LEGEND - for i in GenRecordInstance.record.geneList : - for j in sorted(i.transcriptList, key = attrgetter("name")) : - - if not j.name: continue #Exclude nameless transcripts - - O.addOutput("legends", ["%s_v%s" % (i.name, j.name), - j.transcriptID, j.locusTag, - j.transcriptProduct, j.linkMethod]) - if j.translate : - O.addOutput("legends", ["%s_i%s" % (i.name, j.name), - j.proteinID, j.locusTag, - j.proteinProduct, j.linkMethod]) - #for - - #Add GeneSymbol and Transcript Var to the Output object for batch - if ParseObj.Gene: - O.addOutput("geneOfInterest", dict(ParseObj.Gene.items())) - else: - O.addOutput("geneOfInterest", dict()) - - _createBatchOutput(O) - - O.addOutput("original", str(MUU.orig)) - O.addOutput("mutated", str(MUU.mutated)) - del MUU - - return GenRecordInstance - #if -#process - -def main(cmd) : - """ - @todo: documentation - """ - C = Config.Config() - O = Output.Output(__file__, C.Output) - - O.addMessage(__file__, -1, "INFO", "Received variant " + cmd) - - RD = process(cmd, C, O) - - O.addMessage(__file__, -1, "INFO", "Finished processing variant " + cmd) - - ### OUTPUT BLOCK ### - gn = O.getOutput("genename") - if gn : - print "Gene Name: " + gn[0] - tv = O.getOutput("transcriptvariant") - if tv : - print "Transcript variant: " + tv[0] - print - #if - - for i in O.getMessages() : - print i - errors, warnings, summary = O.Summary() - print summary - print - - #if not errors : - if not errors or DEBUG: - visualisation = O.getOutput("visualisation") - if visualisation : - for i in range(len(visualisation)) : - if i and not i % 3 : - print - print visualisation[i] - #for - print - #if - - reference = O.getOutput("reference")[-1] - for i in O.getOutput("descriptions") : - print i - print - for i in O.getOutput("protDescriptions") : - print i - print - - if RD.record and RD.record._sourcetype == "LRG": #LRG record - from collections import defaultdict - toutput = defaultdict(list) - poutput = defaultdict(list) - for i in RD.record.geneList: - for j in i.transcriptList: - d = j.description - d = ';' in d and '['+d+']' or d - if j.name: - toutput[i.name].append( - "%st%s:%c.%s" % (reference, j.name, j.molType, d)) - else: - pass - if j.molType == 'c': - poutput[i.name].append( - "%sp%s:%s" % (reference, j.name, - j.proteinDescription)) - poutput[i.name].sort() - toutput[i.name].sort() - - #Transcript Notation - print "Following transcripts were affected:" - for key, values in toutput.items(): - print key - for value in values: - print "\t"+value - - #Protein Notation - print "\nFollowing proteins were affected:" - for key, values in poutput.items(): - print key - for value in values: - print "\t"+value - #for - #if - else : - for i in RD.record.geneList : - for j in i.transcriptList : - if ';' in j.description : - print "%s(%s_v%s):%c.[%s]" % (reference, i.name, j.name, - j.molType, j.description) - else : - print "%s(%s_v%s):%c.%s" % (reference, i.name, j.name, - j.molType, j.description) - if (j.molType == 'c') : - print "%s(%s_i%s):%s" % (reference, i.name, j.name, - j.proteinDescription) - #else - #for - #for - #else - - #Genomic Notation - rdrd = RD.record.description - gdescr = ';' in rdrd and '['+rdrd+']' or rdrd - print "\nGenomic notation:\n\t%s:g.%s" % (reference, gdescr) - print O.getOutput("genomicChromDescription") - - op = O.getOutput("oldprotein") - if op : - print "\nOld protein:" - #__bprint(op[0], O) - for i in O.getOutput("oldProteinFancy") : - print i - print - #if - np = O.getOutput("newprotein") - if np : - print "\nNew protein:" - #__bprint(np[0], O) - for i in O.getOutput("newProteinFancy") : - print i - print - #if - ap = O.getOutput("altProtein") - if ap : - print "\nAlternative protein using start codon %s:" % \ - O.getOutput("altstart")[0] - #__bprint(ap[0], O) - for i in O.getOutput("altProteinFancy") : - print i - print - #if - - for i in O.getOutput("exonInfo") : - print i - print - print O.getOutput("cdsStart") - print O.getOutput("cdsStop") - print - - for i in O.getOutput("legends") : - print i - - print - print "Restriction sites:" - for i in O.getOutput("restrictionSites") : - print i - - print "+++ %s" % O.getOutput("myTranscriptDescription") - - #if - ### OUTPUT BLOCK ### - del O -#main - -if __name__ == "__main__" : - if len(sys.argv) > 1: - main(sys.argv[1]) -#if diff --git a/src/UCSC_update.py b/src/UCSC_update.py deleted file mode 100644 index 82e1c07394b14a61ce178858141bb045b8297ec5..0000000000000000000000000000000000000000 --- a/src/UCSC_update.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/python - -""" -Get updates on mapping information from the UCSC. - -This program is intended to be run daily from cron. - -@requires: sys -@requires: os - -@requires: Modules.Config -@requires: Modules.Output -@requires: Modules.Remote -@requires: Modules.Update -""" - -import sys # sys.argv -import os # os.chdir() - -from Modules import Config -from Modules import Output -from Modules.Db import Remote -from Modules.Db import Update - -os.chdir(sys.argv[0].rsplit('/', 2)[0]) - -C = Config.Config() -O = Output.Output(__file__, C.Output) -O.addMessage(__file__, -1, "INFO", "Starting UCSC mapping data update") - -for i in C.Db.dbNames : - RemoteDb = Remote(i, C.Db) - RemoteDb.get_Update() - del RemoteDb - - LocalDb = Update(i, C.Db) - LocalDb.load_Update() - - count_Updates = LocalDb.count_Updates() - if count_Updates : - O.addMessage(__file__, -1, "INFO", "%i updates found" % count_Updates) - LocalDb.backup_cdsUpdates() - cds_Updates = LocalDb.count_cdsUpdates() - if cds_Updates : - O.addMessage(__file__, -1, "INFO", - "%i CDS updates found, backing up" % cds_Updates) - LocalDb.merge_cdsUpdates() - #if - LocalDb.merge_Update() - - del LocalDb -#for - -O.addMessage(__file__, -1, "INFO", "UCSC mapping data update end") - -del O, C diff --git a/src/VarInfo.py b/src/VarInfo.py deleted file mode 100644 index 8ff5bf20b9e86a60248b070d2f974df0366a59fb..0000000000000000000000000000000000000000 --- a/src/VarInfo.py +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/python - -""" -Search for an NM number in the MySQL database, if the version number -matches, get the start and end positions in a variant and translate these -positions to I{g.} notation if the variant is in I{c.} notation and vice versa. - - If no end position is present, the start position is assumed to be the - end position. - - If the version number is not found in the database, an error message is - generated and a suggestion for an other version is given. - - If the reference sequence is not found at all, an error is returned. - - If no variant is present, the transcription start and end and CDS end - in I{c.} notation is returned. - - If the variant is not accepted by the nomenclature parser, a parse error - will be printed. - -@requires: sys -@requires: Modules.Db -@requires: Modules.Crossmap -@requires: Modules.Parser -@requires: Modules.Output -@requires: Modules.Config -@requires: Modules.Mapper - -@todo: documentation -""" - -import sys # argv -from Modules import Db # Db(), get_NM_version(), get_NM_info() -from Modules import Crossmap # Crossmap(), g2x(), x2g(), main2int(), - # offset2int(), info() -from Modules import Parser # Nomenclatureparser(), parse() -from Modules import Output # Output(), LogMsg() -from Modules import Config -from Modules import Mapper - -def __sl2il(l) : - """ - Convert a list of strings to a list of integers. - - @arg l: A list of strings - @type l: list - - @return: A list of integers - @rtype: list - """ - - for i in range(len(l)) : - l[i] = int(l[i]) - return l -#__sl2il - -def __getcoords(C, Loc, Type) : - """ - Return main, offset and g positions given either a position in - I{c.} or in I{g.} notation. - - @arg C: A crossmapper - @type C: object - @arg Loc: Either a location in I{g.} or I{c.} notation - @type Loc: object - @arg Type: The reference type - @type Type: character - - @return: triple: - - 0 ; Main coordinate in I{c.} notation - - 1 ; Offset coordinate in I{c.} notation - - 2 ; Position in I{g.} notation - @rtype: triple (integer, integer, integer) - """ - - if Type == 'c' : - main = C.main2int(Loc.MainSgn + Loc.Main) - offset = C.offset2int(Loc.OffSgn + Loc.Offset) - g = C.x2g(main, offset) - main, offset = C.g2x(g) - #if - else : - g = int(Loc.Main) - main, offset = C.g2x(g) - #else - return (main, offset, g) -#__getcoords - -def main(LOVD_ver, build, acc, var) : - """ - The entry point (called by the HTML publisher). - - Returns: - - start_main ; The main coordinate of the start position in I{c.} - (non-star) notation. - - start_offset ; The offset coordinate of the start position in I{c.} - notation (intronic position). - - end_main ; The main coordinate of the end position in I{c.} - (non-star) notation. - - end_offset ; The offset coordinate of the end position in I{c.} - notation (intronic position). - - start_g ; The I{g.} notation of the start position. - - end_g ; The I{g.} notation of the end position. - - type ; The mutation type. - - Returns (alternative): - - trans_start ; Transcription start in I{c.} notation. - - trans_stop ; Transcription stop in I{c.} notation. - - CDS_stop ; CDS stop in I{c.} notation. - - @arg LOVD_ver: The LOVD version (ignored for now) - @type LOVD_ver: string - @arg build: The human genome build - @type build: string - @arg acc: The NM accession number and version - @type acc: string - @arg var: The variant, or empty - @type var: string - - @return: - @rtype: - """ - - C = Config.Config() - O = Output.Output(__file__, C.Output) - - O.addMessage(__file__, -1, "INFO", - "Received %s:%s (LOVD_ver %s, build %s)" % (acc, var, - LOVD_ver, build)) - - Converter = Mapper.Converter(build, C, O) - - #V = Mapper.makeParsetree(O, Cross, var) - - # If no variant is given, return transcription start, transcription end and - # CDS stop in c. notation. - if var : - ret = Converter.mainMapping(acc, var) - #for i in Converter.crossmap.RNA : - # print i, Converter.crossmap.g2c(i) - else : - ret = Converter.giveInfo(acc) - if ret: - return "%i\n%i\n%i" % ret - - if not getattr(ret, "startmain", None) : - output = O.getOutput("LOVDERR") - if output: - return output[0] - else: - #print "\n".join(O.getMessages()) - return "Unknown error occured" - - O.addMessage(__file__, -1, "INFO", - "Finished processing %s:%s (LOVD_ver %s, build %s)" % (acc, - var, LOVD_ver, build)) - del O, C - # And return the output. - return "%i\n%i\n%i\n%i\n%i\n%i\n%s" % (ret.startmain, ret.startoffset, - ret.endmain, ret.endoffset, ret.start_g, ret.end_g, ret.mutationType) - -#main - -if __name__ == "__main__" : - print main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) diff --git a/src/fishBugs.py b/src/fishBugs.py deleted file mode 100644 index e7d6cfe068382f5448f9f79d71db40000c1db57a..0000000000000000000000000000000000000000 --- a/src/fishBugs.py +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/python - -from Modules import Config - -myConfig = Config.Config() -handle = open(myConfig.Output.log, "r") - -scanning = False -line = handle.readline() -while line : - if not scanning : - if " Received " in line : - message = line - scanning = True - #if - #if - else : - if " Received " in line : - print message, - scanning = False - #if - if " Finished " in line : - scanning = False - #else - line = handle.readline() -#while -handle.close() diff --git a/src/tests/test_mutalyzer.py b/src/tests/test_mutalyzer.py deleted file mode 100755 index d6420ec8fbf05f1ca31d820d0855df401055a330..0000000000000000000000000000000000000000 --- a/src/tests/test_mutalyzer.py +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env python - -""" -Tests for the Mutalyzer module. -""" - -#import logging; logging.basicConfig() -import re -import os -import random -import unittest -import site -from Bio.Seq import Seq - -# Todo: Can this be done in a more elegant way? -os.chdir('../..') -site.addsitedir('src') - -from Modules import Config -from Modules import Output -import Mutalyzer - - -class TestMutalyzer(unittest.TestCase): - """ - Test the Mutalyzer module. - """ - - def setUp(self): - """ - Initialize test Mutalyzer module. - """ - self.config = Config.Config() - self.output = Output.Output(__file__, self.config.Output) - - def test_roll(self): - """ - Just a variant where we should roll. - """ - Mutalyzer.process('NM_003002.2:c.273del', self.config, self.output) - wroll = self.output.getMessagesWithErrorCode('WROLL') - self.assertTrue(len(wroll) > 0) - - def test_no_roll(self): - """ - Just a variant where we cannot roll. - """ - Mutalyzer.process('NM_003002.2:c.274del', self.config, self.output) - wroll = self.output.getMessagesWithErrorCode('WROLL') - self.assertTrue(len(wroll) == 0) - - def test_no_roll_splice(self): - """ - Here we can roll but should not, because it is over a splice site. - """ - Mutalyzer.process('NM_000088.3:g.459del', self.config, self.output) - wrollback = self.output.getMessagesWithErrorCode('IROLLBACK') - self.assertTrue(len(wrollback) > 0) - wroll = self.output.getMessagesWithErrorCode('WROLL') - self.assertTrue(len(wroll) == 0) - - def test_partial_roll_splice(self): - """ - Here we can roll two positions, but should roll only one because - otherwise it is over a splice site. - """ - Mutalyzer.process('NM_000088.3:g.494del', self.config, self.output) - wrollback = self.output.getMessagesWithErrorCode('IROLLBACK') - self.assertTrue(len(wrollback) > 0) - wroll = self.output.getMessagesWithErrorCode('WROLL') - self.assertTrue(len(wroll) > 0) - - def test_roll_after_splice(self): - """ - Here we can roll and should, we stay in the same exon. - """ - Mutalyzer.process('NM_000088.3:g.460del', self.config, self.output) - wroll = self.output.getMessagesWithErrorCode('WROLL') - self.assertTrue(len(wroll) > 0) - - def test_ins_cds_start(self): - """ - Insertion on CDS start boundary should not be included in CDS. - """ - Mutalyzer.process('NM_000143.3:c.-1_1insCAT', self.config, self.output) - self.assertEqual(self.output.getIndexedOutput("newprotein", 0), None) - - def test_ins_cds_start_after(self): - """ - Insertion after CDS start boundary should be included in CDS. - """ - Mutalyzer.process('NM_000143.3:c.1_2insCAT', self.config, self.output) - self.assertEqual(self.output.getIndexedOutput("newprotein", 0), '?') - - -if __name__ == '__main__': - # Usage: - # ./test_mutalyzer.py -v - # Or, selecting a specific test: - # ./test_mutalyzer.py -v TestMutalyzer.test_ins_cds_start - unittest.main() diff --git a/src/tests/test_webservice.py b/src/tests/test_webservice.py deleted file mode 100755 index 801cf25f5ccebd745ea5262164d065a9e689500b..0000000000000000000000000000000000000000 --- a/src/tests/test_webservice.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python - -""" -Tests for the SOAP interface to Mutalyzer. -""" - -import logging; logging.raiseExceptions = 0 -import urllib2 -from suds.client import Client -from suds import WebFault -import unittest - -WSDL_URL = 'http://mutalyzer.martijn/services/?wsdl' - -class TestWSDL(unittest.TestCase): - """ - Test the Mutalyzer SOAP interface WSDL description. - """ - def test_wsdl(self): - """ - Test if the WSDL is available and looks somewhat sensible. - """ - wsdl = urllib2.urlopen(WSDL_URL).read() - self.assertTrue(wsdl.startswith("<?xml version='1.0' encoding='UTF-8'?>")) - self.assertTrue('name="Mutalyzer"' in wsdl) - -class TestWebservice(unittest.TestCase): - """ - Test the Mutalyzer SOAP interface. - """ - - def setUp(self): - """ - Initialize webservice entrypoint. - - @todo: Start the standalone server and stop it in self.tearDown - instead of depending on some running instance at a fixed address. - """ - self.client = Client(WSDL_URL, cache=None) - - def test_checksyntax_valid(self): - """ - Running checkSyntax with a valid variant name should return True. - """ - r = self.client.service.checkSyntax('AB026906.1:c.274G>T') - self.assertEqual(r.valid, True) - - def test_checksyntax_invalid(self): - """ - Running checkSyntax with an invalid variant name should return False - and give at least one error message. - """ - r = self.client.service.checkSyntax('0:abcd') - self.assertEqual(r.valid, False) - self.assertTrue(len(r.messages.SoapMessage) >= 1) - - def test_checksyntax_empty(self): - """ - Running checkSyntax with no variant name should raise exception. - """ - try: - self.client.service.checkSyntax() - self.fail('Expected WebFault exception') - except WebFault, e: - self.assertEqual(e.fault.faultstring, - 'The variant argument is not provided.') - - def test_transcriptinfo_valid(self): - """ - Running transcriptInfo with valid arguments should get us a Transcript - object. - """ - r = self.client.service.transcriptInfo(LOVD_ver='123', build='hg19', - accNo='NM_002001.2') - self.assertEqual(r.trans_start, -99) - self.assertEqual(r.trans_stop, 1066) - self.assertEqual(r.CDS_stop, 774) - - def test_numberconversion_gtoc_valid(self): - """ - Running numberConversion with valid g variant should give a list of - c variant names. - """ - r = self.client.service.numberConversion(build='hg19', - variant='NC_000001.10:g.159272155del') - self.assertEqual(type(r.string), list) - self.assertTrue('NM_002001.2:c.1del' in r.string) - - def test_numberconversion_ctog_valid(self): - """ - Running numberConversion with valid c variant should give a list of - g variant names. - """ - r = self.client.service.numberConversion(build='hg19', - variant='NM_002001.2:c.1del') - self.assertEqual(type(r.string), list) - self.assertTrue('NC_000001.10:g.159272155del' in r.string) - - def test_gettranscriptsbygenename_valid(self): - """ - Running getTranscriptsByGeneName with valid gene name should give a - list of transcripts. - """ - r = self.client.service.getTranscriptsByGeneName(build='hg19', - name='DMD') - self.assertEqual(type(r.string), list) - for t in ['NM_004006.2', - 'NM_000109.3', - 'NM_004021.2', - 'NM_004009.3', - 'NM_004007.2', - 'NM_004018.2', - 'NM_004022.2']: - self.assertTrue(t in r.string) - - def test_gettranscriptsandinfo_valid(self): - """ - Running getTranscriptsAndInfo with a valid genomic reference should - give a list of TranscriptInfo objects. - """ - r = self.client.service.getTranscriptsAndInfo('AL449423.14') - self.assertEqual(type(r.TranscriptInfo), list) - names = [t.name for t in r.TranscriptInfo] - for t in ['CDKN2B_v002', - 'CDKN2B_v001', - 'MTAP_v005', - 'CDKN2A_v008', - 'CDKN2A_v007', - 'C9orf53_v001', - 'CDKN2A_v001']: - self.assertTrue(t in names) - - def test_gettranscriptsandinfo_restricted_valid(self): - """ - Running getTranscriptsAndInfo with a valid genomic reference and a - gene name should give a list of TranscriptInfo objects restricted - to the gene. - """ - r = self.client.service.getTranscriptsAndInfo('AL449423.14', 'CDKN2A') - self.assertEqual(type(r.TranscriptInfo), list) - names = [t.name for t in r.TranscriptInfo] - for t in ['CDKN2A_v008', - 'CDKN2A_v007']: - self.assertTrue(t in names) - for t in ['CDKN2B_v002', - 'CDKN2B_v001', - 'MTAP_v005', - 'C9orf53_v001']: - self.assertFalse(t in names) - -if __name__ == '__main__': - # Usage: - # ./test_webservice.py -v - # Or, selecting a specific test: - # ./test_webservice.py -v TestWebservice.test_checksyntax_empty - unittest.main() diff --git a/templates/check.html b/templates/check.html deleted file mode 100644 index ced4ee04ed1687eb41b27d1199a70cff0a4f1cc4..0000000000000000000000000000000000000000 --- a/templates/check.html +++ /dev/null @@ -1,260 +0,0 @@ -<html> - <head> - <link rel="stylesheet" - type="text/css" - href="base/css/style.css"> - <title></title> - </head> - <body> - <div metal:define-macro="content"> - <center> - <h3>Name checker</h3> - </center> - <div id = "output" tal:condition = "interactive"> - <div> - Please insert the mutation name using the - <span class = "helper" - title = "Human Genome Variation Society standard variant nomenclature"> - <a href = "http://www.hgvs.org/mutnomen">HGVS</a> format</span>:<br> - <Accession Number>.<version - number>(<Gene symbol>):<sequence - type>.<mutation> - </div><br> - Example: AB026906.1:c.274G>T<br> - <br> - <form action = "" method = "post"> - <input - type = "text" - name = "mutationName" - tal:attributes = "value lastpost" - style = "width:100%" - ><br> - <input type="submit" value="Submit"> - <input type="button" value="Clear field" - onClick = "clearForm(this.form, 'mutationName');"> - </form> - <br> - </div> - <div tal:condition = "lastpost"> - <h3>Mutalyzer output:</h3> - <br> - <div tal:repeat = "i messages" - tal:replace = "structure string:${i}<br>"> - </div> - <div tal:replace = "summary"></div><br> - <br> - <div tal:condition = "parseError"> - <h4>Details of the parse error:</h4> - <pre tal:content = - "structure string:${parseError/0}<br>${parseError/1}"> - </pre> - The "^" indicates the position where the error occurred. - </div> - <br> - <div tal:condition = "not:errors"> - <div tal:condition = "visualisation"> - <b>Overview of the raw variants:</b><br> - <br> - <div tal:repeat = "i visualisation"> - <div tal:repeat = "j i"> - <div tal:condition = "repeat/j/start" - tal:content = "structure string:Raw variant - ${repeat/i/number}: ${j}"></div> - <tt tal:condition = "not: repeat/j/start" tal:content = "j"> - </tt> - </div> - <br> - </div> - <br> - <div tal:condition = "genomicDNA"> - <b>Genomic description:</b> - </div> - <div tal:condition = "not:genomicDNA"> - <b>Description relative to transcription start:</b><br> - (Not for use in LSDBs in case of protein-coding transcripts). - </div> - <br> - <tt> - <a tal:content = "genomicDescription/0" - tal:attributes = "href - string:checkForward?mutationName=${genomicDescription/1}"> - </a> - <br> - </tt> - <div tal:condition = "chromDescription"> - <br> - Alternative chromosomal position:<br> - <tt tal:content = "chromDescription"></tt> - </div> - <br> - <br> - <b>Affected transcripts:</b><br> - <br> - <tt tal:repeat = "i descriptions"> - <a tal:content = "i/0" - tal:attributes = - "href string:checkForward?mutationName=${i/1}"></a><br> - </tt> - <br> - <br> - <b>Affected proteins:</b><br> - <br> - <tt> - <div tal:repeat = "i protDescriptions" - tal:replace = "structure string:${i}<br>"> - </div> - </tt> - <br> - <br> - <div tal:condition = "oldProtein"> - <b>Detailed information about the selected transcript and - predicted protein:</b><br> - <br> - <div style = "background-color : #ccffff;"> - <b>Reference protein:</b><br> - <pre><div tal:repeat = "i oldProtein" - tal:replace = "structure string:${i}<br>"> - </div></pre> - <br> - <b>Protein predicted from variant coding sequence:</b><br> - <div tal:condition = "not:newProtein"> - <br> - No change: Predicted protein (not shown) equals reference - protein. <br> - <br> - </div> - <div tal:condition = "newProtein"> - <pre><div tal:repeat = "i newProtein" - tal:replace = "structure string:${i}<br>"> - </div></pre> - </div> - <br> - <div tal:condition = "altStart"> - <b tal:content = "structure string:Alternative protein - using start codon ${altStart}:"></b><br> - <div tal:condition = "altProtein"> - <pre><div tal:repeat = "i altProtein" - tal:replace = "structure string:${i}<br>"> - </div></pre> - </div> - <div tal:condition = "not:altProtein"> - <br> - No change: Predicted protein (not shown) equals reference - protein. <br> - <br> - </div> - <br> - </div> - </div> - </div> - </div> - <div tal:condition = "oldProtein"> - <div style = "background-color : #ccffff;"> - <b>Additional information about the transcript:</b><br> - <br> - Exon information:<br> - <table class = "raTable"> - <tr> - <td>Number</td> - <td>Start (g.)</td> - <td>Stop (g.)</td> - <td>Start (c.)</td> - <td>Stop (c.)</td> - </tr> - <tr tal:repeat = "i exonInfo"> - <td tal:content = "repeat/i/number"></td> - <td tal:repeat = "j i" tal:content = "j"></td> - </tr> - </table> - <br> - <span class = "helper" title = "Coding Sequence">CDS</span> - information:<br> - <table class = "raTable"> - <tr> - <td></td> - <td>g.</td> - <td>c.</td> - </tr> - <tr> - <td>Start</td> - <td tal:content = "cdsStart_g"></td> - <td tal:content = "cdsStart_c"></td> - </tr> - <tr> - <td>Stop</td> - <td tal:content = "cdsStop_g"></td> - <td tal:content = "cdsStop_c"></td> - </tr> - <tr> - </tr> - </table> - <br> - </div> - <br> - </div> - <div tal:condition = "visualisation"> - <b>Effects on Restriction sites:</b><br> - <br> - <table class = "laTable"> - <tr> - <td>Raw variant</td> - <td>Created</td> - <td>Deleted</td> - </tr> - <tr tal:repeat = "i restrictionSites"> - <td tal:content = "repeat/i/number"></td> - <td> - <span tal:repeat = "j i/0"> - <span tal:condition = "not:repeat/j/end" - tal:content = "structure string:${j},"> - </span> - <span tal:condition = "repeat/j/end" - tal:content = "structure string:${j}"> - </span> - </span> - </td> - <td> - <span tal:repeat = "j i/1"> - <span tal:condition = "not:repeat/j/end" - tal:content = "structure string:${j},"> - </span> - <span tal:condition = "repeat/j/end" - tal:content = "structure string:${j}"> - </span> - </span> - </td> - </tr> - </table> - <br> - <br> - </div> - </div> - <div tal:condition = "legends"> - <b>Legend:</b><br> - <br> - <table class = "laTable"> - <tr> - <td>Name</td> - <td>ID</td> - <td>Locus tag</td> - <td>Product</td> - <td>Link method</td> - </tr> - <tr tal:repeat = "i legends"> - <td tal:repeat = "j i" tal:content = "j"></td> - </tr> - </table> - <div tal:condition = "interactive"> - <br> - <br> - <b>Links:</b><br> - <br> - Download this reference sequence file: - <a tal:content = "reference" - tal:attributes = "href string:Reference/${reference}"></a> - </div> - </div> - </div> - </div> - </body> -</html> diff --git a/src/tests/data/AB026906.1.gb b/tests/data/AB026906.1.gb similarity index 100% rename from src/tests/data/AB026906.1.gb rename to tests/data/AB026906.1.gb diff --git a/src/Modules/tests/lrgtest.py b/tests/old/lrgtest.py similarity index 100% rename from src/Modules/tests/lrgtest.py rename to tests/old/lrgtest.py diff --git a/src/Modules/tests/lrgtest_files/LRG_1.xml b/tests/old/lrgtest_files/LRG_1.xml similarity index 100% rename from src/Modules/tests/lrgtest_files/LRG_1.xml rename to tests/old/lrgtest_files/LRG_1.xml diff --git a/src/Modules/tests/lrgtest_files/LRG_11.xml b/tests/old/lrgtest_files/LRG_11.xml similarity index 100% rename from src/Modules/tests/lrgtest_files/LRG_11.xml rename to tests/old/lrgtest_files/LRG_11.xml diff --git a/src/Modules/tests/lrgtest_files/LRG_130.xml b/tests/old/lrgtest_files/LRG_130.xml similarity index 100% rename from src/Modules/tests/lrgtest_files/LRG_130.xml rename to tests/old/lrgtest_files/LRG_130.xml diff --git a/src/Modules/tests/maptest.py b/tests/old/maptest.py similarity index 100% rename from src/Modules/tests/maptest.py rename to tests/old/maptest.py diff --git a/src/Modules/tests/recordtest.py b/tests/old/recordtest.py similarity index 100% rename from src/Modules/tests/recordtest.py rename to tests/old/recordtest.py diff --git a/tests/test_grammar.py b/tests/test_grammar.py new file mode 100644 index 0000000000000000000000000000000000000000..8aadc2a0c3a296736684d4550c01e3f190fdb9a8 --- /dev/null +++ b/tests/test_grammar.py @@ -0,0 +1,38 @@ +""" +Tests for the mutalyzer.grammar module. +""" + + +#import logging; logging.basicConfig() +import os +from nose.tools import * + +import mutalyzer +from mutalyzer.config import Config +from mutalyzer.grammar import Grammar +from mutalyzer.output import Output + + +class TestGrammar(): + """ + Test the mytalyzer.grammar module. + """ + + def setUp(self): + """ + Initialize test Grammar instance. + """ + self.config = Config() + self.output = Output(__file__, self.config.Output) + self.grammar = Grammar(self.output) + + def test_some_variants(self): + """ + Some example variants. + """ + self.grammar.parse('NM_002001.2:c.[12del]') + self.grammar.parse('NM_002001.2:c.[(12del)]') + self.grammar.parse('NM_002001.2:c.[(12del)?]') + self.grammar.parse('NM_002001.2:c.[(12del);(12del)]') + self.grammar.parse('NM_002001.2:c.[(12del;12del)]') + self.grammar.parse('NM_002001.2:c.[((12del)?;12del)?]') diff --git a/tests/test_mapping.py b/tests/test_mapping.py new file mode 100644 index 0000000000000000000000000000000000000000..036b46f6c7aadc2ca26629cc123fa7b01c9a4869 --- /dev/null +++ b/tests/test_mapping.py @@ -0,0 +1,54 @@ +""" +Tests for the mapping module. +""" + + +#import logging; logging.basicConfig() +from nose.tools import * + +from mutalyzer.config import Config +from mutalyzer.output import Output +from mutalyzer.mapping import Converter + + +class TestConverter(): + """ + Test the Converter class. + """ + def setUp(self): + """ + Initialize test converter module. + """ + self.config = Config() + self.output = Output(__file__, self.config.Output) + + def _converter(self, build): + """ + Create a Converter instance for a given build. + """ + return Converter(build, self.config, self.output) + + def test_converter(self): + """ + Simple test. + """ + converter = self._converter('hg19') + genomic = converter.c2chrom('NM_003002.2:c.274G>T') + assert_equal(genomic, 'NC_000011.9:g.111959695G>T') + coding = converter.chrom2c(genomic, 'list') + assert 'NM_003002.2:c.274G>T' in coding + + def test_hla_cluster(self): + """ + Convert to primary assembly. + + Transcript NM_000500.5 is mapped to different chromosome locations, + but we like to just see the primary assembly mapping to chromosome 6. + + See also bug #58. + """ + converter = self._converter('hg19') + genomic = converter.c2chrom('NM_000500.5:c.92C>T') + assert_equal(genomic, 'NC_000006.11:g.32006291C>T') + coding = converter.chrom2c(genomic, 'list') + assert 'NM_000500.5:c.92C>T' in coding diff --git a/src/tests/test_mutator.py b/tests/test_mutator.py old mode 100755 new mode 100644 similarity index 77% rename from src/tests/test_mutator.py rename to tests/test_mutator.py index fd6af9fa2430998e98b310142d36f8d7aed2634e..ee01cf2a1f0a0ea04af396d5d458511159cfd932 --- a/src/tests/test_mutator.py +++ b/tests/test_mutator.py @@ -1,24 +1,20 @@ -#!/usr/bin/env python - """ -Tests for the Mutator module. +Tests for the mutalyzer.mutator module. """ + #import logging; logging.basicConfig() import re import os import random -import unittest -import site +from nose.tools import * from Bio.Seq import Seq -# Todo: Can this be done in a more elegant way? -os.chdir('../..') -site.addsitedir('src') - -from Modules import Config -from Modules import Output -from Modules import Mutator +import mutalyzer +from mutalyzer.util import skip +from mutalyzer.config import Config +from mutalyzer.output import Output +from mutalyzer import mutator def _seq(length): @@ -31,23 +27,23 @@ def _seq(length): return Seq(sequence) -class TestMutator(unittest.TestCase): +class TestMutator(): """ - Test the Mutator module. + Test the mutator module. """ def setUp(self): """ - Initialize test Mutator module. + Initialize test mutator module. """ - self.config = Config.Config() - self.output = Output.Output(__file__, self.config.Output) + self.config = Config() + self.output = Output(__file__, self.config.Output) def _mutator(self, sequence): """ - Create a Mutator object for a given sequence. + Create a Mutator instance for a given sequence. """ - return Mutator.Mutator(sequence, + return mutator.Mutator(sequence, self.config.Mutator, self.output) @@ -59,7 +55,7 @@ class TestMutator(unittest.TestCase): m = self._mutator(_seq(l)) # Numbering is 1-based for i in range(1, l + 1): - self.assertEqual(m.shiftpos(i), i) + assert_equal(m.shiftpos(i), i) def test_shiftpos_del_example(self): """ @@ -67,9 +63,9 @@ class TestMutator(unittest.TestCase): """ m = self._mutator(Seq('ATCGATCG')) m.delM(2, 2) - self.assertEqual(m.shiftpos(1), 1) - self.assertEqual(m.shiftpos(2), 2) - self.assertEqual(m.shiftpos(3), 2) + assert_equal(m.shiftpos(1), 1) + assert_equal(m.shiftpos(2), 2) + assert_equal(m.shiftpos(3), 2) def test_shiftpos_del(self): """ @@ -80,9 +76,9 @@ class TestMutator(unittest.TestCase): m = self._mutator(_seq(l)) m.delM(d, d) for p in range(1, d + 1): - self.assertEqual(m.shiftpos(p), p) + assert_equal(m.shiftpos(p), p) for p in range(d + 1, l + 1): - self.assertEqual(m.shiftpos(p), p - 1) + assert_equal(m.shiftpos(p), p - 1) def test_shiftpos_del2(self): """ @@ -93,9 +89,9 @@ class TestMutator(unittest.TestCase): m = self._mutator(_seq(l)) m.delM(d, d + 1) for p in range(1, d + 2): - self.assertEqual(m.shiftpos(p), p) + assert_equal(m.shiftpos(p), p) for p in range(d + 2, l + 1): - self.assertEqual(m.shiftpos(p), p - 2) + assert_equal(m.shiftpos(p), p - 2) def test_shiftpos_ins_example(self): """ @@ -103,9 +99,9 @@ class TestMutator(unittest.TestCase): """ m = self._mutator(Seq('ATCGATCG')) m.insM(2, 'A') - self.assertEqual(m.shiftpos(1), 1) - self.assertEqual(m.shiftpos(2), 2) - self.assertEqual(m.shiftpos(3), 4) + assert_equal(m.shiftpos(1), 1) + assert_equal(m.shiftpos(2), 2) + assert_equal(m.shiftpos(3), 4) def test_shiftpos_ins(self): """ @@ -116,9 +112,9 @@ class TestMutator(unittest.TestCase): m = self._mutator(_seq(l)) m.insM(i, 'T') for p in range(1, i + 1): - self.assertEqual(m.shiftpos(p), p) + assert_equal(m.shiftpos(p), p) for p in range(i + 1, l + 1): - self.assertEqual(m.shiftpos(p), p + 1) + assert_equal(m.shiftpos(p), p + 1) def test_shiftpos_ins2(self): """ @@ -129,9 +125,9 @@ class TestMutator(unittest.TestCase): m = self._mutator(_seq(l)) m.insM(i, 'TT') for p in range(1, i + 1): - self.assertEqual(m.shiftpos(p), p) + assert_equal(m.shiftpos(p), p) for p in range(i + 1, l + 1): - self.assertEqual(m.shiftpos(p), p + 2) + assert_equal(m.shiftpos(p), p + 2) def test_newSplice_no_change(self): """ @@ -147,7 +143,7 @@ class TestMutator(unittest.TestCase): l = 30 sites = [4, 9, 14, 19, 25, 27] m = self._mutator(_seq(l)) - self.assertEqual(m.newSplice(sites), sites) + assert_equal(m.newSplice(sites), sites) def test_newSplice_acc_del_before(self): """ @@ -159,7 +155,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.delM(13, 13) # g.13del - self.assertEqual(m.newSplice(sites), [4, 9, 13, 16, 24, 26]) + assert_equal(m.newSplice(sites), [4, 9, 13, 16, 24, 26]) def test_newSplice_acc_del_after(self): """ @@ -169,7 +165,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.delM(14, 14) # g.14del - self.assertEqual(m.newSplice(sites), [4, 9, 14, 16, 24, 26]) + assert_equal(m.newSplice(sites), [4, 9, 14, 16, 24, 26]) def test_newSplice_don_del_before(self): """ @@ -179,7 +175,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.delM(17, 17) # g.17del - self.assertEqual(m.newSplice(sites), [4, 9, 14, 16, 24, 26]) + assert_equal(m.newSplice(sites), [4, 9, 14, 16, 24, 26]) def test_newSplice_don_del_after(self): """ @@ -191,7 +187,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.delM(18, 18) # g.18del - self.assertEqual(m.newSplice(sites), [4, 9, 14, 17, 24, 26]) + assert_equal(m.newSplice(sites), [4, 9, 14, 17, 24, 26]) def test_newSplice_acc_del2_before(self): """ @@ -203,20 +199,20 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.delM(12, 13) # g.12_13del - self.assertEqual(m.newSplice(sites), [4, 9, 12, 15, 23, 25]) + assert_equal(m.newSplice(sites), [4, 9, 12, 15, 23, 25]) + @skip def test_newSplice_acc_del2_on(self): """ Deletion of 2 in intron/exon. @note: This hits a splice site, so we don't really support it. """ - return # Disabled (see docstring) l = 30 sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.delM(13, 14) # g.13_14del - self.assertEqual(m.newSplice(sites), [4, 9, 13, 15, 23, 25]) + assert_equal(m.newSplice(sites), [4, 9, 13, 15, 23, 25]) def test_newSplice_acc_del2_after(self): """ @@ -226,7 +222,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.delM(14, 15) # g.14_15del - self.assertEqual(m.newSplice(sites), [4, 9, 14, 15, 23, 25]) + assert_equal(m.newSplice(sites), [4, 9, 14, 15, 23, 25]) def test_newSplice_don_del2_before(self): """ @@ -236,20 +232,20 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.delM(16, 17) # g.16_17del - self.assertEqual(m.newSplice(sites), [4, 9, 14, 15, 23, 25]) + assert_equal(m.newSplice(sites), [4, 9, 14, 15, 23, 25]) + @skip def test_newSplice_don_del2_on(self): """ Deletion of 2 in exon/intron. @note: This hits a splice site, so we don't really support it. """ - return # Disabled (see docstring) l = 30 sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.delM(17, 18) # g.17_18del - self.assertEqual(m.newSplice(sites), [4, 9, 14, 16, 23, 25]) + assert_equal(m.newSplice(sites), [4, 9, 14, 16, 23, 25]) def test_newSplice_don_del2_after(self): """ @@ -261,7 +257,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.delM(18, 19) # g.18_19del - self.assertEqual(m.newSplice(sites), [4, 9, 14, 17, 23, 25]) + assert_equal(m.newSplice(sites), [4, 9, 14, 17, 23, 25]) def test_newSplice_acc_ins_before(self): """ @@ -273,7 +269,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(12, 'A') # g.12_13insA - self.assertEqual(m.newSplice(sites), [4, 9, 15, 18, 26, 28]) + assert_equal(m.newSplice(sites), [4, 9, 15, 18, 26, 28]) def test_newSplice_acc_ins_on(self): """ @@ -283,7 +279,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(13, 'A') # g.13_14insA - self.assertEqual(m.newSplice(sites), [4, 9, 14, 18, 26, 28]) + assert_equal(m.newSplice(sites), [4, 9, 14, 18, 26, 28]) def test_newSplice_first_acc_ins_on(self): """ @@ -293,7 +289,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(3, 'A') # g.3_4insA - self.assertEqual(m.newSplice(sites), [5, 10, 15, 18, 26, 28]) + assert_equal(m.newSplice(sites), [5, 10, 15, 18, 26, 28]) def test_newSplice_acc_ins_after(self): """ @@ -303,7 +299,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(14, 'A') # g.14_15insA - self.assertEqual(m.newSplice(sites), [4, 9, 14, 18, 26, 28]) + assert_equal(m.newSplice(sites), [4, 9, 14, 18, 26, 28]) def test_newSplice_don_ins_before(self): """ @@ -313,7 +309,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(16, 'A') # g.16_17insA - self.assertEqual(m.newSplice(sites), [4, 9, 14, 18, 26, 28]) + assert_equal(m.newSplice(sites), [4, 9, 14, 18, 26, 28]) def test_newSplice_don_ins_on(self): """ @@ -323,7 +319,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(17, 'A') # g.17_18insA - self.assertEqual(m.newSplice(sites), [4, 9, 14, 18, 26, 28]) + assert_equal(m.newSplice(sites), [4, 9, 14, 18, 26, 28]) def test_newSplice_last_don_ins_on(self): """ @@ -333,7 +329,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(27, 'A') # g.27_28insA - self.assertEqual(m.newSplice(sites), [4, 9, 14, 17, 25, 27]) + assert_equal(m.newSplice(sites), [4, 9, 14, 17, 25, 27]) def test_newSplice_don_ins_after(self): """ @@ -345,7 +341,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(18, 'A') # g.18_19insA - self.assertEqual(m.newSplice(sites), [4, 9, 14, 17, 26, 28]) + assert_equal(m.newSplice(sites), [4, 9, 14, 17, 26, 28]) def test_newSplice_acc_ins2_before(self): """ @@ -357,7 +353,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(12, 'AT') # g.12_13insAT - self.assertEqual(m.newSplice(sites), [4, 9, 16, 19, 27, 29]) + assert_equal(m.newSplice(sites), [4, 9, 16, 19, 27, 29]) def test_newSplice_first_acc_ins2_on(self): """ @@ -367,7 +363,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(3, 'AT') # g.3_4insAT - self.assertEqual(m.newSplice(sites), [6, 11, 16, 19, 27, 29]) + assert_equal(m.newSplice(sites), [6, 11, 16, 19, 27, 29]) def test_newSplice_acc_ins2_after(self): """ @@ -377,7 +373,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(14, 'AT') # g.14_15insAT - self.assertEqual(m.newSplice(sites), [4, 9, 14, 19, 27, 29]) + assert_equal(m.newSplice(sites), [4, 9, 14, 19, 27, 29]) def test_newSplice_don_ins2_before(self): """ @@ -387,7 +383,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(16, 'AT') # g.16_17insAT - self.assertEqual(m.newSplice(sites), [4, 9, 14, 19, 27, 29]) + assert_equal(m.newSplice(sites), [4, 9, 14, 19, 27, 29]) def test_newSplice_last_don_ins2_on(self): """ @@ -397,7 +393,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(27, 'AT') # g.27_28insAT - self.assertEqual(m.newSplice(sites), [4, 9, 14, 17, 25, 27]) + assert_equal(m.newSplice(sites), [4, 9, 14, 17, 25, 27]) def test_newSplice_don_ins2_after(self): """ @@ -409,7 +405,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(18, 'AT') # g.18_19insAT - self.assertEqual(m.newSplice(sites), [4, 9, 14, 17, 27, 29]) + assert_equal(m.newSplice(sites), [4, 9, 14, 17, 27, 29]) def test_newSplice_acc_ins3_before(self): """ @@ -421,7 +417,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(12, 'ATT') # g.12_13insATT - self.assertEqual(m.newSplice(sites), [4, 9, 17, 20, 28, 30]) + assert_equal(m.newSplice(sites), [4, 9, 17, 20, 28, 30]) def test_newSplice_acc_ins3_on(self): """ @@ -431,7 +427,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(13, 'ATT') # g.13_14insATT - self.assertEqual(m.newSplice(sites), [4, 9, 14, 20, 28, 30]) + assert_equal(m.newSplice(sites), [4, 9, 14, 20, 28, 30]) def test_newSplice_first_acc_ins3_on(self): """ @@ -441,7 +437,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(3, 'ATT') # g.3_4insATT - self.assertEqual(m.newSplice(sites), [7, 12, 17, 20, 28, 30]) + assert_equal(m.newSplice(sites), [7, 12, 17, 20, 28, 30]) def test_newSplice_acc_ins3_after(self): """ @@ -451,7 +447,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(14, 'ATT') # g.14_15insATT - self.assertEqual(m.newSplice(sites), [4, 9, 14, 20, 28, 30]) + assert_equal(m.newSplice(sites), [4, 9, 14, 20, 28, 30]) def test_newSplice_don_ins3_before(self): """ @@ -461,7 +457,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(16, 'ATT') # g.16_17insATT - self.assertEqual(m.newSplice(sites), [4, 9, 14, 20, 28, 30]) + assert_equal(m.newSplice(sites), [4, 9, 14, 20, 28, 30]) def test_newSplice_don_ins3_on(self): """ @@ -471,7 +467,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(17, 'ATT') # g.17_18insATT - self.assertEqual(m.newSplice(sites), [4, 9, 14, 20, 28, 30]) + assert_equal(m.newSplice(sites), [4, 9, 14, 20, 28, 30]) def test_newSplice_last_don_ins3_on(self): """ @@ -481,7 +477,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(27, 'ATT') # g.27_28insATT - self.assertEqual(m.newSplice(sites), [4, 9, 14, 17, 25, 27]) + assert_equal(m.newSplice(sites), [4, 9, 14, 17, 25, 27]) def test_newSplice_don_ins3_after(self): """ @@ -493,7 +489,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 14, 17, 25, 27] m = self._mutator(_seq(l)) m.insM(18, 'ATT') # g.18_19insATT - self.assertEqual(m.newSplice(sites), [4, 9, 14, 17, 28, 30]) + assert_equal(m.newSplice(sites), [4, 9, 14, 17, 28, 30]) def test_newSplice_adj_del_before1(self): """ @@ -510,7 +506,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 10, 17, 18, 27] m = self._mutator(_seq(l)) m.delM(16, 16) # g.16del - self.assertEqual(m.newSplice(sites), [4, 9, 10, 16, 17, 26]) + assert_equal(m.newSplice(sites), [4, 9, 10, 16, 17, 26]) def test_newSplice_adj_del_before(self): """ @@ -520,7 +516,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 10, 17, 18, 27] m = self._mutator(_seq(l)) m.delM(17, 17) # g.17del - self.assertEqual(m.newSplice(sites), [4, 9, 10, 16, 17, 26]) + assert_equal(m.newSplice(sites), [4, 9, 10, 16, 17, 26]) def test_newSplice_adj_del_after(self): """ @@ -530,7 +526,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 10, 17, 18, 27] m = self._mutator(_seq(l)) m.delM(18, 18) # g.18del - self.assertEqual(m.newSplice(sites), [4, 9, 10, 17, 18, 26]) + assert_equal(m.newSplice(sites), [4, 9, 10, 17, 18, 26]) def test_newSplice_adj_del_after1(self): """ @@ -540,7 +536,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 10, 17, 18, 27] m = self._mutator(_seq(l)) m.delM(19, 19) # g.19del - self.assertEqual(m.newSplice(sites), [4, 9, 10, 17, 18, 26]) + assert_equal(m.newSplice(sites), [4, 9, 10, 17, 18, 26]) def test_newSplice_adj_ins_before(self): """ @@ -550,7 +546,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 10, 17, 18, 27] m = self._mutator(_seq(l)) m.insM(16, 'A') # g.16_17insA - self.assertEqual(m.newSplice(sites), [4, 9, 10, 18, 19, 28]) + assert_equal(m.newSplice(sites), [4, 9, 10, 18, 19, 28]) def test_newSplice_adj_ins_on(self): """ @@ -566,7 +562,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 10, 17, 18, 27] m = self._mutator(_seq(l)) m.insM(17, 'A') # g.17_18insA - self.assertEqual(m.newSplice(sites), [4, 9, 10, 18, 19, 28]) + assert_equal(m.newSplice(sites), [4, 9, 10, 18, 19, 28]) def test_newSplice_adj_ins_after(self): """ @@ -576,7 +572,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 10, 17, 18, 27] m = self._mutator(_seq(l)) m.insM(18, 'A') # g.18_19insA - self.assertEqual(m.newSplice(sites), [4, 9, 10, 17, 18, 28]) + assert_equal(m.newSplice(sites), [4, 9, 10, 17, 18, 28]) def test_newSplice_adj_del2_before1(self): """ @@ -586,7 +582,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 10, 17, 18, 27] m = self._mutator(_seq(l)) m.delM(15, 16) # g.15_16del - self.assertEqual(m.newSplice(sites), [4, 9, 10, 15, 16, 25]) + assert_equal(m.newSplice(sites), [4, 9, 10, 15, 16, 25]) def test_newSplice_adj_del2_before(self): """ @@ -596,8 +592,9 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 10, 17, 18, 27] m = self._mutator(_seq(l)) m.delM(16, 17) # g.16_17del - self.assertEqual(m.newSplice(sites), [4, 9, 10, 15, 16, 25]) + assert_equal(m.newSplice(sites), [4, 9, 10, 15, 16, 25]) + @skip def test_newSplice_adj_del2_on(self): """ Adjacent exons: deletion of 2 at exon/exon boundary. @@ -605,12 +602,11 @@ class TestMutator(unittest.TestCase): @todo: This is a special case of bug #????. Once fixed, the two exons will be joined to one new exon. """ - return # Disabled (see docstring) l = 30 sites = [4, 9, 10, 17, 18, 27] m = self._mutator(_seq(l)) m.delM(17, 18) # g.17_18del - self.assertEqual(m.newSplice(sites), [4, 9, 10, 16, 17, 25]) + assert_equal(m.newSplice(sites), [4, 9, 10, 16, 17, 25]) def test_newSplice_adj_del2_after(self): """ @@ -620,7 +616,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 10, 17, 18, 27] m = self._mutator(_seq(l)) m.delM(18, 19) # g.18_19del - self.assertEqual(m.newSplice(sites), [4, 9, 10, 17, 18, 25]) + assert_equal(m.newSplice(sites), [4, 9, 10, 17, 18, 25]) def test_newSplice_adj_del2_after1(self): """ @@ -630,7 +626,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 10, 17, 18, 27] m = self._mutator(_seq(l)) m.delM(19, 20) # g.19_20del - self.assertEqual(m.newSplice(sites), [4, 9, 10, 17, 18, 25]) + assert_equal(m.newSplice(sites), [4, 9, 10, 17, 18, 25]) def test_newSplice_adj_ins2_before(self): """ @@ -640,7 +636,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 10, 17, 18, 27] m = self._mutator(_seq(l)) m.insM(16, 'AT') # g.16_17insAT - self.assertEqual(m.newSplice(sites), [4, 9, 10, 19, 20, 29]) + assert_equal(m.newSplice(sites), [4, 9, 10, 19, 20, 29]) def test_newSplice_adj_ins2_on(self): """ @@ -656,7 +652,7 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 10, 17, 18, 27] m = self._mutator(_seq(l)) m.insM(17, 'AT') # g.17_18insAT - self.assertEqual(m.newSplice(sites), [4, 9, 10, 19, 20, 29]) + assert_equal(m.newSplice(sites), [4, 9, 10, 19, 20, 29]) def test_newSplice_adj_ins2_after(self): """ @@ -666,12 +662,4 @@ class TestMutator(unittest.TestCase): sites = [4, 9, 10, 17, 18, 27] m = self._mutator(_seq(l)) m.insM(18, 'AT') # g.18_19insAT - self.assertEqual(m.newSplice(sites), [4, 9, 10, 17, 18, 29]) - - -if __name__ == '__main__': - # Usage: - # ./test_mutator.py -v - # Or, selecting a specific test: - # ./test_mutator.py -v TestMutator.test_mutated - unittest.main() + assert_equal(m.newSplice(sites), [4, 9, 10, 17, 18, 29]) diff --git a/tests/test_variantchecker.py b/tests/test_variantchecker.py new file mode 100644 index 0000000000000000000000000000000000000000..74bbae733b93b4caadcb097559de70ffa3b211fc --- /dev/null +++ b/tests/test_variantchecker.py @@ -0,0 +1,427 @@ +""" +Tests for the variantchecker module. +""" + + +#import logging; logging.basicConfig() +from nose.tools import * + +from mutalyzer.config import Config +from mutalyzer.output import Output +from mutalyzer.variantchecker import check_variant + + +class TestVariantchecker(): + """ + Test the variantchecker module. + """ + def setUp(self): + """ + Initialize test variantchecker module. + """ + self.config = Config() + self.output = Output(__file__, self.config.Output) + + def test_deletion_in_frame(self): + """ + Simple in-frame deletion should give a simple description on protein + level. + """ + check_variant('AL449423.14(CDKN2A_v001):c.161_163del', + self.config, self.output) + assert_equal(self.output.getIndexedOutput('genomicDescription', 0), + 'AL449423.14:g.61937_61939del') + assert 'AL449423.14(CDKN2A_v001):c.161_163del' \ + in self.output.getOutput('descriptions') + assert 'AL449423.14(CDKN2A_i001):p.(Met54_Gly55delinsSer)' \ + in self.output.getOutput('protDescriptions') + assert self.output.getOutput('newprotein') + + def test_insertion_in_frame(self): + """ + Simple in-frame insertion should give a simple description on protein + level. + """ + check_variant('AL449423.14(CDKN2A_v001):c.161_162insATC', + self.config, self.output) + assert_equal(self.output.getIndexedOutput('genomicDescription', 0), + 'AL449423.14:g.61938_61939insGAT') + assert 'AL449423.14(CDKN2A_v001):c.161_162insATC' \ + in self.output.getOutput('descriptions') + assert 'AL449423.14(CDKN2A_i001):p.(Met54delinsIleSer)' \ + in self.output.getOutput('protDescriptions') + assert self.output.getOutput('newprotein') + + def test_deletion_insertion_in_frame(self): + """ + Simple in-frame deletion/insertion should give a simple description on + protein level. + """ + check_variant('AL449423.14(CDKN2A_v001):c.161_162delinsATCCC', + self.config, self.output) + assert_equal(self.output.getIndexedOutput('genomicDescription', 0), + 'AL449423.14:g.61938_61939delinsGGGAT') + assert 'AL449423.14(CDKN2A_v001):c.161_162delinsATCCC' \ + in self.output.getOutput('descriptions') + assert 'AL449423.14(CDKN2A_i001):p.(Met54delinsAsnPro)' \ + in self.output.getOutput('protDescriptions') + assert self.output.getOutput('newprotein') + + def test_deletion_insertion_in_frame_complete(self): + """ + Simple in-frame deletion/insertion should give a simple description on + protein level, also with the optional deleted sequence argument. + """ + check_variant('AL449423.14(CDKN2A_v001):c.161_162delTGinsATCCC', + self.config, self.output) + assert_equal(self.output.getIndexedOutput('genomicDescription', 0), + 'AL449423.14:g.61938_61939delinsGGGAT') + assert 'AL449423.14(CDKN2A_v001):c.161_162delinsATCCC' \ + in self.output.getOutput('descriptions') + assert 'AL449423.14(CDKN2A_i001):p.(Met54delinsAsnPro)' \ + in self.output.getOutput('protDescriptions') + assert self.output.getOutput('newprotein') + + def test_roll(self): + """ + Just a variant where we should roll. + """ + check_variant('NM_003002.2:c.273del', self.config, self.output) + wroll = self.output.getMessagesWithErrorCode('WROLLFORWARD') + assert len(wroll) > 0 + + def test_no_roll(self): + """ + Just a variant where we cannot roll. + """ + check_variant('NM_003002.2:c.274del', self.config, self.output) + wroll = self.output.getMessagesWithErrorCode('WROLLFORWARD') + assert_equal(len(wroll), 0) + + def test_no_roll_splice(self): + """ + Here we can roll but should not, because it is over a splice site. + """ + check_variant('NM_000088.3:g.459del', self.config, self.output) + wrollback = self.output.getMessagesWithErrorCode('IROLLBACK') + assert len(wrollback) > 0 + wroll = self.output.getMessagesWithErrorCode('WROLLFORWARD') + assert_equal(len(wroll), 0) + + def test_partial_roll_splice(self): + """ + Here we can roll two positions, but should roll only one because + otherwise it is over a splice site. + """ + check_variant('NM_000088.3:g.494del', self.config, self.output) + wrollback = self.output.getMessagesWithErrorCode('IROLLBACK') + assert len(wrollback) > 0 + wroll = self.output.getMessagesWithErrorCode('WROLLFORWARD') + assert len(wroll) > 0 + + def test_roll_after_splice(self): + """ + Here we can roll and should, we stay in the same exon. + """ + check_variant('NM_000088.3:g.460del', self.config, self.output) + wroll = self.output.getMessagesWithErrorCode('WROLLFORWARD') + assert len(wroll) > 0 + + def test_roll_both_ins(self): + """ + Insertion that rolls should not use the same inserted sequence in + descriptions on forward and reverse strands. + + Here we have the following situation on the forward strand: + + 65470 (genomic) + | + CGGTGCGTTGGGCAGCGCCCCCGCCTCCAGCAGCGCCCGCACCTCCTCTA + + Now, an insertion of TAC after 65470 should be rolled to an insertion + of ACT after 65471: + + CGGTGCGTTGGGCAGCGCCCCCGCC --- TCCAGCAGCGCCCGCACCTCCTCTA + CGGTGCGTTGGGCAGCGCCCCCGCC TAC TCCAGCAGCGCCCGCACCTCCTCTA => + + CGGTGCGTTGGGCAGCGCCCCCGCCT --- CCAGCAGCGCCCGCACCTCCTCTA + CGGTGCGTTGGGCAGCGCCCCCGCCT ACT CCAGCAGCGCCCGCACCTCCTCTA + + However, in CDKN2A_v001 (on the reverse strand), this insertion should + roll the other direction and the inserted sequence should be the reverse + complement of CTA, which is TAG, and not that of ACT, which is AGT. + + The next test (test_roll_reverse_ins) tests the situation for an input + of AL449423.14:g.65471_65472insACT, where only the reverse roll should + be done. + """ + check_variant('AL449423.14:g.65470_65471insTAC', self.config, self.output) + assert 'AL449423.14(CDKN2A_v001):c.99_100insTAG' in self.output.getOutput('descriptions') + assert_equal ('AL449423.14:g.65471_65472insACT', self.output.getIndexedOutput('genomicDescription', 0, '')) + assert_equal(len(self.output.getMessagesWithErrorCode('WROLLFORWARD')), 1) + + def test_roll_reverse_ins(self): + """ + Insertion that rolls on the reverse strand should not use the same + inserted sequence in descriptions on forward and reverse strands. + """ + check_variant('AL449423.14:g.65471_65472insACT', self.config, self.output) + assert 'AL449423.14(CDKN2A_v001):c.99_100insTAG' in self.output.getOutput('descriptions') + assert_equal ('AL449423.14:g.65471_65472insACT', self.output.getIndexedOutput('genomicDescription', 0, '')) + assert_equal(len(self.output.getMessagesWithErrorCode('WROLLFORWARD')), 0) + + def test_roll_message_forward(self): + """ + Roll warning message should only be shown for currently selected + strand (forward). + """ + check_variant('AL449423.14:g.65470_65471insTAC', self.config, self.output) + assert_equal(len(self.output.getMessagesWithErrorCode('WROLLFORWARD')), 1) + assert_equal(len(self.output.getMessagesWithErrorCode('WROLLREVERSE')), 0) + + def test_roll_message_reverse(self): + """ + Roll warning message should only be shown for currently selected + strand (reverse). + """ + check_variant('AL449423.14(CDKN2A_v001):c.98_99insGTA', self.config, self.output) + assert_equal(len(self.output.getMessagesWithErrorCode('WROLLFORWARD')), 0) + assert_equal(len(self.output.getMessagesWithErrorCode('WROLLREVERSE')), 1) + + def test_ins_cds_start(self): + """ + Insertion on CDS start boundary should not be included in CDS. + """ + check_variant('NM_000143.3:c.-1_1insCAT', self.config, self.output) + assert_equal(self.output.getIndexedOutput("newprotein", 0), None) + # Todo: Is this a good test? + + def test_ins_cds_start_after(self): + """ + Insertion after CDS start boundary should be included in CDS. + """ + check_variant('NM_000143.3:c.1_2insCAT', self.config, self.output) + assert_equal(self.output.getIndexedOutput("newprotein", 0), '?') + # Todo: Is this a good test? + + def test_del_splice_site(self): + """ + Deletion hitting one splice site should not do a protein prediction. + """ + check_variant('NG_012772.1(BRCA2_v001):c.632-5_670del', + self.config, self.output) + assert len(self.output.getMessagesWithErrorCode('WOVERSPLICE')) > 0 + assert_equal(self.output.getOutput('removedSpliceSites'), []) + # Todo: For now, the following is how to check if no protein + # prediction is done. + assert not self.output.getOutput('newprotein') + + def test_del_exon(self): + """ + Deletion of an entire exon should be possible. + """ + check_variant('NG_012772.1(BRCA2_v001):c.632-5_681+7del', + self.config, self.output) + assert len(self.output.getMessagesWithErrorCode('WOVERSPLICE')) > 0 + assert_equal(self.output.getOutput('removedSpliceSites'), [2]) + # Todo: For now, the following is how to check if protein + # prediction is done. + assert self.output.getOutput('newprotein') + + def test_del_exon_exact(self): + """ + Deletion of exactly an exon should be possible. + """ + check_variant('NG_012772.1(BRCA2_v001):c.632_681del', + self.config, self.output) + assert_equal(len(self.output.getMessagesWithErrorCode('WOVERSPLICE')), 0) + assert_equal(self.output.getOutput('removedSpliceSites'), [2]) + # Todo: For now, the following is how to check if protein + # prediction is done. + assert self.output.getOutput('newprotein') + + def test_del_exon_in_frame(self): + """ + Deletion of an entire exon with length a triplicate should give a + proteine product with just this deletion (and possibly substitutions + directly before and after). + + NG_012772.1(BRCA2_v001):c.68-7_316+7del is such a variant, since + positions 68 through 316 are exactly one exon and (316-68+1)/3 = 83. + """ + check_variant('NG_012772.1(BRCA2_v001):c.68-7_316+7del', + self.config, self.output) + assert len(self.output.getMessagesWithErrorCode('WOVERSPLICE')) > 0 + assert_equal(self.output.getOutput('removedSpliceSites'), [2]) + # Todo: For now, the following is how to check if protein + # prediction is done. + assert self.output.getOutput('newprotein') + # Todo: assert that protein products indeed have only this difference. + + def test_del_exons(self): + """ + Deletion of two entire exons should be possible. + """ + check_variant('NG_012772.1(BRCA2_v001):c.632-5_793+7del', + self.config, self.output) + assert len(self.output.getMessagesWithErrorCode('WOVERSPLICE')) > 0 + assert_equal(self.output.getOutput('removedSpliceSites'), [4]) + # Todo: For now, the following is how to check if protein + # prediction is done. + assert self.output.getOutput('newprotein') + + def test_del_intron(self): + """ + Deletion of an entire intron should be possible (fusion of remaining + exonic parts). + """ + check_variant('NG_012772.1(BRCA2_v001):c.622_674del', + self.config, self.output) + assert len(self.output.getMessagesWithErrorCode('WOVERSPLICE')) > 0 + assert_equal(self.output.getOutput('removedSpliceSites'), [2]) + # Todo: For now, the following is how to check if protein + # prediction is done. + assert self.output.getOutput('newprotein') + + def test_del_intron_exact(self): + """ + Deletion of exactly an intron should be possible (fusion of flanking + exons). + """ + check_variant('NG_012772.1(BRCA2_v001):c.681+1_682-1del', + self.config, self.output) + assert_equal(self.output.getMessagesWithErrorCode('WOVERSPLICE'), []) + assert_equal(self.output.getOutput('removedSpliceSites'), [2]) + # Note: The protein prediction is done, but 'newprotein' is not set + # because we have no change. So to check if the prediction is done, we + # check if 'oldprotein' is set and to check if the prediction is + # correct, we check if 'newprotein' is not set. + assert self.output.getOutput('oldprotein') + assert not self.output.getOutput('newprotein') + + def test_del_intron_in_frame(self): + """ + Deletion of an entire intron should be possible (fusion of remaining + exonic parts). + """ + check_variant('NG_012772.1(BRCA2_v001):c.622_672del', + self.config, self.output) + assert len(self.output.getMessagesWithErrorCode('WOVERSPLICE')) > 0 + assert_equal(self.output.getOutput('removedSpliceSites'), [2]) + # Todo: For now, the following is how to check if protein + # prediction is done. + assert self.output.getOutput('newprotein') + # Todo: assert that protein products indeed have only this difference. + + def test_del_exon_unknown_offsets(self): + """ + Deletion of an entire exon with unknown offsets should be possible. + """ + check_variant('NG_012772.1(BRCA2_v001):c.632-?_681+?del', + self.config, self.output) + assert len(self.output.getMessagesWithErrorCode('WOVERSPLICE')) > 0 + assert len(self.output.getMessagesWithErrorCode('IDELSPLICE')) > 0 + # Todo: For now, the following is how to check if protein + # prediction is done. + assert self.output.getOutput('newprotein') + # Genomic positions should be centered in flanking introns and unsure. + assert_equal(self.output.getIndexedOutput('genomicDescription', 0), + 'NG_012772.1:g.(17550_19725)del') + assert 'NG_012772.1(BRCA2_v001):c.632-?_681+?del' \ + in self.output.getOutput('descriptions') + assert 'NG_012772.1(BRCA2_i001):p.(Val211Glufs*10)' \ + in self.output.getOutput('protDescriptions') + # Todo: .c notation should still be c.632-?_681+?del, but what about + # other transcripts? + + def test_del_exon_unknown_offsets_in_frame(self): + """ + Deletion of an entire exon with unknown offsets and length a + triplicate should give a proteine product with just this deletion + (and possibly substitutions directly before and after). + + NG_012772.1(BRCA2_v001):c.68-?_316+?del is such a variant, since + positions 68 through 316 are exactly one exon and (316-68+1)/3 = 83. + """ + check_variant('NG_012772.1(BRCA2_v001):c.68-?_316+?del', + self.config, self.output) + assert len(self.output.getMessagesWithErrorCode('WOVERSPLICE')) > 0 + assert len(self.output.getMessagesWithErrorCode('IDELSPLICE')) > 0 + # Todo: For now, the following is how to check if protein + # prediction is done. + assert self.output.getOutput('newprotein') + # Genomic positions should be centered in flanking introns and unsure. + assert_equal(self.output.getIndexedOutput('genomicDescription', 0), + 'NG_012772.1:g.(7324_11720)del') + assert 'NG_012772.1(BRCA2_v001):c.68-?_316+?del' \ + in self.output.getOutput('descriptions') + # Todo: .c notation should still be c.632-?_681+?del, but what about + # other transcripts? + + def test_del_exon_unknown_offsets_composed(self): + """ + Deletion of an entire exon with unknown offsets and another composed + variant with exact positioning should be possible. + """ + check_variant('NG_012772.1(BRCA2_v001):c.[632-?_681+?del;681+4del]', + self.config, self.output) + assert len(self.output.getMessagesWithErrorCode('WOVERSPLICE')) > 0 + assert len(self.output.getMessagesWithErrorCode('IDELSPLICE')) > 0 + # Todo: For now, the following is how to check if protein + # prediction is done. + assert self.output.getOutput('newprotein') + # Genomic positions should be centered in flanking introns and unsure. + assert_equal(self.output.getIndexedOutput('genomicDescription', 0), + 'NG_012772.1:g.[(17550_19725)del;19017del]') + assert 'NG_012772.1(BRCA2_v001):c.[632-?_681+?del;681+4del]' \ + in self.output.getOutput('descriptions') + # Todo: .c notation should still be c.632-?_681+?del, but what about + # other transcripts? + + def test_del_exon_unknown_offsets_reverse(self): + """ + Deletion of an entire exon with unknown offsets should be possible, + also on the reverse strand. + """ + check_variant('AL449423.14(CDKN2A_v001):c.151-?_457+?del', + self.config, self.output) + assert len(self.output.getMessagesWithErrorCode('WOVERSPLICE')) > 0 + assert len(self.output.getMessagesWithErrorCode('IDELSPLICE')) > 0 + # Todo: For now, the following is how to check if protein + # prediction is done. + assert self.output.getOutput('newprotein') + # Genomic positions should be centered in flanking introns and unsure. + assert_equal(self.output.getIndexedOutput('genomicDescription', 0), + 'AL449423.14:g.(60314_63683)del') + assert 'AL449423.14(CDKN2A_v001):c.151-?_457+?del' \ + in self.output.getOutput('descriptions') + # Todo: .c notation should still be c.632-?_681+?del, but what about + # other transcripts? + + def test_del_exon_transcript_reference(self): + """ + Deletion of entire exon on a transcript reference should remove the + expected splice sites (only that of the deleted exon), and not those + of the flanking exons (as would happen using the mechanism for genomic + references). + """ + check_variant('NM_018723.3:c.758_890del', self.config, self.output) + assert_equal(len(self.output.getMessagesWithErrorCode('WOVERSPLICE')), 0) + assert_equal(self.output.getOutput('removedSpliceSites'), [2]) + # Todo: For now, the following is how to check if protein + # prediction is done. + assert self.output.getOutput('newprotein') + + def test_ins_range(self): + """ + Insertion of a range is not implemented yet. + """ + check_variant('AB026906.1:c.274_275ins262_268', self.config, self.output) + assert_equal(len(self.output.getMessagesWithErrorCode('ENOTIMPLEMENTED')), 1) + + def test_delins_range(self): + """ + Deletion/insertion of a range is not implemented yet. + """ + check_variant('AB026906.1:c.274delins262_268', self.config, self.output) + assert_equal(len(self.output.getMessagesWithErrorCode('ENOTIMPLEMENTED')), 1) diff --git a/tests/test_webservice.py b/tests/test_webservice.py new file mode 100644 index 0000000000000000000000000000000000000000..d470168056906ad52ddb6e0a72165a988d3c354f --- /dev/null +++ b/tests/test_webservice.py @@ -0,0 +1,245 @@ +""" +Tests for the SOAP interface to Mutalyzer. +""" + + +from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() + +import os +from datetime import datetime, timedelta +import mutalyzer +from mutalyzer.config import Config +from mutalyzer.output import Output +from mutalyzer.sync import CacheSync +from mutalyzer import Db +import logging +import urllib2 +from suds.client import Client +from suds import WebFault +from nose.tools import * + + +# Suds logs an awful lot of things with level=DEBUG, including entire WSDL +# files and SOAP responses. On any error, this is all dumped to the console, +# which is very unconvenient. The following suppresses most of this. +logging.raiseExceptions = 0 +logging.basicConfig(level=logging.INFO) +for logger in ('suds.metrics', 'suds.wsdl', 'suds.xsd.schema', + 'suds.xsd.sxbasic', 'suds.xsd.sxbase', 'suds.xsd.query', + 'suds.transport.http', 'suds.xsd.deplist', 'suds.mx.core', + 'suds.mx.literal', 'suds.resolver'): + logging.getLogger(logger).setLevel(logging.ERROR) + + +WSDL_URL = 'http://localhost/mutalyzer/services/?wsdl' + + +class TestWSDL(): + """ + Test the Mutalyzer SOAP interface WSDL description. + """ + def test_wsdl(self): + """ + Test if the WSDL is available and looks somewhat sensible. + """ + wsdl = urllib2.urlopen(WSDL_URL).read() + assert wsdl.startswith("<?xml version='1.0' encoding='UTF-8'?>") + assert 'name="Mutalyzer"' in wsdl + + +class TestWebservice(): + """ + Test the Mutalyzer SOAP interface. + """ + + def setUp(self): + """ + Initialize webservice entrypoint. + + @todo: Start the standalone server and stop it in self.tearDown + instead of depending on some running instance at a fixed address. + """ + self.client = Client(WSDL_URL) #, cache=None) + self.client.options.cache.setduration(seconds=120) + + def test_checksyntax_valid(self): + """ + Running checkSyntax with a valid variant name should return True. + """ + r = self.client.service.checkSyntax('AB026906.1:c.274G>T') + assert_equal(r.valid, True) + + def test_checksyntax_invalid(self): + """ + Running checkSyntax with an invalid variant name should return False + and give at least one error message. + """ + r = self.client.service.checkSyntax('0:abcd') + assert_equal(r.valid, False) + assert len(r.messages.SoapMessage) >= 1 + + @raises(WebFault) + def test_checksyntax_empty(self): + """ + Running checkSyntax with no variant name should raise exception. + """ + self.client.service.checkSyntax() + + def test_transcriptinfo_valid(self): + """ + Running transcriptInfo with valid arguments should get us a Transcript + object. + """ + r = self.client.service.transcriptInfo(LOVD_ver='123', build='hg19', + accNo='NM_002001.2') + assert_equal(r.trans_start, -99) + assert_equal(r.trans_stop, 1066) + assert_equal(r.CDS_stop, 774) + + def test_numberconversion_gtoc_valid(self): + """ + Running numberConversion with valid g variant should give a list of + c variant names. + """ + r = self.client.service.numberConversion(build='hg19', + variant='NC_000001.10:g.159272155del') + assert_equal(type(r.string), list) + assert 'NM_002001.2:c.1del' in r.string + + def test_numberconversion_ctog_valid(self): + """ + Running numberConversion with valid c variant should give a list of + g variant names. + """ + r = self.client.service.numberConversion(build='hg19', + variant='NM_002001.2:c.1del') + assert_equal(type(r.string), list) + assert 'NC_000001.10:g.159272155del' in r.string + + def test_gettranscriptsbygenename_valid(self): + """ + Running getTranscriptsByGeneName with valid gene name should give a + list of transcripts. + """ + r = self.client.service.getTranscriptsByGeneName(build='hg19', + name='DMD') + assert_equal(type(r.string), list) + for t in ['NM_004006.2', + 'NM_000109.3', + 'NM_004021.2', + 'NM_004009.3', + 'NM_004007.2', + 'NM_004018.2', + 'NM_004022.2']: + assert t in r.string + + def test_gettranscriptsandinfo_valid(self): + """ + Running getTranscriptsAndInfo with a valid genomic reference should + give a list of TranscriptInfo objects. + """ + r = self.client.service.getTranscriptsAndInfo('AL449423.14') + assert_equal(type(r.TranscriptInfo), list) + names = [t.name for t in r.TranscriptInfo] + for t in ['CDKN2B_v002', + 'CDKN2B_v001', + 'MTAP_v005', + 'CDKN2A_v008', + 'CDKN2A_v007', + 'C9orf53_v001', + 'CDKN2A_v001']: + assert t in names + + def test_gettranscriptsandinfo_restricted_valid(self): + """ + Running getTranscriptsAndInfo with a valid genomic reference and a + gene name should give a list of TranscriptInfo objects restricted + to the gene. + """ + r = self.client.service.getTranscriptsAndInfo('AL449423.14', 'CDKN2A') + assert_equal(type(r.TranscriptInfo), list) + names = [t.name for t in r.TranscriptInfo] + for t in ['CDKN2A_v008', + 'CDKN2A_v007']: + assert t in names + for t in ['CDKN2B_v002', + 'CDKN2B_v001', + 'MTAP_v005', + 'C9orf53_v001']: + assert_false(t in names) + + def test_info(self): + """ + Running the info method should give us some version information. + """ + r = self.client.service.info() + assert_equal(type(r.versionParts.string), list) + assert_equal(r.version, mutalyzer.__version__) + + def test_getcache(self): + """ + Running the getCache method should give us the expected number of + cache entries. + """ + created_since = datetime.today() - timedelta(days=14) + + config = Config() + database = Db.Cache(config.Db) + output = Output(__file__, config.Output) + sync = CacheSync(config.Retriever, output, database) + cache = sync.local_cache(created_since) + + r = self.client.service.getCache(created_since) + if len(cache) > 0: + assert_equal(len(r.CacheEntry), len(cache)) + + def test_getdbsnpdescriptions(self): + """ + Running getdbSNPDescriptions method should give us the expected HGVS + descriptions for the given dbSNP id. + """ + r = self.client.service.getdbSNPDescriptions('rs9919552') + assert 'NC_000011.9:g.111959625C>T' in r.string + assert 'NG_012337.1:g.7055C>T' in r.string + assert 'NM_003002.2:c.204C>T' in r.string + assert 'NP_002993.1:p.Ser68=' in r.string + + def test_gettranscripts(self): + """ + Running getTranscriptsByGeneName should give a list of transcripts. + """ + r = self.client.service.getTranscripts(build='hg19', chrom='chrX', + pos=32237295) + assert_equal(type(r.string), list) + for t in ['NM_000109', + 'NM_004006', + 'NM_004007', + 'NM_004009', + 'NM_004010', + 'NM_004011', + 'NM_004012']: + assert t in r.string + + def test_gettranscripts_with_versions(self): + """ + Running getTranscriptsByGeneName with versions=True should give a list + of transcripts with version numbers. + """ + r = self.client.service.getTranscripts(build='hg19', chrom='chrX', + pos=32237295, versions=True) + assert_equal(type(r.string), list) + for t in ['NM_000109.3', + 'NM_004006.2', + 'NM_004007.2', + 'NM_004009.3', + 'NM_004010.3', + 'NM_004011.3', + 'NM_004012.3']: + assert t in r.string + + def test_ping(self): + """ + Running the ping method should return 'pong'. + """ + r = self.client.service.ping() + assert_equal(r, 'pong') diff --git a/src/tests/test_wsgi.py b/tests/test_website.py old mode 100755 new mode 100644 similarity index 66% rename from src/tests/test_wsgi.py rename to tests/test_website.py index b9648546930ce49057ba0edad929435a05c3d2ca..61a7206ba4c2565311161a0f544d720c7e79cbcc --- a/src/tests/test_wsgi.py +++ b/tests/test_website.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python - """ Tests for the WSGI interface to Mutalyzer. @@ -12,18 +10,35 @@ I just installed webtest by 'easy_install webtest'. @todo: Tests for /upload. """ + #import logging; logging.basicConfig() +import os import re +import urllib2 import time -import unittest +import web +from nose.tools import * from webtest import TestApp +import logging + + +import mutalyzer +from mutalyzer import website +from mutalyzer.util import slow + -# Todo: Can this be done in a more elegant way? -import site -site.addsitedir('..') -from wsgi import application +# TAL logs an awful lot of things with level=DEBUG. On any error, this is all +# dumped to the console, which is very unconvenient. The following suppresses +# most of this. +logging.raiseExceptions = 0 +logging.basicConfig(level=logging.INFO) +logging.getLogger('simpleTAL.HTMLTemplateCompiler').setLevel(logging.ERROR) -class TestWSGI(unittest.TestCase): + +BATCH_RESULT_URL = 'http://localhost/mutalyzer/Results_{id}.txt' + + +class TestWSGI(): """ Test the Mutalyzer WSGI interface. """ @@ -32,19 +47,54 @@ class TestWSGI(unittest.TestCase): """ Initialize test application. """ + web.config.debug = False + application = website.app.wsgifunc() self.app = TestApp(application) + def test_root(self): + """ + Expect the index HTML page. + """ + r = self.app.get('') + assert_equal(r.status, '301 Moved Permanently') + assert r.location.endswith('/') + r = r.follow() + assert_equal(r.status, '200 OK') + # We check for <html> to make sure the menu template is included + r.mustcontain('<html>', + 'Welcome to the Mutalyzer web site', + '</html>') + def test_index(self): """ Expect the index HTML page. """ r = self.app.get('/') - self.assertEqual(r.status, '200 OK') + assert_equal(r.status, '200 OK') # We check for <html> to make sure the menu template is included r.mustcontain('<html>', 'Welcome to the Mutalyzer web site', '</html>') + def test_index_explicit(self): + """ + Expect the index HTML page. + """ + r = self.app.get('/index') + assert_equal(r.status, '200 OK') + # We check for <html> to make sure the menu template is included + r.mustcontain('<html>', + 'Welcome to the Mutalyzer web site', + '</html>') + + def test_about(self): + """ + See if my name is on the About page ;) + """ + r = self.app.get('/about') + assert_equal(r.status, '200 OK') + r.mustcontain('Martijn Vermaat') + def test_non_existing(self): """ Expect a 404 response. @@ -131,8 +181,8 @@ class TestWSGI(unittest.TestCase): Should not include form and main layout HTML. """ r = self.app.get('/check?mutationName=NM_002001.2:g.1del') - self.assertFalse('<a href="#bottom" class="hornav">go to bottom</a>' in r) - self.assertFalse('<input value="NM_002001.2:g.1del" type="text" name="mutationName" style="width:100%">' in r) + assert_false('<a href="#bottom" class="hornav">go to bottom</a>' in r) + assert_false('<input value="NM_002001.2:g.1del" type="text" name="mutationName" style="width:100%">' in r) r.mustcontain('0 Errors', '0 Warnings', 'Raw variant 1: deletion of 1', @@ -145,8 +195,8 @@ class TestWSGI(unittest.TestCase): redirect to the name checker. """ r = self.app.get('/checkForward?mutationName=NM_002001.2:g.1del') - self.assertEqual(r.status, '303 See Other') - self.assertTrue(r.location.endswith('/check')) + assert_equal(r.status, '303 See Other') + assert r.location.endswith('/check') r = r.follow() r.mustcontain('0 Errors', '0 Warnings', @@ -164,9 +214,10 @@ class TestWSGI(unittest.TestCase): r = form.submit() r.mustcontain('0 Errors', '0 Warnings', + 'NC_000011.9:g.111959625C>T', 'NG_012337.1:g.7055C>T', 'NM_003002.2:c.204C>T', - 'NT_033899.8:g.15522041C>T') + 'NP_002993.1:p.Ser68=') def test_snp_converter_invalid(self): """ @@ -203,8 +254,9 @@ class TestWSGI(unittest.TestCase): r = form.submit() r.mustcontain('NM_003002.2:c.204C>T') + @slow def _batch(self, batch_type='NameChecker', arg1=None, file="", size=0, - header=''): + header='', lines=None): """ Submit a batch form. @@ -214,13 +266,25 @@ class TestWSGI(unittest.TestCase): @kwarg file: String with variants to use as input for the batch job. @kwarg size: Number of variants in input. @kwarg header: Message that must be found in the batch job result. + @kwarg lines: Number of result rows expected. + + @return: The batch result document. + @rtype: string + + @note: Since the batch files are processed by a running batch daemon + process, the result gets written to the directory defined by the + system-wide configuration (e.g. /var/mutalyzer/cache), thus + inaccessible for the TestApp instance under our current user. + The 'solution' for this is to download the results via a running + webserver that should be using the same configuration as the batch + daemon. Yes, this is a hack. """ r = self.app.get('/batch') form = r.forms[0] if arg1: form['arg1'] = arg1 form['batchType'] = batch_type - form['batchEmail'] = 'm.vermaat.hg@lumc.nl' + form['batchEmail'] = 'test@test.test' form.set('batchFile', ('test_%s.txt' % batch_type, file)) r = form.submit() @@ -228,16 +292,25 @@ class TestWSGI(unittest.TestCase): max_tries = 60 for i in range(max_tries): r = self.app.get('/progress?jobID=' + id + '&totalJobs=' + str(size) + '&ajax=1') - self.assertEqual(r.content_type, 'text/plain') + assert_equal(r.content_type, 'text/plain') #print '%s: %s' % (batch_type, r.body) if r.body == 'OK': break - self.assertTrue(re.match('[0-9]+', r.body)) + assert re.match('[0-9]+', r.body) time.sleep(2) - self.assertEqual(r.body, 'OK') - r = self.app.get('/Results_' + id + '.txt') - self.assertEqual(r.content_type, 'text/plain') - r.mustcontain(header) - self.assertTrue(len(r.body.strip().split('\n')) == size + 1) + assert_equal(r.body, 'OK') + # Actually, this only means the last entry was taken from the database + # queue. It might still be processing, in which case we miss some + # expected output. So let's wait a few seconds. + time.sleep(2) + # This is a hack to get to the batch results (see @note above). + response = urllib2.urlopen(BATCH_RESULT_URL.format(id=id)) + assert_equal(response.info().getheader('Content-Type'), 'text/plain') + result = response.read() + assert header in result + if not lines: + lines = size + assert_equal(len(result.strip().split('\n')) - 1, lines) + return result def test_batch_namechecker(self): """ @@ -375,6 +448,7 @@ class TestWSGI(unittest.TestCase): size=len(variants)-1, header='Input\tStatus') + @slow def test_batch_syntaxchecker_toobig(self): """ Submit the batch syntax checker with a too big input file. @@ -399,14 +473,32 @@ facilisi.""" form.set('batchFile', ('test_batch_toobig.txt', file)) r = form.submit(status=413) - self.assertEqual(r.content_type, 'text/plain') + assert_equal(r.content_type, 'text/plain') + + @slow + def test_batch_multicolumn(self): + """ + Submit the batch syntax checker with a multiple-colums input file. + + This by the way also tests for the correct order of batch results. + """ + variants = [('AB026906.1(SDHD):g.7872G>T', 'NM_003002.1:c.3_4insG'), + ('NM_003002.1:c.3_4insG', 'AB026906.1(SDHD):g.7872G>T'), + ('AL449423.14(CDKN2A_v002):c.5_400del', 'AL449423.14(CDKN2A_v002):c.5_400del')] + result = self._batch('SyntaxChecker', + file='\n'.join(['\t'.join(r) for r in variants]), + size=len(variants) * 2, + header='Input\tStatus', + lines=len(variants)) + for line in result.splitlines()[1:]: + assert_equal(len(line.split('\t')), len(variants[0]) * 2) def test_download_py(self): """ Download a Python example client for the webservice. """ r = self.app.get('/download/client-suds.py') - self.assertEqual(r.content_type, 'text/plain') + assert_equal(r.content_type, 'text/plain') r.mustcontain('#!/usr/bin/env python') def test_download_rb(self): @@ -414,7 +506,7 @@ facilisi.""" Download a Ruby example client for the webservice. """ r = self.app.get('/download/client-savon.rb') - self.assertEqual(r.content_type, 'text/plain') + assert_equal(r.content_type, 'text/plain') r.mustcontain('#!/usr/bin/env ruby') def test_download_cs(self): @@ -422,7 +514,7 @@ facilisi.""" Download a C# example client for the webservice. """ r = self.app.get('/download/client-mono.cs') - self.assertEqual(r.content_type, 'text/plain') + assert_equal(r.content_type, 'text/plain') r.mustcontain('public static void Main(String [] args) {') def test_download_php(self): @@ -430,7 +522,7 @@ facilisi.""" Download a PHP example client for the webservice. """ r = self.app.get('/download/client-php.php') - self.assertEqual(r.content_type, 'text/plain') + assert_equal(r.content_type, 'text/plain') r.mustcontain('<?php') def test_downloads_batchtest(self): @@ -438,24 +530,15 @@ facilisi.""" Download the batch test example file. """ r = self.app.get('/downloads/batchtestnew.txt') - self.assertEqual(r.content_type, 'text/plain') + assert_equal(r.content_type, 'text/plain') r.mustcontain('NM_003002.1:c.3_4insG') - def test_reference(self): - """ - Download a reference file. - """ - r = self.app.get('/Reference/AB026906.1.gb') - self.assertEqual(r.content_type, 'text/plain') - self.assertEqual(r.content_length, 26427) - r.mustcontain('ggaaaaagtc tctcaaaaaa cctgctttat') - def test_soap_documentation(self): """ Test the SOAP documentation generated from the WSDL. """ r = self.app.get('/documentation') - self.assertEqual(r.content_type, 'text/html') + assert_equal(r.content_type, 'text/html') r.mustcontain('Web Service: Mutalyzer') def test_getgs(self): @@ -465,27 +548,27 @@ facilisi.""" r = self.app.get('/getGS?variantRecord=NM_003002.2&forward=1&mutationName=NG_012337.1:g.7055C%3ET') r.mustcontain('0 Errors', '0 Warnings', - 'Raw variant 1: substitution at 7055', - '<a href="#bottom" class="hornav">go to bottom</a>', - '<input value="NG_012337.1(SDHD_v001):g.7055C>T" type="text" name="mutationName" style="width:100%">') + 'Raw variant 1: substitution at 7055') + assert_equal(r.body.find('go to bottom'), -1) + assert_equal(r.body.find('<input'), -1) def test_variantinfo_g2c(self): """ Test the /Variant_info interface used by LOVD2 (g to c). """ r = self.app.get('/Variant_info?LOVD_ver=2.0-29&build=hg19&acc=NM_203473.1&var=g.48374289_48374389del') - self.assertEqual(r.content_type, 'text/plain') + assert_equal(r.content_type, 'text/plain') expected = '\n'.join(['1020', '0', '1072', '48', '48374289', '48374389', 'del']) - self.assertEqual(r.body, expected) + assert_equal(r.body, expected) def test_variantinfo_c2g(self): """ Test the /Variant_info interface used by LOVD2 (c to g). """ r = self.app.get('/Variant_info?LOVD_ver=2.0-29&build=hg19&acc=NM_203473.1&var=c.1020_1072%2B48del') - self.assertEqual(r.content_type, 'text/plain') + assert_equal(r.content_type, 'text/plain') expected = '\n'.join(['1020', '0', '1072', '48', '48374289', '48374389', 'del']) - self.assertEqual(r.body, expected) + assert_equal(r.body, expected) def test_variantinfo_c2g_downstream(self): """ @@ -493,28 +576,92 @@ facilisi.""" notation to g). """ r = self.app.get('/Variant_info?LOVD_ver=2.0-29&build=hg19&acc=NM_203473.1&var=c.1709%2Bd187del') - self.assertEqual(r.content_type, 'text/plain') + assert_equal(r.content_type, 'text/plain') expected = '\n'.join(['1709', '187', '1709', '187', '48379389', '48379389', 'del']) - self.assertEqual(r.body, expected) + assert_equal(r.body, expected) + + def test_variantinfo_no_variant(self): + """ + Test the /Variant_info interface used by LOVD2 (without variant). + """ + r = self.app.get('/Variant_info?LOVD_ver=2.0-32&build=hg19&acc=NM_001083962.1') + assert_equal(r.content_type, 'text/plain') + expected = '\n'.join(['-612', '7720', '2016']) + assert_equal(r.body, expected) def test_upload_local_file(self): """ Test the genbank uploader. - @todo: Test if returned genomic reference can indeed be used now. + @todo: Use another genbank file to get a UD number and check that + we can then check variants using that UD number. + @todo: This genbank file location is bogus. The tests directory is not + included with the package installation. """ - test_genbank_file = 'src/tests/data/AB026906.1.gb' + test_genbank_file = os.path.join(os.path.split(mutalyzer.package_root())[0], 'tests/data/AB026906.1.gb') r = self.app.get('/upload') form = r.forms[0] form['invoermethode'] = 'file' form.set('bestandsveld', ('test_upload.gb', open(test_genbank_file, 'r').read())) r = form.submit() - r.mustcontain('Your reference sequence was uploaded successfully.') - -if __name__ == '__main__': - # Usage: - # ./test_wsgi.py -v - # Or, selecting a specific test: - # ./test_wsgi.py -v TestWSGI.test_getgs - unittest.main() + r.mustcontain('Your reference sequence was loaded successfully.') + + def test_upload_local_file_invalid(self): + """ + Test the genbank uploader with a non-genbank file. + + @note: We add the current time to the file contents to make sure it is + not recognized by its hash. + """ + r = self.app.get('/upload') + form = r.forms[0] + form['invoermethode'] = 'file' + form.set('bestandsveld', ('test_upload.gb', + 'this is not a genbank file (%s)\n' % time.ctime())) + r = form.submit() + r.mustcontain('The file could not be parsed.') + print r.body + + def test_reference(self): + """ + Test if reference files are cached. + """ + r = self.app.get('/check') + form = r.forms[0] + form['mutationName'] = 'AB026906.1:c.274G>T' + r = form.submit() + r.mustcontain('0 Errors', + '1 Warning', + 'Raw variant 1: substitution at 7872', + '<a href="#bottom" class="hornav">go to bottom</a>', + '<input value="AB026906.1:c.274G>T" type="text" name="mutationName" style="width:100%">') + r = self.app.get('/Reference/AB026906.1.gb') + assert_equal(r.content_type, 'text/plain') + assert_equal(r.content_length, 26427) + r.mustcontain('ggaaaaagtc tctcaaaaaa cctgctttat') + + def test_reference_head(self): + """ + Test if reference files are cached, by issuing a HEAD request. + + Note: The WebTest module also checks that the response to a HEAD + request is empty, as it should be. + """ + r = self.app.get('/check') + form = r.forms[0] + form['mutationName'] = 'AB026906.1:c.274G>T' + r = form.submit() + r.mustcontain('0 Errors', + '1 Warning', + 'Raw variant 1: substitution at 7872', + '<a href="#bottom" class="hornav">go to bottom</a>', + '<input value="AB026906.1:c.274G>T" type="text" name="mutationName" style="width:100%">') + r = self.app.head('/Reference/AB026906.1.gb') + assert_equal(r.content_type, 'text/plain') + + def test_reference_head_none(self): + """ + Test if non-existing reference files gives a 404 on a HEAD request. + """ + r = self.app.head('/Reference/AB026906.78.gb', status=404)