From 13737ad94f287724ecc649ff5a57f2df4054fe09 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat <martijn@vermaat.name> Date: Fri, 2 Sep 2011 15:17:04 +0000 Subject: [PATCH] Convert 'map' table to 'Mapping' and use NCBI for updates. Convert the 'hg18.map' and 'hg19.map' tables are to 'hg18.Mapping' and 'hg19.Mapping'. These tables are now updated with mapping info from the NCBI instead of the UCSC. Also added database and configuration file migration scripts. extras/config.example: - Remove configuration values for UCSC updates. extras/post-install.sh: mutalyzer/Mapper.py: mutalyzer/Db.py: - Use 'Mapping' table instead of 'map' table. extras/cron.d/mutalyzer-mapping-update: mutalyzer/Mapper.py: mutalyzer/Db.py: - Get mapping updates from the NCBI. extras/migrations: extras/post-upgrade.sh: - Automated migration scripts. git-svn-id: https://humgenprojects.lumc.nl/svn/mutalyzer/branches/refactor-mutalyzer-branch@342 eb6bd6ab-9ccd-42b9-aceb-e2899b4a52f1 --- INSTALL | 20 +- README | 11 +- bin/mutalyzer | 8 +- bin/mutalyzer-cache-sync | 8 +- bin/mutalyzer-mapping-update | 59 ++ bin/mutalyzer-ucsc-update | 54 -- extras/config.example | 6 - extras/cron.d/mutalyzer-mapping-update | 3 + extras/cron.d/mutalyzer-ucsc-update | 2 - .../001-db-gbinfo-add-created.migration | 69 +++ .../002-db-map-to-mapping.migration | 160 +++++ .../003-config-remove-ucsc.migration | 25 + .../004-cron-ucsc-to-ncbi.migration | 24 + extras/migrations/README | 18 + extras/post-install.sh | 167 ++--- extras/post-upgrade.sh | 6 + mutalyzer/Db.py | 574 ++++++------------ mutalyzer/Scheduler.py | 4 +- mutalyzer/config.py | 3 - mutalyzer/{Mapper.py => mapping.py} | 440 ++++++++++---- mutalyzer/util.py | 31 + mutalyzer/webservice.py | 8 +- mutalyzer/website.py | 16 +- setup.py | 2 +- tests/{test_converter.py => test_mapping.py} | 6 +- 25 files changed, 1024 insertions(+), 700 deletions(-) create mode 100755 bin/mutalyzer-mapping-update delete mode 100755 bin/mutalyzer-ucsc-update create mode 100644 extras/cron.d/mutalyzer-mapping-update delete mode 100644 extras/cron.d/mutalyzer-ucsc-update create mode 100755 extras/migrations/001-db-gbinfo-add-created.migration create mode 100755 extras/migrations/002-db-map-to-mapping.migration create mode 100755 extras/migrations/003-config-remove-ucsc.migration create mode 100755 extras/migrations/004-cron-ucsc-to-ncbi.migration create mode 100644 extras/migrations/README rename mutalyzer/{Mapper.py => mapping.py} (55%) rename tests/{test_converter.py => test_mapping.py} (92%) diff --git a/INSTALL b/INSTALL index 72856076..93b1f1c7 100644 --- a/INSTALL +++ b/INSTALL @@ -12,14 +12,14 @@ throughout. The following is an overview of default locations used by Mutalyzer: - Package files /usr/local/lib/python2.6/dist-packages/... - Configuration /etc/mutalyzer/config - Log file /var/log/mutalyzer.log - Cache directory /var/cache/mutalyzer - Batchd init script /etc/init.d/mutalyzer-batchd - UCSC update crontab /etc/cron.d/mutalyzer-ucsc-update - Apache configuration /etc/apache2/conf.d/mutalyzer.conf - Static website files /var/www/mutalyzer/base + Package files /usr/local/lib/python2.6/dist-packages/... + Configuration /etc/mutalyzer/config + Log file /var/log/mutalyzer.log + Cache directory /var/cache/mutalyzer + Batchd init script /etc/init.d/mutalyzer-batchd + Mapping update crontab /etc/cron.d/mutalyzer-mapping-update + Apache configuration /etc/apache2/conf.d/mutalyzer.conf + Static website files /var/www/mutalyzer/base The default database user is 'mutalyzer' with no password and the database names are 'mutalyzer', 'hg18', and 'hg19'. @@ -144,7 +144,9 @@ directory: sudo bash extras/post-upgrade If you installed Mutalyzer in a development environment, you don't have to -do anything to upgrade (apart from updating the source code). +do anything to upgrade except for running the automated migration scripts: + + for M in extras/migrations/*.migration; do sudo $M migrate; done Additional notes diff --git a/README b/README index 00f3e6d4..f703d5ae 100644 --- a/README +++ b/README @@ -40,8 +40,8 @@ Development notes Todo list: - Improve the web interface design :) - Test all uses of mkstemp(). -- Use naming conventions for modules Crossmap, Db, File, GenRecord, Mapper, - Retriever, Scheduler. +- Use naming conventions for modules Crossmap, Db, File, GenRecord, Retriever + and Scheduler. - Use standard logging module, with rotating functionality. Race conditions on the log file are probably a problem in the current setup. Instead of that rotating, we could also use logrotate: @@ -49,6 +49,8 @@ Todo list: - Setup continuous integration. Currently, I'm most impressed with Hudson. http://hudson-ci.org/ http://www.rhonabwy.com/wp/2009/11/04/setting-up-a-python-ci-server-with-hudson/ + Or perhaps Jenkins. + http://jenkins-ci.org/ - Use monit on the production server. http://mmonit.com/monit/ - Migrate Javascript to JQuery. @@ -60,13 +62,16 @@ Todo list: for example jinja. - Develop a large test suite. - Create a web interface url to watch the progress of a batch job. +- Create webservices for the batch jobs (steal ideas from Jeroen's DVD + webservice). - Use virtualenv? - Use SQLAlchemy? - Password for MySQL user. - In deployment, remove old versions of Mutalyzer package? - Use https protocol. - Check for os.path.join vulnerabilities. -- Solution for database schema migration on version updates. +- Use a standard solution for the database migrations in extras/migrations. +- Use something like Sphinx to generate development documentation from code. Code style guide: - Follow PEP 8 (code) and PEP 257 (docstrings). diff --git a/bin/mutalyzer b/bin/mutalyzer index 912ab2c1..36af8af7 100755 --- a/bin/mutalyzer +++ b/bin/mutalyzer @@ -4,7 +4,10 @@ Command-line interface to the nomenclature checker. Usage: - ./check 'AB026906.1:c.274delG' + {command} variant + + variant: The variant description to check. + @todo: Refactor this file. """ @@ -16,6 +19,7 @@ import os import mutalyzer from mutalyzer.output import Output from mutalyzer.config import Config +from mutalyzer.util import format_usage def main(cmd): @@ -181,7 +185,7 @@ def main(cmd): if __name__ == '__main__': if len(sys.argv) < 2: - print 'Please provide a variant' + print format_usage() sys.exit(1) main(sys.argv[1]) diff --git a/bin/mutalyzer-cache-sync b/bin/mutalyzer-cache-sync index b53d709c..bca5c1d7 100755 --- a/bin/mutalyzer-cache-sync +++ b/bin/mutalyzer-cache-sync @@ -4,13 +4,14 @@ Synchronize the database cache with other Mutalyzer instances. Usage: - ./mutalyzer-cache-sync remote_wsdl url_template days + {command} remote_wsdl url_template days remote_wsdl: Location of the remote WSDL description. - url_template: URL to remote downloads, where {file} is to be substituted + url_template: URL to remote downloads, where {{file}} is to be substituted by the filename. days: Number of days to go back in the remote cache. + This program is intended to be run daily from cron. Example: 25 5 * * * mutalyzer-cache-sync 'http://dom1/?wsdl' 'http://dom1/{file}' 7 @@ -24,6 +25,7 @@ from mutalyzer.config import Config from mutalyzer.output import Output from mutalyzer.sync import CacheSync from mutalyzer import Db +from mutalyzer.util import format_usage def cache_sync(remote_wsdl, url_template, days): @@ -40,7 +42,7 @@ def cache_sync(remote_wsdl, url_template, days): if __name__ == '__main__': if len(sys.argv) < 4: - print __doc__.strip() + print format_usage() sys.exit(1) try: days = int(sys.argv[3]) diff --git a/bin/mutalyzer-mapping-update b/bin/mutalyzer-mapping-update new file mode 100755 index 00000000..e46a89c4 --- /dev/null +++ b/bin/mutalyzer-mapping-update @@ -0,0 +1,59 @@ +#!/usr/bin/env python + +""" +Update the database with mapping information from the NCBI. + +Usage: + {command} database mapping_file assembly + + database: Database to update (i.e. 'hg18' or 'hg19'). + mapping_file: Path to the NCBI mapping information. + assembly: Use only entries from this assembly (this is the 'group_name' + column in the NCBI mapping file). + + +This program is intended to be run daily from cron. Example: + + 25 6 * * * mutalyzer-mapping-update hg19 /tmp/seq_gene.md reference +""" + + +import sys + +from mutalyzer.config import Config +from mutalyzer.output import Output +from mutalyzer.mapping import NCBIUpdater +from mutalyzer.util import format_usage + + +def main(database, mapping_file, assembly): + """ + Update the database with information from the NCBI. + + @arg database: Database to update (i.e. 'hg18' or 'hg19'). + @type database: string + @arg mapping_file: Path to NCBI mapping information. + @type mapping_file: string + @arg assembly: Use only entries from this assembly (this is the + 'group_name' column in the NCBI mapping file). + @type assembly: string + + @todo: Also report how much was added/updated. + """ + config = Config() + output = Output(__file__, config.Output) + output.addMessage(__file__, -1, 'INFO', + 'Starting NCBI mapping data update') + + updater = NCBIUpdater(database, config) + updater.load(mapping_file, assembly) + updater.merge() + + output.addMessage(__file__, -1, 'INFO', 'NCBI mapping data update end') + + +if __name__ == '__main__': + if len(sys.argv) != 4: + print format_usage() + sys.exit(1) + main(*sys.argv[1:]) diff --git a/bin/mutalyzer-ucsc-update b/bin/mutalyzer-ucsc-update deleted file mode 100755 index 6fdff9cb..00000000 --- a/bin/mutalyzer-ucsc-update +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python - -""" -Get updates on mapping information from the UCSC. - -This program is intended to be run daily from cron. Example: - - 25 6 * * * mutalyzer-ucsc-update -""" - - -import sys -import os - -from mutalyzer.config import Config -from mutalyzer.output import Output -from mutalyzer.Db import Remote -from mutalyzer.Db import Update - - -def main(): - """ - Update the database with information from the UCSC. - """ - config = Config() - output = Output(__file__, config.Output) - output.addMessage(__file__, -1, 'INFO', - 'Starting UCSC mapping data update') - - for database in config.Db.dbNames: - remote = Remote(database, config.Db) - tempfile = remote.get_Update() - - update = Update(database, config.Db) - update.load_Update(tempfile) - - count_updates = update.count_Updates() - if count_updates: - output.addMessage(__file__, -1, 'INFO', - '%i updates found' % count_updates) - update.backup_cdsUpdates() - count_cds_updates = update.count_cdsUpdates() - if count_cds_updates: - output.addMessage(__file__, -1, 'INFO', '%i CDS updates ' \ - 'found, backing up' % count_cds_updates) - update.merge_cdsUpdates() - - update.merge_Update() - - output.addMessage(__file__, -1, 'INFO', 'UCSC mapping data update end') - - -if __name__ == '__main__': - main() diff --git a/extras/config.example b/extras/config.example index bce2c1e4..4bbecc30 100644 --- a/extras/config.example +++ b/extras/config.example @@ -42,14 +42,8 @@ LocalMySQLuser = "mutalyzer" # Host name for the local databases. LocalMySQLhost = "localhost" -# MySQL username for the UCSC database. -RemoteMySQLuser = "genome" -# Host name for the UCSC database. -RemoteMySQLhost = "genome-mysql.cse.ucsc.edu" -# Retrieve all entries modified within a certain number of days. -UpdateInterval = 7 # diff --git a/extras/cron.d/mutalyzer-mapping-update b/extras/cron.d/mutalyzer-mapping-update new file mode 100644 index 00000000..896f304d --- /dev/null +++ b/extras/cron.d/mutalyzer-mapping-update @@ -0,0 +1,3 @@ +# Update the mapping database every sunday morning at 03:25 and 04:25 +25 3 * * 7 www-data wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/BUILD.36.3/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md; <MUTALYZER_BIN_MAPPING_UPDATE> hg18 /tmp/seq_gene.md reference +25 4 * * 7 www-data wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md; <MUTALYZER_BIN_MAPPING_UPDATE> hg19 /tmp/seq_gene.md 'GRCh37.p2-Primary Assembly' diff --git a/extras/cron.d/mutalyzer-ucsc-update b/extras/cron.d/mutalyzer-ucsc-update deleted file mode 100644 index 141eecf6..00000000 --- a/extras/cron.d/mutalyzer-ucsc-update +++ /dev/null @@ -1,2 +0,0 @@ -# Update the UCSC database every morning at 06:25 -25 6 * * * www-data <MUTALYZER_BIN_UCSC_UPDATE> diff --git a/extras/migrations/001-db-gbinfo-add-created.migration b/extras/migrations/001-db-gbinfo-add-created.migration new file mode 100755 index 00000000..bbef9018 --- /dev/null +++ b/extras/migrations/001-db-gbinfo-add-created.migration @@ -0,0 +1,69 @@ +#!/usr/bin/env python + +""" +Add a column and index 'created' to the 'GBInfo' table. + +Usage: + ./001-db-gbinfo-add-created.migration [migrate] +""" + + +import sys +import MySQLdb + + +def _connect(): + try: + connection = MySQLdb.connect(host='localhost', + user='mutalyzer', + passwd='', + db='mutalyzer') + except MySQLdb.Error, e: + print 'Error %d: %s' % (e.args[0], e.args[1]) + sys.exit(1) + return connection + + +def check(): + """ + Check if migration is needed. + """ + connection = _connect() + cursor = connection.cursor() + cursor.execute('SHOW COLUMNS FROM GBInfo WHERE field = "created";') + has_column = len(cursor.fetchall()) > 0 + cursor.execute('SHOW INDEX FROM GBInfo WHERE Key_name = "created";') + has_index = len(cursor.fetchall()) > 0 + connection.close() + if has_column != has_index: + print 'Installation is not in a recognizable state. Fix manually.' + sys.exit(1) + return not has_column + + +def migrate(): + """ + Perform migration. + """ + connection = _connect() + cursor = connection.cursor() + cursor.execute(""" + ALTER TABLE GBInfo + #ADD COLUMN created TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + ADD INDEX created (created);""") + connection.commit() + connection.close() + print 'Added column mutalyzer.GBInfo.created' + print 'Added index on mutalyzer.GBInfo.created' + + +if __name__ == '__main__': + needed = check() + if needed: + print 'This migration is needed.' + if len(sys.argv) > 1 and sys.argv[1] == 'migrate': + print 'Performing migration.' + migrate() + print 'Performed migration.' + else: + print 'This migration is not needed.' diff --git a/extras/migrations/002-db-map-to-mapping.migration b/extras/migrations/002-db-map-to-mapping.migration new file mode 100755 index 00000000..fff956e0 --- /dev/null +++ b/extras/migrations/002-db-map-to-mapping.migration @@ -0,0 +1,160 @@ +#!/usr/bin/env python + +""" +Convert the old 'map' tables to the new 'Mapping' tables. + +Usage: + ./002-db-map-to-mapping.migration [migrate] + +This is basically just a renaming of columns and +- use NULL for missing values +- add 1 to all chromosomal start positions. + +The following tables on hg18 and hg19 are dropped: +- gbStatus +- map_cdsBackup +- refGene +- refLink + +The map tables are renamed to map_backup. +""" + + +import sys +import MySQLdb + + +def _connect(db): + try: + connection = MySQLdb.connect(host='localhost', + user='mutalyzer', + passwd='', + db=db) + except MySQLdb.Error, e: + print 'Error %d: %s' % (e.args[0], e.args[1]) + sys.exit(1) + return connection + + +def _exon_starts(starts): + updated = [] + for start in starts.split(',')[:-1]: + updated.append(str(int(start) + 1)) + return ','.join(updated) + + +def _exon_stops(stops): + if stops[-1] == ',': + return stops[:-1] + + +def _check(db): + # Todo: Also check if 'map' is gone. + connection = _connect(db) + cursor = connection.cursor() + cursor.execute('SHOW TABLES LIKE "Mapping";') + ok = len(cursor.fetchall()) > 0 + connection.close() + return ok + + +def _migrate(db): + connection = _connect(db) + cursor = connection.cursor() + cursor.execute(""" + CREATE TABLE Mapping ( + gene varchar(255) DEFAULT NULL, + transcript varchar(20) NOT NULL DEFAULT '', + version smallint(6) DEFAULT NULL, + chromosome varchar(40) DEFAULT NULL, + orientation char(1) DEFAULT NULL, + start int(11) unsigned DEFAULT NULL, + stop int(11) unsigned DEFAULT NULL, + cds_start int(11) unsigned DEFAULT NULL, + cds_stop int(11) unsigned DEFAULT NULL, + exon_starts longblob NOT NULL, + exon_stops longblob NOT NULL, + protein varchar(20) DEFAULT NULL, + source varchar(20) DEFAULT NULL, + INDEX (transcript) + );""") + select_cursor = connection.cursor(MySQLdb.cursors.DictCursor) + select_cursor.execute(""" + SELECT + geneName as gene, + acc as transcript, + version as version, + chrom as chromosome, + strand as orientation, + txStart + 1 as start, + txEnd as stop, + NULLIF(cdsStart + 1, cdsEnd + 1) as cds_start, + NULLIF(cdsEnd, cdsStart) as cds_stop, + exonStarts as exon_starts, + exonEnds as exon_stops, + NULLIF(protAcc, '') as protein, + 'UCSC' as source + FROM + map;""") + count = 0 + while True: + r = select_cursor.fetchone() + if r == None: + break + count += 1 + cursor.execute(""" + INSERT INTO Mapping + (gene, transcript, version, chromosome, orientation, start, stop, + cds_start, cds_stop, exon_starts, exon_stops, protein, source) + VALUES + (%s, %s, %s, %s, %s, %s, %s, + %s, %s, %s, %s, %s, %s);""", + (r['gene'], r['transcript'], r['version'], r['chromosome'], + r['orientation'], r['start'], r['stop'], r['cds_start'], + r['cds_stop'], _exon_starts(r['exon_starts']), _exon_stops(r['exon_stops']), + r['protein'], r['source'])) + + print 'Converted table map to table Mapping on %s (%d entries)' % (db, count) + + cursor.execute('DROP TABLE IF EXISTS gbStatus, map_cdsBackup, refGene, refLink') + cursor.execute('RENAME TABLE map TO map_backup') + + print 'Dropped tables gbStatus, map_cdsBackup, refGene, refLink on %s' % db + print 'Renamed table map to map_backup on %s' % db + + select_cursor.close() + cursor.close() + connection.commit() + connection.close() + + +def check(): + """ + Check if migration is needed. + """ + hg18_ok = _check('hg18') + hg19_ok = _check('hg19') + if hg18_ok != hg19_ok: + print 'Installation is not in a recognizable state. Fix manually.' + sys.exit(1) + return not hg18_ok + + +def migrate(): + """ + Perform migration. + """ + _migrate('hg18') + _migrate('hg19') + + +if __name__ == '__main__': + needed = check() + if needed: + print 'This migration is needed.' + if len(sys.argv) > 1 and sys.argv[1] == 'migrate': + print 'Performing migration.' + migrate() + print 'Performed migration.' + else: + print 'This migration is not needed.' diff --git a/extras/migrations/003-config-remove-ucsc.migration b/extras/migrations/003-config-remove-ucsc.migration new file mode 100755 index 00000000..d3b37aa0 --- /dev/null +++ b/extras/migrations/003-config-remove-ucsc.migration @@ -0,0 +1,25 @@ +#!/bin/bash + +# Remove UCSC database values from the configuration file. +# +# Usage: +# ./003-config-remove-ucsc.migration [migrate] + +if [ -e /etc/mutalyzer/config ] && $(grep -q 'MySQL username for the UCSC database' /etc/mutalyzer/config); then + echo 'This migration is needed.' + if [ "$1" = 'migrate' ]; then + echo 'Performing migration.' + echo 'Copying /etc/mutalyzer/config to /etc/mutalyzer/config.backup' + cp /etc/mutalyzer/config /etc/mutalyzer/config.backup + sed -i '/MySQL username for the UCSC database/d' /etc/mutalyzer/config + sed -i '/Host name for the UCSC database/d' /etc/mutalyzer/config + sed -i '/Retrieve all entries modified within a certain number of days/d' /etc/mutalyzer/config + sed -i '/RemoteMySQLuser =/d' /etc/mutalyzer/config + sed -i '/^RemoteMySQLhost =/d' /etc/mutalyzer/config + sed -i '/^UpdateInterval =/d' /etc/mutalyzer/config + echo 'Removed all UCSC database configuration values from /etc/mutalyzer/config' + echo 'Performed migration.' + fi +else + echo 'This migration is not needed.' +fi diff --git a/extras/migrations/004-cron-ucsc-to-ncbi.migration b/extras/migrations/004-cron-ucsc-to-ncbi.migration new file mode 100755 index 00000000..5ba44c1a --- /dev/null +++ b/extras/migrations/004-cron-ucsc-to-ncbi.migration @@ -0,0 +1,24 @@ +#!/bin/bash + +# Remove UCSC update from cron and install NCBI update. +# +# Usage: +# ./004-cron-ucsc-to-ncbi.migration [migrate] + +if [ -e /etc/cron.d/mutalyzer-ucsc-update ] && $(grep -v -q '^#' /etc/cron.d/mutalyzer-ucsc-update); then + echo 'This migration is needed.' + if [ "$1" = 'migrate' ]; then + echo 'Performing migration.' + sed -i 's/^/#/' /etc/cron.d/mutalyzer-ucsc-update + echo 'Commented all lines in /etc/cron.d/mutalyzer-ucsc-update' + if [ ! -e /etc/cron.d/mutalyzer-mapping-update ]; then + BIN_MAPPING_UPDATE=$(which mutalyzer-mapping-update) + cp extras/cron.d/mutalyzer-mapping-update /etc/cron.d/mutalyzer-mapping-update + sed -i -e "s@<MUTALYZER_BIN_MAPPING_UPDATE>@${BIN_MAPPING_UPDATE}@g" /etc/cron.d/mutalyzer-mapping-update + echo 'Installed /etc/cron.d/mutalyzer-mapping-update' + fi + echo 'Performed migration.' + fi +else + echo 'This migration is not needed.' +fi diff --git a/extras/migrations/README b/extras/migrations/README new file mode 100644 index 00000000..e514423f --- /dev/null +++ b/extras/migrations/README @@ -0,0 +1,18 @@ +Automated migration scripts +=========================== + +This directory contains scripts to automate the migration of a Mutalyzer +installation to the latest version. Things that might need a migration +include: + +- database schema +- database data +- configuration files +- cache +- ? + +All migration scripts accept as parameters a flag 'migrate'. Running a +script without parameters just checks if the migration is needed. Running +a script with 'migrate' does the actual migration (only if needed). + +Performing multiple migrations should be done in the order of their names. diff --git a/extras/post-install.sh b/extras/post-install.sh index 6b0735c4..4d359e9f 100644 --- a/extras/post-install.sh +++ b/extras/post-install.sh @@ -22,7 +22,7 @@ set -e PACKAGE_ROOT=$(cd / && python -c 'import mutalyzer; print mutalyzer.package_root()') BIN_BATCHD=$(which mutalyzer-batchd) BIN_CACHE_SYNC=$(which mutalyzer-cache-sync) -BIN_UCSC_UPDATE=$(which mutalyzer-ucsc-update) +BIN_MAPPING_UPDATE=$(which mutalyzer-mapping-update) BIN_WEBSITE=$(which mutalyzer-website.wsgi) BIN_WEBSERVICE=$(which mutalyzer-webservice.wsgi) @@ -55,10 +55,10 @@ update-rc.d -f mutalyzer-batchd remove update-rc.d mutalyzer-batchd defaults 98 02 echo "Installing crontab" -cp extras/cron.d/mutalyzer-ucsc-update /etc/cron.d/mutalyzer-ucsc-update -sed -i -e "s@<MUTALYZER_BIN_UCSC_UPDATE>@${BIN_UCSC_UPDATE}@g" /etc/cron.d/mutalyzer-ucsc-update cp extras/cron.d/mutalyzer-cache-sync /etc/cron.d/mutalyzer-cache-sync sed -i -e "s@<MUTALYZER_BIN_CACHE_SYNC>@${BIN_CACHE_SYNC}@g" /etc/cron.d/mutalyzer-cache-sync +cp extras/cron.d/mutalyzer-mapping-update /etc/cron.d/mutalyzer-mapping-update +sed -i -e "s@<MUTALYZER_BIN_MAPPING_UPDATE>@${BIN_MAPPING_UPDATE}@g" /etc/cron.d/mutalyzer-mapping-update echo "Creating /etc/apache2/conf.d/mutalyzer.conf" cp extras/apache/mutalyzer.conf /etc/apache2/conf.d/mutalyzer.conf @@ -79,130 +79,29 @@ cat << EOF | mysql -u root -p FLUSH PRIVILEGES; EOF -mkdir -p /tmp/mutalyzer-install -pushd /tmp/mutalyzer-install - -# Do hg18 -mkdir -p hg18 -pushd hg18 - -echo "Creating and populating hg18 database" - -# Then retrieve the refLink table from the UCSC website (hg18) -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/refLink.sql -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/refLink.txt.gz - -# For Variant_info to work, you need the following files too (hg18) -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/refGene.sql -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/refGene.txt.gz -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/gbStatus.sql -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/gbStatus.txt.gz - -# Create table and load data (hg18) -mysql -u mutalyzer -D hg18 < refLink.sql -zcat refLink.txt.gz | mysql -u mutalyzer -D hg18 -e 'LOAD DATA LOCAL INFILE "/dev/stdin" INTO TABLE refLink;' - -mysql -u mutalyzer -D hg18 < gbStatus.sql -zgrep mRNA gbStatus.txt.gz > gbStatus.mrna.txt -mysql -u mutalyzer -D hg18 -e 'LOAD DATA LOCAL INFILE "gbStatus.mrna.txt" INTO TABLE gbStatus;' - -mysql -u mutalyzer -D hg18 < refGene.sql -zcat refGene.txt.gz | mysql -u mutalyzer -D hg18 -e 'LOAD DATA LOCAL INFILE "/dev/stdin" INTO TABLE refGene;' - -# Combine the mapping info into one table (hg18) -cat << EOF | mysql -u mutalyzer -D hg18 - CREATE TABLE map - SELECT DISTINCT acc, version, txStart, txEnd, cdsStart, cdsEnd, - exonStarts, exonEnds, name2 AS geneName, chrom, - strand, protAcc - FROM gbStatus, refGene, refLink - WHERE type = "mRNA" - AND refGene.name = acc - AND acc = mrnaAcc; - CREATE TABLE map_cdsBackup ( - acc char(12) NOT NULL DEFAULT '', - version smallint(6) unsigned NOT NULL DEFAULT '0', - txStart int(11) unsigned NOT NULL DEFAULT '0', - txEnd int(11) unsigned NOT NULL DEFAULT '0', - cdsStart int(11) unsigned NOT NULL DEFAULT '0', - cdsEnd int(11) unsigned NOT NULL DEFAULT '0', - exonStarts longblob NOT NULL, - exonEnds longblob NOT NULL, - geneName varchar(255) NOT NULL DEFAULT '', - chrom varchar(255) NOT NULL DEFAULT '', - strand char(1) NOT NULL DEFAULT '', - protAcc varchar(255) NOT NULL DEFAULT '' - ); -EOF - -popd -rm -Rf /tmp/mutalyzer-install/hg18 - -# Do hg19 -mkdir -p hg19 -pushd hg19 - -echo "Creating and populating hg19 database" - -# Then retrieve the refLink table from the UCSC website (hg19) -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refLink.sql -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refLink.txt.gz - -# For Variant_info to work, you need the following files too (hg19) -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.sql -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/gbStatus.sql -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/gbStatus.txt.gz - -# Create table and load data (hg19) -mysql -u mutalyzer -D hg19 < refLink.sql -zcat refLink.txt.gz | mysql -u mutalyzer -D hg19 -e 'LOAD DATA LOCAL INFILE "/dev/stdin" INTO TABLE refLink;' - -mysql -u mutalyzer -D hg19 < gbStatus.sql -zgrep mRNA gbStatus.txt.gz > gbStatus.mrna.txt -mysql -u mutalyzer -D hg19 -e 'LOAD DATA LOCAL INFILE "gbStatus.mrna.txt" INTO TABLE gbStatus;' - -mysql -u mutalyzer -D hg19 < refGene.sql -zcat refGene.txt.gz | mysql -u mutalyzer -D hg19 -e 'LOAD DATA LOCAL INFILE "/dev/stdin" INTO TABLE refGene;' - -# Combine the mapping info into one table (hg19) -cat << EOF | mysql -u mutalyzer -D hg19 - CREATE TABLE map - SELECT DISTINCT acc, version, txStart, txEnd, cdsStart, cdsEnd, - exonStarts, exonEnds, name2 AS geneName, chrom, - strand, protAcc - FROM gbStatus, refGene, refLink - WHERE type = "mRNA" - AND refGene.name = acc - AND acc = mrnaAcc; - CREATE TABLE map_cdsBackup ( - acc char(12) NOT NULL DEFAULT '', - version smallint(5) unsigned NOT NULL DEFAULT '0', - txStart int(10) unsigned NOT NULL DEFAULT '0', - txEnd int(10) unsigned NOT NULL DEFAULT '0', - cdsStart int(10) unsigned NOT NULL DEFAULT '0', - cdsEnd int(10) unsigned NOT NULL DEFAULT '0', - exonStarts longblob NOT NULL, - exonEnds longblob NOT NULL, - geneName varchar(255) NOT NULL DEFAULT '', - chrom varchar(255) NOT NULL DEFAULT '', - strand char(1) NOT NULL DEFAULT '', - protAcc varchar(255) NOT NULL DEFAULT '' - ); -EOF - -popd - -popd -rm -Rf /tmp/mutalyzer-install - -# Create ChrName tables (hg18) +# Create ChrName and Mapping table (hg18) cat << EOF | mysql -u mutalyzer -D hg18 CREATE TABLE ChrName ( AccNo char(20) NOT NULL, name char(20) NOT NULL, PRIMARY KEY (AccNo) ); +CREATE TABLE Mapping ( + gene varchar(255) DEFAULT NULL, + transcript varchar(20) NOT NULL DEFAULT '', + version smallint(6) DEFAULT NULL, + chromosome varchar(40) DEFAULT NULL, + orientation char(1) DEFAULT NULL, + start int(11) unsigned DEFAULT NULL, + stop int(11) unsigned DEFAULT NULL, + cds_start int(11) unsigned DEFAULT NULL, + cds_stop int(11) unsigned DEFAULT NULL, + exon_starts longblob NOT NULL, + exon_stops longblob NOT NULL, + protein varchar(20) DEFAULT NULL, + source varchar(20) DEFAULT NULL, + INDEX (transcript) +); INSERT INTO ChrName (AccNo, name) VALUES ('NC_000001.9', 'chr1'), ('NC_000002.10', 'chr2'), @@ -233,13 +132,33 @@ INSERT INTO ChrName (AccNo, name) VALUES ('NT_113959.1', 'chr22_h2_hap1'); EOF -# Create ChrName tables (hg19) +# Populate Mapping table with UCSC data (hg18) +wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/BUILD.36.3/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md +$(BIN_MAPPING_UPDATE hg18 /tmp/seq_gene.md reference) + +# Create ChrName and Mapping table (hg19) cat << EOF | mysql -u mutalyzer -D hg19 CREATE TABLE ChrName ( AccNo char(20) NOT NULL, name char(20) NOT NULL, PRIMARY KEY (AccNo) ); +CREATE TABLE Mapping ( + gene varchar(255) DEFAULT NULL, + transcript varchar(20) NOT NULL DEFAULT '', + version smallint(6) DEFAULT NULL, + chromosome varchar(40) DEFAULT NULL, + orientation char(1) DEFAULT NULL, + start int(11) unsigned DEFAULT NULL, + stop int(11) unsigned DEFAULT NULL, + cds_start int(11) unsigned DEFAULT NULL, + cds_stop int(11) unsigned DEFAULT NULL, + exon_starts longblob NOT NULL, + exon_stops longblob NOT NULL, + protein varchar(20) DEFAULT NULL, + source varchar(20) DEFAULT NULL, + INDEX (transcript) +); INSERT INTO ChrName (AccNo, name) VALUES ('NC_000001.10', 'chr1'), ('NC_000002.11', 'chr2'), @@ -276,6 +195,10 @@ INSERT INTO ChrName (AccNo, name) VALUES ('NT_167251.1', 'chr17_ctg5_hap1'); EOF +# Populate Mapping table with UCSC data (hg19) +wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md +$(BIN_MAPPING_UPDATE hg19 /tmp/seq_gene.md 'GRCh37.p2-Primary Assembly' + echo "Creating tables in mutalyzer database" # Create mutalyzer tables diff --git a/extras/post-upgrade.sh b/extras/post-upgrade.sh index 5cf4e8c4..1a77436e 100644 --- a/extras/post-upgrade.sh +++ b/extras/post-upgrade.sh @@ -29,6 +29,12 @@ fi echo "Symlinking /var/www/mutalyzer/base to $PACKAGE_ROOT/templates/base" ln -s $PACKAGE_ROOT/templates/base /var/www/mutalyzer/base +echo "Running any needed migrations" +for MIGRATION in extras/migrations/*.migration; do + echo "Checking migration $(basename $MIGRATION)" + $MIGRATION migrate +done + echo "Restarting Apache" /etc/init.d/apache2 restart diff --git a/mutalyzer/Db.py b/mutalyzer/Db.py index 06441424..33fad56f 100644 --- a/mutalyzer/Db.py +++ b/mutalyzer/Db.py @@ -15,8 +15,6 @@ statements. #Public classes: # - Db ; Log in to a database and keep it open for queries. # - Mapping ; Mapping of transcripts and genes. -# - Remote ; Retrieving updates for the mapping databases. -# - Update ; Updating the mapping databases. # - Cache ; Cache administration. # - Batch ; Batch checker. @@ -133,7 +131,7 @@ class Mapping(Db) : - query(statement) ; General query function. SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping; Accumulated mapping info. """ def __init__(self, build, config) : @@ -150,66 +148,12 @@ class Mapping(Db) : Db.__init__(self, build, config.LocalMySQLuser, config.LocalMySQLhost) #__init__ - def get_protAcc(self, mrnaAcc) : - """ - Query the database for a protein ID given an mRNA ID. - - SQL tables from dbNames: - - map ; Accumulated mapping info. - - @arg mrnaAcc: The ID of an mRNA - @type mrnaAcc: string - - @return: The protein ID - @rtype: string - """ - - statement = """ - SELECT protAcc - FROM map - WHERE acc = %s; - """, mrnaAcc - - return self.query(statement)[0][0] - #get_protAcc - - def get_NM_info(self, mrnaAcc, version = None) : - """ - Retrieve various data for an NM number. - - SQL tables from dbNames: - - map ; Accumulated mapping info. - - @arg mrnaAcc: The ID of an mRNA - @type mrnaAcc: string - @arg version: version number of the accession number (not used) - @type version: integer - - @return: - - exonStarts ; List of exon start sites. - - exonEnds ; List of exon end sites. - - cdsStart ; Position of the start codon. - - cdsEnd ; Position of the end codon. - - strand ; Orientation of the gene (+ = forward, - - = reverse) - @rtype: list - """ - - statement = """ - SELECT exonStarts, exonEnds, cdsStart, cdsEnd, strand - FROM map - WHERE acc = %s; - """, mrnaAcc - - return self.query(statement)[0] - #get_NM_info - def get_NM_version(self, mrnaAcc) : """ Get the version number of an accession number. SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping ; Accumulated mapping info. @arg mrnaAcc: The ID of an mRNA @type mrnaAcc: string @@ -220,8 +164,8 @@ class Mapping(Db) : statement = """ SELECT version - FROM map - WHERE acc = %s; + FROM Mapping + WHERE transcript = %s; """, mrnaAcc return [int(i[0]) for i in self.query(statement)] @@ -233,7 +177,7 @@ class Mapping(Db) : If the version number is None, use the "newest" version number. SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping ; Accumulated mapping info. @arg mrnaAcc: The ID of an mRNA @type mrnaAcc: string @@ -251,25 +195,25 @@ class Mapping(Db) : See also test_converter.test_hla_cluster and bug #58. """ q = """ - select acc, - txStart, txEnd, - cdsStart, cdsEnd, - exonStarts, exonEnds, - geneName, chrom, - strand, protAcc - from map + select transcript, + start, stop, + cds_start, cds_stop, + exon_starts, exon_stops, + gene, chromosome, + orientation, protein + from Mapping """ if version is None: q += """ - where acc = %s - order by version desc, chrom asc; + where transcript = %s + order by version desc, chromosome asc; """ statement = (q, mrnaAcc) else: q += """ - where acc = %s and + where transcript = %s and version = %s - order by chrom asc; + order by chromosome asc; """ statement = q, (mrnaAcc, version) @@ -283,7 +227,7 @@ class Mapping(Db) : should be returned, set overlap to 0. SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping ; Accumulated mapping info. @arg chrom: The chromosome (coded as "chr1", ..., "chrY") @type chrom: string @@ -295,35 +239,35 @@ class Mapping(Db) : - 0 ; Return only the transcripts that completely fall in the range [p1, p2] - 1 ; Return all hit transcripts - @type overlap: boolean + @type overlap: boolean @return: All accession numbers that are hit according to the overlap criterium @rtype: list """ q = """ - select acc, - txStart, txEnd, - cdsStart, cdsEnd, - exonStarts, exonEnds, - geneName, chrom, - strand, protAcc, + select transcript, + start, stop, + cds_start, cds_stop, + exon_starts, exon_stops, + gene, chromosome, + orientation, protein, version - from map + from Mapping """ if overlap: q += """ - WHERE chrom = %s AND - txStart <= "%s" AND - txEnd >= "%s"; + WHERE chromosome = %s AND + start <= "%s" AND + stop >= "%s"; """ statement = q, (chrom, p2, p1) else: q += """ - WHERE chrom = %s AND - txStart >= "%s" AND - txEnd <= "%s"; + WHERE chromosome = %s AND + start >= "%s" AND + stop <= "%s"; """ statement = q, (chrom, p1, p2) @@ -338,16 +282,16 @@ class Mapping(Db) : geneName ; Name of a gene. SQL tables from dbNames: - map ; Accumulated mapping info. + Mapping ; Accumulated mapping info. Returns: list ; A list of transcripts. """ statement = """ - SELECT acc, version - FROM map - WHERE geneName = %s; + SELECT transcript, version + FROM Mapping + WHERE gene = %s; """, geneName ret = self.query(statement) @@ -365,7 +309,7 @@ class Mapping(Db) : Get the name of a gene, given a transcript identifier (NM number). SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping ; Accumulated mapping info. @arg mrnaAcc: The ID of an mRNA @type mrnaAcc: string @@ -375,9 +319,9 @@ class Mapping(Db) : """ statement = """ - SELECT geneName - FROM map - WHERE acc = %s; + SELECT gene + FROM Mapping + WHERE transcript = %s; """, mrnaAcc ret = self.query(statement) @@ -391,7 +335,7 @@ class Mapping(Db) : Check if the given name is a valid chromosome name. SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping ; Accumulated mapping info. @arg name: The name to be tested @type name: string @@ -403,8 +347,8 @@ class Mapping(Db) : statement = """ SELECT COUNT(*) - FROM map - WHERE chrom = %s; + FROM Mapping + WHERE chromosome = %s; """, name if int(self.query(statement)[0][0]) > 0 : @@ -469,7 +413,7 @@ class Mapping(Db) : Get the chromosome name, given a transcript identifier (NM number). SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping ; Accumulated mapping info. @arg acc: The NM accession number (version NOT included) @type acc: string @@ -479,341 +423,227 @@ class Mapping(Db) : """ statement = """ - SELECT chrom - FROM map - WHERE acc = %s; + SELECT chromosome + FROM Mapping + WHERE transcript = %s; """, acc - print acc + ret = self.query(statement) if ret : return ret[0][0] return None #get_chromName -#Mapper - -class Remote(Db) : - """ - Database functions for retrieving updates for the mapping databases. - Special methods: - - __init__(config) ; Initialise the class. - - Public methods: - - get_Update() ; Retrieve new mapping info from the UCSC. - - Inherited methods from Db: - - query(statement) ; General query function. - - SQL tables from dbNames: - - gbStatus ; acc -> version mapping (NM to NM + version), - type, modDate - - refGene ; name -> geneName mapping (NM to gene name), - txStart, txEnd, cdsStart, cdsEnd, exonStarts, - exonEnds, chrom, strand. - - refLink ; mrnaAcc -> protAcc mapping (NM to NP). - """ - - def __init__(self, build, config) : + def merge_update(self): """ - Initialise the Db parent class. Use the remote database for a - certain build. - - Private variables (altered): - - __config ; Configuration variables. + Merge existing mapping information with new mapping information, which + should be in table 'MappingTemp'. - @arg build: The version of the mapping database - @type build: string - @arg config: Configuration variables - @type config: class instance - """ - - self.__config = config - Db.__init__(self, build, config.RemoteMySQLuser, config.RemoteMySQLhost) - #__init__ - - def get_Update(self) : - """ - Retrieve all mapping updates from the UCSC within a certain time - window (defined in the configuration file) and gather the results - into one mapping table. + The strategy is as follows. Existing mappings (accumulated by + Mutalyzer in the past) that are not in the new mapping information are + added to the new mapping information. The resulting set is used as the + mapping information from now on. - The results will be written to a temporary file to be imported in - the local database with the load_Update() function. - - Return temporary filename used to store the results. - - @return: Filename used to store the results. - @rtype: string + This way, we get the latest updates for existing mappings and keep old + mappings not in the updated information. SQL tables from dbNames: - - gbStatus ; acc -> version mapping (NM to NM + version), - type, modDate - - refGene ; name -> geneName mapping (NM to gene name), - txStart, txEnd, cdsStart, cdsEnd, exonStarts, - exonEnds, chrom, strand. - - refLink ; mrnaAcc -> protAcc mapping (NM to NP). - """ - - statement = """ - SELECT DISTINCT acc, version, txStart, txEnd, cdsStart, cdsEnd, - exonStarts, exonEnds, name2 AS geneName, chrom, - strand, protAcc - FROM gbStatus, refGene, refLink - WHERE type = "mRNA" - AND refGene.name = acc - AND acc = mrnaAcc - AND time >= DATE_SUB(CURDATE(), INTERVAL %s DAY); - """, self.__config.UpdateInterval - - handle, filename = tempfile.mkstemp(text=True) - - # Convert the results to a tab delimited file. - for i in self.query(statement) : - for j in i : - os.write(handle, str(j) + chr(0x09)) # 0x09 is a TAB. - os.write(handle, '\n') - #for - - os.close(handle) - return filename - #get_Update -#Remote - -class Update(Db) : - """ - Database functions for updating the mapping databases. - - Public methods: - - load_Update() ; Load new mapping info into the local database. - - count_Updates() ; Count the number of entries in the new - mapping info table. - - backup_cdsUpdates() ; Make a backup of updates that overwrite the - old mapping info. - - count_cdsUpdates() ; Count the number of updates that overwrite - the old mapping info. - - merge_cdsUpdates() ; Merge the backup of old mapping info with the - other old info. - - merge_Update() ; Merge the new mapping info from the UCSC with - what we already have. + - Mapping ; Accumulated mapping info. + - MappingTemp ; New mapping info. + - MappingBackup ; Backup of accumulated mapping info. - Inherited methods from Db: - - query(statement) ; General query function. - - SQL tables from dbNames: - - map ; Accumulated mapping info. - - map_temp ; Newly found data. - - map_new ; Merge of map_temp and map. - - map_cdsBackup_temp ; Entries that were updated without an increment - of the version number. - - map_cdsBackup ; Merge of map_cdsBackup_temp and itself. - """ - - def __init__(self, build, config) : + @todo: Return number of entries added/updated. """ - Initialise the Db parent class. Use the remote database for a - certain build. + statement = """ + CREATE TABLE IF NOT EXISTS MappingTemp LIKE Mapping; + """, None + self.query(statement) - Private variables (altered): - - __config ; Configuration variables. + statement = """ + INSERT INTO MappingTemp + SELECT * FROM Mapping AS OldM + WHERE NOT EXISTS ( + SELECT * FROM MappingTemp AS NewM + WHERE OldM.transcript = NewM.transcript + AND OldM.version = NewM.version + ); + """, None + self.query(statement) - @arg build: The version of the mapping database - @type build: string - @arg config: Configuration variables - @type config: class instance - """ + statement = """ + DROP TABLE IF EXISTS MappingBackup; + """, None + self.query(statement) - self.__config = config - Db.__init__(self, build, config.LocalMySQLuser, config.LocalMySQLhost) - #__init__ + statement = """ + RENAME TABLE Mapping TO MappingBackup, MappingTemp TO Mapping; + """, None + self.query(statement) + #merge_update - def load_Update(self, filename) : + def ncbi_create_temporary_tables(self): """ - Load the updates from the temporary file created by the get_Update() - function and import it in the local database. - - @arg filename: Filename to read the update from. - @type filename: string - - SQL tables from dbNames (altered): - - map_temp ; Created and loaded with data from TempFile. + Create temporary tables to import NCBI mapping into. SQL tables from dbNames: - - map ; Accumulated mapping info. + - Genes ; Gene names from NCBI. + - Transcripts ; Transcript mappings from NCBI. + - Exons ; Exon mappings from NCBI. """ - - # The statements in this function may be combined when MYSQL_BUG is - # solved. + self.ncbi_drop_temporary_tables() statement = """ - CREATE TABLE map_temp LIKE map; + CREATE TABLE Genes ( + id varchar(20) NOT NULL DEFAULT '', + name varchar(255) DEFAULT NULL, + PRIMARY KEY (id) + ); """, None self.query(statement) - statement = """ - LOAD DATA LOCAL INFILE %s - INTO TABLE map_temp; - """, filename + statement = """ + CREATE TABLE Transcripts ( + name varchar(20) NOT NULL DEFAULT '', + gene_id varchar(20) DEFAULT NULL, + chromosome char(2) DEFAULT NULL, + start int(11) DEFAULT NULL, + stop int(11) DEFAULT NULL, + orientation char(1) DEFAULT NULL, + PRIMARY KEY (name,start) + ); + """, None self.query(statement) - os.remove(filename) - #load_Update + statement = """ + CREATE TABLE Exons ( + transcript varchar(20) NOT NULL DEFAULT '', + start int(11) DEFAULT NULL, + stop int(11) DEFAULT NULL, + cds_start int(11) DEFAULT NULL, + cds_stop int(11) DEFAULT NULL, + protein varchar(20) DEFAULT NULL, + PRIMARY KEY (transcript,start) + ); + """, None + self.query(statement) + #ncbi_create_temporary_table - def count_Updates(self) : + def ncbi_drop_temporary_tables(self): """ - Count the number of updates. This function will only work if it - is preceeded by the load_Update() function. Otherwise the map_temp - table may not exist. This function can not be used after the - merge_Update() function has been executed, since it drops the - map_temp table. + Drop temporary tables used for importing NCBI mapping information. - @return: The number of entries in the table of updated mapping info - @rtype: integer + SQL tables from dbNames: + - Genes ; Gene names from NCBI. + - Transcripts ; Transcript mappings from NCBI. + - Exons ; Exon mappings from NCBI. """ - statement = """ - SELECT COUNT(*) - FROM map_temp; + DROP TABLE IF EXISTS Genes, Transcripts, Exons; """, None + self.query(statement) + #ncbi_drop_temporary_tables - return int(self.query(statement)[0][0]) - #count_Updates - - def backup_cdsUpdates(self) : + def ncbi_import_gene(self, id, name): """ - Copy all mapping entries where there was an update, but no - increment in the version number, to a backup table. Note that - we use acc, version, txStart as the primary key because members - of a gene family are mapped multiple times. - - SQL tables from dbNames (altered): - - map_cdsBackup_temp ; Created and filled with entries that - were updated without an increment of the - version number. + Import a (gene id, gene name) pair in a temporary table. SQL tables from dbNames: - - map ; Accumulated mapping info. - - map_temp ; Freshly downloaded mapping info. + - Genes ; Gene names from NCBI. """ - statement = """ - CREATE TABLE map_cdsBackup_temp - SELECT map.* - FROM map, map_temp - WHERE map.acc = map_temp.acc - AND map.version = map_temp.version - AND map.txStart = map_temp.txStart - AND ( - map.cdsStart != map_temp.cdsStart - OR map.cdsEnd != map_temp.cdsEnd - ); - """, None + INSERT IGNORE INTO Genes (id, name) VALUES (%s, %s); + """, (id, name) self.query(statement) - #backup_cdsUpdates + #ncbi_import_gene - def count_cdsUpdates(self) : + def ncbi_import_transcript(self, name, gene, chromosome, start, stop, + orientation): """ - Count the number of mapping entries that have changed without an - increment in the version number. This function can only be called - after backup_cdsUpdates() has been executed and before - merge_cdsUpdates has been executed. + Import a transcript mapping in a temporary table. SQL tables from dbNames: - - map_cdsBackup_temp ; Entries that wre updated without an - increment of the version number. - - @return: The number of mapping entries that have changed without an - increment in the version number - @rtype: integer + - Transcripts ; Transcript mappings from NCBI. """ - statement = """ - SELECT COUNT(*) - FROM map_cdsBackup_temp; - """, None + INSERT IGNORE INTO Transcripts + (name, gene_id, chromosome, start, stop, orientation) + VALUES + (%s, %s, %s, %s, %s, %s); + """, (name, gene, chromosome, start, stop, orientation) - return int(self.query(statement)[0][0]) - #count_cdsUpdates + self.query(statement) + #ncbi_import_transcript - def merge_cdsUpdates(self) : + def ncbi_import_exon(self, transcript, start, stop, cds_start, cds_stop, + protein): """ - Merge the mapping entries that have changed without an increment in - the version number with a table that contains backups of these - entries. + Import an exon mapping in a temporary table. - SQL tables from dbNames (altered): - - map_cdsBackup ; Extended with the entries in - map_cdsBackup_temp. - - map_cdsBackup_temp ; Dropped. + SQL tables from dbNames: + - Exons ; Exon mappings from NCBI. """ - - # The statements in this function may be combined when MYSQL_BUG is - # solved. - statement = """ - INSERT INTO map_cdsBackup - SELECT * - FROM map_cdsBackup_temp; - """, None - self.query(statement) - statement = """ - DROP TABLE map_cdsBackup_temp; - """, None + INSERT IGNORE INTO Exons + (transcript, start, stop, cds_start, cds_stop, protein) + VALUES + (%s, %s, %s, %s, %s, %s); + """, (transcript, start, stop, cds_start, cds_stop, protein) self.query(statement) - #merge_cdsUpdates + #ncbi_import_exon - def merge_Update(self) : + def ncbi_aggregate_mapping(self): """ - Merge the new mapping data with the old ones. + Aggregate gene, transcript and exon mapping information from the NCBI + into one table. - SQL tables from dbNames (altered): - - map_new ; Created and filled with the merge of map_temp and map. - Dropped after use. - - map_temp ; Merged with map to form map_new. Dropped after use. - - map ; Overwritten with the merged info in map_new. + @note: Default MySQL value for group_concat_max_len is 1024, meaning + that the GROUP_CONCAT aggregate function returns values of at most + 1024 bytes long. This is not enough (currently we need around 3000 + bytes), so we explicitely set this to a higher value. + @note: We use MAX(E.protein) since MySQL does not have an ANY() + aggregator. """ - - # The statements in this function may be combined when MYSQL_BUG is - # solved. - statement = """ - CREATE TABLE map_new - SELECT * - FROM map_temp - UNION - SELECT * - FROM map - WHERE NOT EXISTS ( - SELECT * - FROM map_temp - WHERE map.acc = map_temp.acc - AND map.version = map_temp.version - AND map.txStart = map_temp.txStart - ); + SET group_concat_max_len = 32768; """, None self.query(statement) + statement = """ - DROP TABLE map; + DROP TABLE IF EXISTS MappingTemp; """, None self.query(statement) + statement = """ - CREATE TABLE map - SELECT * - FROM map_new; + CREATE TABLE MappingTemp LIKE Mapping; """, None self.query(statement) + statement = """ - DROP TABLE map_new; + INSERT INTO MappingTemp + SELECT + G.name as gene, + SUBSTRING(T.name FROM 1 FOR LOCATE('.', T.name) - 1) as transcript, + SUBSTRING(T.name FROM LOCATE('.', T.name) + 1) as version, + CONCAT('chr', T.chromosome) as chromosome, + T.orientation as orientation, + MIN(T.start) as start, + MAX(T.stop) as stop, + MAX(E.cds_start) as cds_start, + MAX(E.cds_stop) as cds_stop, + GROUP_CONCAT(DISTINCT E.start ORDER BY E.start ASC) as exon_starts, + GROUP_CONCAT(DISTINCT E.stop ORDER BY E.stop ASC) as exon_stops, + MAX(E.protein) as protein, + 'NCBI' as source + FROM Transcripts as T, Genes as G, Exons as E + WHERE T.gene_id = G.id AND T.name = E.transcript + GROUP BY T.name; """, None self.query(statement) - statement = """ - DROP TABLE map_temp; - """, None + #ncbi_aggregate_mapping +#Mapping - self.query(statement) - #merge_Update -#Update class Cache(Db) : """ @@ -908,9 +738,9 @@ class Cache(Db) : def insertLRG(self, accNo, fileHash, url): """ Insert information about a LRG record in the internal database. - + See insertGB() for more information. - + @arg accNo: The name associated with this record @type accNo: string @arg fileHash: The hash of the content of the record @@ -1160,7 +990,7 @@ class Cache(Db) : @arg accNo: The accession number @type accNo: string - + @return: GI number @rtype: string """ @@ -1181,13 +1011,13 @@ class Cache(Db) : """ Gets the protein accession number for the given mRNA accession number. - + SQL tables from internalDb: - Link ; mRNA and associated protein IDs. - + @arg mrnaAcc: The ID of an mRNA @type mrnaAcc: string - + @return: The protein accession number @rtype: string """ @@ -1207,13 +1037,13 @@ class Cache(Db) : def getmrnaAcc(self, protAcc) : """ Gets the mRNA accession number for a given protein accession number. - + SQL tables from internalDb: - Link ; mRNA and associated protein IDs. - + @arg protAcc: The protein ID @type protAcc: string - + @return: The mRNA accession number @rtype: string """ @@ -1235,14 +1065,14 @@ class Cache(Db) : """ Inserts the given mRNA and protein accession numbers into the Link table. - + SQL tables from internalDb: - Link ; mRNA and associated protein IDs. - + @arg protAcc: The protein ID @type protAcc: string @arg mrnaAcc: The ID of an mRNA - @type mrnaAcc: string + @type mrnaAcc: string """ statement = """ diff --git a/mutalyzer/Scheduler.py b/mutalyzer/Scheduler.py index d54ec427..626d33c5 100644 --- a/mutalyzer/Scheduler.py +++ b/mutalyzer/Scheduler.py @@ -25,7 +25,7 @@ from mutalyzer import variantchecker from mutalyzer.grammar import Grammar from mutalyzer.config import Config from mutalyzer.output import Output -from mutalyzer import Mapper # Mapper.Converter +from mutalyzer.mapping import Converter from mutalyzer import Retriever # Retriever.Retriever @@ -507,7 +507,7 @@ Mutalyzer batch checker.""" % url) if not skip : try : #process - converter = Mapper.Converter(build, C, O) + converter = Converter(build, C, O) #Also accept chr accNo variant = converter.correctChrVariant(variant) diff --git a/mutalyzer/config.py b/mutalyzer/config.py index 1f5b36cd..cb98f726 100644 --- a/mutalyzer/config.py +++ b/mutalyzer/config.py @@ -98,9 +98,6 @@ class Config(): self.Db.dbNames = config["dbNames"] self.Db.LocalMySQLuser = config["LocalMySQLuser"] self.Db.LocalMySQLhost = config["LocalMySQLhost"] - self.Db.RemoteMySQLuser = config["RemoteMySQLuser"] - self.Db.RemoteMySQLhost = config["RemoteMySQLhost"] - self.Db.UpdateInterval = int(config["UpdateInterval"]) # Set the variables needed by the Output module. self.Output.log = config["log"] diff --git a/mutalyzer/Mapper.py b/mutalyzer/mapping.py similarity index 55% rename from mutalyzer/Mapper.py rename to mutalyzer/mapping.py index ef09e108..718094f1 100644 --- a/mutalyzer/Mapper.py +++ b/mutalyzer/mapping.py @@ -1,93 +1,47 @@ -#!/usr/bin/python - """ -Search for an NM number in the MySQL database, if the version number -matches, get the start and end positions in a variant. Translate these -positions to I{g.} notation if the variant is in I{c.} notation or vice versa. +Work with the mappings of transcripts to chromosomes. - - If no end position is present, the start position is assumed to be the end - position. - - If the version number is not found in the database, an error message is - generated and a suggestion for an other version is given. - - If the reference sequence is not found at all, an error is returned. - - If no variant is present, the transcription start and end and CDS end in - I{c.} notation is returned. - - If the variant is not accepted by the nomenclature parser, a parse error - will be printed. - -@requires: sys -@requires: Modules.Config -@requires: Modules.Db -@requires: Modules.Crossmap -@requires: Modules.Serializers.SoapMessage -@requires: Modules.Serializers.Mapping -@requires: Modules.Serializers.Transcript -@requires: Bio.Seq.reverse_complement -@requires: collections.defaultdict - -@todo: Rename Mapper to converter? +Instances of the {Converter} class convert between transcript and chromosomal +locations, using the 'Mapping' table. + +The {Updater} class is an abstract base class, subclassed by {NCBIUpdater}. +Instances of {NCBIUpdater} can load NCBI mapping information from a file and +update the database with this information. """ -import sys # argv -from mutalyzer.grammar import Grammar -from mutalyzer import Db # Db(), get_NM_version(), get_NM_info() -from mutalyzer import Crossmap # Crossmap(), g2x(), x2g(), main2int(), - # offset2int(), info() -from mutalyzer.models import SoapMessage, Mapping, Transcript +import sys from Bio.Seq import reverse_complement from collections import defaultdict -def _sl2il(l) : - """ - Convert a list of strings to a list of integers. +from mutalyzer.grammar import Grammar +from mutalyzer import Db +from mutalyzer import Crossmap +from mutalyzer.models import SoapMessage, Mapping, Transcript - @arg l: A list of strings - @type l: list - @returns: A list of integers - @rtype: list +class Converter(object) : """ + Convert between transcript and chromosomal locations. - return [int(s) for s in l] -#__sl2il - -def _getcoords(C, Loc, Type) : - """ - Return main, offset and g positions given either a position in - I{c.} or in I{g.} notation. - - @arg C: A crossmapper - @type C: object - @arg Loc: A location in either I{g.} or I{c.} notation - @type Loc: object - @arg Type: The reference type - @type Type: string - @returns: triple: - 0. Main coordinate in I{c.} notation - 1. Offset coordinate in I{c.} notation - 2. Position in I{g.} notation - @rtype: triple (integer, integer, integer) - """ + Search for an NM number in the MySQL database, if the version number + matches, get the start and end positions in a variant. Translate these + positions to I{g.} notation if the variant is in I{c.} notation or vice + versa. - if Type == 'c' : - main = C.main2int(Loc.MainSgn + Loc.Main) - offset = C.offset2int(Loc.OffSgn + Loc.Offset) - g = C.x2g(main, offset) - main, offset = C.g2x(g) - #if - else : - g = int(Loc.Main) - main, offset = C.g2x(g) - #else - return (main, offset, g) -#__getcoords + - If no end position is present, the start position is assumed to be the + end position. + - If the version number is not found in the database, an error message is + generated and a suggestion for an other version is given. + - If the reference sequence is not found at all, an error is returned. + - If no variant is present, the transcription start and end and CDS end in + I{c.} notation is returned. + - If the variant is not accepted by the nomenclature parser, a parse error + will be printed. -class Converter(object) : + @todo: Refactor anything using {mutalyzer.models} into the {webservice} + module. """ - Converter object docstring - """ - def __init__(self, build, C, O) : """ Initialise the class. @@ -100,7 +54,6 @@ class Converter(object) : @arg O: output object @type O: object """ - self.build = None self.__output = O self.__config = C @@ -168,7 +121,6 @@ class Converter(object) : #_parseInput def _populateFields(self, Fields) : - #TODO: ADD Error Messages, unlikely that CDS info is missing """ Create a Mutalyzer compatible exon list. @@ -181,22 +133,17 @@ class Converter(object) : @return: Exon list @rtype: list """ + Fields["exon_starts"] = map(int, Fields["exon_starts"].split(',')) + Fields["exon_stops"] = map(int, Fields["exon_stops"].split(',')) + assert(len(Fields["exon_starts"]) == len(Fields["exon_stops"])) - Fields["exonStarts"] =\ - _sl2il(Fields["exonStarts"].split(',')[:-1]) - Fields["exonEnds"] =\ - _sl2il(Fields["exonEnds"].split(',')[:-1]) - assert(len(Fields["exonStarts"]) == len(Fields["exonEnds"])) - - Fields["cdsStart"] = int(Fields["cdsStart"]) - Fields["cdsEnd"] = int(Fields["cdsEnd"]) - - for i in range(len(Fields["exonStarts"])) : - Fields["exonStarts"][i] += 1 + if Fields['cds_start'] and Fields['cds_stop']: + Fields["cds_start"] = int(Fields["cds_start"]) + Fields["cds_stop"] = int(Fields["cds_stop"]) # Create Mutalyzer compatible exon list Fields["exons"] = [] - for exon in zip(Fields["exonStarts"], Fields["exonEnds"]) : + for exon in zip(Fields["exon_starts"], Fields["exon_stops"]) : Fields["exons"].extend(exon) self.dbFields = Fields @@ -218,9 +165,9 @@ class Converter(object) : """ Fields = dict(zip( - ("acc", "txStart", "txEnd", "cdsStart", "cdsEnd", - "exonStarts", "exonEnds", "geneName", - "chrom", "strand", "protAcc", "version"), + ("transcript", "start", "stop", "cds_start", "cds_stop", + "exon_starts", "exon_stops", "gene", + "chromosome", "orientation", "protein", "version"), values)) return self._populateFields(Fields) #_FieldsFromValues @@ -292,15 +239,14 @@ class Converter(object) : if not self.dbFields: return None CDS = [] - if self.dbFields["cdsStart"] != self.dbFields["cdsEnd"] : - #The counting from 0 conversion. - CDS = [self.dbFields["cdsStart"] + 1, self.dbFields["cdsEnd"]] + if self.dbFields["cds_start"] and self.dbFields["cds_stop"]: + CDS = [self.dbFields["cds_start"], self.dbFields["cds_stop"]] mRNA = self.dbFields["exons"] # Convert the strand information to orientation. orientation = 1 - if self.dbFields["strand"] == '-' : + if self.dbFields["orientation"] == '-' : orientation = -1 # Build the crossmapper. @@ -308,6 +254,37 @@ class Converter(object) : return self.crossmap #makeCrossmap + @staticmethod + def _getcoords(C, Loc, Type) : + """ + Return main, offset and g positions given either a position in + I{c.} or in I{g.} notation. + + @arg C: A crossmapper + @type C: object + @arg Loc: A location in either I{g.} or I{c.} notation + @type Loc: object + @arg Type: The reference type + @type Type: string + @returns: triple: + 0. Main coordinate in I{c.} notation + 1. Offset coordinate in I{c.} notation + 2. Position in I{g.} notation + @rtype: triple (integer, integer, integer) + """ + if Type == 'c' : + main = C.main2int(Loc.MainSgn + Loc.Main) + offset = C.offset2int(Loc.OffSgn + Loc.Offset) + g = C.x2g(main, offset) + main, offset = C.g2x(g) + #if + else : + g = int(Loc.Main) + main, offset = C.g2x(g) + #else + return (main, offset, g) + #_getcoords + def _coreMapping(self) : """ Build the Mapping ClassSerializer. @@ -327,14 +304,14 @@ class Converter(object) : # Get the coordinates of the start position startmain, startoffset, start_g = \ - _getcoords(Cross, mutation.StartLoc.PtLoc, - self.parseTree.RefType) + self._getcoords(Cross, mutation.StartLoc.PtLoc, + self.parseTree.RefType) # If there is an end position, calculate the coordinates. if mutation.EndLoc : endmain, endoffset, end_g = \ - _getcoords(Cross, mutation.EndLoc.PtLoc, - self.parseTree.RefType) + self._getcoords(Cross, mutation.EndLoc.PtLoc, + self.parseTree.RefType) else : end_g, endmain, endoffset = start_g, startmain, startoffset @@ -456,16 +433,16 @@ class Converter(object) : return None # construct the variant description - chromAcc = self.__database.chromAcc(self.dbFields["chrom"]) + chromAcc = self.__database.chromAcc(self.dbFields["chromosome"]) f_change = self._constructChange(False) r_change = self._constructChange(True) - if self.dbFields["strand"] == "+" : + if self.dbFields["orientation"] == "+" : change = f_change else : change = r_change if M.start_g != M.end_g : - if self.dbFields["strand"] == '+' : + if self.dbFields["orientation"] == '+' : var_in_g = "g.%s_%s%s" % (M.start_g, M.end_g, change) else : var_in_g = "g.%s_%s%s" % (M.end_g, M.start_g, change) @@ -554,9 +531,9 @@ class Converter(object) : #balen continue # construct the variant description - accNo = "%s.%s" % (self.dbFields["acc"],self.dbFields["version"]) - geneName = self.dbFields["geneName"] or "" - strand = self.dbFields["strand"] == '+' + accNo = "%s.%s" % (self.dbFields["transcript"],self.dbFields["version"]) + geneName = self.dbFields["gene"] or "" + strand = self.dbFields["orientation"] == '+' startp = self.crossmap.tuple2string((M.startmain, M.startoffset)) endp = self.crossmap.tuple2string((M.endmain, M.endoffset)) @@ -620,3 +597,254 @@ class Converter(object) : return change #_constructChange #Converter + + +class Updater(object): + """ + Abstract base class for updating the mapping information in the database. + + Subclasses should implement the {load} method, loading new mapping + information into the 'MappingTemp' table. The {merge} method merges this + table into the real 'Mapping' table. + """ + def __init__(self, build, config): + """ + @arg build: Human genome build (or database name), i.e. 'hg18' or + 'hg19'. + @type build: string + @arg config: A configuration object. + @type config: mutalyzer.config.Config + """ + self.build = build + self.config = config + self.db = Db.Mapping(build, config.Db) + #__init__ + + def load(self): + """ + The implementation of this method in subclasses should load mapping + information in the 'MappingTemp' table. + """ + raise NotImplementedError('Implement this method in subclasses') + #load + + def merge(self): + """ + Merge the 'Mapping' and 'MappingTemp' tables. The result is stored in + the 'Mapping' table, of which a backup is created as 'MappingBackup'. + + @todo: Report how much was updated/added. + """ + self.db.merge_update() + #merge +#Updater + + +class NCBIUpdater(Updater): + """ + Update the mapping information in the database with mapping information + from the NCBI. + + Example usage: + + >>> updater = NCBIUpdater('hg19', mutalyzer.config.Config()) + >>> updater.load('/tmp/seq_gene.md', 'GRCh37.p2-Primary Assembly') + >>> updater.merge() + + """ + COLUMNS = ['taxonomy', 'chromosome', 'start', 'stop', 'orientation', + 'contig', 'ctg_start', 'ctg_stop', 'ctg_orientation', + 'feature_name', 'feature_id', 'feature_type', 'group_label', + 'transcript', 'evidence_code'] + + def __init__(self, build, config): + """ + @arg build: Human genome build (or database name), i.e. 'hg18' or + 'hg19'. + @type build: string + @arg config: A configuration object. + @type config: mutalyzer.config.Config + """ + self.exon_backlog = {} + super(NCBIUpdater, self).__init__(build, config) + #__init__ + + def load(self, mapping_file, assembly): + """ + Load NCBI mapping information from {mapping_file} into the database. + + The NCBI mapping file consists of entries, one per line, in order of + their location in the genome (more specifically by start location). + Every entry has a 'group_name' column, denoting the assembly it is + from. We only use entries where this value is {assembly}. + + There are four types of entries (for our purposes): + - Gene: Name, identifier, and location of a gene. + - Transcript: Name, gene id, and location of a transcript. + - UTR: Location and transcript of a non-coding exon (or part of it). + - CDS: Location and transcript of a coding exon (or part of it). + + A bit troublesome for us is that exons are split in UTR exons and CDS + exons, with exons overlapping the UTR/CDS border defined as two + separate entries (one of type UTR and one of type CDS). + + Another minor annoyance is that some transcripts (~ 15) are split over + two contigs (NT_*). In that case, they are defined by two entries in + the file, where we should merge them by taking the start position of + the first and the stop position of the second. + + Our strategy is to loop over all entries and store them in three + temporary tables (for genes, transcripts, exons). The entries of type + UTR and CDS are merged to correct exon entries by keeping a backlog + of these entries that can still be modified before storing them in the + database. + + The values from the three temporary tables are aggregated into the + 'MappingTemp' table. + + @arg mapping_file: Path to NCBI mapping information. + @type mapping_file: string + @arg assembly: Use only entries from this assembly (this is the + 'group_name' column in the NCBI mapping file). + @type assembly: string + """ + self._create_temporary_tables() + self._import_mapping(mapping_file, assembly) + self._aggregate_mapping() + self._drop_temporary_tables() + #load + + def _import_mapping(self, mapping_file, assembly): + """ + Import mapping information from {mapping_file} into three temporary + tables. + + @note: We issue a separate INSERT statement to the database for every + entry. An alternative is to write everything to tab-separated + files and load those into the database with LOAD DATA LOCAL INFILE + statements. This alternative seems to be about twice as fast, but + for now we stick with the simpler solution. + """ + self.exon_backlog = {} + + with open(mapping_file, 'r') as mapping: + for line in mapping: + if line.startswith('#'): + continue + entry = dict(zip(self.COLUMNS, line.rstrip().split('\t'))) + + # Only use entries from the given assembly. + if entry['group_label'] != assembly: + continue + + # Only use entries on the normal chromosomes. + try: + int(entry['chromosome']) + except ValueError: + if entry['chromosome'] not in 'XY': + continue + + if entry['feature_type'] == 'GENE': + self._import_gene(entry) + elif entry['feature_type'] == 'RNA': + self._import_transcript(entry) + elif entry['feature_type'] in ('UTR', 'CDS'): + self._import_exon(entry) + + self._import_exon_backlog() + #_import_mapping + + def _import_gene(self, entry): + """ + Insert a gene in the database. + """ + self.db.ncbi_import_gene(entry['feature_id'], entry['feature_name']) + #_import_gene + + def _import_transcript(self, entry): + """ + Insert a transcript in the database. + """ + self.db.ncbi_import_transcript( + entry['feature_name'], entry['feature_id'], entry['chromosome'], + int(entry['start']), int(entry['stop']), entry['orientation']) + #_import_transcript + + def _import_exon(self, entry): + """ + Instead of directly inserting each exon in the database, we keep them + in a backlog of at most one exon per transcript. Exons are taken from + the backlog and inserted in the database only when we passed their + genomic stop location by more than one position. + + This way, exons in the backlog can be merged when they are on a + UTR/CDS boundary. + """ + cds = entry['feature_type'] == 'CDS' + entry['start'] = int(entry['start']) + entry['stop'] = int(entry['stop']) + entry['protein'] = entry['feature_name'] if cds else None + entry['cds'] = cds + + self._import_exon_backlog(entry['start'] - 1) + + try: + previous = self.exon_backlog[entry['transcript']] + if previous['cds'] != entry['cds'] \ + and previous['stop'] == entry['start'] - 1: + if entry['cds']: + entry['cds_start'] = entry['start'] + else: + entry['cds_stop'] = previous['stop'] + if 'cds_start' in previous: + entry['cds_start'] = previous['cds_start'] + entry['protein'] = previous['protein'] + entry['start'] = previous['start'] + except KeyError: + pass + + self.exon_backlog[entry['transcript']] = entry + #_import_exon + + def _import_exon_backlog(self, up_to_position=None): + """ + Import exons from the backlog in the database. If the optional + argument {up_to_position} is set, only import exons with a stop + position before this value. + + We explicitely remove imported exons from the backlog, because it + might be suboptimal to keep more than 30,000 exons in there. + """ + for transcript, exon in self.exon_backlog.items(): + if not up_to_position or exon['stop'] < up_to_position: + del self.exon_backlog[transcript] + del exon['cds'] + self.db.ncbi_import_exon( + exon['transcript'], exon['start'], exon['stop'], + exon['cds_start'] if 'cds_start' in exon else None, + exon['cds_stop'] if 'cds_stop' in exon else None, + exon['protein'] or None) + #_import_exon_backlog + + def _aggregate_mapping(self): + """ + Aggregate the genes, transcripts and exons from their temporary + tables into the 'MappingTemp' table. + """ + self.db.ncbi_aggregate_mapping() + #_aggregate_mapping + + def _create_temporary_tables(self): + """ + Create temporary tables needed for loading the NCBI mapping data. + """ + self.db.ncbi_create_temporary_tables() + #_create_temporary_tables + + def _drop_temporary_tables(self): + """ + Drop temporary tables needed for loading the NCBI mapping data. + """ + self.db.ncbi_drop_temporary_tables() + #_drop_temporary_tables +#NCBIUpdater diff --git a/mutalyzer/util.py b/mutalyzer/util.py index dd65e0d5..a247a16e 100644 --- a/mutalyzer/util.py +++ b/mutalyzer/util.py @@ -19,9 +19,11 @@ General utility functions. """ +import sys import os import math import time +import inspect from itertools import izip_longest import Bio.Seq @@ -722,6 +724,35 @@ def message_info(message): #message_info +def format_usage(usage=None, keywords={}): + """ + Format a usage string suitable for printing to the console. Some magic + is employed so you can usually just call this function without arguments + to have the calling module's docstring pretty-printed. + + @kwarg usage: The string to format. If omitted, the calling module's + docstring is used. + @type usage: string + @kwarg keywords: A dictionary of (keyword, value) pairs used to format + the usage string. If it does not contain the key 'command', it is + added with the value of sys.argv[0]. + @type keywords: dictionary(string, string) + + @return: Formatted usage string. This is {usage} with any entries from + {keywords} replaced and cut-off at the first occurence of two + consecutive empty lines. + @rtype: string + """ + if not usage: + caller = inspect.stack()[1] + usage = inspect.getmodule(caller[0]).__doc__ + if not 'command' in keywords: + keywords['command'] = sys.argv[0] + + return usage.split('\n\n\n')[0].strip().format(**keywords) +#format_usage + + def slow(f): """ Decorator for slow tests. This makes them to pass immediately, without diff --git a/mutalyzer/webservice.py b/mutalyzer/webservice.py index 181c4af9..b31c47c5 100644 --- a/mutalyzer/webservice.py +++ b/mutalyzer/webservice.py @@ -38,7 +38,7 @@ from mutalyzer.grammar import Grammar from mutalyzer.sync import CacheSync from mutalyzer import variantchecker from mutalyzer import Db -from mutalyzer import Mapper +from mutalyzer.mapping import Converter from mutalyzer import Retriever from mutalyzer import GenRecord from mutalyzer.models import * @@ -334,7 +334,7 @@ class MutalyzerService(DefinitionBase): "Reveived request mappingInfo(%s %s %s %s)" % ( LOVD_ver, build, accNo, variant)) - conv = Mapper.Converter(build, self._config, L) + conv = Converter(build, self._config, L) result = conv.mainMapping(accNo, variant) L.addMessage(__file__, -1, "INFO", @@ -372,7 +372,7 @@ class MutalyzerService(DefinitionBase): "Received request transcriptInfo(%s %s %s)" % (LOVD_ver, build, accNo)) - converter = Mapper.Converter(build, self._config, O) + converter = Converter(build, self._config, O) T = converter.mainTranscript(accNo) O.addMessage(__file__, -1, "INFO", @@ -497,7 +497,7 @@ class MutalyzerService(DefinitionBase): O.addMessage(__file__, -1, "INFO", "Received request cTogConversion(%s %s)" % ( build, variant)) - converter = Mapper.Converter(build, self._config, O) + converter = Converter(build, self._config, O) variant = converter.correctChrVariant(variant) if "c." in variant : diff --git a/mutalyzer/website.py b/mutalyzer/website.py index faa35853..1fddc040 100644 --- a/mutalyzer/website.py +++ b/mutalyzer/website.py @@ -35,7 +35,7 @@ from mutalyzer.grammar import Grammar from mutalyzer import webservice from mutalyzer import variantchecker from mutalyzer.output import Output -from mutalyzer import Mapper +from mutalyzer.mapping import Converter from mutalyzer import Db from mutalyzer import Scheduler from mutalyzer import Retriever @@ -300,7 +300,7 @@ class GetGS: # Todo: The following is probably a problem elsewhere too. # We stringify the variant, because a unicode string crashes - # Bio.Seq.reverse_complement in Mapper.py:607. + # Bio.Seq.reverse_complement in mapping.py:607. # We are only interested in the legend #Mutalyzer.process(str(i.mutationName), config, O) @@ -443,7 +443,7 @@ class PositionConverter: i = web.input(build='', variant='') # Todo: The following is probably a problem elsewhere too. # We stringify the variant, because a unicode string crashes - # Bio.Seq.reverse_complement in Mapper.py:607. + # Bio.Seq.reverse_complement in mapping.py:607. return self.position_converter(i.build, str(i.variant)) def position_converter(self, build='', variant=''): @@ -472,7 +472,7 @@ class PositionConverter: } if build and variant: - converter = Mapper.Converter(build, config, output) + converter = Converter(build, config, output) #Convert chr accNo to NC number variant = converter.correctChrVariant(variant) @@ -564,16 +564,16 @@ class VariantInfo: 'Received %s:%s (LOVD_ver %s, build %s)' \ % (acc, var, LOVD_ver, build)) - Converter = Mapper.Converter(build, config, output) + converter = Converter(build, config, output) result = '' # If no variant is given, return transcription start, transcription # end and CDS stop in c. notation. if var: - ret = Converter.mainMapping(acc, var) + ret = converter.mainMapping(acc, var) else: - ret = Converter.giveInfo(acc) + ret = converter.giveInfo(acc) if ret: result = '%i\n%i\n%i' % ret @@ -662,7 +662,7 @@ class Check: output.addMessage(__file__, -1, 'INFO', 'Received variant %s' % name) # Todo: The following is probably a problem elsewhere too. # We stringify the variant, because a unicode string crashes - # Bio.Seq.reverse_complement in Mapper.py:607. + # Bio.Seq.reverse_complement in mapping.py:607. variantchecker.check_variant(str(name), config, output) output.addMessage(__file__, -1, 'INFO', 'Finished processing variant %s' % name) diff --git a/setup.py b/setup.py index 62ea18aa..2899ae3d 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ setup( scripts=['bin/mutalyzer', 'bin/mutalyzer-batchd', 'bin/mutalyzer-cache-sync', - 'bin/mutalyzer-ucsc-update', + 'bin/mutalyzer-mapping-update', 'bin/mutalyzer-webservice.wsgi', 'bin/mutalyzer-website.wsgi'], zip_safe=False diff --git a/tests/test_converter.py b/tests/test_mapping.py similarity index 92% rename from tests/test_converter.py rename to tests/test_mapping.py index 17d4c5c5..036b46f6 100644 --- a/tests/test_converter.py +++ b/tests/test_mapping.py @@ -1,5 +1,5 @@ """ -Tests for the converter (Mapper) module. +Tests for the mapping module. """ @@ -8,12 +8,12 @@ from nose.tools import * from mutalyzer.config import Config from mutalyzer.output import Output -from mutalyzer.Mapper import Converter +from mutalyzer.mapping import Converter class TestConverter(): """ - Test the converter (Mapper) module. + Test the Converter class. """ def setUp(self): """ -- GitLab