diff --git a/INSTALL b/INSTALL index 72856076359cfbfdb6abcb51340a8f1546ff5548..93b1f1c73912bab8c506b27762226a0b41d6dc3e 100644 --- a/INSTALL +++ b/INSTALL @@ -12,14 +12,14 @@ throughout. The following is an overview of default locations used by Mutalyzer: - Package files /usr/local/lib/python2.6/dist-packages/... - Configuration /etc/mutalyzer/config - Log file /var/log/mutalyzer.log - Cache directory /var/cache/mutalyzer - Batchd init script /etc/init.d/mutalyzer-batchd - UCSC update crontab /etc/cron.d/mutalyzer-ucsc-update - Apache configuration /etc/apache2/conf.d/mutalyzer.conf - Static website files /var/www/mutalyzer/base + Package files /usr/local/lib/python2.6/dist-packages/... + Configuration /etc/mutalyzer/config + Log file /var/log/mutalyzer.log + Cache directory /var/cache/mutalyzer + Batchd init script /etc/init.d/mutalyzer-batchd + Mapping update crontab /etc/cron.d/mutalyzer-mapping-update + Apache configuration /etc/apache2/conf.d/mutalyzer.conf + Static website files /var/www/mutalyzer/base The default database user is 'mutalyzer' with no password and the database names are 'mutalyzer', 'hg18', and 'hg19'. @@ -144,7 +144,9 @@ directory: sudo bash extras/post-upgrade If you installed Mutalyzer in a development environment, you don't have to -do anything to upgrade (apart from updating the source code). +do anything to upgrade except for running the automated migration scripts: + + for M in extras/migrations/*.migration; do sudo $M migrate; done Additional notes diff --git a/README b/README index 00f3e6d4ef64a15c1c8b8a89a826f8d5cecfa1f2..f703d5aece48bab1a3ed88e9cfbabd1f8b2f307d 100644 --- a/README +++ b/README @@ -40,8 +40,8 @@ Development notes Todo list: - Improve the web interface design :) - Test all uses of mkstemp(). -- Use naming conventions for modules Crossmap, Db, File, GenRecord, Mapper, - Retriever, Scheduler. +- Use naming conventions for modules Crossmap, Db, File, GenRecord, Retriever + and Scheduler. - Use standard logging module, with rotating functionality. Race conditions on the log file are probably a problem in the current setup. Instead of that rotating, we could also use logrotate: @@ -49,6 +49,8 @@ Todo list: - Setup continuous integration. Currently, I'm most impressed with Hudson. http://hudson-ci.org/ http://www.rhonabwy.com/wp/2009/11/04/setting-up-a-python-ci-server-with-hudson/ + Or perhaps Jenkins. + http://jenkins-ci.org/ - Use monit on the production server. http://mmonit.com/monit/ - Migrate Javascript to JQuery. @@ -60,13 +62,16 @@ Todo list: for example jinja. - Develop a large test suite. - Create a web interface url to watch the progress of a batch job. +- Create webservices for the batch jobs (steal ideas from Jeroen's DVD + webservice). - Use virtualenv? - Use SQLAlchemy? - Password for MySQL user. - In deployment, remove old versions of Mutalyzer package? - Use https protocol. - Check for os.path.join vulnerabilities. -- Solution for database schema migration on version updates. +- Use a standard solution for the database migrations in extras/migrations. +- Use something like Sphinx to generate development documentation from code. Code style guide: - Follow PEP 8 (code) and PEP 257 (docstrings). diff --git a/bin/mutalyzer b/bin/mutalyzer index 912ab2c147a516c0c0928bce0efe9556f0be6542..36af8af7aad61ccd8cea79c3ee8f4493822433c8 100755 --- a/bin/mutalyzer +++ b/bin/mutalyzer @@ -4,7 +4,10 @@ Command-line interface to the nomenclature checker. Usage: - ./check 'AB026906.1:c.274delG' + {command} variant + + variant: The variant description to check. + @todo: Refactor this file. """ @@ -16,6 +19,7 @@ import os import mutalyzer from mutalyzer.output import Output from mutalyzer.config import Config +from mutalyzer.util import format_usage def main(cmd): @@ -181,7 +185,7 @@ def main(cmd): if __name__ == '__main__': if len(sys.argv) < 2: - print 'Please provide a variant' + print format_usage() sys.exit(1) main(sys.argv[1]) diff --git a/bin/mutalyzer-cache-sync b/bin/mutalyzer-cache-sync index b53d709c4071a682d287eb2f9a4f0a5a29264637..bca5c1d7399de40b78060f792ef50750cb6f97cd 100755 --- a/bin/mutalyzer-cache-sync +++ b/bin/mutalyzer-cache-sync @@ -4,13 +4,14 @@ Synchronize the database cache with other Mutalyzer instances. Usage: - ./mutalyzer-cache-sync remote_wsdl url_template days + {command} remote_wsdl url_template days remote_wsdl: Location of the remote WSDL description. - url_template: URL to remote downloads, where {file} is to be substituted + url_template: URL to remote downloads, where {{file}} is to be substituted by the filename. days: Number of days to go back in the remote cache. + This program is intended to be run daily from cron. Example: 25 5 * * * mutalyzer-cache-sync 'http://dom1/?wsdl' 'http://dom1/{file}' 7 @@ -24,6 +25,7 @@ from mutalyzer.config import Config from mutalyzer.output import Output from mutalyzer.sync import CacheSync from mutalyzer import Db +from mutalyzer.util import format_usage def cache_sync(remote_wsdl, url_template, days): @@ -40,7 +42,7 @@ def cache_sync(remote_wsdl, url_template, days): if __name__ == '__main__': if len(sys.argv) < 4: - print __doc__.strip() + print format_usage() sys.exit(1) try: days = int(sys.argv[3]) diff --git a/bin/mutalyzer-mapping-update b/bin/mutalyzer-mapping-update new file mode 100755 index 0000000000000000000000000000000000000000..e46a89c4d7f723a665b03050371148ec8548aba4 --- /dev/null +++ b/bin/mutalyzer-mapping-update @@ -0,0 +1,59 @@ +#!/usr/bin/env python + +""" +Update the database with mapping information from the NCBI. + +Usage: + {command} database mapping_file assembly + + database: Database to update (i.e. 'hg18' or 'hg19'). + mapping_file: Path to the NCBI mapping information. + assembly: Use only entries from this assembly (this is the 'group_name' + column in the NCBI mapping file). + + +This program is intended to be run daily from cron. Example: + + 25 6 * * * mutalyzer-mapping-update hg19 /tmp/seq_gene.md reference +""" + + +import sys + +from mutalyzer.config import Config +from mutalyzer.output import Output +from mutalyzer.mapping import NCBIUpdater +from mutalyzer.util import format_usage + + +def main(database, mapping_file, assembly): + """ + Update the database with information from the NCBI. + + @arg database: Database to update (i.e. 'hg18' or 'hg19'). + @type database: string + @arg mapping_file: Path to NCBI mapping information. + @type mapping_file: string + @arg assembly: Use only entries from this assembly (this is the + 'group_name' column in the NCBI mapping file). + @type assembly: string + + @todo: Also report how much was added/updated. + """ + config = Config() + output = Output(__file__, config.Output) + output.addMessage(__file__, -1, 'INFO', + 'Starting NCBI mapping data update') + + updater = NCBIUpdater(database, config) + updater.load(mapping_file, assembly) + updater.merge() + + output.addMessage(__file__, -1, 'INFO', 'NCBI mapping data update end') + + +if __name__ == '__main__': + if len(sys.argv) != 4: + print format_usage() + sys.exit(1) + main(*sys.argv[1:]) diff --git a/bin/mutalyzer-ucsc-update b/bin/mutalyzer-ucsc-update deleted file mode 100755 index 6fdff9cbef6e4fa4020d9dd6f95400abd418905b..0000000000000000000000000000000000000000 --- a/bin/mutalyzer-ucsc-update +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python - -""" -Get updates on mapping information from the UCSC. - -This program is intended to be run daily from cron. Example: - - 25 6 * * * mutalyzer-ucsc-update -""" - - -import sys -import os - -from mutalyzer.config import Config -from mutalyzer.output import Output -from mutalyzer.Db import Remote -from mutalyzer.Db import Update - - -def main(): - """ - Update the database with information from the UCSC. - """ - config = Config() - output = Output(__file__, config.Output) - output.addMessage(__file__, -1, 'INFO', - 'Starting UCSC mapping data update') - - for database in config.Db.dbNames: - remote = Remote(database, config.Db) - tempfile = remote.get_Update() - - update = Update(database, config.Db) - update.load_Update(tempfile) - - count_updates = update.count_Updates() - if count_updates: - output.addMessage(__file__, -1, 'INFO', - '%i updates found' % count_updates) - update.backup_cdsUpdates() - count_cds_updates = update.count_cdsUpdates() - if count_cds_updates: - output.addMessage(__file__, -1, 'INFO', '%i CDS updates ' \ - 'found, backing up' % count_cds_updates) - update.merge_cdsUpdates() - - update.merge_Update() - - output.addMessage(__file__, -1, 'INFO', 'UCSC mapping data update end') - - -if __name__ == '__main__': - main() diff --git a/extras/config.example b/extras/config.example index bce2c1e4905b66827080eb43a7cba9138c0951ec..4bbecc30a65e772e202e1d4aa04c499a27bc4580 100644 --- a/extras/config.example +++ b/extras/config.example @@ -42,14 +42,8 @@ LocalMySQLuser = "mutalyzer" # Host name for the local databases. LocalMySQLhost = "localhost" -# MySQL username for the UCSC database. -RemoteMySQLuser = "genome" -# Host name for the UCSC database. -RemoteMySQLhost = "genome-mysql.cse.ucsc.edu" -# Retrieve all entries modified within a certain number of days. -UpdateInterval = 7 # diff --git a/extras/cron.d/mutalyzer-mapping-update b/extras/cron.d/mutalyzer-mapping-update new file mode 100644 index 0000000000000000000000000000000000000000..896f304d933c3d368c89676906c361f3b87df0d1 --- /dev/null +++ b/extras/cron.d/mutalyzer-mapping-update @@ -0,0 +1,3 @@ +# Update the mapping database every sunday morning at 03:25 and 04:25 +25 3 * * 7 www-data wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/BUILD.36.3/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md; <MUTALYZER_BIN_MAPPING_UPDATE> hg18 /tmp/seq_gene.md reference +25 4 * * 7 www-data wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md; <MUTALYZER_BIN_MAPPING_UPDATE> hg19 /tmp/seq_gene.md 'GRCh37.p2-Primary Assembly' diff --git a/extras/cron.d/mutalyzer-ucsc-update b/extras/cron.d/mutalyzer-ucsc-update deleted file mode 100644 index 141eecf61558ce8e6c2ae70a36e630cd262884a2..0000000000000000000000000000000000000000 --- a/extras/cron.d/mutalyzer-ucsc-update +++ /dev/null @@ -1,2 +0,0 @@ -# Update the UCSC database every morning at 06:25 -25 6 * * * www-data <MUTALYZER_BIN_UCSC_UPDATE> diff --git a/extras/migrations/001-db-gbinfo-add-created.migration b/extras/migrations/001-db-gbinfo-add-created.migration new file mode 100755 index 0000000000000000000000000000000000000000..bbef9018d5747334b57d51ee81642c863b1a026e --- /dev/null +++ b/extras/migrations/001-db-gbinfo-add-created.migration @@ -0,0 +1,69 @@ +#!/usr/bin/env python + +""" +Add a column and index 'created' to the 'GBInfo' table. + +Usage: + ./001-db-gbinfo-add-created.migration [migrate] +""" + + +import sys +import MySQLdb + + +def _connect(): + try: + connection = MySQLdb.connect(host='localhost', + user='mutalyzer', + passwd='', + db='mutalyzer') + except MySQLdb.Error, e: + print 'Error %d: %s' % (e.args[0], e.args[1]) + sys.exit(1) + return connection + + +def check(): + """ + Check if migration is needed. + """ + connection = _connect() + cursor = connection.cursor() + cursor.execute('SHOW COLUMNS FROM GBInfo WHERE field = "created";') + has_column = len(cursor.fetchall()) > 0 + cursor.execute('SHOW INDEX FROM GBInfo WHERE Key_name = "created";') + has_index = len(cursor.fetchall()) > 0 + connection.close() + if has_column != has_index: + print 'Installation is not in a recognizable state. Fix manually.' + sys.exit(1) + return not has_column + + +def migrate(): + """ + Perform migration. + """ + connection = _connect() + cursor = connection.cursor() + cursor.execute(""" + ALTER TABLE GBInfo + #ADD COLUMN created TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + ADD INDEX created (created);""") + connection.commit() + connection.close() + print 'Added column mutalyzer.GBInfo.created' + print 'Added index on mutalyzer.GBInfo.created' + + +if __name__ == '__main__': + needed = check() + if needed: + print 'This migration is needed.' + if len(sys.argv) > 1 and sys.argv[1] == 'migrate': + print 'Performing migration.' + migrate() + print 'Performed migration.' + else: + print 'This migration is not needed.' diff --git a/extras/migrations/002-db-map-to-mapping.migration b/extras/migrations/002-db-map-to-mapping.migration new file mode 100755 index 0000000000000000000000000000000000000000..fff956e026350c03913a6f936ed125115086d0b6 --- /dev/null +++ b/extras/migrations/002-db-map-to-mapping.migration @@ -0,0 +1,160 @@ +#!/usr/bin/env python + +""" +Convert the old 'map' tables to the new 'Mapping' tables. + +Usage: + ./002-db-map-to-mapping.migration [migrate] + +This is basically just a renaming of columns and +- use NULL for missing values +- add 1 to all chromosomal start positions. + +The following tables on hg18 and hg19 are dropped: +- gbStatus +- map_cdsBackup +- refGene +- refLink + +The map tables are renamed to map_backup. +""" + + +import sys +import MySQLdb + + +def _connect(db): + try: + connection = MySQLdb.connect(host='localhost', + user='mutalyzer', + passwd='', + db=db) + except MySQLdb.Error, e: + print 'Error %d: %s' % (e.args[0], e.args[1]) + sys.exit(1) + return connection + + +def _exon_starts(starts): + updated = [] + for start in starts.split(',')[:-1]: + updated.append(str(int(start) + 1)) + return ','.join(updated) + + +def _exon_stops(stops): + if stops[-1] == ',': + return stops[:-1] + + +def _check(db): + # Todo: Also check if 'map' is gone. + connection = _connect(db) + cursor = connection.cursor() + cursor.execute('SHOW TABLES LIKE "Mapping";') + ok = len(cursor.fetchall()) > 0 + connection.close() + return ok + + +def _migrate(db): + connection = _connect(db) + cursor = connection.cursor() + cursor.execute(""" + CREATE TABLE Mapping ( + gene varchar(255) DEFAULT NULL, + transcript varchar(20) NOT NULL DEFAULT '', + version smallint(6) DEFAULT NULL, + chromosome varchar(40) DEFAULT NULL, + orientation char(1) DEFAULT NULL, + start int(11) unsigned DEFAULT NULL, + stop int(11) unsigned DEFAULT NULL, + cds_start int(11) unsigned DEFAULT NULL, + cds_stop int(11) unsigned DEFAULT NULL, + exon_starts longblob NOT NULL, + exon_stops longblob NOT NULL, + protein varchar(20) DEFAULT NULL, + source varchar(20) DEFAULT NULL, + INDEX (transcript) + );""") + select_cursor = connection.cursor(MySQLdb.cursors.DictCursor) + select_cursor.execute(""" + SELECT + geneName as gene, + acc as transcript, + version as version, + chrom as chromosome, + strand as orientation, + txStart + 1 as start, + txEnd as stop, + NULLIF(cdsStart + 1, cdsEnd + 1) as cds_start, + NULLIF(cdsEnd, cdsStart) as cds_stop, + exonStarts as exon_starts, + exonEnds as exon_stops, + NULLIF(protAcc, '') as protein, + 'UCSC' as source + FROM + map;""") + count = 0 + while True: + r = select_cursor.fetchone() + if r == None: + break + count += 1 + cursor.execute(""" + INSERT INTO Mapping + (gene, transcript, version, chromosome, orientation, start, stop, + cds_start, cds_stop, exon_starts, exon_stops, protein, source) + VALUES + (%s, %s, %s, %s, %s, %s, %s, + %s, %s, %s, %s, %s, %s);""", + (r['gene'], r['transcript'], r['version'], r['chromosome'], + r['orientation'], r['start'], r['stop'], r['cds_start'], + r['cds_stop'], _exon_starts(r['exon_starts']), _exon_stops(r['exon_stops']), + r['protein'], r['source'])) + + print 'Converted table map to table Mapping on %s (%d entries)' % (db, count) + + cursor.execute('DROP TABLE IF EXISTS gbStatus, map_cdsBackup, refGene, refLink') + cursor.execute('RENAME TABLE map TO map_backup') + + print 'Dropped tables gbStatus, map_cdsBackup, refGene, refLink on %s' % db + print 'Renamed table map to map_backup on %s' % db + + select_cursor.close() + cursor.close() + connection.commit() + connection.close() + + +def check(): + """ + Check if migration is needed. + """ + hg18_ok = _check('hg18') + hg19_ok = _check('hg19') + if hg18_ok != hg19_ok: + print 'Installation is not in a recognizable state. Fix manually.' + sys.exit(1) + return not hg18_ok + + +def migrate(): + """ + Perform migration. + """ + _migrate('hg18') + _migrate('hg19') + + +if __name__ == '__main__': + needed = check() + if needed: + print 'This migration is needed.' + if len(sys.argv) > 1 and sys.argv[1] == 'migrate': + print 'Performing migration.' + migrate() + print 'Performed migration.' + else: + print 'This migration is not needed.' diff --git a/extras/migrations/003-config-remove-ucsc.migration b/extras/migrations/003-config-remove-ucsc.migration new file mode 100755 index 0000000000000000000000000000000000000000..d3b37aa0f3fdc8c34bf24c05fe9c58c09d2e5902 --- /dev/null +++ b/extras/migrations/003-config-remove-ucsc.migration @@ -0,0 +1,25 @@ +#!/bin/bash + +# Remove UCSC database values from the configuration file. +# +# Usage: +# ./003-config-remove-ucsc.migration [migrate] + +if [ -e /etc/mutalyzer/config ] && $(grep -q 'MySQL username for the UCSC database' /etc/mutalyzer/config); then + echo 'This migration is needed.' + if [ "$1" = 'migrate' ]; then + echo 'Performing migration.' + echo 'Copying /etc/mutalyzer/config to /etc/mutalyzer/config.backup' + cp /etc/mutalyzer/config /etc/mutalyzer/config.backup + sed -i '/MySQL username for the UCSC database/d' /etc/mutalyzer/config + sed -i '/Host name for the UCSC database/d' /etc/mutalyzer/config + sed -i '/Retrieve all entries modified within a certain number of days/d' /etc/mutalyzer/config + sed -i '/RemoteMySQLuser =/d' /etc/mutalyzer/config + sed -i '/^RemoteMySQLhost =/d' /etc/mutalyzer/config + sed -i '/^UpdateInterval =/d' /etc/mutalyzer/config + echo 'Removed all UCSC database configuration values from /etc/mutalyzer/config' + echo 'Performed migration.' + fi +else + echo 'This migration is not needed.' +fi diff --git a/extras/migrations/004-cron-ucsc-to-ncbi.migration b/extras/migrations/004-cron-ucsc-to-ncbi.migration new file mode 100755 index 0000000000000000000000000000000000000000..5ba44c1ab66e3e30d096f7bfe1388d7a4113edaa --- /dev/null +++ b/extras/migrations/004-cron-ucsc-to-ncbi.migration @@ -0,0 +1,24 @@ +#!/bin/bash + +# Remove UCSC update from cron and install NCBI update. +# +# Usage: +# ./004-cron-ucsc-to-ncbi.migration [migrate] + +if [ -e /etc/cron.d/mutalyzer-ucsc-update ] && $(grep -v -q '^#' /etc/cron.d/mutalyzer-ucsc-update); then + echo 'This migration is needed.' + if [ "$1" = 'migrate' ]; then + echo 'Performing migration.' + sed -i 's/^/#/' /etc/cron.d/mutalyzer-ucsc-update + echo 'Commented all lines in /etc/cron.d/mutalyzer-ucsc-update' + if [ ! -e /etc/cron.d/mutalyzer-mapping-update ]; then + BIN_MAPPING_UPDATE=$(which mutalyzer-mapping-update) + cp extras/cron.d/mutalyzer-mapping-update /etc/cron.d/mutalyzer-mapping-update + sed -i -e "s@<MUTALYZER_BIN_MAPPING_UPDATE>@${BIN_MAPPING_UPDATE}@g" /etc/cron.d/mutalyzer-mapping-update + echo 'Installed /etc/cron.d/mutalyzer-mapping-update' + fi + echo 'Performed migration.' + fi +else + echo 'This migration is not needed.' +fi diff --git a/extras/migrations/README b/extras/migrations/README new file mode 100644 index 0000000000000000000000000000000000000000..e514423f3eda8b60a79e6aa430fa8984bfd9085d --- /dev/null +++ b/extras/migrations/README @@ -0,0 +1,18 @@ +Automated migration scripts +=========================== + +This directory contains scripts to automate the migration of a Mutalyzer +installation to the latest version. Things that might need a migration +include: + +- database schema +- database data +- configuration files +- cache +- ? + +All migration scripts accept as parameters a flag 'migrate'. Running a +script without parameters just checks if the migration is needed. Running +a script with 'migrate' does the actual migration (only if needed). + +Performing multiple migrations should be done in the order of their names. diff --git a/extras/post-install.sh b/extras/post-install.sh index 6b0735c4f67d4ad1a4fc46fc64a74a3c52d70c6c..4d359e9fa0b92dbd7c4b0fd2f42f23e970936380 100644 --- a/extras/post-install.sh +++ b/extras/post-install.sh @@ -22,7 +22,7 @@ set -e PACKAGE_ROOT=$(cd / && python -c 'import mutalyzer; print mutalyzer.package_root()') BIN_BATCHD=$(which mutalyzer-batchd) BIN_CACHE_SYNC=$(which mutalyzer-cache-sync) -BIN_UCSC_UPDATE=$(which mutalyzer-ucsc-update) +BIN_MAPPING_UPDATE=$(which mutalyzer-mapping-update) BIN_WEBSITE=$(which mutalyzer-website.wsgi) BIN_WEBSERVICE=$(which mutalyzer-webservice.wsgi) @@ -55,10 +55,10 @@ update-rc.d -f mutalyzer-batchd remove update-rc.d mutalyzer-batchd defaults 98 02 echo "Installing crontab" -cp extras/cron.d/mutalyzer-ucsc-update /etc/cron.d/mutalyzer-ucsc-update -sed -i -e "s@<MUTALYZER_BIN_UCSC_UPDATE>@${BIN_UCSC_UPDATE}@g" /etc/cron.d/mutalyzer-ucsc-update cp extras/cron.d/mutalyzer-cache-sync /etc/cron.d/mutalyzer-cache-sync sed -i -e "s@<MUTALYZER_BIN_CACHE_SYNC>@${BIN_CACHE_SYNC}@g" /etc/cron.d/mutalyzer-cache-sync +cp extras/cron.d/mutalyzer-mapping-update /etc/cron.d/mutalyzer-mapping-update +sed -i -e "s@<MUTALYZER_BIN_MAPPING_UPDATE>@${BIN_MAPPING_UPDATE}@g" /etc/cron.d/mutalyzer-mapping-update echo "Creating /etc/apache2/conf.d/mutalyzer.conf" cp extras/apache/mutalyzer.conf /etc/apache2/conf.d/mutalyzer.conf @@ -79,130 +79,29 @@ cat << EOF | mysql -u root -p FLUSH PRIVILEGES; EOF -mkdir -p /tmp/mutalyzer-install -pushd /tmp/mutalyzer-install - -# Do hg18 -mkdir -p hg18 -pushd hg18 - -echo "Creating and populating hg18 database" - -# Then retrieve the refLink table from the UCSC website (hg18) -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/refLink.sql -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/refLink.txt.gz - -# For Variant_info to work, you need the following files too (hg18) -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/refGene.sql -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/refGene.txt.gz -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/gbStatus.sql -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/gbStatus.txt.gz - -# Create table and load data (hg18) -mysql -u mutalyzer -D hg18 < refLink.sql -zcat refLink.txt.gz | mysql -u mutalyzer -D hg18 -e 'LOAD DATA LOCAL INFILE "/dev/stdin" INTO TABLE refLink;' - -mysql -u mutalyzer -D hg18 < gbStatus.sql -zgrep mRNA gbStatus.txt.gz > gbStatus.mrna.txt -mysql -u mutalyzer -D hg18 -e 'LOAD DATA LOCAL INFILE "gbStatus.mrna.txt" INTO TABLE gbStatus;' - -mysql -u mutalyzer -D hg18 < refGene.sql -zcat refGene.txt.gz | mysql -u mutalyzer -D hg18 -e 'LOAD DATA LOCAL INFILE "/dev/stdin" INTO TABLE refGene;' - -# Combine the mapping info into one table (hg18) -cat << EOF | mysql -u mutalyzer -D hg18 - CREATE TABLE map - SELECT DISTINCT acc, version, txStart, txEnd, cdsStart, cdsEnd, - exonStarts, exonEnds, name2 AS geneName, chrom, - strand, protAcc - FROM gbStatus, refGene, refLink - WHERE type = "mRNA" - AND refGene.name = acc - AND acc = mrnaAcc; - CREATE TABLE map_cdsBackup ( - acc char(12) NOT NULL DEFAULT '', - version smallint(6) unsigned NOT NULL DEFAULT '0', - txStart int(11) unsigned NOT NULL DEFAULT '0', - txEnd int(11) unsigned NOT NULL DEFAULT '0', - cdsStart int(11) unsigned NOT NULL DEFAULT '0', - cdsEnd int(11) unsigned NOT NULL DEFAULT '0', - exonStarts longblob NOT NULL, - exonEnds longblob NOT NULL, - geneName varchar(255) NOT NULL DEFAULT '', - chrom varchar(255) NOT NULL DEFAULT '', - strand char(1) NOT NULL DEFAULT '', - protAcc varchar(255) NOT NULL DEFAULT '' - ); -EOF - -popd -rm -Rf /tmp/mutalyzer-install/hg18 - -# Do hg19 -mkdir -p hg19 -pushd hg19 - -echo "Creating and populating hg19 database" - -# Then retrieve the refLink table from the UCSC website (hg19) -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refLink.sql -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refLink.txt.gz - -# For Variant_info to work, you need the following files too (hg19) -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.sql -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/gbStatus.sql -wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/gbStatus.txt.gz - -# Create table and load data (hg19) -mysql -u mutalyzer -D hg19 < refLink.sql -zcat refLink.txt.gz | mysql -u mutalyzer -D hg19 -e 'LOAD DATA LOCAL INFILE "/dev/stdin" INTO TABLE refLink;' - -mysql -u mutalyzer -D hg19 < gbStatus.sql -zgrep mRNA gbStatus.txt.gz > gbStatus.mrna.txt -mysql -u mutalyzer -D hg19 -e 'LOAD DATA LOCAL INFILE "gbStatus.mrna.txt" INTO TABLE gbStatus;' - -mysql -u mutalyzer -D hg19 < refGene.sql -zcat refGene.txt.gz | mysql -u mutalyzer -D hg19 -e 'LOAD DATA LOCAL INFILE "/dev/stdin" INTO TABLE refGene;' - -# Combine the mapping info into one table (hg19) -cat << EOF | mysql -u mutalyzer -D hg19 - CREATE TABLE map - SELECT DISTINCT acc, version, txStart, txEnd, cdsStart, cdsEnd, - exonStarts, exonEnds, name2 AS geneName, chrom, - strand, protAcc - FROM gbStatus, refGene, refLink - WHERE type = "mRNA" - AND refGene.name = acc - AND acc = mrnaAcc; - CREATE TABLE map_cdsBackup ( - acc char(12) NOT NULL DEFAULT '', - version smallint(5) unsigned NOT NULL DEFAULT '0', - txStart int(10) unsigned NOT NULL DEFAULT '0', - txEnd int(10) unsigned NOT NULL DEFAULT '0', - cdsStart int(10) unsigned NOT NULL DEFAULT '0', - cdsEnd int(10) unsigned NOT NULL DEFAULT '0', - exonStarts longblob NOT NULL, - exonEnds longblob NOT NULL, - geneName varchar(255) NOT NULL DEFAULT '', - chrom varchar(255) NOT NULL DEFAULT '', - strand char(1) NOT NULL DEFAULT '', - protAcc varchar(255) NOT NULL DEFAULT '' - ); -EOF - -popd - -popd -rm -Rf /tmp/mutalyzer-install - -# Create ChrName tables (hg18) +# Create ChrName and Mapping table (hg18) cat << EOF | mysql -u mutalyzer -D hg18 CREATE TABLE ChrName ( AccNo char(20) NOT NULL, name char(20) NOT NULL, PRIMARY KEY (AccNo) ); +CREATE TABLE Mapping ( + gene varchar(255) DEFAULT NULL, + transcript varchar(20) NOT NULL DEFAULT '', + version smallint(6) DEFAULT NULL, + chromosome varchar(40) DEFAULT NULL, + orientation char(1) DEFAULT NULL, + start int(11) unsigned DEFAULT NULL, + stop int(11) unsigned DEFAULT NULL, + cds_start int(11) unsigned DEFAULT NULL, + cds_stop int(11) unsigned DEFAULT NULL, + exon_starts longblob NOT NULL, + exon_stops longblob NOT NULL, + protein varchar(20) DEFAULT NULL, + source varchar(20) DEFAULT NULL, + INDEX (transcript) +); INSERT INTO ChrName (AccNo, name) VALUES ('NC_000001.9', 'chr1'), ('NC_000002.10', 'chr2'), @@ -233,13 +132,33 @@ INSERT INTO ChrName (AccNo, name) VALUES ('NT_113959.1', 'chr22_h2_hap1'); EOF -# Create ChrName tables (hg19) +# Populate Mapping table with UCSC data (hg18) +wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/BUILD.36.3/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md +$(BIN_MAPPING_UPDATE hg18 /tmp/seq_gene.md reference) + +# Create ChrName and Mapping table (hg19) cat << EOF | mysql -u mutalyzer -D hg19 CREATE TABLE ChrName ( AccNo char(20) NOT NULL, name char(20) NOT NULL, PRIMARY KEY (AccNo) ); +CREATE TABLE Mapping ( + gene varchar(255) DEFAULT NULL, + transcript varchar(20) NOT NULL DEFAULT '', + version smallint(6) DEFAULT NULL, + chromosome varchar(40) DEFAULT NULL, + orientation char(1) DEFAULT NULL, + start int(11) unsigned DEFAULT NULL, + stop int(11) unsigned DEFAULT NULL, + cds_start int(11) unsigned DEFAULT NULL, + cds_stop int(11) unsigned DEFAULT NULL, + exon_starts longblob NOT NULL, + exon_stops longblob NOT NULL, + protein varchar(20) DEFAULT NULL, + source varchar(20) DEFAULT NULL, + INDEX (transcript) +); INSERT INTO ChrName (AccNo, name) VALUES ('NC_000001.10', 'chr1'), ('NC_000002.11', 'chr2'), @@ -276,6 +195,10 @@ INSERT INTO ChrName (AccNo, name) VALUES ('NT_167251.1', 'chr17_ctg5_hap1'); EOF +# Populate Mapping table with UCSC data (hg19) +wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md +$(BIN_MAPPING_UPDATE hg19 /tmp/seq_gene.md 'GRCh37.p2-Primary Assembly' + echo "Creating tables in mutalyzer database" # Create mutalyzer tables diff --git a/extras/post-upgrade.sh b/extras/post-upgrade.sh index 5cf4e8c4bea2768b57fb94f809e328ebb316638e..1a77436e80772951ebbf4ed719a6b41185356a32 100644 --- a/extras/post-upgrade.sh +++ b/extras/post-upgrade.sh @@ -29,6 +29,12 @@ fi echo "Symlinking /var/www/mutalyzer/base to $PACKAGE_ROOT/templates/base" ln -s $PACKAGE_ROOT/templates/base /var/www/mutalyzer/base +echo "Running any needed migrations" +for MIGRATION in extras/migrations/*.migration; do + echo "Checking migration $(basename $MIGRATION)" + $MIGRATION migrate +done + echo "Restarting Apache" /etc/init.d/apache2 restart diff --git a/mutalyzer/Db.py b/mutalyzer/Db.py index 0644142461e2b922b32370536d913bdac132b652..33fad56f9672bb52adb5991b76b0c1cf3b6d99a8 100644 --- a/mutalyzer/Db.py +++ b/mutalyzer/Db.py @@ -15,8 +15,6 @@ statements. #Public classes: # - Db ; Log in to a database and keep it open for queries. # - Mapping ; Mapping of transcripts and genes. -# - Remote ; Retrieving updates for the mapping databases. -# - Update ; Updating the mapping databases. # - Cache ; Cache administration. # - Batch ; Batch checker. @@ -133,7 +131,7 @@ class Mapping(Db) : - query(statement) ; General query function. SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping; Accumulated mapping info. """ def __init__(self, build, config) : @@ -150,66 +148,12 @@ class Mapping(Db) : Db.__init__(self, build, config.LocalMySQLuser, config.LocalMySQLhost) #__init__ - def get_protAcc(self, mrnaAcc) : - """ - Query the database for a protein ID given an mRNA ID. - - SQL tables from dbNames: - - map ; Accumulated mapping info. - - @arg mrnaAcc: The ID of an mRNA - @type mrnaAcc: string - - @return: The protein ID - @rtype: string - """ - - statement = """ - SELECT protAcc - FROM map - WHERE acc = %s; - """, mrnaAcc - - return self.query(statement)[0][0] - #get_protAcc - - def get_NM_info(self, mrnaAcc, version = None) : - """ - Retrieve various data for an NM number. - - SQL tables from dbNames: - - map ; Accumulated mapping info. - - @arg mrnaAcc: The ID of an mRNA - @type mrnaAcc: string - @arg version: version number of the accession number (not used) - @type version: integer - - @return: - - exonStarts ; List of exon start sites. - - exonEnds ; List of exon end sites. - - cdsStart ; Position of the start codon. - - cdsEnd ; Position of the end codon. - - strand ; Orientation of the gene (+ = forward, - - = reverse) - @rtype: list - """ - - statement = """ - SELECT exonStarts, exonEnds, cdsStart, cdsEnd, strand - FROM map - WHERE acc = %s; - """, mrnaAcc - - return self.query(statement)[0] - #get_NM_info - def get_NM_version(self, mrnaAcc) : """ Get the version number of an accession number. SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping ; Accumulated mapping info. @arg mrnaAcc: The ID of an mRNA @type mrnaAcc: string @@ -220,8 +164,8 @@ class Mapping(Db) : statement = """ SELECT version - FROM map - WHERE acc = %s; + FROM Mapping + WHERE transcript = %s; """, mrnaAcc return [int(i[0]) for i in self.query(statement)] @@ -233,7 +177,7 @@ class Mapping(Db) : If the version number is None, use the "newest" version number. SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping ; Accumulated mapping info. @arg mrnaAcc: The ID of an mRNA @type mrnaAcc: string @@ -251,25 +195,25 @@ class Mapping(Db) : See also test_converter.test_hla_cluster and bug #58. """ q = """ - select acc, - txStart, txEnd, - cdsStart, cdsEnd, - exonStarts, exonEnds, - geneName, chrom, - strand, protAcc - from map + select transcript, + start, stop, + cds_start, cds_stop, + exon_starts, exon_stops, + gene, chromosome, + orientation, protein + from Mapping """ if version is None: q += """ - where acc = %s - order by version desc, chrom asc; + where transcript = %s + order by version desc, chromosome asc; """ statement = (q, mrnaAcc) else: q += """ - where acc = %s and + where transcript = %s and version = %s - order by chrom asc; + order by chromosome asc; """ statement = q, (mrnaAcc, version) @@ -283,7 +227,7 @@ class Mapping(Db) : should be returned, set overlap to 0. SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping ; Accumulated mapping info. @arg chrom: The chromosome (coded as "chr1", ..., "chrY") @type chrom: string @@ -295,35 +239,35 @@ class Mapping(Db) : - 0 ; Return only the transcripts that completely fall in the range [p1, p2] - 1 ; Return all hit transcripts - @type overlap: boolean + @type overlap: boolean @return: All accession numbers that are hit according to the overlap criterium @rtype: list """ q = """ - select acc, - txStart, txEnd, - cdsStart, cdsEnd, - exonStarts, exonEnds, - geneName, chrom, - strand, protAcc, + select transcript, + start, stop, + cds_start, cds_stop, + exon_starts, exon_stops, + gene, chromosome, + orientation, protein, version - from map + from Mapping """ if overlap: q += """ - WHERE chrom = %s AND - txStart <= "%s" AND - txEnd >= "%s"; + WHERE chromosome = %s AND + start <= "%s" AND + stop >= "%s"; """ statement = q, (chrom, p2, p1) else: q += """ - WHERE chrom = %s AND - txStart >= "%s" AND - txEnd <= "%s"; + WHERE chromosome = %s AND + start >= "%s" AND + stop <= "%s"; """ statement = q, (chrom, p1, p2) @@ -338,16 +282,16 @@ class Mapping(Db) : geneName ; Name of a gene. SQL tables from dbNames: - map ; Accumulated mapping info. + Mapping ; Accumulated mapping info. Returns: list ; A list of transcripts. """ statement = """ - SELECT acc, version - FROM map - WHERE geneName = %s; + SELECT transcript, version + FROM Mapping + WHERE gene = %s; """, geneName ret = self.query(statement) @@ -365,7 +309,7 @@ class Mapping(Db) : Get the name of a gene, given a transcript identifier (NM number). SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping ; Accumulated mapping info. @arg mrnaAcc: The ID of an mRNA @type mrnaAcc: string @@ -375,9 +319,9 @@ class Mapping(Db) : """ statement = """ - SELECT geneName - FROM map - WHERE acc = %s; + SELECT gene + FROM Mapping + WHERE transcript = %s; """, mrnaAcc ret = self.query(statement) @@ -391,7 +335,7 @@ class Mapping(Db) : Check if the given name is a valid chromosome name. SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping ; Accumulated mapping info. @arg name: The name to be tested @type name: string @@ -403,8 +347,8 @@ class Mapping(Db) : statement = """ SELECT COUNT(*) - FROM map - WHERE chrom = %s; + FROM Mapping + WHERE chromosome = %s; """, name if int(self.query(statement)[0][0]) > 0 : @@ -469,7 +413,7 @@ class Mapping(Db) : Get the chromosome name, given a transcript identifier (NM number). SQL tables from dbNames: - - map ; Accumulated mapping info. + - Mapping ; Accumulated mapping info. @arg acc: The NM accession number (version NOT included) @type acc: string @@ -479,341 +423,227 @@ class Mapping(Db) : """ statement = """ - SELECT chrom - FROM map - WHERE acc = %s; + SELECT chromosome + FROM Mapping + WHERE transcript = %s; """, acc - print acc + ret = self.query(statement) if ret : return ret[0][0] return None #get_chromName -#Mapper - -class Remote(Db) : - """ - Database functions for retrieving updates for the mapping databases. - Special methods: - - __init__(config) ; Initialise the class. - - Public methods: - - get_Update() ; Retrieve new mapping info from the UCSC. - - Inherited methods from Db: - - query(statement) ; General query function. - - SQL tables from dbNames: - - gbStatus ; acc -> version mapping (NM to NM + version), - type, modDate - - refGene ; name -> geneName mapping (NM to gene name), - txStart, txEnd, cdsStart, cdsEnd, exonStarts, - exonEnds, chrom, strand. - - refLink ; mrnaAcc -> protAcc mapping (NM to NP). - """ - - def __init__(self, build, config) : + def merge_update(self): """ - Initialise the Db parent class. Use the remote database for a - certain build. - - Private variables (altered): - - __config ; Configuration variables. + Merge existing mapping information with new mapping information, which + should be in table 'MappingTemp'. - @arg build: The version of the mapping database - @type build: string - @arg config: Configuration variables - @type config: class instance - """ - - self.__config = config - Db.__init__(self, build, config.RemoteMySQLuser, config.RemoteMySQLhost) - #__init__ - - def get_Update(self) : - """ - Retrieve all mapping updates from the UCSC within a certain time - window (defined in the configuration file) and gather the results - into one mapping table. + The strategy is as follows. Existing mappings (accumulated by + Mutalyzer in the past) that are not in the new mapping information are + added to the new mapping information. The resulting set is used as the + mapping information from now on. - The results will be written to a temporary file to be imported in - the local database with the load_Update() function. - - Return temporary filename used to store the results. - - @return: Filename used to store the results. - @rtype: string + This way, we get the latest updates for existing mappings and keep old + mappings not in the updated information. SQL tables from dbNames: - - gbStatus ; acc -> version mapping (NM to NM + version), - type, modDate - - refGene ; name -> geneName mapping (NM to gene name), - txStart, txEnd, cdsStart, cdsEnd, exonStarts, - exonEnds, chrom, strand. - - refLink ; mrnaAcc -> protAcc mapping (NM to NP). - """ - - statement = """ - SELECT DISTINCT acc, version, txStart, txEnd, cdsStart, cdsEnd, - exonStarts, exonEnds, name2 AS geneName, chrom, - strand, protAcc - FROM gbStatus, refGene, refLink - WHERE type = "mRNA" - AND refGene.name = acc - AND acc = mrnaAcc - AND time >= DATE_SUB(CURDATE(), INTERVAL %s DAY); - """, self.__config.UpdateInterval - - handle, filename = tempfile.mkstemp(text=True) - - # Convert the results to a tab delimited file. - for i in self.query(statement) : - for j in i : - os.write(handle, str(j) + chr(0x09)) # 0x09 is a TAB. - os.write(handle, '\n') - #for - - os.close(handle) - return filename - #get_Update -#Remote - -class Update(Db) : - """ - Database functions for updating the mapping databases. - - Public methods: - - load_Update() ; Load new mapping info into the local database. - - count_Updates() ; Count the number of entries in the new - mapping info table. - - backup_cdsUpdates() ; Make a backup of updates that overwrite the - old mapping info. - - count_cdsUpdates() ; Count the number of updates that overwrite - the old mapping info. - - merge_cdsUpdates() ; Merge the backup of old mapping info with the - other old info. - - merge_Update() ; Merge the new mapping info from the UCSC with - what we already have. + - Mapping ; Accumulated mapping info. + - MappingTemp ; New mapping info. + - MappingBackup ; Backup of accumulated mapping info. - Inherited methods from Db: - - query(statement) ; General query function. - - SQL tables from dbNames: - - map ; Accumulated mapping info. - - map_temp ; Newly found data. - - map_new ; Merge of map_temp and map. - - map_cdsBackup_temp ; Entries that were updated without an increment - of the version number. - - map_cdsBackup ; Merge of map_cdsBackup_temp and itself. - """ - - def __init__(self, build, config) : + @todo: Return number of entries added/updated. """ - Initialise the Db parent class. Use the remote database for a - certain build. + statement = """ + CREATE TABLE IF NOT EXISTS MappingTemp LIKE Mapping; + """, None + self.query(statement) - Private variables (altered): - - __config ; Configuration variables. + statement = """ + INSERT INTO MappingTemp + SELECT * FROM Mapping AS OldM + WHERE NOT EXISTS ( + SELECT * FROM MappingTemp AS NewM + WHERE OldM.transcript = NewM.transcript + AND OldM.version = NewM.version + ); + """, None + self.query(statement) - @arg build: The version of the mapping database - @type build: string - @arg config: Configuration variables - @type config: class instance - """ + statement = """ + DROP TABLE IF EXISTS MappingBackup; + """, None + self.query(statement) - self.__config = config - Db.__init__(self, build, config.LocalMySQLuser, config.LocalMySQLhost) - #__init__ + statement = """ + RENAME TABLE Mapping TO MappingBackup, MappingTemp TO Mapping; + """, None + self.query(statement) + #merge_update - def load_Update(self, filename) : + def ncbi_create_temporary_tables(self): """ - Load the updates from the temporary file created by the get_Update() - function and import it in the local database. - - @arg filename: Filename to read the update from. - @type filename: string - - SQL tables from dbNames (altered): - - map_temp ; Created and loaded with data from TempFile. + Create temporary tables to import NCBI mapping into. SQL tables from dbNames: - - map ; Accumulated mapping info. + - Genes ; Gene names from NCBI. + - Transcripts ; Transcript mappings from NCBI. + - Exons ; Exon mappings from NCBI. """ - - # The statements in this function may be combined when MYSQL_BUG is - # solved. + self.ncbi_drop_temporary_tables() statement = """ - CREATE TABLE map_temp LIKE map; + CREATE TABLE Genes ( + id varchar(20) NOT NULL DEFAULT '', + name varchar(255) DEFAULT NULL, + PRIMARY KEY (id) + ); """, None self.query(statement) - statement = """ - LOAD DATA LOCAL INFILE %s - INTO TABLE map_temp; - """, filename + statement = """ + CREATE TABLE Transcripts ( + name varchar(20) NOT NULL DEFAULT '', + gene_id varchar(20) DEFAULT NULL, + chromosome char(2) DEFAULT NULL, + start int(11) DEFAULT NULL, + stop int(11) DEFAULT NULL, + orientation char(1) DEFAULT NULL, + PRIMARY KEY (name,start) + ); + """, None self.query(statement) - os.remove(filename) - #load_Update + statement = """ + CREATE TABLE Exons ( + transcript varchar(20) NOT NULL DEFAULT '', + start int(11) DEFAULT NULL, + stop int(11) DEFAULT NULL, + cds_start int(11) DEFAULT NULL, + cds_stop int(11) DEFAULT NULL, + protein varchar(20) DEFAULT NULL, + PRIMARY KEY (transcript,start) + ); + """, None + self.query(statement) + #ncbi_create_temporary_table - def count_Updates(self) : + def ncbi_drop_temporary_tables(self): """ - Count the number of updates. This function will only work if it - is preceeded by the load_Update() function. Otherwise the map_temp - table may not exist. This function can not be used after the - merge_Update() function has been executed, since it drops the - map_temp table. + Drop temporary tables used for importing NCBI mapping information. - @return: The number of entries in the table of updated mapping info - @rtype: integer + SQL tables from dbNames: + - Genes ; Gene names from NCBI. + - Transcripts ; Transcript mappings from NCBI. + - Exons ; Exon mappings from NCBI. """ - statement = """ - SELECT COUNT(*) - FROM map_temp; + DROP TABLE IF EXISTS Genes, Transcripts, Exons; """, None + self.query(statement) + #ncbi_drop_temporary_tables - return int(self.query(statement)[0][0]) - #count_Updates - - def backup_cdsUpdates(self) : + def ncbi_import_gene(self, id, name): """ - Copy all mapping entries where there was an update, but no - increment in the version number, to a backup table. Note that - we use acc, version, txStart as the primary key because members - of a gene family are mapped multiple times. - - SQL tables from dbNames (altered): - - map_cdsBackup_temp ; Created and filled with entries that - were updated without an increment of the - version number. + Import a (gene id, gene name) pair in a temporary table. SQL tables from dbNames: - - map ; Accumulated mapping info. - - map_temp ; Freshly downloaded mapping info. + - Genes ; Gene names from NCBI. """ - statement = """ - CREATE TABLE map_cdsBackup_temp - SELECT map.* - FROM map, map_temp - WHERE map.acc = map_temp.acc - AND map.version = map_temp.version - AND map.txStart = map_temp.txStart - AND ( - map.cdsStart != map_temp.cdsStart - OR map.cdsEnd != map_temp.cdsEnd - ); - """, None + INSERT IGNORE INTO Genes (id, name) VALUES (%s, %s); + """, (id, name) self.query(statement) - #backup_cdsUpdates + #ncbi_import_gene - def count_cdsUpdates(self) : + def ncbi_import_transcript(self, name, gene, chromosome, start, stop, + orientation): """ - Count the number of mapping entries that have changed without an - increment in the version number. This function can only be called - after backup_cdsUpdates() has been executed and before - merge_cdsUpdates has been executed. + Import a transcript mapping in a temporary table. SQL tables from dbNames: - - map_cdsBackup_temp ; Entries that wre updated without an - increment of the version number. - - @return: The number of mapping entries that have changed without an - increment in the version number - @rtype: integer + - Transcripts ; Transcript mappings from NCBI. """ - statement = """ - SELECT COUNT(*) - FROM map_cdsBackup_temp; - """, None + INSERT IGNORE INTO Transcripts + (name, gene_id, chromosome, start, stop, orientation) + VALUES + (%s, %s, %s, %s, %s, %s); + """, (name, gene, chromosome, start, stop, orientation) - return int(self.query(statement)[0][0]) - #count_cdsUpdates + self.query(statement) + #ncbi_import_transcript - def merge_cdsUpdates(self) : + def ncbi_import_exon(self, transcript, start, stop, cds_start, cds_stop, + protein): """ - Merge the mapping entries that have changed without an increment in - the version number with a table that contains backups of these - entries. + Import an exon mapping in a temporary table. - SQL tables from dbNames (altered): - - map_cdsBackup ; Extended with the entries in - map_cdsBackup_temp. - - map_cdsBackup_temp ; Dropped. + SQL tables from dbNames: + - Exons ; Exon mappings from NCBI. """ - - # The statements in this function may be combined when MYSQL_BUG is - # solved. - statement = """ - INSERT INTO map_cdsBackup - SELECT * - FROM map_cdsBackup_temp; - """, None - self.query(statement) - statement = """ - DROP TABLE map_cdsBackup_temp; - """, None + INSERT IGNORE INTO Exons + (transcript, start, stop, cds_start, cds_stop, protein) + VALUES + (%s, %s, %s, %s, %s, %s); + """, (transcript, start, stop, cds_start, cds_stop, protein) self.query(statement) - #merge_cdsUpdates + #ncbi_import_exon - def merge_Update(self) : + def ncbi_aggregate_mapping(self): """ - Merge the new mapping data with the old ones. + Aggregate gene, transcript and exon mapping information from the NCBI + into one table. - SQL tables from dbNames (altered): - - map_new ; Created and filled with the merge of map_temp and map. - Dropped after use. - - map_temp ; Merged with map to form map_new. Dropped after use. - - map ; Overwritten with the merged info in map_new. + @note: Default MySQL value for group_concat_max_len is 1024, meaning + that the GROUP_CONCAT aggregate function returns values of at most + 1024 bytes long. This is not enough (currently we need around 3000 + bytes), so we explicitely set this to a higher value. + @note: We use MAX(E.protein) since MySQL does not have an ANY() + aggregator. """ - - # The statements in this function may be combined when MYSQL_BUG is - # solved. - statement = """ - CREATE TABLE map_new - SELECT * - FROM map_temp - UNION - SELECT * - FROM map - WHERE NOT EXISTS ( - SELECT * - FROM map_temp - WHERE map.acc = map_temp.acc - AND map.version = map_temp.version - AND map.txStart = map_temp.txStart - ); + SET group_concat_max_len = 32768; """, None self.query(statement) + statement = """ - DROP TABLE map; + DROP TABLE IF EXISTS MappingTemp; """, None self.query(statement) + statement = """ - CREATE TABLE map - SELECT * - FROM map_new; + CREATE TABLE MappingTemp LIKE Mapping; """, None self.query(statement) + statement = """ - DROP TABLE map_new; + INSERT INTO MappingTemp + SELECT + G.name as gene, + SUBSTRING(T.name FROM 1 FOR LOCATE('.', T.name) - 1) as transcript, + SUBSTRING(T.name FROM LOCATE('.', T.name) + 1) as version, + CONCAT('chr', T.chromosome) as chromosome, + T.orientation as orientation, + MIN(T.start) as start, + MAX(T.stop) as stop, + MAX(E.cds_start) as cds_start, + MAX(E.cds_stop) as cds_stop, + GROUP_CONCAT(DISTINCT E.start ORDER BY E.start ASC) as exon_starts, + GROUP_CONCAT(DISTINCT E.stop ORDER BY E.stop ASC) as exon_stops, + MAX(E.protein) as protein, + 'NCBI' as source + FROM Transcripts as T, Genes as G, Exons as E + WHERE T.gene_id = G.id AND T.name = E.transcript + GROUP BY T.name; """, None self.query(statement) - statement = """ - DROP TABLE map_temp; - """, None + #ncbi_aggregate_mapping +#Mapping - self.query(statement) - #merge_Update -#Update class Cache(Db) : """ @@ -908,9 +738,9 @@ class Cache(Db) : def insertLRG(self, accNo, fileHash, url): """ Insert information about a LRG record in the internal database. - + See insertGB() for more information. - + @arg accNo: The name associated with this record @type accNo: string @arg fileHash: The hash of the content of the record @@ -1160,7 +990,7 @@ class Cache(Db) : @arg accNo: The accession number @type accNo: string - + @return: GI number @rtype: string """ @@ -1181,13 +1011,13 @@ class Cache(Db) : """ Gets the protein accession number for the given mRNA accession number. - + SQL tables from internalDb: - Link ; mRNA and associated protein IDs. - + @arg mrnaAcc: The ID of an mRNA @type mrnaAcc: string - + @return: The protein accession number @rtype: string """ @@ -1207,13 +1037,13 @@ class Cache(Db) : def getmrnaAcc(self, protAcc) : """ Gets the mRNA accession number for a given protein accession number. - + SQL tables from internalDb: - Link ; mRNA and associated protein IDs. - + @arg protAcc: The protein ID @type protAcc: string - + @return: The mRNA accession number @rtype: string """ @@ -1235,14 +1065,14 @@ class Cache(Db) : """ Inserts the given mRNA and protein accession numbers into the Link table. - + SQL tables from internalDb: - Link ; mRNA and associated protein IDs. - + @arg protAcc: The protein ID @type protAcc: string @arg mrnaAcc: The ID of an mRNA - @type mrnaAcc: string + @type mrnaAcc: string """ statement = """ diff --git a/mutalyzer/Scheduler.py b/mutalyzer/Scheduler.py index d54ec427af0bbd096fbe9ad9ce05bdb3b21bafd7..626d33c5fef682062cf22510daa54a566202881f 100644 --- a/mutalyzer/Scheduler.py +++ b/mutalyzer/Scheduler.py @@ -25,7 +25,7 @@ from mutalyzer import variantchecker from mutalyzer.grammar import Grammar from mutalyzer.config import Config from mutalyzer.output import Output -from mutalyzer import Mapper # Mapper.Converter +from mutalyzer.mapping import Converter from mutalyzer import Retriever # Retriever.Retriever @@ -507,7 +507,7 @@ Mutalyzer batch checker.""" % url) if not skip : try : #process - converter = Mapper.Converter(build, C, O) + converter = Converter(build, C, O) #Also accept chr accNo variant = converter.correctChrVariant(variant) diff --git a/mutalyzer/config.py b/mutalyzer/config.py index 1f5b36cdfc7e65b3e82a460fcaf6e01c74a2018e..cb98f726eaf8cc761243400b8d3bb571fdeda4c9 100644 --- a/mutalyzer/config.py +++ b/mutalyzer/config.py @@ -98,9 +98,6 @@ class Config(): self.Db.dbNames = config["dbNames"] self.Db.LocalMySQLuser = config["LocalMySQLuser"] self.Db.LocalMySQLhost = config["LocalMySQLhost"] - self.Db.RemoteMySQLuser = config["RemoteMySQLuser"] - self.Db.RemoteMySQLhost = config["RemoteMySQLhost"] - self.Db.UpdateInterval = int(config["UpdateInterval"]) # Set the variables needed by the Output module. self.Output.log = config["log"] diff --git a/mutalyzer/Mapper.py b/mutalyzer/mapping.py similarity index 55% rename from mutalyzer/Mapper.py rename to mutalyzer/mapping.py index ef09e108c1080d69f09479fcc1e8fcd4ad840300..718094f1b08210f9e68be8c296e1b9cfb9898d12 100644 --- a/mutalyzer/Mapper.py +++ b/mutalyzer/mapping.py @@ -1,93 +1,47 @@ -#!/usr/bin/python - """ -Search for an NM number in the MySQL database, if the version number -matches, get the start and end positions in a variant. Translate these -positions to I{g.} notation if the variant is in I{c.} notation or vice versa. +Work with the mappings of transcripts to chromosomes. - - If no end position is present, the start position is assumed to be the end - position. - - If the version number is not found in the database, an error message is - generated and a suggestion for an other version is given. - - If the reference sequence is not found at all, an error is returned. - - If no variant is present, the transcription start and end and CDS end in - I{c.} notation is returned. - - If the variant is not accepted by the nomenclature parser, a parse error - will be printed. - -@requires: sys -@requires: Modules.Config -@requires: Modules.Db -@requires: Modules.Crossmap -@requires: Modules.Serializers.SoapMessage -@requires: Modules.Serializers.Mapping -@requires: Modules.Serializers.Transcript -@requires: Bio.Seq.reverse_complement -@requires: collections.defaultdict - -@todo: Rename Mapper to converter? +Instances of the {Converter} class convert between transcript and chromosomal +locations, using the 'Mapping' table. + +The {Updater} class is an abstract base class, subclassed by {NCBIUpdater}. +Instances of {NCBIUpdater} can load NCBI mapping information from a file and +update the database with this information. """ -import sys # argv -from mutalyzer.grammar import Grammar -from mutalyzer import Db # Db(), get_NM_version(), get_NM_info() -from mutalyzer import Crossmap # Crossmap(), g2x(), x2g(), main2int(), - # offset2int(), info() -from mutalyzer.models import SoapMessage, Mapping, Transcript +import sys from Bio.Seq import reverse_complement from collections import defaultdict -def _sl2il(l) : - """ - Convert a list of strings to a list of integers. +from mutalyzer.grammar import Grammar +from mutalyzer import Db +from mutalyzer import Crossmap +from mutalyzer.models import SoapMessage, Mapping, Transcript - @arg l: A list of strings - @type l: list - @returns: A list of integers - @rtype: list +class Converter(object) : """ + Convert between transcript and chromosomal locations. - return [int(s) for s in l] -#__sl2il - -def _getcoords(C, Loc, Type) : - """ - Return main, offset and g positions given either a position in - I{c.} or in I{g.} notation. - - @arg C: A crossmapper - @type C: object - @arg Loc: A location in either I{g.} or I{c.} notation - @type Loc: object - @arg Type: The reference type - @type Type: string - @returns: triple: - 0. Main coordinate in I{c.} notation - 1. Offset coordinate in I{c.} notation - 2. Position in I{g.} notation - @rtype: triple (integer, integer, integer) - """ + Search for an NM number in the MySQL database, if the version number + matches, get the start and end positions in a variant. Translate these + positions to I{g.} notation if the variant is in I{c.} notation or vice + versa. - if Type == 'c' : - main = C.main2int(Loc.MainSgn + Loc.Main) - offset = C.offset2int(Loc.OffSgn + Loc.Offset) - g = C.x2g(main, offset) - main, offset = C.g2x(g) - #if - else : - g = int(Loc.Main) - main, offset = C.g2x(g) - #else - return (main, offset, g) -#__getcoords + - If no end position is present, the start position is assumed to be the + end position. + - If the version number is not found in the database, an error message is + generated and a suggestion for an other version is given. + - If the reference sequence is not found at all, an error is returned. + - If no variant is present, the transcription start and end and CDS end in + I{c.} notation is returned. + - If the variant is not accepted by the nomenclature parser, a parse error + will be printed. -class Converter(object) : + @todo: Refactor anything using {mutalyzer.models} into the {webservice} + module. """ - Converter object docstring - """ - def __init__(self, build, C, O) : """ Initialise the class. @@ -100,7 +54,6 @@ class Converter(object) : @arg O: output object @type O: object """ - self.build = None self.__output = O self.__config = C @@ -168,7 +121,6 @@ class Converter(object) : #_parseInput def _populateFields(self, Fields) : - #TODO: ADD Error Messages, unlikely that CDS info is missing """ Create a Mutalyzer compatible exon list. @@ -181,22 +133,17 @@ class Converter(object) : @return: Exon list @rtype: list """ + Fields["exon_starts"] = map(int, Fields["exon_starts"].split(',')) + Fields["exon_stops"] = map(int, Fields["exon_stops"].split(',')) + assert(len(Fields["exon_starts"]) == len(Fields["exon_stops"])) - Fields["exonStarts"] =\ - _sl2il(Fields["exonStarts"].split(',')[:-1]) - Fields["exonEnds"] =\ - _sl2il(Fields["exonEnds"].split(',')[:-1]) - assert(len(Fields["exonStarts"]) == len(Fields["exonEnds"])) - - Fields["cdsStart"] = int(Fields["cdsStart"]) - Fields["cdsEnd"] = int(Fields["cdsEnd"]) - - for i in range(len(Fields["exonStarts"])) : - Fields["exonStarts"][i] += 1 + if Fields['cds_start'] and Fields['cds_stop']: + Fields["cds_start"] = int(Fields["cds_start"]) + Fields["cds_stop"] = int(Fields["cds_stop"]) # Create Mutalyzer compatible exon list Fields["exons"] = [] - for exon in zip(Fields["exonStarts"], Fields["exonEnds"]) : + for exon in zip(Fields["exon_starts"], Fields["exon_stops"]) : Fields["exons"].extend(exon) self.dbFields = Fields @@ -218,9 +165,9 @@ class Converter(object) : """ Fields = dict(zip( - ("acc", "txStart", "txEnd", "cdsStart", "cdsEnd", - "exonStarts", "exonEnds", "geneName", - "chrom", "strand", "protAcc", "version"), + ("transcript", "start", "stop", "cds_start", "cds_stop", + "exon_starts", "exon_stops", "gene", + "chromosome", "orientation", "protein", "version"), values)) return self._populateFields(Fields) #_FieldsFromValues @@ -292,15 +239,14 @@ class Converter(object) : if not self.dbFields: return None CDS = [] - if self.dbFields["cdsStart"] != self.dbFields["cdsEnd"] : - #The counting from 0 conversion. - CDS = [self.dbFields["cdsStart"] + 1, self.dbFields["cdsEnd"]] + if self.dbFields["cds_start"] and self.dbFields["cds_stop"]: + CDS = [self.dbFields["cds_start"], self.dbFields["cds_stop"]] mRNA = self.dbFields["exons"] # Convert the strand information to orientation. orientation = 1 - if self.dbFields["strand"] == '-' : + if self.dbFields["orientation"] == '-' : orientation = -1 # Build the crossmapper. @@ -308,6 +254,37 @@ class Converter(object) : return self.crossmap #makeCrossmap + @staticmethod + def _getcoords(C, Loc, Type) : + """ + Return main, offset and g positions given either a position in + I{c.} or in I{g.} notation. + + @arg C: A crossmapper + @type C: object + @arg Loc: A location in either I{g.} or I{c.} notation + @type Loc: object + @arg Type: The reference type + @type Type: string + @returns: triple: + 0. Main coordinate in I{c.} notation + 1. Offset coordinate in I{c.} notation + 2. Position in I{g.} notation + @rtype: triple (integer, integer, integer) + """ + if Type == 'c' : + main = C.main2int(Loc.MainSgn + Loc.Main) + offset = C.offset2int(Loc.OffSgn + Loc.Offset) + g = C.x2g(main, offset) + main, offset = C.g2x(g) + #if + else : + g = int(Loc.Main) + main, offset = C.g2x(g) + #else + return (main, offset, g) + #_getcoords + def _coreMapping(self) : """ Build the Mapping ClassSerializer. @@ -327,14 +304,14 @@ class Converter(object) : # Get the coordinates of the start position startmain, startoffset, start_g = \ - _getcoords(Cross, mutation.StartLoc.PtLoc, - self.parseTree.RefType) + self._getcoords(Cross, mutation.StartLoc.PtLoc, + self.parseTree.RefType) # If there is an end position, calculate the coordinates. if mutation.EndLoc : endmain, endoffset, end_g = \ - _getcoords(Cross, mutation.EndLoc.PtLoc, - self.parseTree.RefType) + self._getcoords(Cross, mutation.EndLoc.PtLoc, + self.parseTree.RefType) else : end_g, endmain, endoffset = start_g, startmain, startoffset @@ -456,16 +433,16 @@ class Converter(object) : return None # construct the variant description - chromAcc = self.__database.chromAcc(self.dbFields["chrom"]) + chromAcc = self.__database.chromAcc(self.dbFields["chromosome"]) f_change = self._constructChange(False) r_change = self._constructChange(True) - if self.dbFields["strand"] == "+" : + if self.dbFields["orientation"] == "+" : change = f_change else : change = r_change if M.start_g != M.end_g : - if self.dbFields["strand"] == '+' : + if self.dbFields["orientation"] == '+' : var_in_g = "g.%s_%s%s" % (M.start_g, M.end_g, change) else : var_in_g = "g.%s_%s%s" % (M.end_g, M.start_g, change) @@ -554,9 +531,9 @@ class Converter(object) : #balen continue # construct the variant description - accNo = "%s.%s" % (self.dbFields["acc"],self.dbFields["version"]) - geneName = self.dbFields["geneName"] or "" - strand = self.dbFields["strand"] == '+' + accNo = "%s.%s" % (self.dbFields["transcript"],self.dbFields["version"]) + geneName = self.dbFields["gene"] or "" + strand = self.dbFields["orientation"] == '+' startp = self.crossmap.tuple2string((M.startmain, M.startoffset)) endp = self.crossmap.tuple2string((M.endmain, M.endoffset)) @@ -620,3 +597,254 @@ class Converter(object) : return change #_constructChange #Converter + + +class Updater(object): + """ + Abstract base class for updating the mapping information in the database. + + Subclasses should implement the {load} method, loading new mapping + information into the 'MappingTemp' table. The {merge} method merges this + table into the real 'Mapping' table. + """ + def __init__(self, build, config): + """ + @arg build: Human genome build (or database name), i.e. 'hg18' or + 'hg19'. + @type build: string + @arg config: A configuration object. + @type config: mutalyzer.config.Config + """ + self.build = build + self.config = config + self.db = Db.Mapping(build, config.Db) + #__init__ + + def load(self): + """ + The implementation of this method in subclasses should load mapping + information in the 'MappingTemp' table. + """ + raise NotImplementedError('Implement this method in subclasses') + #load + + def merge(self): + """ + Merge the 'Mapping' and 'MappingTemp' tables. The result is stored in + the 'Mapping' table, of which a backup is created as 'MappingBackup'. + + @todo: Report how much was updated/added. + """ + self.db.merge_update() + #merge +#Updater + + +class NCBIUpdater(Updater): + """ + Update the mapping information in the database with mapping information + from the NCBI. + + Example usage: + + >>> updater = NCBIUpdater('hg19', mutalyzer.config.Config()) + >>> updater.load('/tmp/seq_gene.md', 'GRCh37.p2-Primary Assembly') + >>> updater.merge() + + """ + COLUMNS = ['taxonomy', 'chromosome', 'start', 'stop', 'orientation', + 'contig', 'ctg_start', 'ctg_stop', 'ctg_orientation', + 'feature_name', 'feature_id', 'feature_type', 'group_label', + 'transcript', 'evidence_code'] + + def __init__(self, build, config): + """ + @arg build: Human genome build (or database name), i.e. 'hg18' or + 'hg19'. + @type build: string + @arg config: A configuration object. + @type config: mutalyzer.config.Config + """ + self.exon_backlog = {} + super(NCBIUpdater, self).__init__(build, config) + #__init__ + + def load(self, mapping_file, assembly): + """ + Load NCBI mapping information from {mapping_file} into the database. + + The NCBI mapping file consists of entries, one per line, in order of + their location in the genome (more specifically by start location). + Every entry has a 'group_name' column, denoting the assembly it is + from. We only use entries where this value is {assembly}. + + There are four types of entries (for our purposes): + - Gene: Name, identifier, and location of a gene. + - Transcript: Name, gene id, and location of a transcript. + - UTR: Location and transcript of a non-coding exon (or part of it). + - CDS: Location and transcript of a coding exon (or part of it). + + A bit troublesome for us is that exons are split in UTR exons and CDS + exons, with exons overlapping the UTR/CDS border defined as two + separate entries (one of type UTR and one of type CDS). + + Another minor annoyance is that some transcripts (~ 15) are split over + two contigs (NT_*). In that case, they are defined by two entries in + the file, where we should merge them by taking the start position of + the first and the stop position of the second. + + Our strategy is to loop over all entries and store them in three + temporary tables (for genes, transcripts, exons). The entries of type + UTR and CDS are merged to correct exon entries by keeping a backlog + of these entries that can still be modified before storing them in the + database. + + The values from the three temporary tables are aggregated into the + 'MappingTemp' table. + + @arg mapping_file: Path to NCBI mapping information. + @type mapping_file: string + @arg assembly: Use only entries from this assembly (this is the + 'group_name' column in the NCBI mapping file). + @type assembly: string + """ + self._create_temporary_tables() + self._import_mapping(mapping_file, assembly) + self._aggregate_mapping() + self._drop_temporary_tables() + #load + + def _import_mapping(self, mapping_file, assembly): + """ + Import mapping information from {mapping_file} into three temporary + tables. + + @note: We issue a separate INSERT statement to the database for every + entry. An alternative is to write everything to tab-separated + files and load those into the database with LOAD DATA LOCAL INFILE + statements. This alternative seems to be about twice as fast, but + for now we stick with the simpler solution. + """ + self.exon_backlog = {} + + with open(mapping_file, 'r') as mapping: + for line in mapping: + if line.startswith('#'): + continue + entry = dict(zip(self.COLUMNS, line.rstrip().split('\t'))) + + # Only use entries from the given assembly. + if entry['group_label'] != assembly: + continue + + # Only use entries on the normal chromosomes. + try: + int(entry['chromosome']) + except ValueError: + if entry['chromosome'] not in 'XY': + continue + + if entry['feature_type'] == 'GENE': + self._import_gene(entry) + elif entry['feature_type'] == 'RNA': + self._import_transcript(entry) + elif entry['feature_type'] in ('UTR', 'CDS'): + self._import_exon(entry) + + self._import_exon_backlog() + #_import_mapping + + def _import_gene(self, entry): + """ + Insert a gene in the database. + """ + self.db.ncbi_import_gene(entry['feature_id'], entry['feature_name']) + #_import_gene + + def _import_transcript(self, entry): + """ + Insert a transcript in the database. + """ + self.db.ncbi_import_transcript( + entry['feature_name'], entry['feature_id'], entry['chromosome'], + int(entry['start']), int(entry['stop']), entry['orientation']) + #_import_transcript + + def _import_exon(self, entry): + """ + Instead of directly inserting each exon in the database, we keep them + in a backlog of at most one exon per transcript. Exons are taken from + the backlog and inserted in the database only when we passed their + genomic stop location by more than one position. + + This way, exons in the backlog can be merged when they are on a + UTR/CDS boundary. + """ + cds = entry['feature_type'] == 'CDS' + entry['start'] = int(entry['start']) + entry['stop'] = int(entry['stop']) + entry['protein'] = entry['feature_name'] if cds else None + entry['cds'] = cds + + self._import_exon_backlog(entry['start'] - 1) + + try: + previous = self.exon_backlog[entry['transcript']] + if previous['cds'] != entry['cds'] \ + and previous['stop'] == entry['start'] - 1: + if entry['cds']: + entry['cds_start'] = entry['start'] + else: + entry['cds_stop'] = previous['stop'] + if 'cds_start' in previous: + entry['cds_start'] = previous['cds_start'] + entry['protein'] = previous['protein'] + entry['start'] = previous['start'] + except KeyError: + pass + + self.exon_backlog[entry['transcript']] = entry + #_import_exon + + def _import_exon_backlog(self, up_to_position=None): + """ + Import exons from the backlog in the database. If the optional + argument {up_to_position} is set, only import exons with a stop + position before this value. + + We explicitely remove imported exons from the backlog, because it + might be suboptimal to keep more than 30,000 exons in there. + """ + for transcript, exon in self.exon_backlog.items(): + if not up_to_position or exon['stop'] < up_to_position: + del self.exon_backlog[transcript] + del exon['cds'] + self.db.ncbi_import_exon( + exon['transcript'], exon['start'], exon['stop'], + exon['cds_start'] if 'cds_start' in exon else None, + exon['cds_stop'] if 'cds_stop' in exon else None, + exon['protein'] or None) + #_import_exon_backlog + + def _aggregate_mapping(self): + """ + Aggregate the genes, transcripts and exons from their temporary + tables into the 'MappingTemp' table. + """ + self.db.ncbi_aggregate_mapping() + #_aggregate_mapping + + def _create_temporary_tables(self): + """ + Create temporary tables needed for loading the NCBI mapping data. + """ + self.db.ncbi_create_temporary_tables() + #_create_temporary_tables + + def _drop_temporary_tables(self): + """ + Drop temporary tables needed for loading the NCBI mapping data. + """ + self.db.ncbi_drop_temporary_tables() + #_drop_temporary_tables +#NCBIUpdater diff --git a/mutalyzer/util.py b/mutalyzer/util.py index dd65e0d5f3935493ce2d4a59fbbb806b4f09a2c1..a247a16e9e2005946c16fb2fc290e2a8d716593d 100644 --- a/mutalyzer/util.py +++ b/mutalyzer/util.py @@ -19,9 +19,11 @@ General utility functions. """ +import sys import os import math import time +import inspect from itertools import izip_longest import Bio.Seq @@ -722,6 +724,35 @@ def message_info(message): #message_info +def format_usage(usage=None, keywords={}): + """ + Format a usage string suitable for printing to the console. Some magic + is employed so you can usually just call this function without arguments + to have the calling module's docstring pretty-printed. + + @kwarg usage: The string to format. If omitted, the calling module's + docstring is used. + @type usage: string + @kwarg keywords: A dictionary of (keyword, value) pairs used to format + the usage string. If it does not contain the key 'command', it is + added with the value of sys.argv[0]. + @type keywords: dictionary(string, string) + + @return: Formatted usage string. This is {usage} with any entries from + {keywords} replaced and cut-off at the first occurence of two + consecutive empty lines. + @rtype: string + """ + if not usage: + caller = inspect.stack()[1] + usage = inspect.getmodule(caller[0]).__doc__ + if not 'command' in keywords: + keywords['command'] = sys.argv[0] + + return usage.split('\n\n\n')[0].strip().format(**keywords) +#format_usage + + def slow(f): """ Decorator for slow tests. This makes them to pass immediately, without diff --git a/mutalyzer/webservice.py b/mutalyzer/webservice.py index 181c4af9e5c383fac302e696348b22436f9f61d5..b31c47c5e3b9dad82bf044e174e2338873464f4b 100644 --- a/mutalyzer/webservice.py +++ b/mutalyzer/webservice.py @@ -38,7 +38,7 @@ from mutalyzer.grammar import Grammar from mutalyzer.sync import CacheSync from mutalyzer import variantchecker from mutalyzer import Db -from mutalyzer import Mapper +from mutalyzer.mapping import Converter from mutalyzer import Retriever from mutalyzer import GenRecord from mutalyzer.models import * @@ -334,7 +334,7 @@ class MutalyzerService(DefinitionBase): "Reveived request mappingInfo(%s %s %s %s)" % ( LOVD_ver, build, accNo, variant)) - conv = Mapper.Converter(build, self._config, L) + conv = Converter(build, self._config, L) result = conv.mainMapping(accNo, variant) L.addMessage(__file__, -1, "INFO", @@ -372,7 +372,7 @@ class MutalyzerService(DefinitionBase): "Received request transcriptInfo(%s %s %s)" % (LOVD_ver, build, accNo)) - converter = Mapper.Converter(build, self._config, O) + converter = Converter(build, self._config, O) T = converter.mainTranscript(accNo) O.addMessage(__file__, -1, "INFO", @@ -497,7 +497,7 @@ class MutalyzerService(DefinitionBase): O.addMessage(__file__, -1, "INFO", "Received request cTogConversion(%s %s)" % ( build, variant)) - converter = Mapper.Converter(build, self._config, O) + converter = Converter(build, self._config, O) variant = converter.correctChrVariant(variant) if "c." in variant : diff --git a/mutalyzer/website.py b/mutalyzer/website.py index faa358539f450ee43114a6200c2e854ddd50e03b..1fddc040e095428a61344d9bbc00742370842180 100644 --- a/mutalyzer/website.py +++ b/mutalyzer/website.py @@ -35,7 +35,7 @@ from mutalyzer.grammar import Grammar from mutalyzer import webservice from mutalyzer import variantchecker from mutalyzer.output import Output -from mutalyzer import Mapper +from mutalyzer.mapping import Converter from mutalyzer import Db from mutalyzer import Scheduler from mutalyzer import Retriever @@ -300,7 +300,7 @@ class GetGS: # Todo: The following is probably a problem elsewhere too. # We stringify the variant, because a unicode string crashes - # Bio.Seq.reverse_complement in Mapper.py:607. + # Bio.Seq.reverse_complement in mapping.py:607. # We are only interested in the legend #Mutalyzer.process(str(i.mutationName), config, O) @@ -443,7 +443,7 @@ class PositionConverter: i = web.input(build='', variant='') # Todo: The following is probably a problem elsewhere too. # We stringify the variant, because a unicode string crashes - # Bio.Seq.reverse_complement in Mapper.py:607. + # Bio.Seq.reverse_complement in mapping.py:607. return self.position_converter(i.build, str(i.variant)) def position_converter(self, build='', variant=''): @@ -472,7 +472,7 @@ class PositionConverter: } if build and variant: - converter = Mapper.Converter(build, config, output) + converter = Converter(build, config, output) #Convert chr accNo to NC number variant = converter.correctChrVariant(variant) @@ -564,16 +564,16 @@ class VariantInfo: 'Received %s:%s (LOVD_ver %s, build %s)' \ % (acc, var, LOVD_ver, build)) - Converter = Mapper.Converter(build, config, output) + converter = Converter(build, config, output) result = '' # If no variant is given, return transcription start, transcription # end and CDS stop in c. notation. if var: - ret = Converter.mainMapping(acc, var) + ret = converter.mainMapping(acc, var) else: - ret = Converter.giveInfo(acc) + ret = converter.giveInfo(acc) if ret: result = '%i\n%i\n%i' % ret @@ -662,7 +662,7 @@ class Check: output.addMessage(__file__, -1, 'INFO', 'Received variant %s' % name) # Todo: The following is probably a problem elsewhere too. # We stringify the variant, because a unicode string crashes - # Bio.Seq.reverse_complement in Mapper.py:607. + # Bio.Seq.reverse_complement in mapping.py:607. variantchecker.check_variant(str(name), config, output) output.addMessage(__file__, -1, 'INFO', 'Finished processing variant %s' % name) diff --git a/setup.py b/setup.py index 62ea18aa7a268f5f918d10428bbbd18db387abfe..2899ae3d0673eb0b2cfae85305df8e294270a744 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ setup( scripts=['bin/mutalyzer', 'bin/mutalyzer-batchd', 'bin/mutalyzer-cache-sync', - 'bin/mutalyzer-ucsc-update', + 'bin/mutalyzer-mapping-update', 'bin/mutalyzer-webservice.wsgi', 'bin/mutalyzer-website.wsgi'], zip_safe=False diff --git a/tests/test_converter.py b/tests/test_mapping.py similarity index 92% rename from tests/test_converter.py rename to tests/test_mapping.py index 17d4c5c575add0080a577e3ad9a4b2cd68cabf48..036b46f6c7aadc2ca26629cc123fa7b01c9a4869 100644 --- a/tests/test_converter.py +++ b/tests/test_mapping.py @@ -1,5 +1,5 @@ """ -Tests for the converter (Mapper) module. +Tests for the mapping module. """ @@ -8,12 +8,12 @@ from nose.tools import * from mutalyzer.config import Config from mutalyzer.output import Output -from mutalyzer.Mapper import Converter +from mutalyzer.mapping import Converter class TestConverter(): """ - Test the converter (Mapper) module. + Test the Converter class. """ def setUp(self): """