diff --git a/bin/mutalyzer-mapping-add-gene b/bin/mutalyzer-mapping-add-gene deleted file mode 100755 index 7f190f20d53fc1479132b2df495f266a9975a07e..0000000000000000000000000000000000000000 --- a/bin/mutalyzer-mapping-add-gene +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python -""" -Update the database with mapping information for the given gene from the UCSC. - -Usage: - {command} database gene - - database: Database to update (i.e. 'hg18' or 'hg19'). - gene: Path to the NCBI mapping information. - - -This program is intended to be run manually whenever transcript mappings for a -specific gene are required that are not yet in our database (i.e. they are not -yet available from the NCBI). It will not overwrite transcript/version entries -that are already in our database. -""" - - -import sys - -from mutalyzer.output import Output -from mutalyzer.mapping import UCSCUpdater -from mutalyzer.util import format_usage - - -def main(database, gene): - """ - Update the database with information from the UCSC. - - @arg database: Database to update (i.e. 'hg18' or 'hg19'). - @type database: string - @arg gene: Gene name to get transcript mapping info for. - @type gene: string - - @todo: Also report how much was added/updated. - """ - output = Output(__file__) - output.addMessage(__file__, -1, 'INFO', - 'Starting UCSC mapping data update (gene: %s)' % gene) - - updater = UCSCUpdater(database) - updater.load(gene) - updater.merge() - - output.addMessage(__file__, -1, 'INFO', - 'NCBI mapping data update end (gene: %s)' % gene) - - -if __name__ == '__main__': - if len(sys.argv) != 3: - print format_usage() - sys.exit(1) - main(*sys.argv[1:]) diff --git a/bin/mutalyzer-mapping-import b/bin/mutalyzer-mapping-import new file mode 100755 index 0000000000000000000000000000000000000000..e680dc10484d10cf23f154d5a94a35fcb3b6b293 --- /dev/null +++ b/bin/mutalyzer-mapping-import @@ -0,0 +1,89 @@ +#!/usr/bin/env python +""" +Update the database with mapping information for the given gene or genomic +reference. + +Usage: + {command} source database gene|reference + + source: Import source, 'gene' for importing a specific gene from the UCSC + or 'reference' for importing from genomic reference. + database: Database to update (i.e. 'hg18' or 'hg19'). + gene: Name of gene to import all transcripts mappings for from the UCSC + database (e.g. 'TTN'). + reference: Genomic reference to import all genes from (e.g. 'NC_012920.1'). + Not that currently no exon locations are supported, this has only + been tested on mtDNA. + + +This program is intended to be run manually whenever transcript mappings for +specific genes are required that are not yet in our database (i.e. they are +not yet available from the NCBI, or they are mtDNA genes). It will not +overwrite transcript/version entries that are already in our database. +""" + + +import sys + +from mutalyzer.output import Output +from mutalyzer.mapping import UCSCUpdater, ReferenceUpdater +from mutalyzer.util import format_usage + + +def import_gene(database, gene): + """ + Update the database with information from the UCSC. + + @arg database: Database to update (i.e. 'hg18' or 'hg19'). + @type database: string + @arg gene: Gene name to get transcript mapping info for. + @type gene: string + + @todo: Also report how much was added/updated. + """ + output = Output(__file__) + output.addMessage(__file__, -1, 'INFO', + 'Starting UCSC mapping data update (gene: %s)' % gene) + + updater = UCSCUpdater(database) + updater.load(gene) + updater.merge() + + output.addMessage(__file__, -1, 'INFO', + 'UCSC mapping data update end (gene: %s)' % gene) + + +def import_reference(database, reference): + """ + Update the database with information from the given reference. + + @arg database: Database to update (i.e. 'hg18' or 'hg19'). + @type database: string + @arg reference: Reference to get gene mappings from. + @type reference: string + + @todo: Also report how much was added/updated. + """ + output = Output(__file__) + output.addMessage(__file__, -1, 'INFO', + 'Starting reference mapping data update (reference: %s)' % reference) + + updater = ReferenceUpdater(database) + updater.load(reference, output) + updater.merge() + + output.addMessage(__file__, -1, 'INFO', + 'Reference mapping data update end (reference: %s)' % reference) + + +if __name__ == '__main__': + if len(sys.argv) != 4: + print format_usage() + sys.exit(1) + if sys.argv[1] == 'gene': + import_gene(*sys.argv[2:]) + elif sys.argv[1] == 'reference': + import_reference(*sys.argv[2:]) + else: + print format_usage() + sys.exit(1) diff --git a/extras/migrations/009-db-mapping-add-selector.migration b/extras/migrations/009-db-mapping-add-selector.migration new file mode 100755 index 0000000000000000000000000000000000000000..23e311b06d2917ffe5bc5a5b1302e9a53feb319c --- /dev/null +++ b/extras/migrations/009-db-mapping-add-selector.migration @@ -0,0 +1,62 @@ +#!/usr/bin/env python +""" +Add columns 'selector' and 'selector_version' to the 'Mapping' tables. + +Usage: + ./009-db-mapping-add-selector.migration [migrate] +""" + + +import migration + + +def check(): + """ + Check if migration is needed. + """ + connection = migration.db_connect('hg18') + cursor = connection.cursor() + cursor.execute('SHOW COLUMNS FROM Mapping WHERE field = "selector";') + has_column_hg18 = len(cursor.fetchall()) > 0 + connection.close() + + connection = migration.db_connect('hg19') + cursor = connection.cursor() + cursor.execute('SHOW COLUMNS FROM Mapping WHERE field = "selector";') + has_column_hg19 = len(cursor.fetchall()) > 0 + connection.close() + + if has_column_hg19 != has_column_hg19: + migration.fatal('Installation is not in a recognizable state. Fix manually.') + return not has_column_hg19 + + +def migrate(): + """ + Perform migration. + """ + connection = migration.db_connect('hg18') + cursor = connection.cursor() + cursor.execute(""" + ALTER TABLE Mapping + ADD COLUMN selector varchar(255) DEFAULT NULL AFTER version, + ADD COLUMN selector_version unsigned DEFAULT NULL AFTER selector;""") + connection.commit() + connection.close() + migration.info('Added column hg18.Mapping.selector') + migration.info('Added column hg18.Mapping.selector_version') + + connection = migration.db_connect('hg19') + cursor = connection.cursor() + cursor.execute(""" + ALTER TABLE Mapping + ADD COLUMN selector varchar(255) DEFAULT NULL AFTER version, + ADD COLUMN selector_version unsigned DEFAULT NULL AFTER selector;""") + connection.commit() + connection.close() + migration.info('Added column hg19.Mapping.selector') + migration.info('Added column hg19.Mapping.selector_version') + + +if __name__ == '__main__': + migration.main(check, migrate) diff --git a/extras/post-install.sh b/extras/post-install.sh index 304bd5e3f1ef0bf80791233afa85495b5fdf5afc..4e173c485bd8b842aeae3860936ebd859d94b458 100644 --- a/extras/post-install.sh +++ b/extras/post-install.sh @@ -104,6 +104,8 @@ CREATE TABLE Mapping ( gene varchar(255) DEFAULT NULL, transcript varchar(20) NOT NULL DEFAULT '', version smallint(6) DEFAULT NULL, + selector varchar(255) DEFAULT NULL, + selector_version unsigned DEFAULT NULL, chromosome varchar(40) DEFAULT NULL, orientation char(1) DEFAULT NULL, start int(11) unsigned DEFAULT NULL, @@ -168,6 +170,8 @@ CREATE TABLE Mapping ( gene varchar(255) DEFAULT NULL, transcript varchar(20) NOT NULL DEFAULT '', version smallint(6) DEFAULT NULL, + selector varchar(255) DEFAULT NULL, + selector_version unsigned DEFAULT NULL, chromosome varchar(40) DEFAULT NULL, orientation char(1) DEFAULT NULL, start int(11) unsigned DEFAULT NULL, diff --git a/mutalyzer/Db.py b/mutalyzer/Db.py index 4894cdd86741c12843bcfbccc441829d669b3879..ae295b3cf0601854323bafc6ed7bd609019c797a 100644 --- a/mutalyzer/Db.py +++ b/mutalyzer/Db.py @@ -235,11 +235,13 @@ class Mapping(Db) : return [int(i[0]) for i in self.query(statement)] #get_NM_version - def getAllFields(self, mrnaAcc, version): + def getAllFields(self, mrnaAcc, version=None, selector=None, selector_version=None): """ Get all Fields of an accession number and version number. If the version number is None, use the "newest" version number. + Optionally also provide a gene and transcript version to select. + SQL tables from dbNames: - Mapping ; Accumulated mapping info. @@ -260,6 +262,7 @@ class Mapping(Db) : """ q = """ select transcript, + selector, selector_version, start, stop, cds_start, cds_stop, exon_starts, exon_stops, @@ -267,20 +270,36 @@ class Mapping(Db) : orientation, protein from Mapping """ + + where = [] + order = [] + args = [] + if version is None: - q += """ - where transcript = %s - order by version desc, chromosome asc; - """ - statement = (q, mrnaAcc) + where.append('transcript = %s') + order.append('version desc') + order.append('chromosome asc') + args.append(mrnaAcc) else: - q += """ - where transcript = %s and - version = %s - order by chromosome asc; - """ - statement = q, (mrnaAcc, version) + where.append('transcript = %s') + where.append('version = %s') + order.append('chromosome asc') + args.append(mrnaAcc) + args.append(version) + + if selector is not None: + where.append('selector = %s') + args.append(selector) + + if selector_version is not None: + where.append('selector_version = %s') + args.append(selector_version) + + q += """ + where """ + ' AND '.join(where) + """ + order by """ + ', '.join(order) + ';' + statement = q, tuple(args) return self.query(statement)[0] def get_Transcripts(self, chrom, p1, p2, overlap) : @@ -311,6 +330,7 @@ class Mapping(Db) : """ q = """ select transcript, + selector, selector_version, start, stop, cds_start, cds_stop, exon_starts, exon_stops, @@ -353,6 +373,7 @@ class Mapping(Db) : """ statement = """ SELECT transcript, + selector, selector_version, start, stop, cds_start, cds_stop, exon_starts, exon_stops, @@ -727,8 +748,8 @@ class Mapping(Db) : """ Load given transcripts into the 'MappingTemp' table. - Todo: Don't overwrite NCBI entries, but *do* overwrite existing UCSC - entries. + Todo: Don't overwrite NCBI/reference entries, but *do* overwrite + existing UCSC entries. """ statement = """ DROP TABLE IF EXISTS MappingTemp; @@ -762,6 +783,46 @@ class Mapping(Db) : 'UCSC', transcript, version, int(overwrite) + 1) self.query(statement) #ucsc_load_mapping + + def reference_load_mapping(self, transcripts, overwrite=False): + """ + Load given transcripts into the 'MappingTemp' table. + + Todo: Don't overwrite NCBI/UCSC entries, but *do* overwrite existing + reference entries. + """ + statement = """ + DROP TABLE IF EXISTS MappingTemp; + """, None + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + self.query(statement) + + statement = """ + CREATE TABLE MappingTemp LIKE Mapping; + """, None + self.query(statement) + + for t in transcripts: + exon_starts = ','.join(str(i) for i in t['exon_starts']) + exon_stops = ','.join(str(i) for i in t['exon_stops']) + statement = """ + INSERT INTO `MappingTemp` + (`gene`, `transcript`, `version`, `selector`, `selector_version`, + `chromosome`, `orientation`, `start`, `stop`, `cds_start`, `cds_stop`, + `exon_starts`, `exon_stops`, `protein`, `source`) + SELECT + %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 'reference' + FROM DUAL WHERE NOT EXISTS + (SELECT * FROM `Mapping` + WHERE `transcript` = %s AND `version` = %s AND 1 = %s); + """, (t['gene'], t['transcript'], t['version'], t['selector'], + t['selector_version'], t['chromosome'], t['orientation'], + t['start'], t['stop'], t['cds_start'], t['cds_stop'], + exon_starts, exon_stops, + t['protein'], t['transcript'], t['version'], int(overwrite) + 1) + self.query(statement) + #reference_load_mapping #Mapping diff --git a/mutalyzer/mapping.py b/mutalyzer/mapping.py index df410bd753757e5df0f6f2e5109d8139733a6504..38c0d48e0fdc4baa45b2af78b1a88a4094c74ab7 100644 --- a/mutalyzer/mapping.py +++ b/mutalyzer/mapping.py @@ -12,10 +12,12 @@ update the database with this information. from Bio.Seq import reverse_complement from collections import defaultdict +from operator import attrgetter from mutalyzer.grammar import Grammar from mutalyzer import Db from mutalyzer import Crossmap +from mutalyzer import Retriever from mutalyzer.models import SoapMessage, Mapping, Transcript @@ -174,14 +176,14 @@ class Converter(object) : """ Fields = dict(zip( - ("transcript", "start", "stop", "cds_start", "cds_stop", - "exon_starts", "exon_stops", "gene", + ("transcript", "selector", "selector_version", "start", "stop", + "cds_start", "cds_stop", "exon_starts", "exon_stops", "gene", "chromosome", "orientation", "protein", "version"), values)) return self._populateFields(Fields) #_FieldsFromValues - def _FieldsFromDb(self, acc, version) : + def _FieldsFromDb(self, acc, version, selector=None, selector_version=None) : """ Get data from database and populate dbFields dict. @@ -189,6 +191,10 @@ class Converter(object) : @type acc: string @arg version: version number @type version: integer + @kwarg selector: Optional gene symbol selector. + @type selector: str + @kwarg selector_version: Optional transcript version selector. + @type selector_version: int """ if not version : @@ -205,7 +211,7 @@ class Converter(object) : #if else : if version in versions : - Values = self.__database.getAllFields(acc, version) + Values = self.__database.getAllFields(acc, version, selector, selector_version) return self._FieldsFromValues(Values) #if if not version : @@ -502,7 +508,12 @@ class Converter(object) : if self._parseInput(variant): acc = self.parseTree.RefSeqAcc version = self.parseTree.Version - self._FieldsFromDb(acc, version) + if self.parseTree.Gene: + selector = self.parseTree.Gene.GeneSymbol + selector_version = int(self.parseTree.Gene.TransVar) + else: + selector = selector_version = None + self._FieldsFromDb(acc, version, selector, selector_version) mappings = self._coreMapping() if not mappings: @@ -693,15 +704,29 @@ class Converter(object) : continue # construct the variant description accNo = "%s.%s" % (self.dbFields["transcript"],self.dbFields["version"]) + if self.dbFields['selector'] and self.dbFields['selector_version']: + selector = '(%s_v%.3i)' % (self.dbFields['selector'], self.dbFields['selector_version']) + elif self.dbFields['selector']: + selector = '(%s)' % self.dbFields['selector'] + else: + selector = '' geneName = self.dbFields["gene"] or "" strand = self.dbFields["orientation"] == '+' - #Check if n or c type - info = self.crossmap.info() - if info[0] == 1 and info[1] == info[2] : - mtype = 'n' - else : + # Check if n or c type + # Note: Originally, the below check using crossmap.info() was + # used (commented out now), but I do not understand this + # logic. Also, it breaks n. notation on non-coding mtDNA + # transcripts, so I replaced it with a simple .CDS check. + #info = self.crossmap.info() + #if info[0] == 1 and info[1] == info[2] : + # mtype = 'n' + #else : + # mtype = 'c' + if self.crossmap.CDS: mtype = 'c' + else: + mtype = 'n' mutations = [] for variant, mapping in zip(variants, mappings): @@ -729,7 +754,7 @@ class Converter(object) : else: mutation = '[' + ';'.join(mutations) + ']' - description = "%s:%c.%s" % (accNo, mtype, mutation) + description = "%s%s:%c.%s" % (accNo, selector, mtype, mutation) HGVS_notatations[geneName].append(description) NM_list.append(description) #for @@ -1023,6 +1048,8 @@ class UCSCUpdater(Updater): The resulting transcript mappings are inserted into the 'MappingTemp' table. + @arg gene: Gene symbol to load mappings for. + @type gene: str @arg overwrite: Include already known transcript/version entries (default: False). @type overwrite: bool @@ -1030,3 +1057,92 @@ class UCSCUpdater(Updater): transcripts = self.ucsc.transcripts_by_gene(gene) self.db.ucsc_load_mapping(transcripts, overwrite=overwrite) #load +#UCSCUpdater + + +class ReferenceUpdater(Updater): + """ + Update the mapping information in the database with mapping information + from a genomic reference. + + For now, we don't handle exon locations (this is only used for mtDNA + genomic references). By default, we don't overwrite any existing + transcript/version entries. This is because we prefer the NCBI mappings + but want to be able to manually load info for specific genes that is not + provided by the NCBI (yet). + + Example usage: + + >>> updater = ReferenceUpdater('hg19') + >>> updater.load('NC_012920.1') + >>> updater.merge() + + """ + def __init__(self, build): + """ + @arg build: Human genome build (or database name), i.e. 'hg18' or + 'hg19'. + @type build: string + """ + super(ReferenceUpdater, self).__init__(build) + #__init__ + + def load(self, reference, output, overwrite=False): + """ + Load genomic reference mapping information into the database. By + default, don't load transcript/version entries we already have. + + The resulting transcript mappings are inserted into the + 'MappingTemp' table. + + @arg reference: Reference to load mappings from. + @type reference: str + @arg output: An output object. + @type output: mutalyzer.output.Output + @arg overwrite: Include already known transcript/version entries + (default: False). + @type overwrite: bool + """ + cache = Db.Cache() + retriever = Retriever.GenBankRetriever(output, cache) + record = retriever.loadrecord(reference) + + transcripts = [] + + for gene in record.geneList: + # We support exactly one transcript per gene. + try: + transcript = sorted(gene.transcriptList, key=attrgetter('name'))[0] + except IndexError: + continue + + # We use gene.location for now, it is always present and the same + # for our purposes. + #start, stop = transcript.mRNA.location[0], transcript.mRNA.location[1] + start, stop = gene.location + + orientation = '-' if gene.orientation == -1 else '+' + + try: + cds_start, cds_stop = transcript.CDS.location + except AttributeError: + cds_start = cds_stop = None + + transcripts.append({'gene': gene.name, + 'transcript': record.source_accession, + 'version': int(record.source_version), + 'selector': gene.name, + 'selector_version': int(transcript.name), + 'chromosome': 'chrM', + 'orientation': orientation, + 'start': start, + 'stop': stop, + 'cds_start': cds_start, + 'cds_stop': cds_stop, + 'exon_starts': [start], + 'exon_stops': [stop], + 'protein': transcript.proteinID}) + + self.db.reference_load_mapping(transcripts, overwrite=overwrite) + #load +#ReferenceUpdater diff --git a/mutalyzer/services/rpc.py b/mutalyzer/services/rpc.py index daf094495f437e2afa9195f8452910107acdf032..48b3ee1896214608519c037e7a3c3780c241e5d7 100644 --- a/mutalyzer/services/rpc.py +++ b/mutalyzer/services/rpc.py @@ -308,7 +308,7 @@ class MutalyzerService(ServiceBase): if ret : l = [] for i in ret : - l.append(i[0] + '.' + str(i[11])) + l.append(i[0] + '.' + str(i[13])) return l return [] @@ -402,9 +402,10 @@ class MutalyzerService(ServiceBase): for transcript in database.get_Transcripts(chrom, pos1, pos2, method): t = TranscriptMappingInfo() - d = dict(zip(('transcript', 'start', 'stop', 'cds_start', - 'cds_stop', 'exon_starts', 'exon_stops', 'gene', 'chromosome', - 'orientation', 'protein', 'version'), transcript)) + d = dict(zip(('transcript', 'selector', 'selector_version', + 'start', 'stop', 'cds_start', 'cds_stop', 'exon_starts', + 'exon_stops', 'gene', 'chromosome', 'orientation', 'protein', + 'version'), transcript)) if d['orientation'] == '-': d['start'], d['stop'] = d['stop'], d['start'] d['cds_start'], d['cds_stop'] = d['cds_stop'], d['cds_start'] diff --git a/setup.py b/setup.py index c2b72d457ec6f7c0ae683f35ce5d538e8bf699fb..a50feae94b212ba066a4f2da9c9365109837b341 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ setup( 'bin/mutalyzer-batchd', 'bin/mutalyzer-cache-sync', 'bin/mutalyzer-mapping-update', - 'bin/mutalyzer-mapping-add-gene', + 'bin/mutalyzer-mapping-import', 'bin/mutalyzer-soap-service.wsgi', 'bin/mutalyzer-json-service.wsgi', 'bin/mutalyzer-website.wsgi'], diff --git a/tests/test_services_soap.py b/tests/test_services_soap.py index 876a1ded11baf34a521292c91a490a2d33e075dc..c610645a2db04bdffd2ae7cf2427f3d784c8d1be 100644 --- a/tests/test_services_soap.py +++ b/tests/test_services_soap.py @@ -154,7 +154,7 @@ class TestServicesSoap(): assert_equal(type(r.string), list) # Fix for r536: disable the -u and +d convention. #assert 'XM_001715131.2:c.1155+d19483A>G' in r.string - assert 'XM_001715131.2:n.*19483A>G' in r.string + assert 'XM_001715131.2:c.*19483A>G' in r.string def test_gettranscriptsbygenename_valid(self): """