From e9bf1bc9d9fb10c02c25fee37d7e8dc64812e949 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat <martijn@vermaat.name> Date: Sun, 5 Jan 2014 23:10:36 +0100 Subject: [PATCH] Port Mapping database module to SQLAlchemy This introduces a proper notion of genome assemblies. Transcript mappings for alle genome assemblies are in the same database, which is better for maintenance. Updating transcript mappings is also simplified a lot, especially from NCBI mapview files where we now require a preprocessing sort on the input file. Overall, this port touches a lot of Mutalyzer code, so beware. --- mutalyzer/Db.py | 665 ------------------ mutalyzer/Scheduler.py | 6 +- mutalyzer/config/default_settings.py | 9 +- mutalyzer/db/models.py | 405 ++++++++++- mutalyzer/entrypoints/mapping_import.py | 106 ++- mutalyzer/entrypoints/mapping_update.py | 172 ++++- mutalyzer/mapping.py | 726 ++++---------------- mutalyzer/models.py | 1 - mutalyzer/services/rpc.py | 365 +++++----- mutalyzer/templates/batch_jobs.html | 6 +- mutalyzer/templates/position_converter.html | 17 +- mutalyzer/templates/reference_loader.html | 7 +- mutalyzer/variantchecker.py | 21 +- mutalyzer/website.py | 228 +++--- tests/test_mapping.py | 4 +- tests/test_website.py | 4 +- tests/utils.py | 2 +- 17 files changed, 1110 insertions(+), 1634 deletions(-) diff --git a/mutalyzer/Db.py b/mutalyzer/Db.py index 04b01dd8..3c45ac47 100644 --- a/mutalyzer/Db.py +++ b/mutalyzer/Db.py @@ -130,671 +130,6 @@ class Db(): #Db -class Mapping(Db) : - """ - Database functions for mapping of transcripts and genes. - - Special methods: - - __init__(build) ; Initialise the class. - - Public methods: - - get_protAcc(mrnaAcc) ; Query the database for a protein ID. - - get_NM_info(mrnaAcc) ; Retrieve various data for an NM number. - - get_NM_version(mrnaAcc) ; Get the version number of an accession - number. - - get_Transcripts(chrom, p1, p2, overlap) ; Get a list of transcripts, - given a chromosome and a range. - - get_GeneName(mrnaAcc) ; Get the gene name, given an NM number. - - isChrom(name) ; Check whether we know this name to be - a chromosome name. - - Inherited methods from Db: - - query(statement) ; General query function. - - SQL tables from dbNames: - - Mapping; Accumulated mapping info. - """ - - def __init__(self, build) : - """ - Initialise the Db parent class. Use the local database for a certain - build. - - @arg build: The version of the mapping database - @type build: string - """ - Db.__init__(self, build, settings.MYSQL_USER, settings.MYSQL_HOST) - #__init__ - - def get_NM_version(self, mrnaAcc) : - """ - Get the version number of an accession number. - - SQL tables from dbNames: - - Mapping ; Accumulated mapping info. - - @arg mrnaAcc: The ID of an mRNA - @type mrnaAcc: string - - @return: The version number - @rtype: integer - """ - - statement = """ - SELECT version - FROM Mapping - WHERE transcript = %s; - """, mrnaAcc - - return [int(i[0]) for i in self.query(statement)] - #get_NM_version - - def getAllFields(self, mrnaAcc, version=None, selector=None, selector_version=None): - """ - Get all Fields of an accession number and version number. - If the version number is None, use the "newest" version number. - - Optionally also provide a gene and transcript version to select. - - SQL tables from dbNames: - - Mapping ; Accumulated mapping info. - - @arg mrnaAcc: The ID of an mRNA - @type mrnaAcc: string - @arg version: The version number - @type version: integer - - @return: The version number - @rtype: integer - - @todo: The 'order by chrom asc' is a quick hack to make sure we first - get a primary assembly mapping instead of some haplotype mapping - for genes in the HLA cluster. - A better fix is to return the entire list of mappings, and/or - remove all secondary mappings for the HLA cluster. - See also test_converter.test_hla_cluster and bug #58. - """ - q = """ - select transcript, - selector, selector_version, - start, stop, - cds_start, cds_stop, - exon_starts, exon_stops, - gene, chromosome, - orientation, protein - from Mapping - """ - - where = [] - order = [] - args = [] - - if version is None: - where.append('transcript = %s') - order.append('version desc') - order.append('chromosome asc') - args.append(mrnaAcc) - else: - where.append('transcript = %s') - where.append('version = %s') - order.append('chromosome asc') - args.append(mrnaAcc) - args.append(version) - - # The fallback to NULL selector (and selector_version) is necessary - # to accept transcript selection in NM references. To be safe, we also - # prefer entries with a match over entries with NULL. - # Example: NM_017780.2(CHD7_v001):c.109A>T - if selector is not None: - where.append('(selector = %s or (selector IS NULL and selector_version IS NULL and gene = %s and %s = 1))') - args.append(selector) - args.append(selector) - args.append(selector_version) - order.append('(selector is null) asc') - - if selector_version is not None: - where.append('(selector_version = %s or (selector_version IS NULL and selector_version IS NULL and gene = %s and %s = 1))') - args.append(selector_version) - args.append(selector) - args.append(selector_version) - order.append('(selector_version is null) asc') - - q += """ - where """ + ' AND '.join(where) + """ - order by """ + ', '.join(order) + ';' - - statement = q, tuple(args) - try: - return self.query(statement)[0] - except IndexError: - return None - - def get_Transcripts(self, chrom, p1, p2, overlap) : - """ - Get a list of transcripts, given a chromosome and a range. If - all transcripts that are hit should be returned, set overlap to 1, - if only the transcripts that completely reside within a range - should be returned, set overlap to 0. - - SQL tables from dbNames: - - Mapping ; Accumulated mapping info. - - @arg chrom: The chromosome (coded as "chr1", ..., "chrY") - @type chrom: string - @arg p1: The position relative to the start of the chromosome - @type p1: integer - @arg p2: The position relative to the start of the chromosome - @type p2: integer - @arg overlap: Specify the behaviour of the selection: - - 0 ; Return only the transcripts that completely fall in the - range [p1, p2] - - 1 ; Return all hit transcripts - @type overlap: boolean - - @return: All accession numbers that are hit according to the overlap - criterium - @rtype: list - """ - q = """ - select transcript, - selector, selector_version, - start, stop, - cds_start, cds_stop, - exon_starts, exon_stops, - gene, chromosome, - orientation, protein, - version - from Mapping - """ - if overlap: - q += """ - WHERE chromosome = %s AND - start <= "%s" AND - stop >= "%s"; - """ - statement = q, (chrom, p2, p1) - - else: - q += """ - WHERE chromosome = %s AND - start >= "%s" AND - stop <= "%s"; - """ - statement = q, (chrom, p1, p2) - - return self.query(statement) - #get_Transcripts - - def get_TranscriptsByGeneName(self, gene): - """ - Get a list of transcripts, given a gene name. - - Arguments: - geneName ; Name of a gene. - - SQL tables from dbNames: - Mapping ; Accumulated mapping info. - - Returns: - list ; A list of transcripts. - """ - statement = """ - SELECT transcript, - selector, selector_version, - start, stop, - cds_start, cds_stop, - exon_starts, exon_stops, - gene, chromosome, - orientation, protein, - version - FROM Mapping - WHERE gene = %s; - """, gene - - return self.query(statement) - #get_TranscriptsByGeneName - - def get_GeneName(self, mrnaAcc) : - """ - Get the name of a gene, given a transcript identifier (NM number). - - SQL tables from dbNames: - - Mapping ; Accumulated mapping info. - - @arg mrnaAcc: The ID of an mRNA - @type mrnaAcc: string - - @return: The gene name - @rtype: string - """ - - statement = """ - SELECT gene - FROM Mapping - WHERE transcript = %s; - """, mrnaAcc - - ret = self.query(statement) - if ret : - return ret[0][0] - return None - #get_GeneName - - def isChrom(self, name) : - """ - Check if the given name is a valid chromosome name. - - SQL tables from dbNames: - - Mapping ; Accumulated mapping info. - - @arg name: The name to be tested - @type name: string - - @return: True if the name is found to be a chromosome name, False - otherwise - @rtype: boolean - """ - - statement = """ - SELECT COUNT(*) - FROM Mapping - WHERE chromosome = %s; - """, name - - if int(self.query(statement)[0][0]) > 0 : - return True - return False - #isChrom - - def chromName(self, accNo) : - """ - Get the name of a chromosome, given an accession number. - - SQL tables from dbNames: - - ChrName ; Assembly release notes. - - @arg accNo: The accession number of a chromosome - @type accNo: string - - @return: The name of a chromosome - @rtype: string - """ - - statement = """ - SELECT name - FROM ChrName - WHERE AccNo = %s; - """, accNo - - ret = self.query(statement) - if ret : - return ret[0][0] - return None - #chromName - - def chromAcc(self, name) : - """ - Get the accession number of a chromosome, given a name. - - SQL tables from dbNames: - - ChrName ; Assembly release notes. - - @arg name: The name of a chromosome - @type name: string - - @return: The accession number of a chromosome - @rtype: string - """ - - statement = """ - SELECT AccNo, organelle_type - FROM ChrName - WHERE name = %s; - """, name - - ret = self.query(statement) - if ret : - return ret[0] - return None - #chromAcc - - def get_chromName(self, acc) : - """ - Get the chromosome name, given a transcript identifier (NM number). - - SQL tables from dbNames: - - Mapping ; Accumulated mapping info. - - @arg acc: The NM accession number (version NOT included) - @type acc: string - - @return: The chromosome name (e.g. chr1) - @rtype: string - """ - - statement = """ - SELECT chromosome - FROM Mapping - WHERE transcript = %s; - """, acc - - ret = self.query(statement) - if ret : - return ret[0][0] - return None - #get_chromName - - def merge_update(self): - """ - Merge existing mapping information with new mapping information, which - should be in table 'MappingTemp'. - - The strategy is as follows. Existing mappings (accumulated by - Mutalyzer in the past) that are not in the new mapping information are - added to the new mapping information. The resulting set is used as the - mapping information from now on. - - This way, we get the latest updates for existing mappings and keep old - mappings not in the updated information. - - SQL tables from dbNames: - - Mapping ; Accumulated mapping info. - - MappingTemp ; New mapping info. - - MappingBackup ; Backup of accumulated mapping info. - - @note: We temporarily suppress warnings during some queries, since - they are expected and clutter the console output (e.g. warnings - for existing tables). - @todo: Return number of entries added/updated. - """ - statement = """ - CREATE TABLE IF NOT EXISTS MappingTemp LIKE Mapping; - """, None - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - self.query(statement) - - statement = """ - INSERT INTO MappingTemp - SELECT * FROM Mapping AS OldM - WHERE NOT EXISTS ( - SELECT * FROM MappingTemp AS NewM - WHERE OldM.transcript = NewM.transcript - AND OldM.version = NewM.version - ); - """, None - self.query(statement) - - statement = """ - DROP TABLE IF EXISTS MappingBackup; - """, None - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - self.query(statement) - - statement = """ - RENAME TABLE Mapping TO MappingBackup, MappingTemp TO Mapping; - """, None - self.query(statement) - #merge_update - - def ncbi_create_temporary_tables(self): - """ - Create temporary tables to import NCBI mapping into. - - SQL tables from dbNames: - - Genes ; Gene names from NCBI. - - Transcripts ; Transcript mappings from NCBI. - - Exons ; Exon mappings from NCBI. - """ - self.ncbi_drop_temporary_tables() - - statement = """ - CREATE TABLE Genes ( - id varchar(20) NOT NULL DEFAULT '', - name varchar(255) DEFAULT NULL, - PRIMARY KEY (id) - ); - """, None - self.query(statement) - - statement = """ - CREATE TABLE Transcripts ( - name varchar(20) NOT NULL DEFAULT '', - gene_id varchar(20) DEFAULT NULL, - chromosome char(2) DEFAULT NULL, - start int(11) DEFAULT NULL, - stop int(11) DEFAULT NULL, - orientation char(1) DEFAULT NULL, - PRIMARY KEY (name,start) - ); - """, None - self.query(statement) - - statement = """ - CREATE TABLE Exons ( - transcript varchar(20) NOT NULL DEFAULT '', - chromosome char(2) DEFAULT NULL, - start int(11) DEFAULT NULL, - stop int(11) DEFAULT NULL, - cds_start int(11) DEFAULT NULL, - cds_stop int(11) DEFAULT NULL, - protein varchar(20) DEFAULT NULL, - PRIMARY KEY (transcript,start) - ); - """, None - self.query(statement) - #ncbi_create_temporary_table - - def ncbi_drop_temporary_tables(self): - """ - Drop temporary tables used for importing NCBI mapping information. - - SQL tables from dbNames: - - Genes ; Gene names from NCBI. - - Transcripts ; Transcript mappings from NCBI. - - Exons ; Exon mappings from NCBI. - """ - statement = """ - DROP TABLE IF EXISTS Genes, Transcripts, Exons; - """, None - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - self.query(statement) - #ncbi_drop_temporary_tables - - def ncbi_import_gene(self, id, name): - """ - Import a (gene id, gene name) pair in a temporary table. - - SQL tables from dbNames: - - Genes ; Gene names from NCBI. - """ - statement = """ - INSERT IGNORE INTO Genes (id, name) VALUES (%s, %s); - """, (id, name) - - self.query(statement) - #ncbi_import_gene - - def ncbi_import_transcript(self, name, gene, chromosome, start, stop, - orientation): - """ - Import a transcript mapping in a temporary table. - - SQL tables from dbNames: - - Transcripts ; Transcript mappings from NCBI. - """ - statement = """ - INSERT IGNORE INTO Transcripts - (name, gene_id, chromosome, start, stop, orientation) - VALUES - (%s, %s, %s, %s, %s, %s); - """, (name, gene, chromosome, start, stop, orientation) - - self.query(statement) - #ncbi_import_transcript - - def ncbi_import_exon(self, transcript, chromosome, start, stop, cds_start, - cds_stop, protein): - """ - Import an exon mapping in a temporary table. - - SQL tables from dbNames: - - Exons ; Exon mappings from NCBI. - """ - statement = """ - INSERT IGNORE INTO Exons - (transcript, chromosome, start, stop, cds_start, cds_stop, protein) - VALUES - (%s, %s, %s, %s, %s, %s, %s); - """, (transcript, chromosome, start, stop, cds_start, cds_stop, protein) - - self.query(statement) - #ncbi_import_exon - - def ncbi_aggregate_mapping(self): - """ - Aggregate gene, transcript and exon mapping information from the NCBI - into one table. - - @note: Default MySQL value for group_concat_max_len is 1024, meaning - that the GROUP_CONCAT aggregate function returns values of at most - 1024 bytes long. This is not enough (currently we need around 3000 - bytes), so we explicitely set this to a higher value. - @note: We use MAX(E.protein) since MySQL does not have an ANY() - aggregator. - @note: Some genes (e.g. in the PAR) are mapped on both the X and Y - chromosomes. Therefore, we group not only by transcript name, but - also by chromosome, and add conditions on exon positions. The flaw - here is that we miss genes that are mapped to two locations on one - chromosome, but I don't think we have any of those. - """ - statement = """ - SET group_concat_max_len = 32768; - """, None - self.query(statement) - - statement = """ - DROP TABLE IF EXISTS MappingTemp; - """, None - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - self.query(statement) - - statement = """ - CREATE TABLE MappingTemp LIKE Mapping; - """, None - self.query(statement) - - statement = """ - INSERT INTO MappingTemp - SELECT - G.name as gene, - SUBSTRING(T.name FROM 1 FOR LOCATE('.', T.name) - 1) as transcript, - SUBSTRING(T.name FROM LOCATE('.', T.name) + 1) as version, - NULL as selector, - NULL as selector_version, - CONCAT('chr', T.chromosome) as chromosome, - T.orientation as orientation, - MIN(T.start) as start, - MAX(T.stop) as stop, - REPLACE(MIN(COALESCE(E.cds_start, 1000000000)), 1000000000, NULL) as cds_start, - MAX(E.cds_stop) as cds_stop, - GROUP_CONCAT(DISTINCT E.start ORDER BY E.start ASC) as exon_starts, - GROUP_CONCAT(DISTINCT E.stop ORDER BY E.stop ASC) as exon_stops, - MAX(E.protein) as protein, - 'NCBI' as source - FROM Transcripts as T, Genes as G, Exons as E - WHERE T.gene_id = G.id AND T.name = E.transcript - AND E.chromosome = T.chromosome AND E.start >= T.start AND E.stop <= T.stop - GROUP BY T.name, T.chromosome; - """, None - self.query(statement) - #ncbi_aggregate_mapping - - def ucsc_load_mapping(self, transcripts, overwrite=False): - """ - Load given transcripts into the 'MappingTemp' table. - - Todo: Don't overwrite NCBI/reference entries, but *do* overwrite - existing UCSC entries. - """ - statement = """ - DROP TABLE IF EXISTS MappingTemp; - """, None - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - self.query(statement) - - statement = """ - CREATE TABLE MappingTemp LIKE Mapping; - """, None - self.query(statement) - - for (gene, transcript, version, chromosome, orientation, start, - stop, cds_start, cds_stop, exon_starts, exon_stops, protein) in transcripts: - exon_starts = ','.join(str(i) for i in exon_starts) - exon_stops = ','.join(str(i) for i in exon_stops) - - statement = """ - INSERT INTO `MappingTemp` - (`gene`, `transcript`, `version`, `chromosome`, `orientation`, - `start`, `stop`, `cds_start`, `cds_stop`, `exon_starts`, `exon_stops`, - `protein`, `source`) - SELECT - %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s - FROM DUAL WHERE NOT EXISTS - (SELECT * FROM `Mapping` - WHERE `transcript` = %s AND `version` = %s AND 1 = %s); - """, (gene, transcript, version, chromosome, orientation, start, - stop, cds_start, cds_stop, exon_starts, exon_stops, protein, - 'UCSC', transcript, version, int(overwrite) + 1) - self.query(statement) - #ucsc_load_mapping - - def reference_load_mapping(self, transcripts, overwrite=False): - """ - Load given transcripts into the 'MappingTemp' table. - - Todo: Don't overwrite NCBI/UCSC entries, but *do* overwrite existing - reference entries. - """ - statement = """ - DROP TABLE IF EXISTS MappingTemp; - """, None - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - self.query(statement) - - statement = """ - CREATE TABLE MappingTemp LIKE Mapping; - """, None - self.query(statement) - - for t in transcripts: - exon_starts = ','.join(str(i) for i in t['exon_starts']) - exon_stops = ','.join(str(i) for i in t['exon_stops']) - statement = """ - INSERT INTO `MappingTemp` - (`gene`, `transcript`, `version`, `selector`, `selector_version`, - `chromosome`, `orientation`, `start`, `stop`, `cds_start`, `cds_stop`, - `exon_starts`, `exon_stops`, `protein`, `source`) - SELECT - %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 'reference' - FROM DUAL WHERE NOT EXISTS - (SELECT * FROM `Mapping` - WHERE `transcript` = %s AND `version` = %s AND 1 = %s); - """, (t['gene'], t['transcript'], t['version'], t['selector'], - t['selector_version'], t['chromosome'], t['orientation'], - t['start'], t['stop'], t['cds_start'], t['cds_stop'], - exon_starts, exon_stops, - t['protein'], t['transcript'], t['version'], int(overwrite) + 1) - self.query(statement) - #reference_load_mapping -#Mapping - - class Counter(Db): """ Database functions for the service counters. diff --git a/mutalyzer/Scheduler.py b/mutalyzer/Scheduler.py index 3fd52c96..b589c0db 100644 --- a/mutalyzer/Scheduler.py +++ b/mutalyzer/Scheduler.py @@ -23,7 +23,7 @@ from sqlalchemy import func import mutalyzer from mutalyzer.config import settings from mutalyzer.db import queries, session -from mutalyzer.db.models import BatchJob, BatchQueueItem +from mutalyzer.db.models import Assembly, BatchJob, BatchQueueItem from mutalyzer import variantchecker from mutalyzer.grammar import Grammar from mutalyzer.output import Output @@ -92,6 +92,7 @@ class Scheduler() : """ # Mail is set to 'test@test.test" if the batch job was submitted from # a unit test. + # Todo: Use `settings.TESTING` instead. if mailTo == 'test@test.test': return @@ -560,7 +561,8 @@ Mutalyzer batch scheduler""" % url) if not skip : try : #process - converter = Converter(batch_job.argument, O) + assembly = Assembly.get(int(batch_job.argument)) + converter = Converter(assembly, O) #Also accept chr accNo variant = converter.correctChrVariant(variant) diff --git a/mutalyzer/config/default_settings.py b/mutalyzer/config/default_settings.py index 7e04dd5f..bf622349 100644 --- a/mutalyzer/config/default_settings.py +++ b/mutalyzer/config/default_settings.py @@ -5,7 +5,7 @@ pointed-to by the `MUTALYZER_SETTINGS` environment variable. # Use Mutalyzer in debug mode. -DEBUG = True +DEBUG = False # We are running unit tests. TESTING = False @@ -38,11 +38,8 @@ MYSQL_USER = 'mutalyzer' # Local MySQL database name. MYSQL_DATABASE = 'mutalyzer' -# Available databases with mapping information. -DB_NAMES = ['hg18', 'hg19', 'mm10'] - -# Default database for mapping information. -DEFAULT_DB = 'hg19' +# Default genome assembly (by name or alias). +DEFAULT_ASSEMBLY = 'hg19' # Name and location of the log file. import os diff --git a/mutalyzer/db/models.py b/mutalyzer/db/models.py index 33587f8a..654fba73 100644 --- a/mutalyzer/db/models.py +++ b/mutalyzer/db/models.py @@ -7,11 +7,10 @@ from datetime import datetime import sqlite3 import uuid -from sqlalchemy import (event, Column, Boolean, BigInteger, DateTime, ForeignKey, - Integer, Numeric, String, Table, Text, Index, Enum, - UniqueConstraint) +from sqlalchemy import (event, Boolean, Column, DateTime, Enum, ForeignKey, + Index, Integer, String, Text, TypeDecorator) from sqlalchemy.engine import Engine -from sqlalchemy.orm import relationship, backref +from sqlalchemy.orm import backref, relationship from mutalyzer import db @@ -35,6 +34,30 @@ def set_sqlite_pragma(dbapi_connection, connection_record): cursor.close() +class Positions(TypeDecorator): + """ + Represents an immutable list of integers as a concatentation of their + string serializations. + + Adapted from the `Marshal JSON Strings + <http://docs.sqlalchemy.org/en/latest/core/types.html#marshal-json-strings>`_ + example in the SQLAlchemy documentation. + """ + impl = Text + + def process_bind_param(self, value, dialect): + if value is not None: + value = ','.join(str(i) for i in value) + return value + + def process_result_value(self, value, dialect): + if value is None: + return None + if value == '': + return [] + return [int(s) for s in value.split(',')] + + class BatchJob(db.Base): """ Batch job. @@ -54,8 +77,8 @@ class BatchJob(db.Base): #: Type of batch job. job_type = Column(Enum(*BATCH_JOB_TYPES, name='job_type'), nullable=False) - #: Optional argument (currently only used with job type PositionConverter, - #: where it denotes the assembly). + #: Optional argument (currently only used when `job_type` is + #: ``PositionConverter``, where it denotes the assembly). argument = Column(String(20)) #: Identifier to use in the job result filename and thus the URL for @@ -95,15 +118,15 @@ class BatchQueueItem(db.Base): #: Input line from a batch job. item = Column(String(200), nullable=False) - #: A list of flags set for this item. A flag consists of either an A, S or - #: C followed by a digit, which refers to the reason of alteration/skip. - #: We simply store the concatenation of these flags. + #: A list of flags set for this item. A flag consists of either an ``A``, + #: ``S`` or ``C`` followed by a digit, which refers to the reason of + #: alteration/skip. We simply store the concatenation of these flags. flags = Column(String(20), nullable=False) #: The :class:`BatchJob` for this item. batch_job = relationship( BatchJob, - backref=backref('batch_jobs', lazy='dynamic', + backref=backref('batch_queue_items', lazy='dynamic', cascade='all, delete-orphan', passive_deletes=True)) @@ -130,7 +153,8 @@ class Reference(db.Base): id = Column(Integer, primary_key=True) #: Accession number for this reference, including the version number if - #: applicable (e.g., AL449423.14, NM_000059.3, UD_138781341344). + #: applicable (e.g., ``AL449423.14``, ``NM_000059.3``, + #: ``UD_138781341344``). accession = Column(String(20), nullable=False, index=True, unique=True) #: MD5 checksum of the reference file. @@ -195,13 +219,13 @@ class TranscriptProteinLink(db.Base): id = Column(Integer, primary_key=True) #: Accession number for the transcript, not including the version number - #: (e.g., NM_018195, XM_005270562, NR_015380). + #: (e.g., ``NM_018195`, ``XM_005270562``, ``NR_015380``). transcript_accession = Column(String(20), nullable=False, index=True, unique=True) #: Accession number for the protein, not including the version number - #: (e.g., NP_060665, XP_005258635). If `NULL`, the record states that no - #: protein is linked to the transcript by the NCBI. + #: (e.g., ``NP_060665``, ``XP_005258635``). If `NULL`, the record states + #: that no protein is linked to the transcript by the NCBI. protein_accession = Column(String(20), index=True) #: Date and time of creation. @@ -217,10 +241,363 @@ class TranscriptProteinLink(db.Base): % (self.transcript_accession, self.protein_accession) +class Assembly(db.Base): + """ + Genome assembly. + """ + __tablename__ = 'assemblies' + __table_args__ = {'mysql_engine': 'InnoDB', 'mysql_charset': 'utf8'} + + id = Column(Integer, primary_key=True) + + #: Genome Reference Consortium name (e.g., ``GRCh37``, ``GRCm38``). + name = Column(String(30), nullable=False, unique=True) + + #: Short alias (e.g., ``hg19``, ``mm10``). + alias = Column(String(10), unique=True) + + #: NCBI taxonomy identifier (e.g., 9606, 10090). + taxonomy_id = Column(Integer, nullable=False) + + #: NCBI taxonomy name (e.g., ``Homo sapiens``, ``Mus musculus``). + taxonomy_common_name = Column(String(50), nullable=False) + + def __init__(self, name, taxonomy_id, taxonomy_common_name, alias=None): + self.name = name + self.taxonomy_id = taxonomy_id + self.taxonomy_common_name = taxonomy_common_name + self.alias = alias + + def __repr__(self): + return '<Assembly %r taxonomy=%r>' % (self.name, + self.taxonomy_common_name) + + +class Chromosome(db.Base): + """ + Chromosome name and accession number in a genome assembly. + """ + __tablename__ = 'chromosomes' + __table_args__ = {'mysql_engine': 'InnoDB', 'mysql_charset': 'utf8'} + + id = Column(Integer, primary_key=True) + assembly_id = Column(Integer, + ForeignKey('assemblies.id', ondelete='CASCADE'), + nullable=False) + + #: Chromosome name (e.g., ``chr1``, ``chrM``). + name = Column(String(30), nullable=False) + + #: Chromosome accession number (e.g., ``NC_000001.10``, ``NC_012920.1``). + accession = Column(String(30), nullable=False) + + #: Type of organelle. + organelle_type = Column(Enum('chromosome', 'mitochondrion', + name='organelle_type')) + + #: The :class:`Assembly` this chromosome is in. + assembly = relationship( + Assembly, + lazy='joined', + innerjoin=True, + backref=backref('chromosomes', lazy='dynamic', + cascade='all, delete-orphan', + passive_deletes=True)) + + def __init__(self, assembly, name, accession, organelle_type): + self.assembly = assembly + self.name = name + self.accession = accession + self.organelle_type = organelle_type + + def __repr__(self): + return '<Chromosome %r accession=%r>' % (self.name, self.accession) + + +Index('chromosome_name', + Chromosome.assembly_id, Chromosome.name, + unique=True) +Index('chromosome_accession', + Chromosome.assembly_id, Chromosome.accession, + unique=True) + + +class TranscriptMapping(db.Base): + """ + Mapping of a gene transcript on a chromosome. + + .. note:: All positions are ordered according to the chromosome, + irrespective of transcript orientation. For example, `start` is always + smaller than `stop`. + """ + __tablename__ = 'transcript_mappings' + __table_args__ = {'mysql_engine': 'InnoDB', 'mysql_charset': 'utf8'} + + id = Column(Integer, primary_key=True) + chromosome_id = Column(Integer, + ForeignKey('chromosomes.id', ondelete='CASCADE'), + nullable=False) + + #: Type of reference sequence containing the transcript. + reference_type = Column(Enum('refseq', 'lrg', name='reference_type'), + nullable=False) + + #: Accession number for the transcript reference, not including the + #: version number (e.g., ``NM_000020``, ``NC_012920``, ``LRG_14``). + accession = Column(String(20), nullable=False) + + #: Accession version (e.g., 3, 2). Not applicaple when `reference_type` is + #: ``lrg``. + version = Column(Integer) + + #: Gene symbol (e.g., ``DMD``, ``PSEN1``, ``TRNS1``). + gene = Column(String(30), nullable=False) + + #: Transcript number in reference (e.g., 1, 2). + transcript = Column(Integer, nullable=False) + + # Todo: Use constants and some utilities for conversion. + #: The orientation of the transcript on the chromosome. + orientation = Column(Enum('forward', 'reverse', name='orentation'), + nullable=False) + + #: The start position of the transcript on the chromosome. + start = Column(Integer, nullable=False) + + #: The stop position of the transcript on the chromosome. + stop = Column(Integer, nullable=False) + + #: The CDS start position of the transcript on the chromosome. + cds_start = Column(Integer) + + #: The CDS stop position of the transcript on the chromosome. + cds_stop = Column(Integer) + + #: The exon start positions of the transcript on the chromosome. + exon_starts = Column(Positions, nullable=False) + + #: The exon start positions of the transcript on the chromosome. + exon_stops = Column(Positions, nullable=False) + + #: If `False`, variant descriptions can use just the accession number + #: without gene and transcript selector (e.g., ``NM_000020:c.1del``, + #: ``LRG_1:c.1del``). If `True`, gene and transcript selection is + #: necessary (e.g., ``NC_012920(TRNI_v001):c.1del``, ``LRG_1_t1:c.1del``). + select_transcript = Column(Boolean, nullable=False) + + #: Source of the mapping. + source = Column(Enum('ucsc', 'ncbi', 'reference', name='source'), + nullable=False) + + #: The :class:`Assembly` this chromosome is in. + chromosome = relationship( + Chromosome, + lazy='joined', + innerjoin=True, + backref=backref('transcript_mappings', lazy='dynamic', + cascade='all, delete-orphan', + passive_deletes=True)) + + def __init__(self, chromosome, reference_type, accession, gene, + orientation, start, stop, exon_starts, exon_stops, source, + transcript=1, cds=None, select_transcript=False, + version=None): + self.chromosome = chromosome + self.reference_type = reference_type + self.accession = accession + self.gene = gene + self.orientation = orientation + self.start = start + self.stop = stop + self.exon_starts = exon_starts + self.exon_stops = exon_stops + self.source = source + self.transcript = transcript + self.cds = cds + self.select_transcript = select_transcript + self.version = version + + def __repr__(self): + return ('<TranscriptMapping accession=%r version=%r gene=%r ' + 'transcript=%r>' + % (self.accession, self.version, self.gene, self.transcript)) + + @classmethod + def create_or_update(cls, chromosome, reference_type, accession, gene, + orientation, start, stop, exon_starts, exon_stops, + source, transcript=1, cds=None, + select_transcript=False, version=None): + """ + Returns an existing :class:`TranscriptMapping` instance with the given + values for `chromosome`, `accession`, `version`, `gene`, and + `transcript` if it exists, or a new instance otherwise. + + .. note:: Unfortunately, SQLAlchemy does not have `ON DUPLICATE KEY + UPDATE` functionality, which would performance-wise be the best + way to update transcript mappings. This class method implements an + alternative albeit at the cost of querying the table for an + existing entry on each update. + """ + instance = cls.query.filter_by( + chromosome=chromosome, accession=accession, version=version, + gene=gene, transcript=transcript).first() + if instance is None: + instance = cls(chromosome, reference_type, accession, gene, + orientation, start, stop, exon_starts, exon_stops, + source, transcript=transcript, cds=cds, + select_transcript=select_transcript, + version=version) + else: + instance.reference_type = reference_type + instance.orientation = orientation + instance.start = start + instance.stop = stop + instance.exon_starts = exon_starts + instance.exon_stops = exon_stops + instance.source = source + instance.cds = cds + instance.select_transcript = select_transcript + return instance + + @property + def coding(self): + """ + Set to `True` iff the transcript is coding. + """ + return self.cds_start is not None and self.cds_stop is not None + + @property + def cds(self): + """ + Tuple of CDS start and stop positions on the chromosome, or `None` if + the transcript is non-coding. + """ + if self.coding: + return self.cds_start, self.cds_stop + return None + + @cds.setter + def cds(self, cds): + self.cds_start, self.cds_stop = cds or (None, None) + + +Index('transcript_mapping_transcript', + TranscriptMapping.accession, TranscriptMapping.version, + TranscriptMapping.gene, TranscriptMapping.transcript, + TranscriptMapping.chromosome_id, + unique=True) + + def create_all(): db.Base.metadata.drop_all(db.session.get_bind()) db.Base.metadata.create_all(db.session.get_bind()) + assembly = Assembly('GRCh37', 9606, 'Homo sapiens', alias='hg19') + db.session.add(assembly) + + for accession, name, organelle_type in [ + ('NC_000001.10', 'chr1', 'chromosome'), + ('NC_000002.11', 'chr2', 'chromosome'), + ('NC_000003.11', 'chr3', 'chromosome'), + ('NC_000004.11', 'chr4', 'chromosome'), + ('NC_000005.9', 'chr5', 'chromosome'), + ('NC_000006.11', 'chr6', 'chromosome'), + ('NC_000007.13', 'chr7', 'chromosome'), + ('NC_000008.10', 'chr8', 'chromosome'), + ('NC_000009.11', 'chr9', 'chromosome'), + ('NC_000010.10', 'chr10', 'chromosome'), + ('NC_000011.9', 'chr11', 'chromosome'), + ('NC_000012.11', 'chr12', 'chromosome'), + ('NC_000013.10', 'chr13', 'chromosome'), + ('NC_000014.8', 'chr14', 'chromosome'), + ('NC_000015.9', 'chr15', 'chromosome'), + ('NC_000016.9', 'chr16', 'chromosome'), + ('NC_000017.10', 'chr17', 'chromosome'), + ('NC_000018.9', 'chr18', 'chromosome'), + ('NC_000019.9', 'chr19', 'chromosome'), + ('NC_000020.10', 'chr20', 'chromosome'), + ('NC_000021.8', 'chr21', 'chromosome'), + ('NC_000022.10', 'chr22', 'chromosome'), + ('NC_000023.10', 'chrX', 'chromosome'), + ('NC_000024.9', 'chrY', 'chromosome'), + ('NT_167244.1', 'chr6_apd_hap1', 'chromosome'), + ('NT_113891.2', 'chr6_cox_hap2', 'chromosome'), + ('NT_167245.1', 'chr6_dbb_hap3', 'chromosome'), + ('NT_167246.1', 'chr6_mann_hap4', 'chromosome'), + ('NT_167247.1', 'chr6_mcf_hap5', 'chromosome'), + ('NT_167248.1', 'chr6_qbl_hap6', 'chromosome'), + ('NT_167249.1', 'chr6_ssto_hap7', 'chromosome'), + ('NT_167250.1', 'chr4_ctg9_hap1', 'chromosome'), + ('NT_167251.1', 'chr17_ctg5_hap1', 'chromosome'), + ('NC_012920.1', 'chrM', 'mitochondrion')]: + chromosome = Chromosome(assembly, name, accession, organelle_type) + db.session.add(chromosome) + + assembly = Assembly('GRCh36', 9606, 'Homo sapiens', alias='hg18') + db.session.add(assembly) + + for accession, name, organelle_type in [ + ('NC_000001.9', 'chr1', 'chromosome'), + ('NC_000002.10', 'chr2', 'chromosome'), + ('NC_000003.10', 'chr3', 'chromosome'), + ('NC_000004.10', 'chr4', 'chromosome'), + ('NC_000005.8', 'chr5', 'chromosome'), + ('NC_000006.10', 'chr6', 'chromosome'), + ('NC_000007.12', 'chr7', 'chromosome'), + ('NC_000008.9', 'chr8', 'chromosome'), + ('NC_000009.10', 'chr9', 'chromosome'), + ('NC_000010.9', 'chr10', 'chromosome'), + ('NC_000011.8', 'chr11', 'chromosome'), + ('NC_000012.10', 'chr12', 'chromosome'), + ('NC_000013.9', 'chr13', 'chromosome'), + ('NC_000014.7', 'chr14', 'chromosome'), + ('NC_000015.8', 'chr15', 'chromosome'), + ('NC_000016.8', 'chr16', 'chromosome'), + ('NC_000017.9', 'chr17', 'chromosome'), + ('NC_000018.8', 'chr18', 'chromosome'), + ('NC_000019.8', 'chr19', 'chromosome'), + ('NC_000020.9', 'chr20', 'chromosome'), + ('NC_000021.7', 'chr21', 'chromosome'), + ('NC_000022.9', 'chr22', 'chromosome'), + ('NC_000023.9', 'chrX', 'chromosome'), + ('NC_000024.8', 'chrY', 'chromosome'), + ('NT_113891.1', 'chr6_cox_hap1', 'chromosome'), + ('NT_113959.1', 'chr22_h2_hap1', 'chromosome'), + ('NC_001807.4', 'chrM', 'mitochondrion')]: + chromosome = Chromosome(assembly, name, accession, organelle_type) + db.session.add(chromosome) + + assembly = Assembly('GRCm38', 10090, 'Mus musculus', alias='mm10') + db.session.add(assembly) + + for accession, name, organelle_type in [ + ('NC_000067.65', 'chr1', 'chromosome'), + ('NC_000068.70', 'chr2', 'chromosome'), + ('NC_000069.60', 'chr3', 'chromosome'), + ('NC_000070.66', 'chr4', 'chromosome'), + ('NC_000071.65', 'chr5', 'chromosome'), + ('NC_000072.60', 'chr6', 'chromosome'), + ('NC_000073.61', 'chr7', 'chromosome'), + ('NC_000074.60', 'chr8', 'chromosome'), + ('NC_000075.60', 'chr9', 'chromosome'), + ('NC_000076.60', 'chr10', 'chromosome'), + ('NC_000077.60', 'chr11', 'chromosome'), + ('NC_000078.60', 'chr12', 'chromosome'), + ('NC_000079.60', 'chr13', 'chromosome'), + ('NC_000080.60', 'chr14', 'chromosome'), + ('NC_000081.60', 'chr15', 'chromosome'), + ('NC_000082.60', 'chr16', 'chromosome'), + ('NC_000083.60', 'chr17', 'chromosome'), + ('NC_000084.60', 'chr18', 'chromosome'), + ('NC_000085.60', 'chr19', 'chromosome'), + ('NC_000086.71', 'chrX', 'chromosome'), + ('NC_000087.74', 'chrY', 'chromosome'), + ('NC_005089.1', 'chrM', 'mitochondrion')]: + chromosome = Chromosome(assembly, name, accession, organelle_type) + db.session.add(chromosome) + + db.session.commit() + # Todo: Use alembic. # if using alembic: diff --git a/mutalyzer/entrypoints/mapping_import.py b/mutalyzer/entrypoints/mapping_import.py index 4982ad1a..1d72523e 100644 --- a/mutalyzer/entrypoints/mapping_import.py +++ b/mutalyzer/entrypoints/mapping_import.py @@ -5,30 +5,69 @@ reference. import argparse +from operator import attrgetter +import sys +from sqlalchemy import or_ + +from ..db import session +from ..db.models import Assembly, TranscriptMapping from .. import output -from .. import mapping +from .. import Retriever -def import_gene(database, gene): +def import_gene(assembly, gene): """ Update the database with information from the UCSC. - .. todo: Also report how much was added/updated. + .. todo: This is not really tested. """ o = output.Output(__file__) o.addMessage(__file__, -1, 'INFO', 'Starting UCSC mapping data update (gene: %s)' % gene) - updater = mapping.UCSCUpdater(database) - updater.load(gene) - updater.merge() + connection = MySQLdb.connect(user='genome', + host='genome-mysql.cse.ucsc.edu', + db=assembly.alias) + + query = """ + SELECT DISTINCT + acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, + exonEnds, name2 AS geneName, chrom, strand, protAcc + FROM gbStatus, refGene, refLink + WHERE type = "mRNA" + AND refGene.name = acc + AND acc = mrnaAcc + AND name2 = %s + """ + parameters = gene + + cursor = connection.cursor() + cursor.execute(query, parameters) + result = cursor.fetchall() + cursor.close() + + for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds, + geneName, chrom, strand, protAcc) in result: + orientation = 'reverse' if strand == '-' else 'forward' + exon_starts = [int(i) + 1 for i in exonStarts.split(',') if i] + exon_stops = [int(i) for i in exonEnds.split(',') if i] + if cdsStart and cdsEnd: + cds = cdsStart + 1, cdsEnd + else: + cds = None + mapping = TranscriptMapping.create_or_update( + chrom, 'refseq', acc, gene.name, orientation, txStart + 1, txEnd, + exon_starts, exon_stops, 'ucsc', cds=cds, version=int(version)) + session.add(mapping) + + session.commit() o.addMessage(__file__, -1, 'INFO', 'UCSC mapping data update end (gene: %s)' % gene) -def import_reference(database, reference): +def import_reference(assembly, reference): """ Update the database with information from the given reference. @@ -41,9 +80,40 @@ def import_reference(database, reference): o.addMessage(__file__, -1, 'INFO', 'Starting reference mapping data update (reference: %s)' % reference) - updater = mapping.ReferenceUpdater(database) - updater.load(reference, o) - updater.merge() + chromosome = assembly.chromosomes.filter_by(name='chrM').one() + + retriever = Retriever.GenBankRetriever(o) + record = retriever.loadrecord(reference) + + select_transcript = len(record.geneList) > 1 + + for gene in record.geneList: + # We support exactly one transcript per gene. + try: + transcript = sorted(gene.transcriptList, key=attrgetter('name'))[0] + except IndexError: + continue + + # We use gene.location for now, it is always present and the same + # for our purposes. + #start, stop = transcript.mRNA.location[0], transcript.mRNA.location[1] + start, stop = gene.location + + orientation = 'reverse' if gene.orientation == -1 else 'forward' + + try: + cds = transcript.CDS.location + except AttributeError: + cds = None + + mapping = TranscriptMapping.create_or_update( + chromosome, 'refseq', record.source_accession, gene.name, + orientation, start, stop, [start], [stop], 'reference', cds=cds, + select_transcript=select_transcript, + version=int(record.source_version)) + session.add(mapping) + + session.commit() o.addMessage(__file__, -1, 'INFO', 'Reference mapping data update end (reference: %s)' % reference) @@ -55,8 +125,8 @@ def main(): """ database_parser = argparse.ArgumentParser(add_help=False) database_parser.add_argument( - '-d', '--database', metavar='DATABASE', dest='database', - default='hg19', help='database to import to (default: hg19)') + '-a', '--assembly', metavar='ASSEMBLY', dest='assembly_name_or_alias', + default='hg19', help='assembly to import to (default: hg19)') parser = argparse.ArgumentParser( description='Mutalyzer mapping importer.', @@ -87,10 +157,18 @@ def main(): 'NC_012920.1)') args = parser.parse_args() + + assembly = Assembly.query \ + .filter(or_(Assembly.name == args.assembly_name_or_alias, + Assembly.alias == args.assembly_name_or_alias)).first() + if not assembly: + sys.stderr.write('Invalid assembly: %s\n' % (assembly_name_or_alias,)) + sys.exit(1) + if args.subcommand == 'gene': - import_gene(args.database, args.gene) + import_gene(assembly, args.gene) if args.subcommand == 'reference': - import_reference(args.database, args.reference) + import_reference(assembly, args.reference) if __name__ == '__main__': diff --git a/mutalyzer/entrypoints/mapping_update.py b/mutalyzer/entrypoints/mapping_update.py index d63fbe7b..a7848bda 100644 --- a/mutalyzer/entrypoints/mapping_update.py +++ b/mutalyzer/entrypoints/mapping_update.py @@ -7,27 +7,160 @@ This program is intended to be run daily from cron. Example: """ +# Todo: Merge this script with mapping_import, the difference between the two +# makes no sense. + + import argparse +from collections import defaultdict +from itertools import groupby +from operator import itemgetter +import sys -from .. import output -from .. import mapping +from sqlalchemy import or_ +from ..db import session +from ..db.models import Assembly, Chromosome, TranscriptMapping -def update_mapping(database, mapping_file, assembly): - """ - Update the database with information from the NCBI. - .. todo: Also report how much was added/updated. +COLUMNS = ['taxonomy', 'chromosome', 'start', 'stop', 'orientation', 'contig', + 'ctg_start', 'ctg_stop', 'ctg_orientation', 'feature_name', + 'feature_id', 'feature_type', 'group_label', 'transcript', + 'evidence_code'] + + +def update_mapping(mappings_path, group_label, assembly_name_or_alias): """ - o = output.Output(__file__) - o.addMessage(__file__, -1, 'INFO', - 'Starting NCBI mapping data update') + Update the database with information from a NCBI `seq_gene.md` file. + + We require that this file is first sorted on the `feature_id` column + (#11), which always contains the gene identifier, and then on the + `chromosome` column (#2). + + sort -k 11,11 -k 2,2 seq_gene.md > seq_gene.by_gene.md + + Load NCBI mapping information from {mapping_file} into the database. - updater = mapping.NCBIUpdater(database) - updater.load(mapping_file, assembly) - updater.merge() + The NCBI mapping file consists of entries, one per line, in order of + their location in the genome (more specifically by start location). + Every entry has a 'group_label' column, denoting the assembly it is + from. We only use entries where this value is `group_label`. - o.addMessage(__file__, -1, 'INFO', 'NCBI mapping data update end') + There are four types of entries (for our purposes): + - Gene: Name, identifier, and location of a gene. + - Transcript: Name, gene id, and location of a transcript. + - UTR: Location and transcript of a non-coding exon (or part of it). + - CDS: Location and transcript of a coding exon (or part of it). + + A bit troublesome for us is that exons are split in UTR exons and CDS + exons, with exons overlapping the UTR/CDS border defined as two + separate entries (one of type UTR and one of type CDS). + + Another minor annoyance is that some transcripts (~ 15) are split over + two contigs (NT_*). In that case, they are defined by two entries in + the file, where we should merge them by taking the start position of + the first and the stop position of the second. + + To complicate this annoyance, some genes (e.g. in the PAR) are mapped + on both the X and Y chromosomes, but stored in the file just like the + transcripts split over two contigs. However, these ones should of + course not be merged. + + Our strategy is too sort by gene and chromosome and process the file + grouped by these two fields. + """ + assembly = Assembly.query \ + .filter(or_(Assembly.name == assembly_name_or_alias, + Assembly.alias == assembly_name_or_alias)).first() + if not assembly: + sys.stderr.write('Invalid assembly: %s\n' % (assembly_name_or_alias,)) + sys.exit(1) + + chromosomes = assembly.chromosomes.all() + + def read_records(mappings_file): + for line in mappings_file: + if line.startswith('#'): + continue + record = dict(zip(COLUMNS, line.rstrip().split('\t'))) + + # Only use records from the given assembly. + if record['group_label'] != group_label: + continue + + # Only use records on chromosomes we know. + try: + record['chromosome'] = next(c for c in chromosomes if + c.name == 'chr' + record['chromosome']) + except StopIteration: + continue + + record['start'] = int(record['start']) + record['stop'] = int(record['stop']) + + yield record + + def build_mappings(records): + # We structure the records per transcript and per record type. This is + # generalized to a list of records for each type, but we expect only + # one GENE record (with `-` as transcript value). + # Note that there can be more than one RNA record per transcript if it + # is split over different reference contigs. + by_transcript = defaultdict(lambda: defaultdict(list)) + for r in records: + by_transcript[r['transcript']][r['feature_type']].append(r) + + gene = by_transcript['-']['GENE'][0]['feature_name'] + + for transcript, by_type in by_transcript.items(): + if transcript == '-': + continue + accession, version = transcript.split('.') + version = int(version) + chromosome = by_type['RNA'][0]['chromosome'] + orientation = 'reverse' if by_type['RNA'][0]['orientation'] == '-' else 'forward' + start = min(t['start'] for t in by_type['RNA']) + stop = max(t['stop'] for t in by_type['RNA']) + + exon_starts = [] + exon_stops = [] + cds_positions = [] + for exon in sorted(by_type['UTR'] + by_type['CDS'], + key=itemgetter('start')): + if exon_stops and exon_stops[-1] > exon['start'] - 1: + # This exon starts before the end of the previous exon. We + # have no idea what to do in this case, so we ignore it. + # The number of transcripts affected is very small (e.g., + # NM_031860.1 and NM_001184961.1 in the GRCh37 assembly). + continue + if exon['feature_type'] == 'CDS': + cds_positions.extend([exon['start'], exon['stop']]) + if exon_stops and exon_stops[-1] == exon['start'] - 1: + # This exon must be merged with the previous one because + # it is split over two entries (a CDS part and a UTR part + # or split over different reference contigs). + exon_stops[-1] = exon['stop'] + else: + exon_starts.append(exon['start']) + exon_stops.append(exon['stop']) + + if cds_positions: + cds = min(cds_positions), max(cds_positions) + else: + cds = None + + yield TranscriptMapping.create_or_update( + chromosome, 'refseq', accession, gene, orientation, start, + stop, exon_starts, exon_stops, 'ncbi', cds=cds, + version=version) + + with open(mappings_path) as mappings_file: + for _, records in groupby(read_records(mappings_file), + itemgetter('feature_id', 'chromosome')): + for mapping in build_mappings(records): + session.add(mapping) + + session.commit() def main(): @@ -38,18 +171,17 @@ def main(): description='Mutalyzer mapping updater.') parser.add_argument( - 'mapping', metavar='FILE', + 'mappings_path', metavar='FILE', help='Path to the NCBI mapping information (example: seq_gene.md)') parser.add_argument( - 'assembly', metavar='ASSEMBLY', - help='use only entries from this assembly (this is the group_name ' - 'column in the NCBI mapping file)') + 'group_label', metavar='GROUP_LABEL', + help='use only entries with this group label') parser.add_argument( - '-d', '--database', metavar='DATABASE', dest='database', - default='hg19', help='database to update (default: hg19)') + 'assembly', metavar='ASSEMBLY', + help='import mappings into this assembly (example: hg19)') args = parser.parse_args() - update_mapping(args.database, args.mapping, args.assembly) + update_mapping(args.mappings_path, args.group_label, args.assembly) if __name__ == '__main__': diff --git a/mutalyzer/mapping.py b/mutalyzer/mapping.py index 217b925e..c10c1545 100644 --- a/mutalyzer/mapping.py +++ b/mutalyzer/mapping.py @@ -16,6 +16,7 @@ from operator import attrgetter from Bio.Seq import reverse_complement import MySQLdb +from mutalyzer.db.models import Chromosome, TranscriptMapping from mutalyzer.grammar import Grammar from mutalyzer import Db from mutalyzer import Crossmap @@ -40,11 +41,11 @@ def _construct_change(var, reverse=False): try: arg1 = str(int(var.Arg1)) except ValueError: - arg1 = reverse_complement(var.Arg1 or '') + arg1 = reverse_complement(str(var.Arg1) or '') try: arg2 = str(int(var.Arg2)) except ValueError: - arg2 = reverse_complement(var.Arg2 or '') + arg2 = reverse_complement(str(var.Arg2) or '') else: arg1 = var.Arg1 arg2 = var.Arg2 @@ -82,29 +83,28 @@ class Converter(object) : @todo: Refactor anything using {mutalyzer.models} into the {webservice} module. """ - def __init__(self, build, O) : + def __init__(self, assembly, O) : """ Initialise the class. - @arg build: the genome build version of the organism (e.g. hg19 for + @arg assembly: the genome build version of the organism (e.g. hg19 for human genome build version 19) - @type build: string + @type assembly: string @arg O: output object @type O: object """ - self.build = None + self.assembly = assembly self.__output = O - self.__database = Db.Mapping(build) # Populated arguments self.parseTree = None self.crossmap = None - self.dbFields = {} + self.mapping = None #__init__ def _reset(self) : self.crossmap = None - self.dbFields = {} + self.mapping = None #_reset def _parseInput(self, variant) : @@ -133,61 +133,9 @@ class Converter(object) : return parseTree #_parseInput - def _populateFields(self, Fields) : - """ - Create a Mutalyzer compatible exon list. - - @todo: ADD Error Messages, unlikely that CDS info is missing. - - @arg Fields: dictionary with exon start and end positions taken from the - MySQL database - @type Fields: dictionary - - @return: Exon list - @rtype: list - """ - Fields["exon_starts"] = map(int, Fields["exon_starts"].split(',')) - Fields["exon_stops"] = map(int, Fields["exon_stops"].split(',')) - assert(len(Fields["exon_starts"]) == len(Fields["exon_stops"])) - - if Fields['cds_start'] and Fields['cds_stop']: - Fields["cds_start"] = int(Fields["cds_start"]) - Fields["cds_stop"] = int(Fields["cds_stop"]) - - # Create Mutalyzer compatible exon list - Fields["exons"] = [] - for exon in zip(Fields["exon_starts"], Fields["exon_stops"]) : - Fields["exons"].extend(exon) - - self.dbFields = Fields - return Fields - #_populateFields - - def _FieldsFromValues(self, values) : + def _get_mapping(self, acc, version=None, selector=None, selector_version=None) : """ - Combines labels with the given values to a dictionary. - (zip returns a list of tuples, where the i-th tuple contains the i-th - element from each of the argument sequences or iterables. - dict(arg) creates a new data dictionary, with items taken from arg.) - - @arg values: list of values take from the MySQL database - @type values: list - - @return: dictionary with values taken from the MySQL database - @rtype: dictionary - """ - - Fields = dict(zip( - ("transcript", "selector", "selector_version", "start", "stop", - "cds_start", "cds_stop", "exon_starts", "exon_stops", "gene", - "chromosome", "orientation", "protein", "version"), - values)) - return self._populateFields(Fields) - #_FieldsFromValues - - def _FieldsFromDb(self, acc, version, selector=None, selector_version=None) : - """ - Get data from database and populate dbFields dict. + Get data from database. @arg acc: NM_ accession number (without version) @type acc: string @@ -198,56 +146,68 @@ class Converter(object) : @kwarg selector_version: Optional transcript version selector. @type selector_version: int """ + versions = [m.version for m in TranscriptMapping.query.filter( + TranscriptMapping.accession == acc, + TranscriptMapping.version != None, + TranscriptMapping.chromosome.has(assembly=self.assembly))] - if not version : - version = 0 - version = int(version) - versions = self.__database.get_NM_version(acc) - if not versions : + if not versions: self.__output.addMessage(__file__, 4, "EACCNOTINDB", "The accession number %s could not be " "found in our database (or is not a transcript)." % acc) self.__output.addOutput("LOVDERR", "Reference sequence not found.") - return None #Explicit return of None in case of error - #if - else : - if version in versions : - Values = self.__database.getAllFields(acc, version, selector, selector_version) - if not Values: - self.__output.addMessage(__file__, 4, "EACCNOTINDB", - "The accession number %s version %s " - "with transcript %s version %s could not be found " - "in our database." % - (acc, version, selector, selector_version)) - return None - return self._FieldsFromValues(Values) - #if - if not version : - self.__output.addMessage(__file__, 4, "ENOVERSION", - "Version number missing for %s" % acc) - else : + return + + if version in versions: + mappings = TranscriptMapping.query.join(Chromosome).filter( + TranscriptMapping.accession == acc, TranscriptMapping.version == version, + Chromosome.assembly == self.assembly) + if selector: + mappings = mappings.filter_by(gene=selector) + if selector_version: + mappings = mappings.filter_by(transcript=selector_version) + + # Todo: The 'order by chrom asc' is a quick hack to make sure we + # first get a primary assembly mapping instead of some haplotype + # mapping for genes in the HLA cluster. + # A better fix is to return the entire list of mappings, and/or + # remove all secondary mappings for the HLA cluster. + # See also test_converter.test_hla_cluster and bug #58. + self.mapping = mappings.order_by(TranscriptMapping.version.desc(), + Chromosome.name.asc()).first() + + if not self.mapping: self.__output.addMessage(__file__, 4, "EACCNOTINDB", - "The accession number %s version %s " - "could not be found in our database (or is not a transcript)." % - (acc, version)) - self.__output.addMessage(__file__, 2, "WDIFFFOUND", - "We found these versions: %s" % - (", ".join("%s.%s" % (acc, i) for i in sorted(versions)))) - - #LOVD list of versions available - self.__output.addOutput("LOVDERR", - "Reference sequence version not found. " - "Available: %s" % - (", ".join("%s.%s" % (acc, i) for i in sorted(versions)))) - - #LOVD Only newest - #self.__output.addOutput("LOVDERR", - # "Reference sequence version not found. " - # "Available: %s.%s" % (acc, sorted(versions)[-1])) - return None - #else - #_FieldsFromDb + "The accession number %s version %s " + "with transcript %s version %s could not be found " + "in our database." % + (acc, version, selector, selector_version)) + return + + if not version: + self.__output.addMessage(__file__, 4, "ENOVERSION", + "Version number missing for %s" % acc) + else : + self.__output.addMessage(__file__, 4, "EACCNOTINDB", + "The accession number %s version %s " + "could not be found in our database (or is not a transcript)." % + (acc, version)) + self.__output.addMessage(__file__, 2, "WDIFFFOUND", + "We found these versions: %s" % + (", ".join("%s.%s" % (acc, i) for i in sorted(versions)))) + + #LOVD list of versions available + self.__output.addOutput("LOVDERR", + "Reference sequence version not found. " + "Available: %s" % + (", ".join("%s.%s" % (acc, i) for i in sorted(versions)))) + + #LOVD Only newest + #self.__output.addOutput("LOVDERR", + # "Reference sequence version not found. " + # "Available: %s.%s" % (acc, sorted(versions)[-1])) + #_get_mapping def makeCrossmap(self) : """ @@ -258,23 +218,19 @@ class Converter(object) : @return: Cross ; A Crossmap object @rtype: object """ - #TODO: ADD Error Messages - if not self.dbFields: return None - - CDS = [] - if self.dbFields["cds_start"] and self.dbFields["cds_stop"]: - CDS = [self.dbFields["cds_start"], self.dbFields["cds_stop"]] + if not self.mapping: + return None - mRNA = self.dbFields["exons"] + # Create Mutalyzer compatible exon list. + mrna = [] + for exon in zip(self.mapping.exon_starts, self.mapping.exon_stops): + mrna.extend(exon) - # Convert the strand information to orientation. - orientation = 1 - if self.dbFields["orientation"] == '-' : - orientation = -1 + cds = self.mapping.cds or [] + orientation = 1 if self.mapping.orientation == 'forward' else -1 - # Build the crossmapper. - self.crossmap = Crossmap.Crossmap(mRNA, CDS, orientation) + self.crossmap = Crossmap.Crossmap(mrna, cds, orientation) return self.crossmap #makeCrossmap @@ -389,7 +345,8 @@ class Converter(object) : acc, ver = accNo, None else : acc, ver = accNo.split('.') - self._FieldsFromDb(acc, ver) + ver = int(ver) + self._get_mapping(acc, ver) CM = self.makeCrossmap() if CM : return CM.info() @@ -433,8 +390,11 @@ class Converter(object) : variant = "%s:%s" % (accNo, mutation) if self._parseInput(variant) : acc = self.parseTree.RefSeqAcc - version = self.parseTree.Version - self._FieldsFromDb(acc, version) + try: + version = int(self.parseTree.Version) + except ValueError: + version = None + self._get_mapping(acc, version) mappings = self._coreMapping() @@ -462,7 +422,7 @@ class Converter(object) : # Now we have to do some tricks to combine the information from # possibly multiple variants into one Mapping object. - # Todo: Maybe it is better to use self.dbFields['orientation'] here. + # Todo: Maybe it is better to use self.mapping.orientation here. min_g = 9000000000 max_g = 0 forward = True @@ -516,13 +476,16 @@ class Converter(object) : """ if self._parseInput(variant): acc = self.parseTree.RefSeqAcc - version = self.parseTree.Version + try: + version = int(self.parseTree.Version) + except ValueError: + version = None if self.parseTree.Gene: selector = self.parseTree.Gene.GeneSymbol selector_version = int(self.parseTree.Gene.TransVar or 1) else: selector = selector_version = None - self._FieldsFromDb(acc, version, selector, selector_version) + self._get_mapping(acc, version, selector, selector_version) mappings = self._coreMapping() if not mappings: @@ -533,23 +496,18 @@ class Converter(object) : else: variants = [self.parseTree.RawVar] - try: - chromAcc, organelle_type = self.__database.chromAcc(self.dbFields["chromosome"]) - except TypeError: - return None - # Construct the variant descriptions descriptions = [] for variant, mapping in zip(variants, mappings): f_change = _construct_change(variant) r_change = _construct_change(variant, reverse=True) - if self.dbFields["orientation"] == "+" : + if self.mapping.orientation == 'forward': change = f_change else : change = r_change if mapping.start_g != mapping.end_g: - if self.dbFields["orientation"] == '-': + if self.mapping.orientation == 'reverse': last_g, first_g = mapping.start_g, mapping.end_g else: first_g, last_g = mapping.start_g, mapping.end_g @@ -567,10 +525,10 @@ class Converter(object) : else: description = '[' + ';'.join(descriptions) + ']' - if organelle_type == 'mitochondrion': - return "%s:m.%s" % (chromAcc, description) + if self.mapping.chromosome.organelle_type == 'mitochondrion': + return "%s:m.%s" % (self.mapping.chromosome.accession, description) else: - return "%s:g.%s" % (chromAcc, description) + return "%s:g.%s" % (self.mapping.chromosome.accession, description) #c2chrom def chromosomal_positions(self, positions, reference, version=None): @@ -591,19 +549,24 @@ class Converter(object) : This only works for positions on transcript references in c. notation. """ - if not version: - version = 0 - version = int(version) - - versions = self.__database.get_NM_version(reference) + versions = [m.version for m in TranscriptMapping.query.filter( + TranscriptMapping.accession == reference, + TranscriptMapping.version != None, + TranscriptMapping.chromosome.has(assembly=self.assembly))] if version not in versions: return None - values = self.__database.getAllFields(reference, version) - if not values: - return None - self._FieldsFromValues(values) + self.mapping = TranscriptMapping.query \ + .join(Chromosome) \ + .filter(TranscriptMapping.accession == reference, + TranscriptMapping.version == version, + Chromosome.assembly == self.assembly) \ + .order_by(TranscriptMapping.version.desc(), + Chromosome.name.asc()).first() + + if not self.mapping: + return mapper = self.makeCrossmap() if not mapper: @@ -616,7 +579,9 @@ class Converter(object) : offset = mapper.offset2int(position.OffSgn + position.Offset) chromosomal_positions.append(mapper.x2g(main, offset)) - return self.dbFields['chromosome'], self.dbFields['orientation'], chromosomal_positions + orientation = '+' if self.mapping.orientation == 'forward' else '-' + + return self.mapping.chromosome.name, orientation, chromosomal_positions #chromosomal_positions def correctChrVariant(self, variant) : @@ -639,16 +604,16 @@ class Converter(object) : if variant.startswith('chr') and ':' in variant: preco, postco = variant.split(':', 1) - try: - chrom, _ = self.__database.chromAcc(preco) - except TypeError: + + chromosome = Chromosome.query.filter_by(assembly=self.assembly, + name=preco).first() + if not chromosome: self.__output.addMessage(__file__, 4, "ENOTINDB", "The accession number %s could not be found in our database (or is not a chromosome)." % preco) return None - #if - else : - variant = "%s:%s" % (chrom, postco) + + variant = "%s:%s" % (chromosome.accession, postco) #if return variant #correctChrVariant @@ -672,8 +637,11 @@ class Converter(object) : acc = self.parseTree.RefSeqAcc version = self.parseTree.Version - chrom = self.__database.chromName("%s.%s" % (acc, version)) - if not chrom : + + chromosome = Chromosome.query \ + .filter_by(assembly=self.assembly, + accession='%s.%s' % (acc, version)).first() + if not chromosome : self.__output.addMessage(__file__, 4, "ENOTINDB", "The Accession number %s could not be found in our database (or is not a chromosome)." % acc) @@ -704,32 +672,32 @@ class Converter(object) : max_loc = max(max_loc, loc2) if gene: - transcripts = self.__database.get_TranscriptsByGeneName(gene) + mappings = chromosome.transcript_mappings.filter_by(gene=gene) else: - transcripts = self.__database.get_Transcripts(chrom, min_loc - 5000, max_loc + 5000, 1) + mappings = chromosome.transcript_mappings.filter( + TranscriptMapping.start <= max_loc + 5000, + TranscriptMapping.stop >= min_loc - 5000) HGVS_notatations = defaultdict(list) NM_list = [] - for transcript in transcripts : + for mapping in mappings: self._reset() - self._FieldsFromValues(transcript) - if self.dbFields['chromosome'] != chrom: - # Could be the case if we got transcripts by gene name - continue - mappings = self._coreMapping() - if not mappings: + self.mapping = mapping + core_mapping = self._coreMapping() + if not core_mapping: #balen continue # construct the variant description - accNo = "%s.%s" % (self.dbFields["transcript"],self.dbFields["version"]) - if self.dbFields['selector'] and self.dbFields['selector_version']: - selector = '(%s_v%.3i)' % (self.dbFields['selector'], self.dbFields['selector_version']) - elif self.dbFields['selector']: - selector = '(%s)' % self.dbFields['selector'] + accNo = "%s.%s" % (self.mapping.accession, self.mapping.version) + if self.mapping.select_transcript: + if self.mapping.transcript: + selector = '(%s_v%.3i)' % (self.mapping.gene, self.mapping.transcript) + else: + selector = '(%s)' % self.mapping.gene else: selector = '' - geneName = self.dbFields["gene"] or "" - strand = self.dbFields["orientation"] == '+' + geneName = self.mapping.gene + strand = self.mapping.orientation == 'forward' # Check if n or c type # Note: Originally, the below check using crossmap.info() was @@ -747,12 +715,12 @@ class Converter(object) : mtype = 'n' mutations = [] - for variant, mapping in zip(variants, mappings): + for variant, cmap in zip(variants, core_mapping): f_change = _construct_change(variant) r_change = _construct_change(variant, reverse=True) - startp = self.crossmap.tuple2string((mapping.startmain, mapping.startoffset)) - endp = self.crossmap.tuple2string((mapping.endmain, mapping.endoffset)) + startp = self.crossmap.tuple2string((cmap.startmain, cmap.startoffset)) + endp = self.crossmap.tuple2string((cmap.endmain, cmap.endoffset)) if strand : change = f_change @@ -760,7 +728,7 @@ class Converter(object) : change = r_change startp, endp = endp, startp - if mapping.start_g != mapping.end_g : + if cmap.start_g != cmap.end_g : loca = "%s_%s" % (startp, endp) else : loca = "%s" % startp @@ -781,415 +749,3 @@ class Converter(object) : return HGVS_notatations #chrom2c #Converter - - -class Updater(object): - """ - Abstract base class for updating the mapping information in the database. - - Subclasses should implement the {load} method, loading new mapping - information into the 'MappingTemp' table. The {merge} method merges this - table into the real 'Mapping' table. - """ - def __init__(self, build): - """ - @arg build: Human genome build (or database name), i.e. 'hg18' or - 'hg19'. - @type build: string - """ - self.build = build - self.db = Db.Mapping(build) - #__init__ - - def load(self, *args, **kwargs): - """ - The implementation of this method in subclasses should load mapping - information in the 'MappingTemp' table. - """ - raise NotImplementedError('Implement this method in subclasses') - #load - - def merge(self): - """ - Merge the 'Mapping' and 'MappingTemp' tables. The result is stored in - the 'Mapping' table, of which a backup is created as 'MappingBackup'. - - @todo: Report how much was updated/added. - """ - self.db.merge_update() - #merge -#Updater - - -class NCBIUpdater(Updater): - """ - Update the mapping information in the database with mapping information - from the NCBI. - - Example usage: - - >>> updater = NCBIUpdater('hg19') - >>> updater.load('/tmp/seq_gene.md', 'GRCh37.p2-Primary Assembly') - >>> updater.merge() - - """ - COLUMNS = ['taxonomy', 'chromosome', 'start', 'stop', 'orientation', - 'contig', 'ctg_start', 'ctg_stop', 'ctg_orientation', - 'feature_name', 'feature_id', 'feature_type', 'group_label', - 'transcript', 'evidence_code'] - - def __init__(self, build): - """ - @arg build: Human genome build (or database name), i.e. 'hg18' or - 'hg19'. - @type build: string - """ - self.exon_backlog = {} - super(NCBIUpdater, self).__init__(build) - #__init__ - - def load(self, mapping_file, assembly): - """ - Load NCBI mapping information from {mapping_file} into the database. - - The NCBI mapping file consists of entries, one per line, in order of - their location in the genome (more specifically by start location). - Every entry has a 'group_name' column, denoting the assembly it is - from. We only use entries where this value is {assembly}. - - There are four types of entries (for our purposes): - - Gene: Name, identifier, and location of a gene. - - Transcript: Name, gene id, and location of a transcript. - - UTR: Location and transcript of a non-coding exon (or part of it). - - CDS: Location and transcript of a coding exon (or part of it). - - A bit troublesome for us is that exons are split in UTR exons and CDS - exons, with exons overlapping the UTR/CDS border defined as two - separate entries (one of type UTR and one of type CDS). - - Another minor annoyance is that some transcripts (~ 15) are split over - two contigs (NT_*). In that case, they are defined by two entries in - the file, where we should merge them by taking the start position of - the first and the stop position of the second. - - To complicate this annoyance, some genes (e.g. in the PAR) are mapped - on both the X and Y chromosomes, but stored in the file just like the - transcripts split over two contigs. However, these ones should of - course not be merged. - - Our strategy is to loop over all entries and store them in three - temporary tables (for genes, transcripts, exons). The entries of type - UTR and CDS are merged to correct exon entries by keeping a backlog - of these entries that can still be modified before storing them in the - database. - - The values from the three temporary tables are aggregated into the - 'MappingTemp' table. - - @arg mapping_file: Path to NCBI mapping information. - @type mapping_file: string - @arg assembly: Use only entries from this assembly (this is the - 'group_name' column in the NCBI mapping file). - @type assembly: string - """ - self._create_temporary_tables() - self._import_mapping(mapping_file, assembly) - self._aggregate_mapping() - self._drop_temporary_tables() - #load - - def _import_mapping(self, mapping_file, assembly): - """ - Import mapping information from {mapping_file} into three temporary - tables. - - @note: We issue a separate INSERT statement to the database for every - entry. An alternative is to write everything to tab-separated - files and load those into the database with LOAD DATA LOCAL INFILE - statements. This alternative seems to be about twice as fast, but - for now we stick with the simpler solution. - """ - self.exon_backlog = {} - - with open(mapping_file, 'r') as mapping: - for line in mapping: - if line.startswith('#'): - continue - entry = dict(zip(self.COLUMNS, line.rstrip().split('\t'))) - - # Only use entries from the given assembly. - if entry['group_label'] != assembly: - continue - - # Only use entries on the normal chromosomes. - try: - int(entry['chromosome']) - except ValueError: - if entry['chromosome'] not in 'XY': - continue - - if entry['feature_type'] == 'GENE': - self._import_gene(entry) - elif entry['feature_type'] == 'RNA': - self._import_transcript(entry) - elif entry['feature_type'] in ('UTR', 'CDS'): - self._import_exon(entry) - - self._import_exon_backlog() - #_import_mapping - - def _import_gene(self, entry): - """ - Insert a gene in the database. - """ - self.db.ncbi_import_gene(entry['feature_id'], entry['feature_name']) - #_import_gene - - def _import_transcript(self, entry): - """ - Insert a transcript in the database. - """ - self.db.ncbi_import_transcript( - entry['feature_name'], entry['feature_id'], entry['chromosome'], - int(entry['start']), int(entry['stop']), entry['orientation']) - #_import_transcript - - def _import_exon(self, entry): - """ - Instead of directly inserting each exon in the database, we keep them - in a backlog of at most one exon per transcript. Exons are taken from - the backlog and inserted in the database only when we passed their - genomic stop location by more than one position. - - This way, exons in the backlog can be merged when they are on a - UTR/CDS boundary. - """ - cds = entry['feature_type'] == 'CDS' - entry['start'] = int(entry['start']) - entry['stop'] = int(entry['stop']) - entry['protein'] = entry['feature_name'] if cds else None - entry['cds_start'] = entry['start'] if cds else None - entry['cds_stop'] = entry['stop'] if cds else None - entry['cds'] = cds - - self._import_exon_backlog(entry['start'] - 1) - - try: - previous = self.exon_backlog[entry['transcript']] - if previous['cds'] != entry['cds'] \ - and previous['stop'] == entry['start'] - 1: - if previous['cds']: - entry['cds_start'] = previous['cds_start'] - entry['cds_stop'] = previous['cds_stop'] - entry['protein'] = previous['protein'] - entry['start'] = previous['start'] - except KeyError: - pass - - self.exon_backlog[entry['transcript']] = entry - #_import_exon - - def _import_exon_backlog(self, up_to_position=None): - """ - Import exons from the backlog in the database. If the optional - argument {up_to_position} is set, only import exons with a stop - position before this value. - - We explicitely remove imported exons from the backlog, because it - might be suboptimal to keep more than 30,000 exons in there. - """ - for transcript, exon in self.exon_backlog.items(): - if not up_to_position or exon['stop'] < up_to_position: - del self.exon_backlog[transcript] - del exon['cds'] - self.db.ncbi_import_exon( - exon['transcript'], exon['chromosome'], exon['start'], exon['stop'], - exon['cds_start'], exon['cds_stop'], exon['protein'] or None) - #_import_exon_backlog - - def _aggregate_mapping(self): - """ - Aggregate the genes, transcripts and exons from their temporary - tables into the 'MappingTemp' table. - """ - self.db.ncbi_aggregate_mapping() - #_aggregate_mapping - - def _create_temporary_tables(self): - """ - Create temporary tables needed for loading the NCBI mapping data. - """ - self.db.ncbi_create_temporary_tables() - #_create_temporary_tables - - def _drop_temporary_tables(self): - """ - Drop temporary tables needed for loading the NCBI mapping data. - """ - self.db.ncbi_drop_temporary_tables() - #_drop_temporary_tables -#NCBIUpdater - - -class UCSCUpdater(Updater): - """ - Update the mapping information in the database with mapping information - from the UCSC. - - For now, we only load info from the UCSC per gene. By default, we don't - overwrite any existing transcript/version entries. This is because we - prefer the NCBI mappings but want to be able to manually load info for - specific genes that is not provided by the NCBI (yet). - - Example usage: - - >>> updater = UCSCUpdater('hg19') - >>> updater.load('SDHD') - >>> updater.merge() - - """ - def __init__(self, build): - """ - @arg build: Human genome build (or database name), i.e. 'hg18' or - 'hg19'. - @type build: string - """ - self._ucsc_connection = MySQLdb.connect( - user='genome', - host='genome-mysql.cse.ucsc.edu', - db=build) - super(UCSCUpdater, self).__init__(build) - #__init__ - - def load(self, gene, overwrite=False): - """ - Load UCSC mapping information for given gene into the database. By - default, don't load transcript/version entries we already have. - - The resulting transcript mappings are inserted into the - 'MappingTemp' table. - - @arg gene: Gene symbol to load mappings for. - @type gene: str - @arg overwrite: Include already known transcript/version entries - (default: False). - @type overwrite: bool - """ - query = """ - SELECT DISTINCT - acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, - exonEnds, name2 AS geneName, chrom, strand, protAcc - FROM gbStatus, refGene, refLink - WHERE type = "mRNA" - AND refGene.name = acc - AND acc = mrnaAcc - AND name2 = %s - """ - parameters = gene - - cursor = self._ucsc_connection.cursor() - cursor.execute(query, parameters) - result = cursor.fetchall() - cursor.close() - - transcripts = [] - for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds, - geneName, chrom, strand, protAcc) in result: - transcripts.append( - (geneName, acc, version, chrom, strand, - txStart + 1, txEnd, - cdsStart + 1, cdsEnd, - [int(i) + 1 for i in exonStarts.split(',') if i], - [int(i) for i in exonEnds.split(',') if i], - protAcc)) - - self.db.ucsc_load_mapping(transcripts, overwrite=overwrite) - #load -#UCSCUpdater - - -class ReferenceUpdater(Updater): - """ - Update the mapping information in the database with mapping information - from a genomic reference. - - For now, we don't handle exon locations (this is only used for mtDNA - genomic references). By default, we don't overwrite any existing - transcript/version entries. This is because we prefer the NCBI mappings - but want to be able to manually load info for specific genes that is not - provided by the NCBI (yet). - - Example usage: - - >>> updater = ReferenceUpdater('hg19') - >>> updater.load('NC_012920.1') - >>> updater.merge() - - """ - def __init__(self, build): - """ - @arg build: Human genome build (or database name), i.e. 'hg18' or - 'hg19'. - @type build: string - """ - super(ReferenceUpdater, self).__init__(build) - #__init__ - - def load(self, reference, output, overwrite=False): - """ - Load genomic reference mapping information into the database. By - default, don't load transcript/version entries we already have. - - The resulting transcript mappings are inserted into the - 'MappingTemp' table. - - @arg reference: Reference to load mappings from. - @type reference: str - @arg output: An output object. - @type output: mutalyzer.output.Output - @arg overwrite: Include already known transcript/version entries - (default: False). - @type overwrite: bool - """ - retriever = Retriever.GenBankRetriever(output) - record = retriever.loadrecord(reference) - - transcripts = [] - - for gene in record.geneList: - # We support exactly one transcript per gene. - try: - transcript = sorted(gene.transcriptList, key=attrgetter('name'))[0] - except IndexError: - continue - - # We use gene.location for now, it is always present and the same - # for our purposes. - #start, stop = transcript.mRNA.location[0], transcript.mRNA.location[1] - start, stop = gene.location - - orientation = '-' if gene.orientation == -1 else '+' - - try: - cds_start, cds_stop = transcript.CDS.location - except AttributeError: - cds_start = cds_stop = None - - transcripts.append({'gene': gene.name, - 'transcript': record.source_accession, - 'version': int(record.source_version), - 'selector': gene.name, - 'selector_version': int(transcript.name), - 'chromosome': 'chrM', - 'orientation': orientation, - 'start': start, - 'stop': stop, - 'cds_start': cds_start, - 'cds_stop': cds_stop, - 'exon_starts': [start], - 'exon_stops': [stop], - 'protein': transcript.proteinID}) - - self.db.reference_load_mapping(transcripts, overwrite=overwrite) - #load -#ReferenceUpdater diff --git a/mutalyzer/models.py b/mutalyzer/models.py index 32686750..6cb3277f 100644 --- a/mutalyzer/models.py +++ b/mutalyzer/models.py @@ -256,7 +256,6 @@ class TranscriptMappingInfo(ComplexModel): name = Mandatory.String version = Mandatory.Integer gene = Mandatory.String - protein = Mandatory.String orientation = Mandatory.String start = Mandatory.Integer diff --git a/mutalyzer/services/rpc.py b/mutalyzer/services/rpc.py index 66482261..fd14d390 100644 --- a/mutalyzer/services/rpc.py +++ b/mutalyzer/services/rpc.py @@ -3,8 +3,7 @@ Mutalyzer RPC services. @todo: More thourough input checking. The @soap decorator does not do any kind of strictness checks on the input. For example, in - transcriptInfo, the build argument must really be present. (Hint: - use __checkBuild.) + transcriptInfo, the build argument must really be present. We should use the built-in validator functionality from Spyne for this. """ @@ -19,11 +18,13 @@ import os import socket import tempfile from operator import itemgetter, attrgetter +from sqlalchemy import and_, or_ import mutalyzer from mutalyzer.config import settings from mutalyzer.db import session -from mutalyzer.db.models import BatchJob, BatchQueueItem +from mutalyzer.db.models import (Assembly, Chromosome, BatchJob, + BatchQueueItem, TranscriptMapping) from mutalyzer.output import Output from mutalyzer.grammar import Grammar from mutalyzer.sync import CacheSync @@ -38,91 +39,6 @@ from mutalyzer.models import * from mutalyzer import describe -def _checkBuild(L, build) : - """ - Check if the build is supported (hg18, hg19, or mm10). - - Returns: - - Nothing (but raises an EARG exception). - - @arg L: An output object for logging. - @type L: object - @arg build: The human genome build name that needs to be checked. - @type build: string - """ - - if not build in settings.DB_NAMES: - L.addMessage(__file__, 4, "EARG", "EARG %s" % build) - raise Fault("EARG", - "The build argument (%s) was not a valid " \ - "build name." % build) - #if -#_checkBuild - - -def _checkChrom(L, D, chrom) : - """ - Check if the chromosome is in our database. - - Returns: - - Nothing (but raises an EARG exception). - - @arg L: An output object for logging. - @type L: object - @arg D: A handle to the database. - @type D: object - @arg chrom: The name of the chromosome. - @type chrom: string - """ - - if not D.isChrom(chrom) : - L.addMessage(__file__, 4, "EARG", "EARG %s" % chrom) - raise Fault("EARG", "The chrom argument (%s) was not a valid " \ - "chromosome name." % chrom) - #if -#_checkChrom - - -def _checkPos(L, pos) : - """ - Check if the position is valid. - - Returns: - - Nothing (but raises an ERANGE exception). - - @arg L: An output object for logging. - @type L: object - @arg pos: The position. - @type pos: integer - """ - - if pos < 1 : - L.addMessage(__file__, 4, "ERANGE", "ERANGE %i" % pos) - raise Fault("ERANGE", "The pos argument (%i) is out of range." % pos) - #if -#_checkPos - - -def _checkVariant(L, variant) : - """ - Check if a variant is provided. - - Returns: - - Nothing (but raises an EARG exception). - - @arg L: An output object for logging. - @type L: object - @arg variant: The variant. - @type variant: string - """ - - if not variant : - L.addMessage(__file__, 4, "EARG", "EARG no variant") - raise Fault("EARG", "The variant argument is not provided.") - #if -#_checkVariant - - class MutalyzerService(ServiceBase): """ Mutalyzer web services. @@ -276,28 +192,34 @@ class MutalyzerService(ServiceBase): "Received request getTranscripts(%s %s %s %s)" % (build, chrom, pos, versions)) - _checkBuild(L, build) - D = Db.Mapping(build) - - _checkChrom(L, D, chrom) - _checkPos(L, pos) + assembly = Assembly.query.filter(or_(Assembly.name == build, + Assembly.alias == build)).first() + if not assembly: + L.addMessage(__file__, 4, "EARG", "EARG %s" % build) + raise Fault("EARG", + "The build argument (%s) was not a valid " \ + "build name." % build) + + chromosome = Chromosome.query.filter_by(assembly=assembly, + name=chrom).first() + if not chromosome: + L.addMessage(__file__, 4, "EARG", "EARG %s" % chrom) + raise Fault("EARG", "The chrom argument (%s) was not a valid " \ + "chromosome name." % chrom) + + mappings = chromosome.transcript_mappings.filter( + TranscriptMapping.start <= pos, + TranscriptMapping.stop >= pos) - ret = D.get_Transcripts(chrom, pos, pos, True) + L.addMessage(__file__, -1, "INFO", + "Finished processing getTranscripts(%s %s %s %s)" + % (build, chrom, pos, versions)) #filter out the accNo if versions: - ret = [r[0] + '.' + str(r[-1]) for r in ret] + return ['%s.%s' % (m.accession, m.version) for m in mappings] else: - ret = [r[0] for r in ret] - - L.addMessage(__file__, -1, "INFO", - "Finished processing getTranscripts(%s %s %s %s)" % (build, chrom, - pos, versions)) - - L.addMessage(__file__, -1, "INFO", "We return %s" % ret) - - del D, L - return ret + return [m.accession for m in mappings] #getTranscripts @srpc(Mandatory.String, Mandatory.String, _returns=Array(Mandatory.String)) @@ -310,21 +232,22 @@ class MutalyzerService(ServiceBase): L.addMessage(__file__, -1, "INFO", "Received request getTranscriptsByGene(%s %s)" % (build, name)) - _checkBuild(L, build) - D = Db.Mapping(build) + assembly = Assembly.query.filter(or_(Assembly.name == build, + Assembly.alias == build)).first() + if not assembly: + L.addMessage(__file__, 4, "EARG", "EARG %s" % build) + raise Fault("EARG", + "The build argument (%s) was not a valid " \ + "build name." % build) - ret = D.get_TranscriptsByGeneName(name) + mappings = TranscriptMapping.query \ + .filter(TranscriptMapping.chromosome.has(assembly=assembly), + TranscriptMapping.gene == name) L.addMessage(__file__, -1, "INFO", "Finished processing getTranscriptsByGene(%s %s)" % (build, name)) - if ret : - l = [] - for i in ret : - l.append(i[0] + '.' + str(i[13])) - return l - - return [] + return ['%s.%s' % (m.accession, m.version) for m in mappings] #getTranscriptsByGene @srpc(Mandatory.String, Mandatory.String, Mandatory.Integer, @@ -355,20 +278,35 @@ class MutalyzerService(ServiceBase): "Received request getTranscriptsRange(%s %s %s %s %s)" % (build, chrom, pos1, pos2, method)) - D = Db.Mapping(build) - _checkBuild(L, build) - - ret = D.get_Transcripts(chrom, pos1, pos2, method) + assembly = Assembly.query.filter(or_(Assembly.name == build, + Assembly.alias == build)).first() + if not assembly: + L.addMessage(__file__, 4, "EARG", "EARG %s" % build) + raise Fault("EARG", + "The build argument (%s) was not a valid " \ + "build name." % build) + + chromosome = Chromosome.query.filter_by(assembly=assembly, + name=chrom).first() + if not chromosome: + L.addMessage(__file__, 4, "EARG", "EARG %s" % chrom) + raise Fault("EARG", "The chrom argument (%s) was not a valid " \ + "chromosome name." % chrom) + + if method: + range_filter = (TranscriptMapping.start <= pos2, + TranscriptMapping.stop >= pos1) + else: + range_filter = (TranscriptMapping.start >= pos1, + TranscriptMapping.stop <= pos2) - #filter out the accNo - ret = [r[0] for r in ret] + mappings = chromosome.transcript_mappings.filter(*range_filter) L.addMessage(__file__, -1, "INFO", "Finished processing getTranscriptsRange(%s %s %s %s %s)" % ( build, chrom, pos1, pos2, method)) - del D, L - return ret + return [m.accession for m in mappings] #getTranscriptsRange @srpc(Mandatory.String, Mandatory.String, Mandatory.Integer, @@ -396,7 +334,6 @@ class MutalyzerService(ServiceBase): - name - version - gene - - protein - orientation - start - stop @@ -405,36 +342,53 @@ class MutalyzerService(ServiceBase): """ output = Output(__file__) output.addMessage(__file__, -1, 'INFO', 'Received request ' \ - 'getTranscriptsRange(%s %s %s %s %s)' % (build, chrom, pos1, pos2, + 'getTranscriptsMapping(%s %s %s %s %s)' % (build, chrom, pos1, pos2, method)) - _checkBuild(output, build) + assembly = Assembly.query.filter(or_(Assembly.name == build, + Assembly.alias == build)).first() + if not assembly: + L.addMessage(__file__, 4, "EARG", "EARG %s" % build) + raise Fault("EARG", + "The build argument (%s) was not a valid " \ + "build name." % build) + + chromosome = Chromosome.query.filter_by(assembly=assembly, + name=chrom).first() + if not chromosome: + L.addMessage(__file__, 4, "EARG", "EARG %s" % chrom) + raise Fault("EARG", "The chrom argument (%s) was not a valid " \ + "chromosome name." % chrom) + + if method: + range_filter = (TranscriptMapping.start <= pos2, + TranscriptMapping.stop >= pos1) + else: + range_filter = (TranscriptMapping.start >= pos1, + TranscriptMapping.stop <= pos2) + + mappings = chromosome.transcript_mappings.filter(*range_filter) - database = Db.Mapping(build) transcripts = [] - for transcript in database.get_Transcripts(chrom, pos1, pos2, method): + for mapping in mappings: t = TranscriptMappingInfo() - d = dict(zip(('transcript', 'selector', 'selector_version', - 'start', 'stop', 'cds_start', 'cds_stop', 'exon_starts', - 'exon_stops', 'gene', 'chromosome', 'orientation', 'protein', - 'version'), transcript)) - if d['orientation'] == '-': - d['start'], d['stop'] = d['stop'], d['start'] - d['cds_start'], d['cds_stop'] = d['cds_stop'], d['cds_start'] - t.name = d['transcript'] - t.version = d['version'] - t.gene = d['gene'] - t.protein = d['protein'] - t.orientation = d['orientation'] - t.start = d['start'] - t.stop = d['stop'] - t.cds_start = d['cds_start'] - t.cds_stop = d['cds_stop'] + t.name = mapping.accession + t.version = mapping.version + t.gene = mapping.gene + t.orientation = '-' if mapping.orientation == 'reverse' else '+' + if mapping.orientation == 'reverse': + t.start, t.stop = mapping.stop, mapping.start + else: + t.start, t.stop = mapping.start, mapping.stop + if mapping.orientation == 'reverse': + t.cds_start, t.cds_stop = mapping.cds_stop, mapping.cds_start + else: + t.cds_start, t.cds_stop = mapping.cds_start, mapping.cds_stop transcripts.append(t) output.addMessage(__file__, -1, 'INFO', 'Finished processing ' \ - 'getTranscriptsRange(%s %s %s %s %s)' % (build, chrom, pos1, pos2, + 'getTranscriptsMapping(%s %s %s %s %s)' % (build, chrom, pos1, pos2, method)) return transcripts @@ -458,19 +412,25 @@ class MutalyzerService(ServiceBase): L.addMessage(__file__, -1, "INFO", "Received request getGeneName(%s %s)" % (build, accno)) - D = Db.Mapping(build) - _checkBuild(L, build) + assembly = Assembly.query.filter(or_(Assembly.name == build, + Assembly.alias == build)).first() + if not assembly: + L.addMessage(__file__, 4, "EARG", "EARG %s" % build) + raise Fault("EARG", + "The build argument (%s) was not a valid " \ + "build name." % build) - ret = D.get_GeneName(accno.split('.')[0]) + mapping = TranscriptMapping.query \ + .filter(TranscriptMapping.chromosome.has(assembly=assembly), + TranscriptMapping.accession == accno.split('.')[0]) \ + .first() L.addMessage(__file__, -1, "INFO", "Finished processing getGeneName(%s %s)" % (build, accno)) - del D, L - return ret + return mapping.gene #getGeneName - @srpc(Mandatory.String, Mandatory.String, Mandatory.String, Mandatory.String, _returns=Mapping) def mappingInfo(LOVD_ver, build, accNo, variant) : @@ -520,7 +480,15 @@ class MutalyzerService(ServiceBase): "Reveived request mappingInfo(%s %s %s %s)" % (LOVD_ver, build, accNo, variant)) - conv = Converter(build, L) + assembly = Assembly.query.filter(or_(Assembly.name == build, + Assembly.alias == build)).first() + if not assembly: + L.addMessage(__file__, 4, "EARG", "EARG %s" % build) + raise Fault("EARG", + "The build argument (%s) was not a valid " \ + "build name." % build) + + conv = Converter(assembly, L) result = conv.mainMapping(accNo, variant) L.addMessage(__file__, -1, "INFO", @@ -558,7 +526,15 @@ class MutalyzerService(ServiceBase): "Received request transcriptInfo(%s %s %s)" % (LOVD_ver, build, accNo)) - converter = Converter(build, O) + assembly = Assembly.query.filter(or_(Assembly.name == build, + Assembly.alias == build)).first() + if not assembly: + O.addMessage(__file__, 4, "EARG", "EARG %s" % build) + raise Fault("EARG", + "The build argument (%s) was not a valid " \ + "build name." % build) + + converter = Converter(assembly, O) T = converter.mainTranscript(accNo) O.addMessage(__file__, -1, "INFO", @@ -580,22 +556,29 @@ class MutalyzerService(ServiceBase): @return: The accession number of a chromosome. @rtype: string """ - D = Db.Mapping(build) L = Output(__file__) - L.addMessage(__file__, -1, "INFO", "Received request chromAccession(%s %s)" % (build, name)) - _checkBuild(L, build) - _checkChrom(L, D, name) - - result = D.chromAcc(name) + assembly = Assembly.query.filter(or_(Assembly.name == build, + Assembly.alias == build)).first() + if not assembly: + L.addMessage(__file__, 4, "EARG", "EARG %s" % build) + raise Fault("EARG", + "The build argument (%s) was not a valid " \ + "build name." % build) + + chromosome = Chromosome.query.filter_by(assembly=assembly, + name=name).first() + if not chromosome: + L.addMessage(__file__, 4, "EARG", "EARG %s" % name) + raise Fault("EARG", "The name argument (%s) was not a valid " \ + "chromosome name." % name) L.addMessage(__file__, -1, "INFO", "Finished processing chromAccession(%s %s)" % (build, name)) - del D,L - return result[0] + return chromosome.accession #chromAccession @srpc(Mandatory.String, Mandatory.String, _returns=Mandatory.String) @@ -611,22 +594,29 @@ class MutalyzerService(ServiceBase): @return: The name of a chromosome. @rtype: string """ - D = Db.Mapping(build) L = Output(__file__) - L.addMessage(__file__, -1, "INFO", "Received request chromName(%s %s)" % (build, accNo)) - _checkBuild(L, build) -# self._checkChrom(L, D, name) - - result = D.chromName(accNo) + assembly = Assembly.query.filter(or_(Assembly.name == build, + Assembly.alias == build)).first() + if not assembly: + L.addMessage(__file__, 4, "EARG", "EARG %s" % build) + raise Fault("EARG", + "The build argument (%s) was not a valid " \ + "build name." % build) + + chromosome = Chromosome.query.filter_by(assembly=assembly, + accession=accNo).first() + if not chromosome: + L.addMessage(__file__, 4, "EARG", "EARG %s" % accNo) + raise Fault("EARG", "The accNo argument (%s) was not a valid " \ + "chromosome accession." % accNo) L.addMessage(__file__, -1, "INFO", "Finished processing chromName(%s %s)" % (build, accNo)) - del D,L - return result + return chromosome.name #chromosomeName @srpc(Mandatory.String, Mandatory.String, _returns=Mandatory.String) @@ -642,26 +632,32 @@ class MutalyzerService(ServiceBase): @return: The name of a chromosome. @rtype: string """ - D = Db.Mapping(build) L = Output(__file__) L.addMessage(__file__, -1, "INFO", "Received request getchromName(%s %s)" % (build, acc)) - _checkBuild(L, build) -# self._checkChrom(L, D, name) + assembly = Assembly.query.filter(or_(Assembly.name == build, + Assembly.alias == build)).first() + if not assembly: + L.addMessage(__file__, 4, "EARG", "EARG %s" % build) + raise Fault("EARG", + "The build argument (%s) was not a valid " \ + "build name." % build) - result = D.get_chromName(acc) + mapping = TranscriptMapping.query \ + .filter(TranscriptMapping.chromosome.has(assembly=assembly), + TranscriptMapping.accession == acc) \ + .first() L.addMessage(__file__, -1, "INFO", "Finished processing getchromName(%s %s)" % (build, acc)) - del D,L - return result + return mapping.chromosome.name #chromosomeName @srpc(Mandatory.String, Mandatory.String, String, - _returns=Array(Mandatory.String)) + _returns=Array(Mandatory.String)) def numberConversion(build, variant, gene=None): """ Converts I{c.} to I{g.} notation or vice versa @@ -678,7 +674,6 @@ class MutalyzerService(ServiceBase): @return: The variant(s) in either I{g.} or I{c.} notation. @rtype: list """ - D = Db.Mapping(build) O = Output(__file__) O.addMessage(__file__, -1, "INFO", "Received request cTogConversion(%s %s)" % (build, variant)) @@ -686,7 +681,15 @@ class MutalyzerService(ServiceBase): counter = Db.Counter() counter.increment('positionconvert', 'webservice') - converter = Converter(build, O) + assembly = Assembly.query.filter(or_(Assembly.name == build, + Assembly.alias == build)).first() + if not assembly: + O.addMessage(__file__, 4, "EARG", "EARG %s" % build) + raise Fault("EARG", + "The build argument (%s) was not a valid " \ + "build name." % build) + + converter = Converter(assembly, O) variant = converter.correctChrVariant(variant) if "c." in variant or "n." in variant: @@ -722,9 +725,11 @@ class MutalyzerService(ServiceBase): counter = Db.Counter() counter.increment('checksyntax', 'webservice') - result = CheckSyntaxOutput() + if not variant : + output.addMessage(__file__, 4, "EARG", "EARG no variant") + raise Fault("EARG", "The variant argument is not provided.") - _checkVariant(output, variant) + result = CheckSyntaxOutput() grammar = Grammar(output) parsetree = grammar.parse(variant) diff --git a/mutalyzer/templates/batch_jobs.html b/mutalyzer/templates/batch_jobs.html index 1f6fd9cb..848b1d95 100644 --- a/mutalyzer/templates/batch_jobs.html +++ b/mutalyzer/templates/batch_jobs.html @@ -84,10 +84,8 @@ <td><b>Build</b></td> <td> <select name="arg1"> - <option selected="selected" value="{{ selected_build|e - }}">{{ selected_build|e }}</option> - {% for i in unselected_builds %} - <option value="{{ i|e }}">{{ i|e }}</option> + {% for assembly in assemblies %} +<option value="{{ assembly.name|e }}"{% if default_assembly in (assembly.name, assembly.alias) %} selected="selected"{% endif %}>{{ assembly.taxonomy_common_name|e }} — {{ assembly.name|e }}{% if assembly.alias %} ({{ assembly.alias|e }}){% endif %}</option> {% endfor %} </select> </td> diff --git a/mutalyzer/templates/position_converter.html b/mutalyzer/templates/position_converter.html index d8485012..1a1f45c1 100644 --- a/mutalyzer/templates/position_converter.html +++ b/mutalyzer/templates/position_converter.html @@ -7,9 +7,8 @@ <div style="border: 1px solid grey; background-color: aliceblue; padding: 20px;"> <p> - Please supply the build which you want to use to convert your - position, available builds at the moment are: hg18 (NCBI36), hg19 - (GRCh37) and mm10 (GRCm38). + Please supply the genome assembly which you want to use to convert your + position. </p> <p> <b>Note:</b> The Position Converter does NOT check the description or @@ -23,13 +22,11 @@ <table id="inputform"> <form action="" method="post" enctype="multipart/form-data"> <tr> - <td><b>Build</b></td> - <div>{{ build|e }}</div> + <td><b>Assembly</b></td> <td> - <select name="build"> - <option selected="selected" value="{{ selected_build|e }}">{{ selected_build|e }}</option> - {% for i in unselected_builds %} - <option value="{{ i|e }}">{{ i|e }}</option> + <select name="assembly_name_or_alias"> + {% for assembly in assemblies %} +<option value="{{ assembly.name|e }}"{% if assembly_name_or_alias in (assembly.name, assembly.alias) %} selected="selected"{% endif %}>{{ assembly.taxonomy_common_name|e }} — {{ assembly.name|e }}{% if assembly.alias %} ({{ assembly.alias|e }}){% endif %}</option> {% endfor %} </select> </td> @@ -50,7 +47,7 @@ </table> <!-- inputform --> </div> -{% if posted %} +{% if variant %} <h3>Results:</h3> <div class="messages"> diff --git a/mutalyzer/templates/reference_loader.html b/mutalyzer/templates/reference_loader.html index ab32e135..ddf05ab2 100644 --- a/mutalyzer/templates/reference_loader.html +++ b/mutalyzer/templates/reference_loader.html @@ -115,10 +115,9 @@ sequence (maximum size is {{ maxSize|e }} megabytes): <td>Assembly</td> <td> <select name="chrnameassembly"> - <option selected="selected" value="{{ selected_assembly|e }}">{{ selected_assembly|e }}</option> - {% for i in unselected_assemblies %} - <option value="{{ i|e }}">{{ i|e }}</option> - {% endfor %} + {% for assembly in assemblies %} +<option value="{{ assembly.name|e }}"{% if assembly_name_or_alias in (assembly.name, assembly.alias) %} selected="selected"{% endif %}>{{ assembly.taxonomy_common_name|e }} — {{ assembly.name|e }}{% if assembly.alias %} ({{assembly.alias|e }}){% endif %}</option> + {% endfor %} </select> </td> </tr> diff --git a/mutalyzer/variantchecker.py b/mutalyzer/variantchecker.py index 8623c824..f565d809 100644 --- a/mutalyzer/variantchecker.py +++ b/mutalyzer/variantchecker.py @@ -22,6 +22,7 @@ from Bio.Alphabet import DNAAlphabet from Bio.Alphabet import ProteinAlphabet from mutalyzer import util +from mutalyzer.db.models import Assembly from mutalyzer.grammar import Grammar from mutalyzer.mutator import Mutator from mutalyzer.mapping import Converter @@ -1694,15 +1695,17 @@ def check_variant(description, output): for descr, first, last in raw_variants for pos in (first, last)] # Todo: This is hard-coded to hg19... - converter = Converter('hg19', output) - chromosomal_positions = converter.chromosomal_positions( - locations, parsed_description.RefSeqAcc, parsed_description.Version) - if chromosomal_positions: - output.addOutput('rawVariantsChromosomal', - (chromosomal_positions[0], chromosomal_positions[1], - zip([descr for descr, first, last in raw_variants], - util.grouper(chromosomal_positions[2])))) - # Example value: ('chr12', [('29+4T>C', (2323, 2323)), ('230_233del', (5342, 5345))]) + assembly = Assembly.query.filter_by(alias='hg19').first() + if assembly: + converter = Converter(assembly, output) + chromosomal_positions = converter.chromosomal_positions( + locations, parsed_description.RefSeqAcc, parsed_description.Version or None) + if chromosomal_positions: + output.addOutput('rawVariantsChromosomal', + (chromosomal_positions[0], chromosomal_positions[1], + zip([descr for descr, first, last in raw_variants], + util.grouper(chromosomal_positions[2])))) + # Example value: ('chr12', [('29+4T>C', (2323, 2323)), ('230_233del', (5342, 5345))]) # Protein. for gene in record.record.geneList: diff --git a/mutalyzer/website.py b/mutalyzer/website.py index 5ea2dd41..13825cd2 100644 --- a/mutalyzer/website.py +++ b/mutalyzer/website.py @@ -30,12 +30,13 @@ from lxml import etree import pkg_resources from cStringIO import StringIO from spyne.server.http import HttpBase +from sqlalchemy import and_, or_ import mutalyzer from mutalyzer import util from mutalyzer.config import settings from mutalyzer.db import session -from mutalyzer.db.models import BatchJob, BatchQueueItem +from mutalyzer.db.models import Assembly, BatchJob, BatchQueueItem from mutalyzer.grammar import Grammar from mutalyzer.services import soap from mutalyzer import variantchecker @@ -464,62 +465,60 @@ class PositionConverter: def POST(self): """ Convert a variant and render position converter HTML form. - - Parameters: - - build: Human genome build (currently 'hg18' or 'hg19'). - - variant: Variant to convert. """ - i = web.input(build='', variant='') + i = web.input(assembly_name_or_alias=None, variant='') # Todo: The following is probably a problem elsewhere too. # We stringify the variant, because a unicode string crashes # Bio.Seq.reverse_complement in mapping.py:607. - return self.position_converter(i.build, str(i.variant)) + return self.position_converter(i.assembly_name_or_alias, + str(i.variant)) #POST - def position_converter(self, build='', variant=''): + def position_converter(self, assembly_name_or_alias=None, variant=None): """ Convert a variant and render position converter HTML form. - @kwarg build: Human genome build (currently 'hg18' or 'hg19'). - @kwarg variant: Variant to convert. + :arg assembly_name_or_alias: Genome assembly. + :type assembly_name_or_alias: string + :arg variant: Variant to convert. + :type variant: string """ output = Output(__file__) IP = web.ctx["ip"] - avail_builds = settings.DB_NAMES + assemblies = Assembly.query \ + .order_by(Assembly.taxonomy_common_name.asc(), + Assembly.name.asc()) \ + .all() - # We have to put up with this crap just to get a certain <option> - # selected in our TAL template. - # Todo: Now we switched to Jinja2, we can make this sane. - if build in avail_builds: - selected_build = build - else: - selected_build = settings.DEFAULT_DB - unselected_builds = sorted(b for b in avail_builds - if b != selected_build) + assembly_name_or_alias = (assembly_name_or_alias or + settings.DEFAULT_ASSEMBLY) attr = { - "avail_builds" : avail_builds, - "unselected_builds": unselected_builds, - "selected_build" : selected_build, - "variant" : variant, - "gName" : "", - "cNames" : [], - "messages" : [], - "posted" : build and variant + "assemblies" : assemblies, + "assembly_name_or_alias" : assembly_name_or_alias, + "variant" : variant or '', + "gName" : "", + "cNames" : [], + "messages" : [], } - if build and variant: - + if variant: output.addMessage(__file__, -1, 'INFO', 'Received request positionConverter(%s, %s) from %s' % ( - build, variant, IP)) + assembly_name_or_alias, variant, IP)) counter = Db.Counter() counter.increment('positionconvert', 'website') - # Todo: check for correct build. - converter = Converter(build, output) + assembly = Assembly.query.filter( + or_(Assembly.name == assembly_name_or_alias, + Assembly.alias == assembly_name_or_alias)).first() + if not assembly: + output.addMessage(__file__, 3, "ENOASSEMBLY", + "Not a valid assembly.") + + converter = Converter(assembly, output) #Convert chr accNo to NC number variant = converter.correctChrVariant(variant) @@ -549,8 +548,8 @@ class PositionConverter: attr['messages'] = map(util.message_info, output.getMessages()) output.addMessage(__file__, -1, 'INFO', - 'Finished request positionConverter(%s, %s)' % (build, - variant)) + 'Finished request positionConverter(%s, %s)' + % (assembly_name_or_alias, variant)) return render.position_converter(attr) #position_converter @@ -616,7 +615,12 @@ class VariantInfo: 'Received request variantInfo(%s:%s (LOVD_ver %s, build %s))' ' from %s' % (acc, var, LOVD_ver, build, IP)) - converter = Converter(build, output) + assembly = Assembly.query.filter(or_(Assembly.name == build, + Assembly.alias == build)).first() + if not assembly: + return 'invalid build' + + converter = Converter(assembly, output) result = '' @@ -1062,31 +1066,23 @@ class BatchChecker: maxUploadSize = settings.MAX_FILE_SIZE - avail_builds = settings.DB_NAMES - - # We have to put up with this crap just to get a certain <option> - # selected in our TAL template. - # Todo: Now we switched to Jinja2, we can make this sane. - if arg1 in avail_builds: - selected_build = arg1 - else: - selected_build = settings.DEFAULT_DB - unselected_builds = sorted(b for b in avail_builds - if b != selected_build) + assemblies = Assembly.query \ + .order_by(Assembly.taxonomy_common_name.asc(), + Assembly.name.asc()) \ + .all() attr = { - "messages" : [], - "errors" : [], - "debug" : [], - "maxSize" : float(maxUploadSize) / 1048576, - "hideTypes" : batchType and 'none' or '', - "selected" : "0", - "batchType" : batchType or "", - "avail_builds" : avail_builds, - "unselected_builds": unselected_builds, - "selected_build": selected_build, - "jobID" : None, - "totalJobs" : None + "messages" : [], + "errors" : [], + "debug" : [], + "maxSize" : float(maxUploadSize) / 1048576, + "hideTypes" : batchType and 'none' or '', + "selected" : "0", + "batchType" : batchType or "", + "assemblies" : assemblies, + "default_assembly": settings.DEFAULT_ASSEMBLY, + "jobID" : None, + "totalJobs" : None } batch_types = ['NameChecker', 'SyntaxChecker', 'PositionConverter', @@ -1125,25 +1121,32 @@ class BatchChecker: return 'Sorry, only files up to %s megabytes are accepted.' % ( float(maxUploadSize) / 1048576) - S = Scheduler.Scheduler() - FileInstance = File.File(O) - - # Generate the fromhost URL from which the results can be fetched - fromHost = web.ctx.homedomain + web.ctx.homepath + '/' - #fromHost = "http://%s%s" % ( - # req.hostname, req.uri.rsplit("/", 1)[0]+"/") - - job, columns = FileInstance.parseBatchFile(inFile.file) - if job is None: - O.addMessage(__file__, 4, "PRSERR", "Could not parse input" - " file, please check your file format.") + if batchType == 'PositionConverter': + if not Assembly.query.filter( + or_(Assembly.name == arg1, + Assembly.alias == arg1)).first(): + output.addMessage(__file__, 4, "ENOASSEMBLY", + "Not a valid assembly.") else: - attr["resultID"] = S.addJob(email, job, columns, fromHost, - batchType, arg1) - attr["totalJobs"] = len(job) or 1 - attr["messages"].append("Your file has been parsed and the job" - " is scheduled, you will receive an email when the job is" - " finished.") + S = Scheduler.Scheduler() + FileInstance = File.File(O) + + # Generate the fromhost URL from which the results can be fetched + fromHost = web.ctx.homedomain + web.ctx.homepath + '/' + #fromHost = "http://%s%s" % ( + # req.hostname, req.uri.rsplit("/", 1)[0]+"/") + + job, columns = FileInstance.parseBatchFile(inFile.file) + if job is None: + O.addMessage(__file__, 4, "PRSERR", "Could not parse input" + " file, please check your file format.") + else: + attr["resultID"] = S.addJob(email, job, columns, fromHost, + batchType, arg1) + attr["totalJobs"] = len(job) or 1 + attr["messages"].append("Your file has been parsed and the job" + " is scheduled, you will receive an email when the job is" + " finished.") attr["errors"].extend(map(util.message_info, O.getMessages())) @@ -1226,23 +1229,21 @@ class Uploader: Render reference sequence uploader form. """ maxUploadSize = settings.MAX_FILE_SIZE - available_assemblies = settings.DB_NAMES - # We have to put up with this crap just to get a certain <option> - # selected in our TAL template. - # Todo: Now we switched to Jinja2, we can make this sane. - selected_assembly = settings.DEFAULT_DB - unselected_assemblies = sorted(b for b in available_assemblies - if b != selected_assembly) + assemblies = Assembly.query \ + .order_by(Assembly.taxonomy_common_name.asc(), + Assembly.name.asc()) \ + .all() + + assembly_name_or_alias = settings.DEFAULT_ASSEMBLY UD, errors = "", [] args = { - 'UD' : UD, - 'available_assemblies' : available_assemblies, - 'unselected_assemblies': unselected_assemblies, - 'selected_assembly' : selected_assembly, - 'maxSize' : float(maxUploadSize) / 1048576, - 'errors' : errors + 'UD' : UD, + 'assemblies' : assemblies, + 'assembly_name_or_alias' : assembly_name_or_alias, + 'maxSize' : float(maxUploadSize) / 1048576, + 'errors' : errors } return render.reference_loader(args) #GET @@ -1292,7 +1293,10 @@ class Uploader: """ maxUploadSize = settings.MAX_FILE_SIZE - available_assemblies = settings.DB_NAMES + assemblies = Assembly.query \ + .order_by(Assembly.taxonomy_common_name.asc(), + Assembly.name.asc()) \ + .all() O = Output(__file__) IP = web.ctx["ip"] @@ -1306,15 +1310,7 @@ class Uploader: chrnameassembly='', chrname='', chrnamestart='', chrnamestop='', chrnameorientation='') - # We have to put up with this crap just to get a certain <option> - # selected in our TAL template. - # Todo: Now we switched to Jinja2, we can make this sane. - if i.chrnameassembly in available_assemblies: - selected_assembly = i.chrnameassembly - else: - selected_assembly = settings.DEFAULT_DB - unselected_assemblies = sorted(b for b in available_assemblies - if b != selected_assembly) + assembly_name_or_alias = i.chrnameassembly or settings.DEFAULT_ASSEMBLY O.addMessage(__file__, -1, 'INFO', 'Received request' @@ -1363,26 +1359,27 @@ class Uploader: orientation = int(i.orientation) UD = R.retrieveslice(accNo, start, stop, orientation) elif i.invoermethode == "chrname": - build = i.chrnameassembly name = i.chrname start = _checkInt(i.chrnamestart, "Start position") stop = _checkInt(i.chrnamestop, "Stop position") orientation = int(i.chrnameorientation) - if build not in available_assemblies: - raise InputException('Assembly not available: %s' % build) + assembly = Assembly.query.filter( + or_(Assembly.name == assembly_name_or_alias, + Assembly.alias == assembly_name_or_alias)).first() + if not assembly: + raise InputException('Invalid assembly') if not name.startswith('chr'): name = 'chr%s' % name - database = Db.Mapping(build) - try: - accession, _ = database.chromAcc(name) - except TypeError: - raise InputException('Chromosome not available for build %s: %s' % - (build, name)) + chromosome = Chromosome.query.filter_by(assembly=assembly, + name=name).first() + if not chromosome: + raise InputException('Chromosome not available for assembly %s: %s' % + (assembly.name, name)) - UD = R.retrieveslice(accession, start, stop, orientation) + UD = R.retrieveslice(chromosome.accession, start, stop, orientation) else: #unknown "invoermethode" raise InputException("Wrong method selected") @@ -1397,12 +1394,11 @@ class Uploader: errors.extend(map(lambda m: str(m), O.getMessages())) args = { - "UD" : UD, - 'available_assemblies' : available_assemblies, - 'unselected_assemblies': unselected_assemblies, - 'selected_assembly' : selected_assembly, - "maxSize" : float(maxUploadSize) / 1048576, - "errors" : errors + "UD" : UD, + 'assemblies' : assemblies, + 'assembly_name_or_alias' : assembly_name_or_alias, + "maxSize" : float(maxUploadSize) / 1048576, + "errors" : errors } O.addMessage(__file__, -1, 'INFO', diff --git a/tests/test_mapping.py b/tests/test_mapping.py index 91b9ff8b..4287d2f4 100644 --- a/tests/test_mapping.py +++ b/tests/test_mapping.py @@ -6,6 +6,7 @@ Tests for the mapping module. #import logging; logging.basicConfig() from nose.tools import * +from mutalyzer.db.models import Assembly from mutalyzer.output import Output from mutalyzer.mapping import Converter @@ -30,7 +31,8 @@ class TestConverter(): """ Create a Converter instance for a given build. """ - return Converter(build, self.output) + assembly = Assembly.query.first() + return Converter(assembly, self.output) def test_converter(self): """ diff --git a/tests/test_website.py b/tests/test_website.py index 1d3816fb..797ef63d 100644 --- a/tests/test_website.py +++ b/tests/test_website.py @@ -347,7 +347,7 @@ class TestWSGI(): """ r = self.app.get('/positionConverter') form = r.forms[0] - form['build'] = 'hg19' + form['assembly_name_or_alias'] = 'hg19' form['variant'] = 'NM_003002.2:c.204C>T' r = form.submit() r.mustcontain('NC_000011.9:g.111959625C>T') @@ -358,7 +358,7 @@ class TestWSGI(): """ r = self.app.get('/positionConverter') form = r.forms[0] - form['build'] = 'hg19' + form['assembly_name_or_alias'] = 'hg19' form['variant'] = 'NC_000011.9:g.111959625C>T' r = form.submit() r.mustcontain('NM_003002.2:c.204C>T') diff --git a/tests/utils.py b/tests/utils.py index aefac256..7e84ce19 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -17,7 +17,7 @@ def create_test_environment(database=False): os.close(log_handle) settings.configure(dict( - DEBUG = True, + DEBUG = False, TESTING = True, CACHE_DIR = tempfile.mkdtemp(), DATABASE_URI = 'sqlite://', -- GitLab