diff --git a/migrations/versions/1bf1b52f057_transcript_mappings_binning.py b/migrations/versions/1bf1b52f057_transcript_mappings_binning.py new file mode 100644 index 0000000000000000000000000000000000000000..6dd9c78e8c8c5a78163e36f9dfff0f047a6be124 --- /dev/null +++ b/migrations/versions/1bf1b52f057_transcript_mappings_binning.py @@ -0,0 +1,71 @@ +"""Transcript mappings binning + +Revision ID: 1bf1b52f057 +Revises: 225a8b4c3902 +Create Date: 2015-10-29 10:02:15.286139 + +""" + +from __future__ import unicode_literals + +# revision identifiers, used by Alembic. +revision = '1bf1b52f057' +down_revision = u'225a8b4c3902' + +from alembic import op +import sqlalchemy as sa +from sqlalchemy import sql +import binning + + +def upgrade(): + # We want to add a NOT NULL column without default value. So we first add + # the column without the constraint, then populate it, then add the + # constraint. + # Unfortunately, SQLite doesn't support adding the constraint on an + # existing column. We use batch_alter_table to workaround this. Of course + # this makes the entire migration horribly awkward on SQLite, but I can't + # really be bothered to improve it. This works. + # Also, the downgrade will fail on SQLite, but we don't support downgrades + # anyway, so I'm not fixing it. + connection = op.get_bind() + + op.add_column('transcript_mappings', sa.Column('bin', sa.Integer(), nullable=True)) + + transcript_mappings = sql.table('transcript_mappings', + sql.column('id', sa.Integer()), + sql.column('start', sa.Integer()), + sql.column('stop', sa.Integer()), + sql.column('bin', sa.Integer())) + + result = connection.execute( + transcript_mappings.select().with_only_columns([ + transcript_mappings.c.id, + transcript_mappings.c.start, + transcript_mappings.c.stop])) + + while True: + chunk = result.fetchmany(1000) + if not chunk: + break + + statement = transcript_mappings.update().where( + transcript_mappings.c.id == sql.bindparam('m_id') + ).values({'bin': sql.bindparam('m_bin')}) + + connection.execute(statement, [ + {'m_id': m.id, 'm_bin': binning.assign_bin(m.start - 1, m.stop)} + for m in chunk]) + + # See note above. + with op.batch_alter_table('transcript_mappings') as batch_op: + batch_op.alter_column('bin', nullable=False, existing_type=sa.Integer()) + + op.create_index(op.f('ix_transcript_mappings_bin'), 'transcript_mappings', ['bin'], unique=False) + + +def downgrade(): + ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f('ix_transcript_mappings_bin'), table_name='transcript_mappings') + op.drop_column('transcript_mappings', 'bin') + ### end Alembic commands ### diff --git a/migrations/versions/3492d2ee8884_transcript_protein_links_have_nullable_.py b/migrations/versions/3492d2ee8884_transcript_protein_links_have_nullable_.py index 607953558f4c5293a0a5118ebf78908956be8053..36db81f4cc7f7a17d89bf51448ae150e63c92301 100644 --- a/migrations/versions/3492d2ee8884_transcript_protein_links_have_nullable_.py +++ b/migrations/versions/3492d2ee8884_transcript_protein_links_have_nullable_.py @@ -31,6 +31,9 @@ import sqlalchemy as sa # and create statements for the indices. # # http://alembic.readthedocs.org/en/latest/batch.html +# +# Update: This has since been fixed with Alembic release 0.8.3 +# http://alembic.readthedocs.org/en/latest/changelog.html#change-9ef6be6709e71f3800b6e451ae75c5f8 def upgrade(): diff --git a/mutalyzer/db/models.py b/mutalyzer/db/models.py index 62877968e7fbdd1b5fb2d457f590efc44d01fe10..17d34186fd95e9981a56d5012f610f37e073c9e9 100644 --- a/mutalyzer/db/models.py +++ b/mutalyzer/db/models.py @@ -9,6 +9,7 @@ from datetime import datetime import sqlite3 import uuid +import binning from sqlalchemy import event, or_ from sqlalchemy import (Boolean, Column, DateTime, Enum, ForeignKey, Index, Integer, String, Text, TypeDecorator) @@ -393,6 +394,11 @@ class TranscriptMapping(db.Base): #: inclusive, in chromosomal orientation). stop = Column(Integer, nullable=False) + #: Bin index that can be used for faster range-based queries. See the + #: `interval binning documentation <http://interval-binning.readthedocs.org/>`_ + #: for more information. + bin = Column(Integer, nullable=False, index=True) + #: The CDS start position of the transcript on the chromosome (one-based, #: inclusive, in chromosomal orientation). cds_start = Column(Integer) @@ -446,6 +452,7 @@ class TranscriptMapping(db.Base): self.cds = cds self.select_transcript = select_transcript self.version = version + self.bin = binning.assign_bin(self.start - 1, self.stop) def __repr__(self): return ('<TranscriptMapping accession=%r version=%r gene=%r ' @@ -482,6 +489,7 @@ class TranscriptMapping(db.Base): instance.orientation = orientation instance.start = start instance.stop = stop + instance.bin = binning.assign_bin(start - 1, stop) instance.exon_starts = exon_starts instance.exon_stops = exon_stops instance.source = source diff --git a/mutalyzer/mapping.py b/mutalyzer/mapping.py index df2fb1dfc630bd02e6a132e94aeb29e6d7871531..b9a55afefdec70ecddbb1dd98eede58420ec95f1 100644 --- a/mutalyzer/mapping.py +++ b/mutalyzer/mapping.py @@ -16,6 +16,7 @@ from collections import defaultdict from itertools import groupby from operator import attrgetter, itemgetter +import binning import MySQLdb from mutalyzer.db import session @@ -707,9 +708,20 @@ class Converter(object) : if gene: mappings = chromosome.transcript_mappings.filter_by(gene=gene) else: + start = max(min_loc - 5000, 1) + stop = min(max_loc + 5000, binning.MAX_POSITION + 1) + bins = binning.overlapping_bins(start - 1, stop) mappings = chromosome.transcript_mappings.filter( - TranscriptMapping.start <= max_loc + 5000, - TranscriptMapping.stop >= min_loc - 5000) + TranscriptMapping.bin.in_(bins), + TranscriptMapping.start <= stop, + TranscriptMapping.stop >= start + ).order_by( + TranscriptMapping.start, + TranscriptMapping.stop, + TranscriptMapping.gene, + TranscriptMapping.accession, + TranscriptMapping.version, + TranscriptMapping.transcript) HGVS_notatations = defaultdict(list) NM_list = [] diff --git a/mutalyzer/services/rpc.py b/mutalyzer/services/rpc.py index 5326f2ff499cfa9a0c2303ed86e7d04642086648..a256831f1de687878e26fa1ecad06f23e256f94c 100644 --- a/mutalyzer/services/rpc.py +++ b/mutalyzer/services/rpc.py @@ -11,6 +11,7 @@ Mutalyzer RPC services. from __future__ import unicode_literals +import binning from spyne.decorator import srpc from spyne.service import ServiceBase from spyne.model.primitive import Integer, Boolean, DateTime, Unicode @@ -227,9 +228,19 @@ class MutalyzerService(ServiceBase): raise Fault("EARG", "The chrom argument (%s) was not a valid " \ "chromosome name." % chrom) + pos = max(min(pos, binning.MAX_POSITION + 1), 1) + bins = binning.overlapping_bins(pos - 1, pos) mappings = chromosome.transcript_mappings.filter( + TranscriptMapping.bin.in_(bins), TranscriptMapping.start <= pos, - TranscriptMapping.stop >= pos) + TranscriptMapping.stop >= pos + ).order_by( + TranscriptMapping.start, + TranscriptMapping.stop, + TranscriptMapping.gene, + TranscriptMapping.accession, + TranscriptMapping.version, + TranscriptMapping.transcript) L.addMessage(__file__, -1, "INFO", "Finished processing getTranscripts(%s %s %s %s)" @@ -307,6 +318,13 @@ class MutalyzerService(ServiceBase): 'Invalid range (%d-%d) with start position greater ' 'than stop position.' % (pos1, pos2)) + if pos1 < 1 or pos2 > binning.MAX_POSITION + 1: + L.addMessage(__file__, 4, 'EARG', + 'Invalid range: %d-%d' % (pos1, pos2)) + raise Fault('EARG', + 'Invalid range (%d-%d) exceeding chromosome bounds.' + % (pos1, pos2)) + try: assembly = Assembly.by_name_or_alias(build) except NoResultFound: @@ -323,13 +341,23 @@ class MutalyzerService(ServiceBase): "chromosome name." % chrom) if method: - range_filter = (TranscriptMapping.start <= pos2, + bins = binning.overlapping_bins(pos1 - 1, pos2) + range_filter = (TranscriptMapping.bin.in_(bins), + TranscriptMapping.start <= pos2, TranscriptMapping.stop >= pos1) else: - range_filter = (TranscriptMapping.start >= pos1, + bins = binning.contained_bins(pos1 - 1, pos2) + range_filter = (TranscriptMapping.bin.in_(bins), + TranscriptMapping.start >= pos1, TranscriptMapping.stop <= pos2) - mappings = chromosome.transcript_mappings.filter(*range_filter) + mappings = chromosome.transcript_mappings.filter(*range_filter).order_by( + TranscriptMapping.start, + TranscriptMapping.stop, + TranscriptMapping.gene, + TranscriptMapping.accession, + TranscriptMapping.version, + TranscriptMapping.transcript) L.addMessage(__file__, -1, "INFO", "Finished processing getTranscriptsRange(%s %s %s %s %s)" % ( @@ -385,6 +413,13 @@ class MutalyzerService(ServiceBase): 'Invalid range (%d-%d) with start position greater ' 'than stop position.' % (pos1, pos2)) + if pos1 < 1 or pos2 > binning.MAX_POSITION + 1: + L.addMessage(__file__, 4, 'EARG', + 'Invalid range: %d-%d' % (pos1, pos2)) + raise Fault('EARG', + 'Invalid range (%d-%d) exceeding chromosome bounds.' + % (pos1, pos2)) + try: assembly = Assembly.by_name_or_alias(build) except NoResultFound: @@ -401,13 +436,23 @@ class MutalyzerService(ServiceBase): "chromosome name." % chrom) if method: - range_filter = (TranscriptMapping.start <= pos2, + bins = binning.overlapping_bins(pos1 - 1, pos2) + range_filter = (TranscriptMapping.bin.in_(bins), + TranscriptMapping.start <= pos2, TranscriptMapping.stop >= pos1) else: - range_filter = (TranscriptMapping.start >= pos1, + bins = binning.contained_bins(pos1 - 1, pos2) + range_filter = (TranscriptMapping.bin.in_(bins), + TranscriptMapping.start >= pos1, TranscriptMapping.stop <= pos2) - mappings = chromosome.transcript_mappings.filter(*range_filter) + mappings = chromosome.transcript_mappings.filter(*range_filter).order_by( + TranscriptMapping.start, + TranscriptMapping.stop, + TranscriptMapping.gene, + TranscriptMapping.accession, + TranscriptMapping.version, + TranscriptMapping.transcript) transcripts = [] diff --git a/requirements.txt b/requirements.txt index ecd4026e23e4263b2403d11c8be21f35425bce63..75bbd5820b0c54a107839a898b33882bfd71182c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,12 +6,13 @@ PyYAML==3.11 SQLAlchemy==0.9.8 Sphinx==1.2.3 Werkzeug==0.9.6 -alembic==0.8.2 +alembic==0.8.3 backtranslate==0.0.5 biopython==1.64 chardet==2.3.0 cssselect==0.9.1 description-extractor==2.2.1 +interval-binning==1.0.0 lxml==3.4.0 mock==1.0.1 mockredispy==2.9.0.9 diff --git a/tests/test_services_json.py b/tests/test_services_json.py index 0e27947c745af113f4314b773d911936c0218592..e47e306e30fe8fef28ecc84f13493efa03e4536e 100644 --- a/tests/test_services_json.py +++ b/tests/test_services_json.py @@ -218,15 +218,7 @@ def test_get_transcripts_mapping(api): Test output of getTranscriptsMapping. """ r = api('getTranscriptsMapping', 'hg19', 'chr11', 111955524, 111966518) - assert r == [{'cds_start': 111957632, - 'cds_stop': 111965694, - 'name': 'NM_003002', - 'stop': 111966518, - 'start': 111957571, - 'version': 2, - 'gene': 'SDHD', - 'orientation': '+'}, - {'cds_start': 111957492, + assert r == [{'cds_start': 111957492, 'cds_stop': 111956019, 'name': 'NM_012459', 'stop': 111955524, @@ -241,7 +233,15 @@ def test_get_transcripts_mapping(api): 'start': 111957522, 'version': 1, 'gene': 'TIMM8B', - 'orientation': '-'}] + 'orientation': '-'}, + {'cds_start': 111957632, + 'cds_stop': 111965694, + 'name': 'NM_003002', + 'stop': 111966518, + 'start': 111957571, + 'version': 2, + 'gene': 'SDHD', + 'orientation': '+'}] def test_description_extract(api): diff --git a/tests/test_services_soap.py b/tests/test_services_soap.py index 5c100eb1e83318a7448cbfe9b75329b9f93488ee..37a5799fba0a02f0a26aeeabbe906e4936250b3c 100644 --- a/tests/test_services_soap.py +++ b/tests/test_services_soap.py @@ -734,15 +734,7 @@ def test_get_transcripts_mapping(api): assert len(r.TranscriptMappingInfo) == 3 assert all(all(t_real[k] == t_expected[k] for k in t_expected) for t_real, t_expected in - zip(r.TranscriptMappingInfo, [{'cds_start': 111957632, - 'cds_stop': 111965694, - 'name': 'NM_003002', - 'stop': 111966518, - 'start': 111957571, - 'version': 2, - 'gene': 'SDHD', - 'orientation': '+'}, - {'cds_start': 111957492, + zip(r.TranscriptMappingInfo, [{'cds_start': 111957492, 'cds_stop': 111956019, 'name': 'NM_012459', 'stop': 111955524, @@ -757,7 +749,15 @@ def test_get_transcripts_mapping(api): 'start': 111957522, 'version': 1, 'gene': 'TIMM8B', - 'orientation': '-'}])) + 'orientation': '-'}, + {'cds_start': 111957632, + 'cds_stop': 111965694, + 'name': 'NM_003002', + 'stop': 111966518, + 'start': 111957571, + 'version': 2, + 'gene': 'SDHD', + 'orientation': '+'}])) def test_description_extract(api):