Skip to content
Snippets Groups Projects
Commit 201d7deb authored by Vermaat's avatar Vermaat
Browse files

Merge pull request #100 from mutalyzer/interval-binning

Use interval binning scheme on transcript mappings
parents e661b72a e0a127cf
No related branches found
No related tags found
No related merge requests found
"""Transcript mappings binning
Revision ID: 1bf1b52f057
Revises: 225a8b4c3902
Create Date: 2015-10-29 10:02:15.286139
"""
from __future__ import unicode_literals
# revision identifiers, used by Alembic.
revision = '1bf1b52f057'
down_revision = u'225a8b4c3902'
from alembic import op
import sqlalchemy as sa
from sqlalchemy import sql
import binning
def upgrade():
# We want to add a NOT NULL column without default value. So we first add
# the column without the constraint, then populate it, then add the
# constraint.
# Unfortunately, SQLite doesn't support adding the constraint on an
# existing column. We use batch_alter_table to workaround this. Of course
# this makes the entire migration horribly awkward on SQLite, but I can't
# really be bothered to improve it. This works.
# Also, the downgrade will fail on SQLite, but we don't support downgrades
# anyway, so I'm not fixing it.
connection = op.get_bind()
op.add_column('transcript_mappings', sa.Column('bin', sa.Integer(), nullable=True))
transcript_mappings = sql.table('transcript_mappings',
sql.column('id', sa.Integer()),
sql.column('start', sa.Integer()),
sql.column('stop', sa.Integer()),
sql.column('bin', sa.Integer()))
result = connection.execute(
transcript_mappings.select().with_only_columns([
transcript_mappings.c.id,
transcript_mappings.c.start,
transcript_mappings.c.stop]))
while True:
chunk = result.fetchmany(1000)
if not chunk:
break
statement = transcript_mappings.update().where(
transcript_mappings.c.id == sql.bindparam('m_id')
).values({'bin': sql.bindparam('m_bin')})
connection.execute(statement, [
{'m_id': m.id, 'm_bin': binning.assign_bin(m.start - 1, m.stop)}
for m in chunk])
# See note above.
with op.batch_alter_table('transcript_mappings') as batch_op:
batch_op.alter_column('bin', nullable=False, existing_type=sa.Integer())
op.create_index(op.f('ix_transcript_mappings_bin'), 'transcript_mappings', ['bin'], unique=False)
def downgrade():
### commands auto generated by Alembic - please adjust! ###
op.drop_index(op.f('ix_transcript_mappings_bin'), table_name='transcript_mappings')
op.drop_column('transcript_mappings', 'bin')
### end Alembic commands ###
......@@ -31,6 +31,9 @@ import sqlalchemy as sa
# and create statements for the indices.
#
# http://alembic.readthedocs.org/en/latest/batch.html
#
# Update: This has since been fixed with Alembic release 0.8.3
# http://alembic.readthedocs.org/en/latest/changelog.html#change-9ef6be6709e71f3800b6e451ae75c5f8
def upgrade():
......
......@@ -9,6 +9,7 @@ from datetime import datetime
import sqlite3
import uuid
import binning
from sqlalchemy import event, or_
from sqlalchemy import (Boolean, Column, DateTime, Enum, ForeignKey, Index,
Integer, String, Text, TypeDecorator)
......@@ -393,6 +394,11 @@ class TranscriptMapping(db.Base):
#: inclusive, in chromosomal orientation).
stop = Column(Integer, nullable=False)
#: Bin index that can be used for faster range-based queries. See the
#: `interval binning documentation <http://interval-binning.readthedocs.org/>`_
#: for more information.
bin = Column(Integer, nullable=False, index=True)
#: The CDS start position of the transcript on the chromosome (one-based,
#: inclusive, in chromosomal orientation).
cds_start = Column(Integer)
......@@ -446,6 +452,7 @@ class TranscriptMapping(db.Base):
self.cds = cds
self.select_transcript = select_transcript
self.version = version
self.bin = binning.assign_bin(self.start - 1, self.stop)
def __repr__(self):
return ('<TranscriptMapping accession=%r version=%r gene=%r '
......@@ -482,6 +489,7 @@ class TranscriptMapping(db.Base):
instance.orientation = orientation
instance.start = start
instance.stop = stop
instance.bin = binning.assign_bin(start - 1, stop)
instance.exon_starts = exon_starts
instance.exon_stops = exon_stops
instance.source = source
......
......@@ -16,6 +16,7 @@ from collections import defaultdict
from itertools import groupby
from operator import attrgetter, itemgetter
import binning
import MySQLdb
from mutalyzer.db import session
......@@ -707,9 +708,20 @@ class Converter(object) :
if gene:
mappings = chromosome.transcript_mappings.filter_by(gene=gene)
else:
start = max(min_loc - 5000, 1)
stop = min(max_loc + 5000, binning.MAX_POSITION + 1)
bins = binning.overlapping_bins(start - 1, stop)
mappings = chromosome.transcript_mappings.filter(
TranscriptMapping.start <= max_loc + 5000,
TranscriptMapping.stop >= min_loc - 5000)
TranscriptMapping.bin.in_(bins),
TranscriptMapping.start <= stop,
TranscriptMapping.stop >= start
).order_by(
TranscriptMapping.start,
TranscriptMapping.stop,
TranscriptMapping.gene,
TranscriptMapping.accession,
TranscriptMapping.version,
TranscriptMapping.transcript)
HGVS_notatations = defaultdict(list)
NM_list = []
......
......@@ -11,6 +11,7 @@ Mutalyzer RPC services.
from __future__ import unicode_literals
import binning
from spyne.decorator import srpc
from spyne.service import ServiceBase
from spyne.model.primitive import Integer, Boolean, DateTime, Unicode
......@@ -227,9 +228,19 @@ class MutalyzerService(ServiceBase):
raise Fault("EARG", "The chrom argument (%s) was not a valid " \
"chromosome name." % chrom)
pos = max(min(pos, binning.MAX_POSITION + 1), 1)
bins = binning.overlapping_bins(pos - 1, pos)
mappings = chromosome.transcript_mappings.filter(
TranscriptMapping.bin.in_(bins),
TranscriptMapping.start <= pos,
TranscriptMapping.stop >= pos)
TranscriptMapping.stop >= pos
).order_by(
TranscriptMapping.start,
TranscriptMapping.stop,
TranscriptMapping.gene,
TranscriptMapping.accession,
TranscriptMapping.version,
TranscriptMapping.transcript)
L.addMessage(__file__, -1, "INFO",
"Finished processing getTranscripts(%s %s %s %s)"
......@@ -307,6 +318,13 @@ class MutalyzerService(ServiceBase):
'Invalid range (%d-%d) with start position greater '
'than stop position.' % (pos1, pos2))
if pos1 < 1 or pos2 > binning.MAX_POSITION + 1:
L.addMessage(__file__, 4, 'EARG',
'Invalid range: %d-%d' % (pos1, pos2))
raise Fault('EARG',
'Invalid range (%d-%d) exceeding chromosome bounds.'
% (pos1, pos2))
try:
assembly = Assembly.by_name_or_alias(build)
except NoResultFound:
......@@ -323,13 +341,23 @@ class MutalyzerService(ServiceBase):
"chromosome name." % chrom)
if method:
range_filter = (TranscriptMapping.start <= pos2,
bins = binning.overlapping_bins(pos1 - 1, pos2)
range_filter = (TranscriptMapping.bin.in_(bins),
TranscriptMapping.start <= pos2,
TranscriptMapping.stop >= pos1)
else:
range_filter = (TranscriptMapping.start >= pos1,
bins = binning.contained_bins(pos1 - 1, pos2)
range_filter = (TranscriptMapping.bin.in_(bins),
TranscriptMapping.start >= pos1,
TranscriptMapping.stop <= pos2)
mappings = chromosome.transcript_mappings.filter(*range_filter)
mappings = chromosome.transcript_mappings.filter(*range_filter).order_by(
TranscriptMapping.start,
TranscriptMapping.stop,
TranscriptMapping.gene,
TranscriptMapping.accession,
TranscriptMapping.version,
TranscriptMapping.transcript)
L.addMessage(__file__, -1, "INFO",
"Finished processing getTranscriptsRange(%s %s %s %s %s)" % (
......@@ -385,6 +413,13 @@ class MutalyzerService(ServiceBase):
'Invalid range (%d-%d) with start position greater '
'than stop position.' % (pos1, pos2))
if pos1 < 1 or pos2 > binning.MAX_POSITION + 1:
L.addMessage(__file__, 4, 'EARG',
'Invalid range: %d-%d' % (pos1, pos2))
raise Fault('EARG',
'Invalid range (%d-%d) exceeding chromosome bounds.'
% (pos1, pos2))
try:
assembly = Assembly.by_name_or_alias(build)
except NoResultFound:
......@@ -401,13 +436,23 @@ class MutalyzerService(ServiceBase):
"chromosome name." % chrom)
if method:
range_filter = (TranscriptMapping.start <= pos2,
bins = binning.overlapping_bins(pos1 - 1, pos2)
range_filter = (TranscriptMapping.bin.in_(bins),
TranscriptMapping.start <= pos2,
TranscriptMapping.stop >= pos1)
else:
range_filter = (TranscriptMapping.start >= pos1,
bins = binning.contained_bins(pos1 - 1, pos2)
range_filter = (TranscriptMapping.bin.in_(bins),
TranscriptMapping.start >= pos1,
TranscriptMapping.stop <= pos2)
mappings = chromosome.transcript_mappings.filter(*range_filter)
mappings = chromosome.transcript_mappings.filter(*range_filter).order_by(
TranscriptMapping.start,
TranscriptMapping.stop,
TranscriptMapping.gene,
TranscriptMapping.accession,
TranscriptMapping.version,
TranscriptMapping.transcript)
transcripts = []
......
......@@ -6,12 +6,13 @@ PyYAML==3.11
SQLAlchemy==0.9.8
Sphinx==1.2.3
Werkzeug==0.9.6
alembic==0.8.2
alembic==0.8.3
backtranslate==0.0.5
biopython==1.64
chardet==2.3.0
cssselect==0.9.1
description-extractor==2.2.1
interval-binning==1.0.0
lxml==3.4.0
mock==1.0.1
mockredispy==2.9.0.9
......
......@@ -218,15 +218,7 @@ def test_get_transcripts_mapping(api):
Test output of getTranscriptsMapping.
"""
r = api('getTranscriptsMapping', 'hg19', 'chr11', 111955524, 111966518)
assert r == [{'cds_start': 111957632,
'cds_stop': 111965694,
'name': 'NM_003002',
'stop': 111966518,
'start': 111957571,
'version': 2,
'gene': 'SDHD',
'orientation': '+'},
{'cds_start': 111957492,
assert r == [{'cds_start': 111957492,
'cds_stop': 111956019,
'name': 'NM_012459',
'stop': 111955524,
......@@ -241,7 +233,15 @@ def test_get_transcripts_mapping(api):
'start': 111957522,
'version': 1,
'gene': 'TIMM8B',
'orientation': '-'}]
'orientation': '-'},
{'cds_start': 111957632,
'cds_stop': 111965694,
'name': 'NM_003002',
'stop': 111966518,
'start': 111957571,
'version': 2,
'gene': 'SDHD',
'orientation': '+'}]
def test_description_extract(api):
......
......@@ -734,15 +734,7 @@ def test_get_transcripts_mapping(api):
assert len(r.TranscriptMappingInfo) == 3
assert all(all(t_real[k] == t_expected[k] for k in t_expected)
for t_real, t_expected in
zip(r.TranscriptMappingInfo, [{'cds_start': 111957632,
'cds_stop': 111965694,
'name': 'NM_003002',
'stop': 111966518,
'start': 111957571,
'version': 2,
'gene': 'SDHD',
'orientation': '+'},
{'cds_start': 111957492,
zip(r.TranscriptMappingInfo, [{'cds_start': 111957492,
'cds_stop': 111956019,
'name': 'NM_012459',
'stop': 111955524,
......@@ -757,7 +749,15 @@ def test_get_transcripts_mapping(api):
'start': 111957522,
'version': 1,
'gene': 'TIMM8B',
'orientation': '-'}]))
'orientation': '-'},
{'cds_start': 111957632,
'cds_stop': 111965694,
'name': 'NM_003002',
'stop': 111966518,
'start': 111957571,
'version': 2,
'gene': 'SDHD',
'orientation': '+'}]))
def test_description_extract(api):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment