Skip to content
Snippets Groups Projects
Commit 8bbbc3a8 authored by Vermaat's avatar Vermaat
Browse files

Bi-directional cachinig of transcript-protein links

Previously transcript-protein links were assumed to always be
indexed by transcript, and cached entries were allowed to have
a `null` protein (meaning caching the knowledget that there is
no link for this transcript).

Now we can cache links in both directions. Both transcript and
protein are allowed to be `null` (but not at the same time),
and the protein column has a new unique constraint.
parent 0b9c7565
No related branches found
No related tags found
No related merge requests found
......@@ -236,13 +236,13 @@ DEFAULT_ASSEMBLY
`Default value:` ``hg19``
PROTEIN_LINK_EXPIRATION
Expiration time for cached transcript->protein links from the NCBI (in
Expiration time for cached transcript<->protein links from the NCBI (in
seconds).
`Default value:` `60 * 60 * 24 * 30` (30 days)
NEGATIVE_PROTEIN_LINK_EXPIRATION
Expiration time for cached negative transcript->protein links from the NCBI
Expiration time for cached negative transcript<->protein links from the NCBI
(in seconds).
`Default value:` `60 * 60 * 24 * 5` (5 days)
......
"""Transcript protein links have nullable transcript and unique protein
Revision ID: 3492d2ee8884
Revises: 4bafcc5086dd
Create Date: 2015-09-25 15:11:45.562392
"""
from __future__ import unicode_literals
# revision identifiers, used by Alembic.
revision = '3492d2ee8884'
down_revision = u'4bafcc5086dd'
from alembic import op
import sqlalchemy as sa
# We are adding a unique constraint here, so actually we should first make
# sure there are no duplicate entries. However, this is highly unlikely, so we
# don't bother prepping the migration for that.
#
# http://skien.cc/blog/2014/01/31/adding-unique-contraints-after-the-fact-in-sqlalchemy/
# We want to be compatible with at least SQLite and PostgreSQL. This means
# using `batch_alter_table` for operations yielding an ALTER TABLE statement.
# However, we also have indices on the table and this causes problems with
# `batch_alter_table` (it seems indices are copied too, under the same name,
# which is invalid). To work around this, we wrap the batch operations in drop
# and create statements for the indices.
#
# http://alembic.readthedocs.org/en/latest/batch.html
def upgrade():
### commands auto generated by Alembic - please adjust! ###
op.drop_index('ix_transcript_protein_links_transcript_accession', table_name='transcript_protein_links')
op.drop_index('ix_transcript_protein_links_protein_accession', table_name='transcript_protein_links')
with op.batch_alter_table('transcript_protein_links') as batch_op:
batch_op.alter_column('transcript_accession',
existing_type=sa.VARCHAR(length=20),
nullable=True)
op.create_index(op.f('ix_transcript_protein_links_transcript_accession'), 'transcript_protein_links', ['transcript_accession'], unique=True)
op.create_index(op.f('ix_transcript_protein_links_protein_accession'), 'transcript_protein_links', ['protein_accession'], unique=True)
### end Alembic commands ###
def downgrade():
### commands auto generated by Alembic - please adjust! ###
op.drop_index(op.f('ix_transcript_protein_links_transcript_accession'), table_name='transcript_protein_links')
op.drop_index(op.f('ix_transcript_protein_links_protein_accession'), table_name='transcript_protein_links')
with op.batch_alter_table('transcript_protein_links') as batch_op:
batch_op.alter_column('transcript_accession',
existing_type=sa.VARCHAR(length=20),
nullable=False)
op.create_index('ix_transcript_protein_links_transcript_accession', 'transcript_protein_links', ['transcript_accession'], unique=True)
op.create_index('ix_transcript_protein_links_protein_accession', 'transcript_protein_links', ['protein_accession'], unique=False)
### end Alembic commands ###
......@@ -65,10 +65,10 @@ LRG_PREFIX_URL = 'ftp://ftp.ebi.ac.uk/pub/databases/lrgex/SCHEMA_1_7_ARCHIVE/'
# Allow for this fraction of errors in batch jobs.
BATCH_JOBS_ERROR_THRESHOLD = 0.05
# Expiration time for transcript->protein links from the NCBI (in seconds).
# Expiration time for transcript<->protein links from the NCBI (in seconds).
PROTEIN_LINK_EXPIRATION = 60 * 60 * 24 * 30
# Expiration time for negative transcript->protein links from the NCBI (in
# Expiration time for negative transcript<->protein links from the NCBI (in
# seconds).
NEGATIVE_PROTEIN_LINK_EXPIRATION = 60 * 60 * 24 * 5
......
......@@ -224,19 +224,23 @@ class TranscriptProteinLink(db.Base):
id = Column(Integer, primary_key=True)
#: Accession number for the transcript, not including the version number
#: (e.g., ``NM_018195`, ``XM_005270562``, ``NR_015380``).
transcript_accession = Column(String(20), nullable=False, index=True,
#: (e.g., ``NM_018195`, ``XM_005270562``, ``NR_015380``). If `NULL`, the
#: record states that no transcript is linked to the protein.
transcript_accession = Column(String(20), nullable=True, index=True,
unique=True)
#: Accession number for the protein, not including the version number
#: (e.g., ``NP_060665``, ``XP_005258635``). If `NULL`, the record states
#: that no protein is linked to the transcript by the NCBI.
protein_accession = Column(String(20), index=True)
#: that no protein is linked to the transcript.
protein_accession = Column(String(20), nullable=True, index=True,
unique=True)
#: Date and time of creation.
added = Column(DateTime)
def __init__(self, transcript_accession, protein_accession=None):
def __init__(self, transcript_accession=None, protein_accession=None):
if transcript_accession is None and protein_accession is None:
raise ValueError('Link must have a transcript or protein')
self.transcript_accession = transcript_accession
self.protein_accession = protein_accession
self.added = datetime.now()
......
......@@ -12,6 +12,7 @@ from __future__ import unicode_literals
from datetime import datetime, timedelta
from sqlalchemy import and_, or_
import sqlalchemy.exc
from mutalyzer.config import settings
from mutalyzer.db import session
......@@ -55,12 +56,17 @@ def pop_batch_queue_item(batch_job):
return item, flags
def get_transcript_protein_link(transcript_accession):
def get_transcript_protein_link(accession, reverse=False):
"""
Get a cached link between a transcript and a protein that is not expired
according to the configuration settings `PROTEIN_LINK_EXPIRATION` and
`NEGATIVE_PROTEIN_LINK_EXPIRATION`.
:arg str accession: Accession number to lookup link for.
:arg bool reverse: If `True`, `accession` is assumed to be a protein
accession number, otherwise `accession` is assumed to be a transcript
accession number.
Note that the link may be negative, i.e., the knowledge that no link
exists can also be cached. In that case, the `protein_accession` field of
the resulting `TranscriptProteinLink` object is `None`.
......@@ -72,31 +78,57 @@ def get_transcript_protein_link(transcript_accession):
negative_link_datetime = datetime.now() - \
timedelta(seconds=settings.NEGATIVE_PROTEIN_LINK_EXPIRATION)
# Query column must have `accession`, other column has the value we're
# probably interested in.
query_column = TranscriptProteinLink.transcript_accession
other_column = TranscriptProteinLink.protein_accession
if reverse:
# Lookup by protein accession instead of transcript accession.
query_column, other_column = other_column, query_column
return TranscriptProteinLink.query \
.filter_by(transcript_accession=transcript_accession) \
.filter(or_(
and_(TranscriptProteinLink.protein_accession != None,
TranscriptProteinLink.added >= link_datetime),
and_(TranscriptProteinLink.protein_accession == None,
TranscriptProteinLink.added >= negative_link_datetime))) \
.filter_by(transcript_accession=accession) \
.filter(
query_column == accession,
or_(
and_(other_column.isnot(None),
TranscriptProteinLink.added >= link_datetime),
and_(other_column.is_(None),
TranscriptProteinLink.added >= negative_link_datetime))
) \
.first()
def update_transcript_protein_link(transcript_accession,
def update_transcript_protein_link(transcript_accession=None,
protein_accession=None):
"""
Update cached link between a transcript and a protein, or create it if it
doesn't exist yet.
"""
link = TranscriptProteinLink.query \
.filter_by(transcript_accession=transcript_accession) \
.first()
if transcript_accession is None and protein_accession is None:
raise ValueError('Link must have a transcript or protein')
# Filter clauses to find links for either of the given accession numbers.
clauses = []
if transcript_accession is not None:
clauses.append(TranscriptProteinLink.transcript_accession ==
transcript_accession)
if protein_accession is not None:
clauses.append(TranscriptProteinLink.protein_accession ==
protein_accession)
# Delete any related existing links.
links = TranscriptProteinLink.query.filter(or_(*clauses))
session.delete(links)
session.commit()
if link is not None:
link.protein_accession = protein_accession
link.added = datetime.now()
else:
link = TranscriptProteinLink(transcript_accession, protein_accession)
# There is a race condition here between deleting old links and adding the
# new one. It's extremely unlikely to go wrong, and we can safely ignore
# it anyway.
link = TranscriptProteinLink(transcript_accession, protein_accession)
try:
session.add(link)
session.commit()
session.commit()
except sqlalchemy.exc.IntegrityError:
session.rollback()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment