Skip to content
Snippets Groups Projects
Commit ba859618 authored by Vermaat's avatar Vermaat
Browse files

Merge pull request #80 from mutalyzer/transcript-protein-link

Bi-directional cachinig of transcript-protein links
parents e003d85d 8bbbc3a8
No related branches found
No related tags found
No related merge requests found
...@@ -236,13 +236,13 @@ DEFAULT_ASSEMBLY ...@@ -236,13 +236,13 @@ DEFAULT_ASSEMBLY
`Default value:` ``hg19`` `Default value:` ``hg19``
PROTEIN_LINK_EXPIRATION PROTEIN_LINK_EXPIRATION
Expiration time for cached transcript->protein links from the NCBI (in Expiration time for cached transcript<->protein links from the NCBI (in
seconds). seconds).
`Default value:` `60 * 60 * 24 * 30` (30 days) `Default value:` `60 * 60 * 24 * 30` (30 days)
NEGATIVE_PROTEIN_LINK_EXPIRATION NEGATIVE_PROTEIN_LINK_EXPIRATION
Expiration time for cached negative transcript->protein links from the NCBI Expiration time for cached negative transcript<->protein links from the NCBI
(in seconds). (in seconds).
`Default value:` `60 * 60 * 24 * 5` (5 days) `Default value:` `60 * 60 * 24 * 5` (5 days)
......
"""Transcript protein links have nullable transcript and unique protein
Revision ID: 3492d2ee8884
Revises: 4bafcc5086dd
Create Date: 2015-09-25 15:11:45.562392
"""
from __future__ import unicode_literals
# revision identifiers, used by Alembic.
revision = '3492d2ee8884'
down_revision = u'4bafcc5086dd'
from alembic import op
import sqlalchemy as sa
# We are adding a unique constraint here, so actually we should first make
# sure there are no duplicate entries. However, this is highly unlikely, so we
# don't bother prepping the migration for that.
#
# http://skien.cc/blog/2014/01/31/adding-unique-contraints-after-the-fact-in-sqlalchemy/
# We want to be compatible with at least SQLite and PostgreSQL. This means
# using `batch_alter_table` for operations yielding an ALTER TABLE statement.
# However, we also have indices on the table and this causes problems with
# `batch_alter_table` (it seems indices are copied too, under the same name,
# which is invalid). To work around this, we wrap the batch operations in drop
# and create statements for the indices.
#
# http://alembic.readthedocs.org/en/latest/batch.html
def upgrade():
### commands auto generated by Alembic - please adjust! ###
op.drop_index('ix_transcript_protein_links_transcript_accession', table_name='transcript_protein_links')
op.drop_index('ix_transcript_protein_links_protein_accession', table_name='transcript_protein_links')
with op.batch_alter_table('transcript_protein_links') as batch_op:
batch_op.alter_column('transcript_accession',
existing_type=sa.VARCHAR(length=20),
nullable=True)
op.create_index(op.f('ix_transcript_protein_links_transcript_accession'), 'transcript_protein_links', ['transcript_accession'], unique=True)
op.create_index(op.f('ix_transcript_protein_links_protein_accession'), 'transcript_protein_links', ['protein_accession'], unique=True)
### end Alembic commands ###
def downgrade():
### commands auto generated by Alembic - please adjust! ###
op.drop_index(op.f('ix_transcript_protein_links_transcript_accession'), table_name='transcript_protein_links')
op.drop_index(op.f('ix_transcript_protein_links_protein_accession'), table_name='transcript_protein_links')
with op.batch_alter_table('transcript_protein_links') as batch_op:
batch_op.alter_column('transcript_accession',
existing_type=sa.VARCHAR(length=20),
nullable=False)
op.create_index('ix_transcript_protein_links_transcript_accession', 'transcript_protein_links', ['transcript_accession'], unique=True)
op.create_index('ix_transcript_protein_links_protein_accession', 'transcript_protein_links', ['protein_accession'], unique=False)
### end Alembic commands ###
...@@ -65,10 +65,10 @@ LRG_PREFIX_URL = 'ftp://ftp.ebi.ac.uk/pub/databases/lrgex/SCHEMA_1_7_ARCHIVE/' ...@@ -65,10 +65,10 @@ LRG_PREFIX_URL = 'ftp://ftp.ebi.ac.uk/pub/databases/lrgex/SCHEMA_1_7_ARCHIVE/'
# Allow for this fraction of errors in batch jobs. # Allow for this fraction of errors in batch jobs.
BATCH_JOBS_ERROR_THRESHOLD = 0.05 BATCH_JOBS_ERROR_THRESHOLD = 0.05
# Expiration time for transcript->protein links from the NCBI (in seconds). # Expiration time for transcript<->protein links from the NCBI (in seconds).
PROTEIN_LINK_EXPIRATION = 60 * 60 * 24 * 30 PROTEIN_LINK_EXPIRATION = 60 * 60 * 24 * 30
# Expiration time for negative transcript->protein links from the NCBI (in # Expiration time for negative transcript<->protein links from the NCBI (in
# seconds). # seconds).
NEGATIVE_PROTEIN_LINK_EXPIRATION = 60 * 60 * 24 * 5 NEGATIVE_PROTEIN_LINK_EXPIRATION = 60 * 60 * 24 * 5
......
...@@ -224,19 +224,23 @@ class TranscriptProteinLink(db.Base): ...@@ -224,19 +224,23 @@ class TranscriptProteinLink(db.Base):
id = Column(Integer, primary_key=True) id = Column(Integer, primary_key=True)
#: Accession number for the transcript, not including the version number #: Accession number for the transcript, not including the version number
#: (e.g., ``NM_018195`, ``XM_005270562``, ``NR_015380``). #: (e.g., ``NM_018195`, ``XM_005270562``, ``NR_015380``). If `NULL`, the
transcript_accession = Column(String(20), nullable=False, index=True, #: record states that no transcript is linked to the protein.
transcript_accession = Column(String(20), nullable=True, index=True,
unique=True) unique=True)
#: Accession number for the protein, not including the version number #: Accession number for the protein, not including the version number
#: (e.g., ``NP_060665``, ``XP_005258635``). If `NULL`, the record states #: (e.g., ``NP_060665``, ``XP_005258635``). If `NULL`, the record states
#: that no protein is linked to the transcript by the NCBI. #: that no protein is linked to the transcript.
protein_accession = Column(String(20), index=True) protein_accession = Column(String(20), nullable=True, index=True,
unique=True)
#: Date and time of creation. #: Date and time of creation.
added = Column(DateTime) added = Column(DateTime)
def __init__(self, transcript_accession, protein_accession=None): def __init__(self, transcript_accession=None, protein_accession=None):
if transcript_accession is None and protein_accession is None:
raise ValueError('Link must have a transcript or protein')
self.transcript_accession = transcript_accession self.transcript_accession = transcript_accession
self.protein_accession = protein_accession self.protein_accession = protein_accession
self.added = datetime.now() self.added = datetime.now()
......
...@@ -12,6 +12,7 @@ from __future__ import unicode_literals ...@@ -12,6 +12,7 @@ from __future__ import unicode_literals
from datetime import datetime, timedelta from datetime import datetime, timedelta
from sqlalchemy import and_, or_ from sqlalchemy import and_, or_
import sqlalchemy.exc
from mutalyzer.config import settings from mutalyzer.config import settings
from mutalyzer.db import session from mutalyzer.db import session
...@@ -55,12 +56,17 @@ def pop_batch_queue_item(batch_job): ...@@ -55,12 +56,17 @@ def pop_batch_queue_item(batch_job):
return item, flags return item, flags
def get_transcript_protein_link(transcript_accession): def get_transcript_protein_link(accession, reverse=False):
""" """
Get a cached link between a transcript and a protein that is not expired Get a cached link between a transcript and a protein that is not expired
according to the configuration settings `PROTEIN_LINK_EXPIRATION` and according to the configuration settings `PROTEIN_LINK_EXPIRATION` and
`NEGATIVE_PROTEIN_LINK_EXPIRATION`. `NEGATIVE_PROTEIN_LINK_EXPIRATION`.
:arg str accession: Accession number to lookup link for.
:arg bool reverse: If `True`, `accession` is assumed to be a protein
accession number, otherwise `accession` is assumed to be a transcript
accession number.
Note that the link may be negative, i.e., the knowledge that no link Note that the link may be negative, i.e., the knowledge that no link
exists can also be cached. In that case, the `protein_accession` field of exists can also be cached. In that case, the `protein_accession` field of
the resulting `TranscriptProteinLink` object is `None`. the resulting `TranscriptProteinLink` object is `None`.
...@@ -72,31 +78,57 @@ def get_transcript_protein_link(transcript_accession): ...@@ -72,31 +78,57 @@ def get_transcript_protein_link(transcript_accession):
negative_link_datetime = datetime.now() - \ negative_link_datetime = datetime.now() - \
timedelta(seconds=settings.NEGATIVE_PROTEIN_LINK_EXPIRATION) timedelta(seconds=settings.NEGATIVE_PROTEIN_LINK_EXPIRATION)
# Query column must have `accession`, other column has the value we're
# probably interested in.
query_column = TranscriptProteinLink.transcript_accession
other_column = TranscriptProteinLink.protein_accession
if reverse:
# Lookup by protein accession instead of transcript accession.
query_column, other_column = other_column, query_column
return TranscriptProteinLink.query \ return TranscriptProteinLink.query \
.filter_by(transcript_accession=transcript_accession) \ .filter_by(transcript_accession=accession) \
.filter(or_( .filter(
and_(TranscriptProteinLink.protein_accession != None, query_column == accession,
TranscriptProteinLink.added >= link_datetime), or_(
and_(TranscriptProteinLink.protein_accession == None, and_(other_column.isnot(None),
TranscriptProteinLink.added >= negative_link_datetime))) \ TranscriptProteinLink.added >= link_datetime),
and_(other_column.is_(None),
TranscriptProteinLink.added >= negative_link_datetime))
) \
.first() .first()
def update_transcript_protein_link(transcript_accession, def update_transcript_protein_link(transcript_accession=None,
protein_accession=None): protein_accession=None):
""" """
Update cached link between a transcript and a protein, or create it if it Update cached link between a transcript and a protein, or create it if it
doesn't exist yet. doesn't exist yet.
""" """
link = TranscriptProteinLink.query \ if transcript_accession is None and protein_accession is None:
.filter_by(transcript_accession=transcript_accession) \ raise ValueError('Link must have a transcript or protein')
.first()
# Filter clauses to find links for either of the given accession numbers.
clauses = []
if transcript_accession is not None:
clauses.append(TranscriptProteinLink.transcript_accession ==
transcript_accession)
if protein_accession is not None:
clauses.append(TranscriptProteinLink.protein_accession ==
protein_accession)
# Delete any related existing links.
links = TranscriptProteinLink.query.filter(or_(*clauses))
session.delete(links)
session.commit()
if link is not None: # There is a race condition here between deleting old links and adding the
link.protein_accession = protein_accession # new one. It's extremely unlikely to go wrong, and we can safely ignore
link.added = datetime.now() # it anyway.
else: link = TranscriptProteinLink(transcript_accession, protein_accession)
link = TranscriptProteinLink(transcript_accession, protein_accession) try:
session.add(link) session.add(link)
session.commit()
session.commit() except sqlalchemy.exc.IntegrityError:
session.rollback()
...@@ -6,7 +6,7 @@ MySQL-python==1.2.5 ...@@ -6,7 +6,7 @@ MySQL-python==1.2.5
SQLAlchemy==0.9.8 SQLAlchemy==0.9.8
Sphinx==1.2.3 Sphinx==1.2.3
Werkzeug==0.9.6 Werkzeug==0.9.6
alembic==0.6.7 alembic==0.8.2
biopython==1.64 biopython==1.64
chardet==2.3.0 chardet==2.3.0
cssselect==0.9.1 cssselect==0.9.1
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment