Merge pull request #80 from mutalyzer/transcript-protein-link

Bi-directional cachinig of transcript-protein links

Merge pull request #80 from mutalyzer/transcript-protein-link
ba859618 · Vermaat · e003d85d · 8bbbc3a8 · ba859618 · ba859618
Commit ba859618 authored 9 years ago by Vermaat
--- a/doc/config.rst
+++ b/doc/config.rst
@@ -236,13 +236,13 @@ DEFAULT_ASSEMBLY
  `Default value:` ``hg19``
 PROTEIN_LINK_EXPIRATION
-  Expiration time for cached transcript->protein links from the NCBI (in
+  Expiration time for cached transcript<->protein links from the NCBI (in
  seconds).
  `Default value:` `60 * 60 * 24 * 30` (30 days)
 NEGATIVE_PROTEIN_LINK_EXPIRATION
-  Expiration time for cached negative transcript->protein links from the NCBI
+  Expiration time for cached negative transcript<->protein links from the NCBI
  (in seconds).
  `Default value:` `60 * 60 * 24 * 5` (5 days)

--- a/migrations/versions/3492d2ee8884_transcript_protein_links_have_nullable_.py
+++ b/migrations/versions/3492d2ee8884_transcript_protein_links_have_nullable_.py
+"""Transcript protein links have nullable transcript and unique protein
+Revision ID: 3492d2ee8884
+Revises: 4bafcc5086dd
+Create Date: 2015-09-25 15:11:45.562392
+"""
+from __future__ import unicode_literals
+# revision identifiers, used by Alembic.
+revision = '3492d2ee8884'
+down_revision = u'4bafcc5086dd'
+from alembic import op
+import sqlalchemy as sa
+# We are adding a unique constraint here, so actually we should first make
+# sure there are no duplicate entries. However, this is highly unlikely, so we
+# don't bother prepping the migration for that.
+#
+# http://skien.cc/blog/2014/01/31/adding-unique-contraints-after-the-fact-in-sqlalchemy/
+# We want to be compatible with at least SQLite and PostgreSQL. This means
+# using `batch_alter_table` for operations yielding an ALTER TABLE statement.
+# However, we also have indices on the table and this causes problems with
+# `batch_alter_table` (it seems indices are copied too, under the same name,
+# which is invalid). To work around this, we wrap the batch operations in drop
+# and create statements for the indices.
+#
+# http://alembic.readthedocs.org/en/latest/batch.html
+def upgrade():
+    ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index('ix_transcript_protein_links_transcript_accession', table_name='transcript_protein_links')
+    op.drop_index('ix_transcript_protein_links_protein_accession', table_name='transcript_protein_links')
+    with op.batch_alter_table('transcript_protein_links') as batch_op:
+        batch_op.alter_column('transcript_accession',
+                              existing_type=sa.VARCHAR(length=20),
+                              nullable=True)
+    op.create_index(op.f('ix_transcript_protein_links_transcript_accession'), 'transcript_protein_links', ['transcript_accession'], unique=True)
+    op.create_index(op.f('ix_transcript_protein_links_protein_accession'), 'transcript_protein_links', ['protein_accession'], unique=True)
+    ### end Alembic commands ###
+def downgrade():
+    ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index(op.f('ix_transcript_protein_links_transcript_accession'), table_name='transcript_protein_links')
+    op.drop_index(op.f('ix_transcript_protein_links_protein_accession'), table_name='transcript_protein_links')
+    with op.batch_alter_table('transcript_protein_links') as batch_op:
+        batch_op.alter_column('transcript_accession',
+                              existing_type=sa.VARCHAR(length=20),
+                              nullable=False)
+    op.create_index('ix_transcript_protein_links_transcript_accession', 'transcript_protein_links', ['transcript_accession'], unique=True)
+    op.create_index('ix_transcript_protein_links_protein_accession', 'transcript_protein_links', ['protein_accession'], unique=False)
+    ### end Alembic commands ###
--- a/mutalyzer/config/default_settings.py
+++ b/mutalyzer/config/default_settings.py
@@ -65,10 +65,10 @@ LRG_PREFIX_URL = 'ftp://ftp.ebi.ac.uk/pub/databases/lrgex/SCHEMA_1_7_ARCHIVE/'
 # Allow for this fraction of errors in batch jobs.
 BATCH_JOBS_ERROR_THRESHOLD = 0.05
-# Expiration time for transcript->protein links from the NCBI (in seconds).
+# Expiration time for transcript<->protein links from the NCBI (in seconds).
 PROTEIN_LINK_EXPIRATION = 60 * 60 * 24 * 30
-# Expiration time for negative transcript->protein links from the NCBI (in
+# Expiration time for negative transcript<->protein links from the NCBI (in
 # seconds).
 NEGATIVE_PROTEIN_LINK_EXPIRATION = 60 * 60 * 24 * 5

--- a/mutalyzer/db/models.py
+++ b/mutalyzer/db/models.py
@@ -224,19 +224,23 @@ class TranscriptProteinLink(db.Base):
    id = Column(Integer, primary_key=True)
    #: Accession number for the transcript, not including the version number
-    #: (e.g., ``NM_018195`, ``XM_005270562``, ``NR_015380``).
+    #: (e.g., ``NM_018195`, ``XM_005270562``, ``NR_015380``). If `NULL`, the
-    transcript_accession = Column(String(20), nullable=False, index=True,
+    #: record states that no transcript is linked to the protein.
+    transcript_accession = Column(String(20), nullable=True, index=True,
                                  unique=True)
    #: Accession number for the protein, not including the version number
    #: (e.g., ``NP_060665``, ``XP_005258635``). If `NULL`, the record states
-    #: that no protein is linked to the transcript by the NCBI.
+    #: that no protein is linked to the transcript.
-    protein_accession = Column(String(20), index=True)
+    protein_accession = Column(String(20), nullable=True, index=True,
+                               unique=True)
    #: Date and time of creation.
    added = Column(DateTime)
-    def __init__(self, transcript_accession, protein_accession=None):
+    def __init__(self, transcript_accession=None, protein_accession=None):
+        if transcript_accession is None and protein_accession is None:
+            raise ValueError('Link must have a transcript or protein')
        self.transcript_accession = transcript_accession
        self.protein_accession = protein_accession
        self.added = datetime.now()

--- a/mutalyzer/db/queries.py
+++ b/mutalyzer/db/queries.py
@@ -12,6 +12,7 @@ from __future__ import unicode_literals
 from datetime import datetime, timedelta
 from sqlalchemy import and_, or_
+import sqlalchemy.exc
 from mutalyzer.config import settings
 from mutalyzer.db import session
@@ -55,12 +56,17 @@ def pop_batch_queue_item(batch_job):
    return item, flags
-def get_transcript_protein_link(transcript_accession):
+def get_transcript_protein_link(accession, reverse=False):
    """
    Get a cached link between a transcript and a protein that is not expired
    according to the configuration settings `PROTEIN_LINK_EXPIRATION` and
    `NEGATIVE_PROTEIN_LINK_EXPIRATION`.
+    :arg str accession: Accession number to lookup link for.
+    :arg bool reverse: If `True`, `accession` is assumed to be a protein
+      accession number, otherwise `accession` is assumed to be a transcript
+      accession number.
    Note that the link may be negative, i.e., the knowledge that no link
    exists can also be cached. In that case, the `protein_accession` field of
    the resulting `TranscriptProteinLink` object is `None`.
@@ -72,31 +78,57 @@ def get_transcript_protein_link(transcript_accession):
    negative_link_datetime = datetime.now() - \
        timedelta(seconds=settings.NEGATIVE_PROTEIN_LINK_EXPIRATION)
+    # Query column must have `accession`, other column has the value we're
+    # probably interested in.
+    query_column = TranscriptProteinLink.transcript_accession
+    other_column = TranscriptProteinLink.protein_accession
+    if reverse:
+        # Lookup by protein accession instead of transcript accession.
+        query_column, other_column = other_column, query_column
    return TranscriptProteinLink.query \
-        .filter_by(transcript_accession=transcript_accession) \
+        .filter_by(transcript_accession=accession) \
-        .filter(or_(
+        .filter(
-          and_(TranscriptProteinLink.protein_accession != None,
+            query_column == accession,
-               TranscriptProteinLink.added >= link_datetime),
+            or_(
-          and_(TranscriptProteinLink.protein_accession == None,
+                and_(other_column.isnot(None),
-               TranscriptProteinLink.added >= negative_link_datetime))) \
+                     TranscriptProteinLink.added >= link_datetime),
+                and_(other_column.is_(None),
+                     TranscriptProteinLink.added >= negative_link_datetime))
+        ) \
        .first()
-def update_transcript_protein_link(transcript_accession,
+def update_transcript_protein_link(transcript_accession=None,
                                   protein_accession=None):
    """
    Update cached link between a transcript and a protein, or create it if it
    doesn't exist yet.
    """
-    link = TranscriptProteinLink.query \
+    if transcript_accession is None and protein_accession is None:
-        .filter_by(transcript_accession=transcript_accession) \
+        raise ValueError('Link must have a transcript or protein')
-        .first()
+    # Filter clauses to find links for either of the given accession numbers.
+    clauses = []
+    if transcript_accession is not None:
+        clauses.append(TranscriptProteinLink.transcript_accession ==
+                       transcript_accession)
+    if protein_accession is not None:
+        clauses.append(TranscriptProteinLink.protein_accession ==
+                       protein_accession)
+    # Delete any related existing links.
+    links = TranscriptProteinLink.query.filter(or_(*clauses))
+    session.delete(links)
+    session.commit()
-    if link is not None:
+    # There is a race condition here between deleting old links and adding the
-        link.protein_accession = protein_accession
+    # new one. It's extremely unlikely to go wrong, and we can safely ignore
-        link.added = datetime.now()
+    # it anyway.
-    else:
+    link = TranscriptProteinLink(transcript_accession, protein_accession)
-        link = TranscriptProteinLink(transcript_accession, protein_accession)
+    try:
        session.add(link)
+        session.commit()
-    session.commit()
+    except sqlalchemy.exc.IntegrityError:
+        session.rollback()
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ MySQL-python==1.2.5
 SQLAlchemy==0.9.8
 Sphinx==1.2.3
 Werkzeug==0.9.6
-alembic==0.6.7
+alembic==0.8.2
 biopython==1.64
 chardet==2.3.0
 cssselect==0.9.1