@@ -236,13 +236,13 @@ DEFAULT_ASSEMBLY
   `Default value:` ``hg19``
-  Expiration time for cached transcript->protein links from the NCBI (in
+  Expiration time for cached transcript<->protein links from the NCBI (in
   `Default value:` `60 * 60 * 24 * 30` (30 days)
-  Expiration time for cached negative transcript->protein links from the NCBI
+  Expiration time for cached negative transcript<->protein links from the NCBI
   (in seconds).
   `Default value:` `60 * 60 * 24 * 5` (5 days)
+"""Transcript protein links have nullable transcript and unique protein
+Revision ID: 3492d2ee8884
+Revises: 4bafcc5086dd
+Create Date: 2015-09-25 15:11:45.562392
+from __future__ import unicode_literals
+# revision identifiers, used by Alembic.
+revision = '3492d2ee8884'
+down_revision = u'4bafcc5086dd'
+from alembic import op
+import sqlalchemy as sa
+# We are adding a unique constraint here, so actually we should first make
+# sure there are no duplicate entries. However, this is highly unlikely, so we
+# don't bother prepping the migration for that.
+# http://skien.cc/blog/2014/01/31/adding-unique-contraints-after-the-fact-in-sqlalchemy/
+# We want to be compatible with at least SQLite and PostgreSQL. This means
+# using `batch_alter_table` for operations yielding an ALTER TABLE statement.
+# However, we also have indices on the table and this causes problems with
+# `batch_alter_table` (it seems indices are copied too, under the same name,
+# which is invalid). To work around this, we wrap the batch operations in drop
+# and create statements for the indices.
+# http://alembic.readthedocs.org/en/latest/batch.html
+def upgrade():
+    ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index('ix_transcript_protein_links_transcript_accession', table_name='transcript_protein_links')
+    op.drop_index('ix_transcript_protein_links_protein_accession', table_name='transcript_protein_links')
+    with op.batch_alter_table('transcript_protein_links') as batch_op:
+        batch_op.alter_column('transcript_accession',
+                              existing_type=sa.VARCHAR(length=20),
+                              nullable=True)
+    op.create_index(op.f('ix_transcript_protein_links_transcript_accession'), 'transcript_protein_links', ['transcript_accession'], unique=True)
+    op.create_index(op.f('ix_transcript_protein_links_protein_accession'), 'transcript_protein_links', ['protein_accession'], unique=True)
+    ### end Alembic commands ###
+def downgrade():
+    ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index(op.f('ix_transcript_protein_links_transcript_accession'), table_name='transcript_protein_links')
+    op.drop_index(op.f('ix_transcript_protein_links_protein_accession'), table_name='transcript_protein_links')
+    with op.batch_alter_table('transcript_protein_links') as batch_op:
+        batch_op.alter_column('transcript_accession',
+                              existing_type=sa.VARCHAR(length=20),
+                              nullable=False)
+    op.create_index('ix_transcript_protein_links_transcript_accession', 'transcript_protein_links', ['transcript_accession'], unique=True)
+    op.create_index('ix_transcript_protein_links_protein_accession', 'transcript_protein_links', ['protein_accession'], unique=False)
+    ### end Alembic commands ###
@@ -65,10 +65,10 @@ LRG_PREFIX_URL = 'ftp://ftp.ebi.ac.uk/pub/databases/lrgex/SCHEMA_1_7_ARCHIVE/'
 # Allow for this fraction of errors in batch jobs.
-# Expiration time for transcript->protein links from the NCBI (in seconds).
+# Expiration time for transcript<->protein links from the NCBI (in seconds).
 PROTEIN_LINK_EXPIRATION = 60 * 60 * 24 * 30
-# Expiration time for negative transcript->protein links from the NCBI (in
+# Expiration time for negative transcript<->protein links from the NCBI (in
 # seconds).
@@ -224,19 +224,23 @@ class TranscriptProteinLink(db.Base):
     id = Column(Integer, primary_key=True)
     #: Accession number for the transcript, not including the version number
-    #: (e.g., ``NM_018195`, ``XM_005270562``, ``NR_015380``).
-    transcript_accession = Column(String(20), nullable=False, index=True,
+    #: (e.g., ``NM_018195`, ``XM_005270562``, ``NR_015380``). If `NULL`, the
+    #: record states that no transcript is linked to the protein.
+    transcript_accession = Column(String(20), nullable=True, index=True,
     #: Accession number for the protein, not including the version number
     #: (e.g., ``NP_060665``, ``XP_005258635``). If `NULL`, the record states
-    #: that no protein is linked to the transcript by the NCBI.
-    protein_accession = Column(String(20), index=True)
+    #: that no protein is linked to the transcript.
+    protein_accession = Column(String(20), nullable=True, index=True,
+                               unique=True)
     #: Date and time of creation.
     added = Column(DateTime)
-    def __init__(self, transcript_accession, protein_accession=None):
+    def __init__(self, transcript_accession=None, protein_accession=None):
+        if transcript_accession is None and protein_accession is None:
+            raise ValueError('Link must have a transcript or protein')
         self.transcript_accession = transcript_accession
         self.protein_accession = protein_accession
         self.added = datetime.now()
@@ -12,6 +12,7 @@ from __future__ import unicode_literals
 from datetime import datetime, timedelta
 from sqlalchemy import and_, or_
+import sqlalchemy.exc
 from mutalyzer.config import settings
 from mutalyzer.db import session
@@ -55,12 +56,17 @@ def pop_batch_queue_item(batch_job):
     return item, flags
-def get_transcript_protein_link(transcript_accession):
+def get_transcript_protein_link(accession, reverse=False):
     Get a cached link between a transcript and a protein that is not expired
     according to the configuration settings `PROTEIN_LINK_EXPIRATION` and
+    :arg str accession: Accession number to lookup link for.
+    :arg bool reverse: If `True`, `accession` is assumed to be a protein
+      accession number, otherwise `accession` is assumed to be a transcript
+      accession number.
     Note that the link may be negative, i.e., the knowledge that no link
     exists can also be cached. In that case, the `protein_accession` field of
     the resulting `TranscriptProteinLink` object is `None`.
@@ -72,31 +78,57 @@ def get_transcript_protein_link(transcript_accession):
     negative_link_datetime = datetime.now() - \
+    # Query column must have `accession`, other column has the value we're
+    # probably interested in.
+    query_column = TranscriptProteinLink.transcript_accession
+    other_column = TranscriptProteinLink.protein_accession
+    if reverse:
+        # Lookup by protein accession instead of transcript accession.
+        query_column, other_column = other_column, query_column
     return TranscriptProteinLink.query \
-        .filter_by(transcript_accession=transcript_accession) \
-        .filter(or_(
-          and_(TranscriptProteinLink.protein_accession != None,
-               TranscriptProteinLink.added >= link_datetime),
-          and_(TranscriptProteinLink.protein_accession == None,
-               TranscriptProteinLink.added >= negative_link_datetime))) \
+        .filter_by(transcript_accession=accession) \
+        .filter(
+            query_column == accession,
+            or_(
+                and_(other_column.isnot(None),
+                     TranscriptProteinLink.added >= link_datetime),
+                and_(other_column.is_(None),
+                     TranscriptProteinLink.added >= negative_link_datetime))
+        ) \
-def update_transcript_protein_link(transcript_accession,
+def update_transcript_protein_link(transcript_accession=None,
     Update cached link between a transcript and a protein, or create it if it
     doesn't exist yet.
-    link = TranscriptProteinLink.query \
-        .filter_by(transcript_accession=transcript_accession) \
-        .first()
+    if transcript_accession is None and protein_accession is None:
+        raise ValueError('Link must have a transcript or protein')
+    # Filter clauses to find links for either of the given accession numbers.
+    clauses = []
+    if transcript_accession is not None:
+        clauses.append(TranscriptProteinLink.transcript_accession ==
+                       transcript_accession)
+    if protein_accession is not None:
+        clauses.append(TranscriptProteinLink.protein_accession ==
+                       protein_accession)
+    # Delete any related existing links.
+    links = TranscriptProteinLink.query.filter(or_(*clauses))
+    session.delete(links)
+    session.commit()
-    if link is not None:
-        link.protein_accession = protein_accession
-        link.added = datetime.now()
-    else:
-        link = TranscriptProteinLink(transcript_accession, protein_accession)
+    # There is a race condition here between deleting old links and adding the
+    # new one. It's extremely unlikely to go wrong, and we can safely ignore
+    # it anyway.
+    link = TranscriptProteinLink(transcript_accession, protein_accession)
+    try:
-    session.commit()
+        session.commit()
+    except sqlalchemy.exc.IntegrityError:
+        session.rollback()