diff --git a/doc/config.rst b/doc/config.rst index bb7d27e4ee76bfdd687c33db0955961cb16ef83f..8d4192f0fd0777d778e37405099adf45336f9464 100644 --- a/doc/config.rst +++ b/doc/config.rst @@ -236,13 +236,13 @@ DEFAULT_ASSEMBLY `Default value:` ``hg19`` PROTEIN_LINK_EXPIRATION - Expiration time for cached transcript->protein links from the NCBI (in + Expiration time for cached transcript<->protein links from the NCBI (in seconds). `Default value:` `60 * 60 * 24 * 30` (30 days) NEGATIVE_PROTEIN_LINK_EXPIRATION - Expiration time for cached negative transcript->protein links from the NCBI + Expiration time for cached negative transcript<->protein links from the NCBI (in seconds). `Default value:` `60 * 60 * 24 * 5` (5 days) diff --git a/migrations/versions/3492d2ee8884_transcript_protein_links_have_nullable_.py b/migrations/versions/3492d2ee8884_transcript_protein_links_have_nullable_.py new file mode 100644 index 0000000000000000000000000000000000000000..607953558f4c5293a0a5118ebf78908956be8053 --- /dev/null +++ b/migrations/versions/3492d2ee8884_transcript_protein_links_have_nullable_.py @@ -0,0 +1,59 @@ +"""Transcript protein links have nullable transcript and unique protein + +Revision ID: 3492d2ee8884 +Revises: 4bafcc5086dd +Create Date: 2015-09-25 15:11:45.562392 + +""" + +from __future__ import unicode_literals + +# revision identifiers, used by Alembic. +revision = '3492d2ee8884' +down_revision = u'4bafcc5086dd' + +from alembic import op +import sqlalchemy as sa + + +# We are adding a unique constraint here, so actually we should first make +# sure there are no duplicate entries. However, this is highly unlikely, so we +# don't bother prepping the migration for that. +# +# http://skien.cc/blog/2014/01/31/adding-unique-contraints-after-the-fact-in-sqlalchemy/ + + +# We want to be compatible with at least SQLite and PostgreSQL. This means +# using `batch_alter_table` for operations yielding an ALTER TABLE statement. +# However, we also have indices on the table and this causes problems with +# `batch_alter_table` (it seems indices are copied too, under the same name, +# which is invalid). To work around this, we wrap the batch operations in drop +# and create statements for the indices. +# +# http://alembic.readthedocs.org/en/latest/batch.html + + +def upgrade(): + ### commands auto generated by Alembic - please adjust! ### + op.drop_index('ix_transcript_protein_links_transcript_accession', table_name='transcript_protein_links') + op.drop_index('ix_transcript_protein_links_protein_accession', table_name='transcript_protein_links') + with op.batch_alter_table('transcript_protein_links') as batch_op: + batch_op.alter_column('transcript_accession', + existing_type=sa.VARCHAR(length=20), + nullable=True) + op.create_index(op.f('ix_transcript_protein_links_transcript_accession'), 'transcript_protein_links', ['transcript_accession'], unique=True) + op.create_index(op.f('ix_transcript_protein_links_protein_accession'), 'transcript_protein_links', ['protein_accession'], unique=True) + ### end Alembic commands ### + + +def downgrade(): + ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f('ix_transcript_protein_links_transcript_accession'), table_name='transcript_protein_links') + op.drop_index(op.f('ix_transcript_protein_links_protein_accession'), table_name='transcript_protein_links') + with op.batch_alter_table('transcript_protein_links') as batch_op: + batch_op.alter_column('transcript_accession', + existing_type=sa.VARCHAR(length=20), + nullable=False) + op.create_index('ix_transcript_protein_links_transcript_accession', 'transcript_protein_links', ['transcript_accession'], unique=True) + op.create_index('ix_transcript_protein_links_protein_accession', 'transcript_protein_links', ['protein_accession'], unique=False) + ### end Alembic commands ### diff --git a/mutalyzer/config/default_settings.py b/mutalyzer/config/default_settings.py index f985e8ce1179987f07f9d63a71b13ef707090a64..be79de5c8f37a38f8b87c9e268a2bfb8d98d023a 100644 --- a/mutalyzer/config/default_settings.py +++ b/mutalyzer/config/default_settings.py @@ -65,10 +65,10 @@ LRG_PREFIX_URL = 'ftp://ftp.ebi.ac.uk/pub/databases/lrgex/SCHEMA_1_7_ARCHIVE/' # Allow for this fraction of errors in batch jobs. BATCH_JOBS_ERROR_THRESHOLD = 0.05 -# Expiration time for transcript->protein links from the NCBI (in seconds). +# Expiration time for transcript<->protein links from the NCBI (in seconds). PROTEIN_LINK_EXPIRATION = 60 * 60 * 24 * 30 -# Expiration time for negative transcript->protein links from the NCBI (in +# Expiration time for negative transcript<->protein links from the NCBI (in # seconds). NEGATIVE_PROTEIN_LINK_EXPIRATION = 60 * 60 * 24 * 5 diff --git a/mutalyzer/db/models.py b/mutalyzer/db/models.py index 87a87c88d31faa169a5a2d45df9e6bc8d52970a4..62877968e7fbdd1b5fb2d457f590efc44d01fe10 100644 --- a/mutalyzer/db/models.py +++ b/mutalyzer/db/models.py @@ -224,19 +224,23 @@ class TranscriptProteinLink(db.Base): id = Column(Integer, primary_key=True) #: Accession number for the transcript, not including the version number - #: (e.g., ``NM_018195`, ``XM_005270562``, ``NR_015380``). - transcript_accession = Column(String(20), nullable=False, index=True, + #: (e.g., ``NM_018195`, ``XM_005270562``, ``NR_015380``). If `NULL`, the + #: record states that no transcript is linked to the protein. + transcript_accession = Column(String(20), nullable=True, index=True, unique=True) #: Accession number for the protein, not including the version number #: (e.g., ``NP_060665``, ``XP_005258635``). If `NULL`, the record states - #: that no protein is linked to the transcript by the NCBI. - protein_accession = Column(String(20), index=True) + #: that no protein is linked to the transcript. + protein_accession = Column(String(20), nullable=True, index=True, + unique=True) #: Date and time of creation. added = Column(DateTime) - def __init__(self, transcript_accession, protein_accession=None): + def __init__(self, transcript_accession=None, protein_accession=None): + if transcript_accession is None and protein_accession is None: + raise ValueError('Link must have a transcript or protein') self.transcript_accession = transcript_accession self.protein_accession = protein_accession self.added = datetime.now() diff --git a/mutalyzer/db/queries.py b/mutalyzer/db/queries.py index 7c54d137fa19e5ff0b8459a3df305ec4241c9d2e..ab4e814e5718737c46fac971d066b65e6c5206cc 100644 --- a/mutalyzer/db/queries.py +++ b/mutalyzer/db/queries.py @@ -12,6 +12,7 @@ from __future__ import unicode_literals from datetime import datetime, timedelta from sqlalchemy import and_, or_ +import sqlalchemy.exc from mutalyzer.config import settings from mutalyzer.db import session @@ -55,12 +56,17 @@ def pop_batch_queue_item(batch_job): return item, flags -def get_transcript_protein_link(transcript_accession): +def get_transcript_protein_link(accession, reverse=False): """ Get a cached link between a transcript and a protein that is not expired according to the configuration settings `PROTEIN_LINK_EXPIRATION` and `NEGATIVE_PROTEIN_LINK_EXPIRATION`. + :arg str accession: Accession number to lookup link for. + :arg bool reverse: If `True`, `accession` is assumed to be a protein + accession number, otherwise `accession` is assumed to be a transcript + accession number. + Note that the link may be negative, i.e., the knowledge that no link exists can also be cached. In that case, the `protein_accession` field of the resulting `TranscriptProteinLink` object is `None`. @@ -72,31 +78,57 @@ def get_transcript_protein_link(transcript_accession): negative_link_datetime = datetime.now() - \ timedelta(seconds=settings.NEGATIVE_PROTEIN_LINK_EXPIRATION) + # Query column must have `accession`, other column has the value we're + # probably interested in. + query_column = TranscriptProteinLink.transcript_accession + other_column = TranscriptProteinLink.protein_accession + + if reverse: + # Lookup by protein accession instead of transcript accession. + query_column, other_column = other_column, query_column + return TranscriptProteinLink.query \ - .filter_by(transcript_accession=transcript_accession) \ - .filter(or_( - and_(TranscriptProteinLink.protein_accession != None, - TranscriptProteinLink.added >= link_datetime), - and_(TranscriptProteinLink.protein_accession == None, - TranscriptProteinLink.added >= negative_link_datetime))) \ + .filter_by(transcript_accession=accession) \ + .filter( + query_column == accession, + or_( + and_(other_column.isnot(None), + TranscriptProteinLink.added >= link_datetime), + and_(other_column.is_(None), + TranscriptProteinLink.added >= negative_link_datetime)) + ) \ .first() -def update_transcript_protein_link(transcript_accession, +def update_transcript_protein_link(transcript_accession=None, protein_accession=None): """ Update cached link between a transcript and a protein, or create it if it doesn't exist yet. """ - link = TranscriptProteinLink.query \ - .filter_by(transcript_accession=transcript_accession) \ - .first() + if transcript_accession is None and protein_accession is None: + raise ValueError('Link must have a transcript or protein') + + # Filter clauses to find links for either of the given accession numbers. + clauses = [] + if transcript_accession is not None: + clauses.append(TranscriptProteinLink.transcript_accession == + transcript_accession) + if protein_accession is not None: + clauses.append(TranscriptProteinLink.protein_accession == + protein_accession) + + # Delete any related existing links. + links = TranscriptProteinLink.query.filter(or_(*clauses)) + session.delete(links) + session.commit() - if link is not None: - link.protein_accession = protein_accession - link.added = datetime.now() - else: - link = TranscriptProteinLink(transcript_accession, protein_accession) + # There is a race condition here between deleting old links and adding the + # new one. It's extremely unlikely to go wrong, and we can safely ignore + # it anyway. + link = TranscriptProteinLink(transcript_accession, protein_accession) + try: session.add(link) - - session.commit() + session.commit() + except sqlalchemy.exc.IntegrityError: + session.rollback()