Skip to content
Snippets Groups Projects
  • Vermaat's avatar
    473c732c
    Cache transcript protein links in Redis · 473c732c
    Vermaat authored
    Caching of transcript protein links received from the NCBI Entrez
    service is a typical use case for Redis. This implements this cache
    in Redis and removes all use of our original database table.
    
    An Alembic migration copies all existing links from the database to
    Redis. The original `TranscriptProteinLink` database table is not
    dropped. This will be done in a future migration to ensure running
    processes don't error and to provide a rollback scenario.
    
    We also remove the expiration of links (originally defaulting to 30
    days), since we don't expect them to ever change. Negative links
    (caching a 'not found' result from Entrez) *are* still expiring,
    but with a longer default of 30 days (was 5 days).
    
    The configuration setting for the latter was renamed, yielding the
    following changes in the default configuration settings.
    
    Removed default settings:
    
        # Expiration time for transcript<->protein links from the NCBI (in seconds).
        PROTEIN_LINK_EXPIRATION = 60 * 60 * 24 * 30
    
        # Expiration time for negative transcript<->protein links from the NCBI (in
        # seconds).
        NEGATIVE_PROTEIN_LINK_EXPIRATION = 60 * 60 * 24 * 5
    
    Added default setting:
    
        # Cache expiration time for negative transcript<->protein links from the NCBI
        # (in seconds).
        NEGATIVE_LINK_CACHE_EXPIRATION = 60 * 60 * 24 * 30
    473c732c
    History
    Cache transcript protein links in Redis
    Vermaat authored
    Caching of transcript protein links received from the NCBI Entrez
    service is a typical use case for Redis. This implements this cache
    in Redis and removes all use of our original database table.
    
    An Alembic migration copies all existing links from the database to
    Redis. The original `TranscriptProteinLink` database table is not
    dropped. This will be done in a future migration to ensure running
    processes don't error and to provide a rollback scenario.
    
    We also remove the expiration of links (originally defaulting to 30
    days), since we don't expect them to ever change. Negative links
    (caching a 'not found' result from Entrez) *are* still expiring,
    but with a longer default of 30 days (was 5 days).
    
    The configuration setting for the latter was renamed, yielding the
    following changes in the default configuration settings.
    
    Removed default settings:
    
        # Expiration time for transcript<->protein links from the NCBI (in seconds).
        PROTEIN_LINK_EXPIRATION = 60 * 60 * 24 * 30
    
        # Expiration time for negative transcript<->protein links from the NCBI (in
        # seconds).
        NEGATIVE_PROTEIN_LINK_EXPIRATION = 60 * 60 * 24 * 5
    
    Added default setting:
    
        # Cache expiration time for negative transcript<->protein links from the NCBI
        # (in seconds).
        NEGATIVE_LINK_CACHE_EXPIRATION = 60 * 60 * 24 * 30
test_migrations.py 5.70 KiB
"""
Test database migrations.
"""


from __future__ import unicode_literals

from datetime import datetime

import alembic.autogenerate
import alembic.command
import alembic.config
from alembic.migration import MigrationContext
import sqlalchemy as sa
from sqlalchemy import create_engine, sql

from mutalyzer import db


def test_migrations(database_uri):
    """
    Run all migrations and assert the result is up to date with the model
    definitions.
    """
    alembic_config = alembic.config.Config('migrations/alembic.ini')
    engine = create_engine(database_uri)

    with engine.begin() as connection:
        # http://alembic.readthedocs.org/en/latest/cookbook.html#sharing-a-connection-with-a-series-of-migration-commands-and-environments
        alembic_config.attributes['connection'] = connection

        if database_uri != 'sqlite://':
            db.Base.metadata.drop_all(connection)

        # Create initial schema by running the first migration.
        alembic.command.upgrade(alembic_config, 'ea660b66f26')

        # Add some database content to run the migrations on.
        add_database_content(connection)

        # Run the remaining migrations.
        alembic.command.upgrade(alembic_config, 'head')

        context = MigrationContext.configure(connection)
        assert not alembic.autogenerate.compare_metadata(
            context, db.Base.metadata)

    engine.dispose()


def add_database_content(connection):
    """
    Add some content to the database.
    """
    # We only define tables and columns we actually need, so this is not a
    # complete mapping of the schema.

    assemblies = sql.table(
        'assemblies',
        sql.column('id', sa.Integer),
        sql.column('name', sa.String(30)),
        sql.column('alias', sa.String(10)),
        sql.column('taxonomy_id', sa.Integer),
        sql.column('taxonomy_common_name', sa.String(50)))

    chromosomes = sql.table(
        'chromosomes',
        sql.column('id', sa.Integer),
        sql.column('assembly_id', sa.Integer),
        sql.column('name', sa.String(30)),
        sql.column('accession', sa.String(30)),
        sql.column('organelle', sa.Enum('nucleus', 'mitochondrion',
                                        name='organelle')))

    transcript_mappings = sql.table(
        'transcript_mappings',
        sql.column('chromosome_id', sa.Integer),
        sql.column('reference_type', sa.Enum('refseq', 'lrg',
                                             name='reference_type')),
        sql.column('accession', sa.String(20)),
        sql.column('gene', sa.String(30)),
        sql.column('transcript', sa.Integer),
        sql.column('orientation', sa.Enum('forward', 'reverse',
                                          name='orentation')),
        sql.column('start', sa.Integer),
        sql.column('stop', sa.Integer),
        sql.column('exon_starts', sa.Text),
        sql.column('exon_stops', sa.Text),
        sql.column('select_transcript', sa.Boolean),
        sql.column('source', sa.Enum('ucsc', 'ncbi', 'reference',
                                     name='source')))

    transcript_protein_links = sql.table(
        'transcript_protein_links',
        sql.column('transcript_accession', sa.String(30)),
        sql.column('protein_accession', sa.String(30)),
        sql.column('added', sa.DateTime))

    # Add some common data.
    connection.execute(
        assemblies.insert(),
        name='GRCh37',
        taxonomy_id=9606,
        taxonomy_common_name='Homo sapiens',
        alias='hg19')
    hg19_id = connection.execute(
        assemblies.select(assemblies.c.alias == 'hg19')
        .with_only_columns([assemblies.c.id])
    ).fetchone()[0]

    connection.execute(
        chromosomes.insert(),
        assembly_id=hg19_id,
        name='chr1',
        accession='NC_000001.10',
        organelle='nucleus')
    chr1_id = connection.execute(
        chromosomes.select(chromosomes.c.name == 'chr1')
        .with_only_columns([chromosomes.c.id])
    ).fetchone()[0]

    # Data for migration 402ff01b0d5d:
    # Fix GRCm38 chromosome accession number versions.
    connection.execute(
        chromosomes.insert(),
        assembly_id=hg19_id,
        name='chr11',
        accession='NC_000077.60',
        organelle='nucleus')

    # Data for migration 2e062969eb54:
    # Rename GRCh36 assembly to NCBI36.
    connection.execute(
        assemblies.insert(),
        name='GRCh36',
        taxonomy_id=9606,
        taxonomy_common_name='Homo sapiens',
        alias='hg18')

    # Data for migration 4bafcc5086dd:
    # Fix zero-exon transcript mappings.
    connection.execute(
        transcript_mappings.insert(),
        chromosome_id=chr1_id,
        reference_type='refseq',
        accession='NC_001807',
        gene='ATP6',
        transcript=1,
        orientation='forward',
        start=8528,
        stop=9208,
        exon_starts='8528',
        exon_stops='9208',
        select_transcript=True,
        source='ncbi')
    connection.execute(
        transcript_mappings.insert(),
        chromosome_id=chr1_id,
        reference_type='refseq',
        accession='NC_001807',
        gene='ATP8',
        transcript=1,
        orientation='forward',
        start=8367,
        stop=8573,
        exon_starts='',
        exon_stops='',
        select_transcript=True,
        source='ncbi')

    # Data for migration 3492d2ee8884:
    # Transcript protein links have nullable transcript and unique protein.
    connection.execute(
        transcript_protein_links.insert(),
        transcript_accession='NM_052818',
        protein_accession='NP_438169',
        added=datetime.now())
    connection.execute(
        transcript_protein_links.insert(),
        transcript_accession='NM_001079691',
        protein_accession=None,
        added=datetime.now())