Merge pull request #96 from mutalyzer/link-versions

Optionally include versions in transcript-protein links

Merge pull request #96 from mutalyzer/link-versions
03bc99a3 · Vermaat · b80ac8ec · 2d1771a5 · 03bc99a3 · 03bc99a3
Commit 03bc99a3 authored 9 years ago by Vermaat
--- a/mutalyzer/ncbi.py
+++ b/mutalyzer/ncbi.py
@@ -3,15 +3,14 @@ Communication with the NCBI.
 """


-import functools
-
 from Bio import Entrez

 from .config import settings
 from .redisclient import client as redis


-def _get_link(source_accession, source_db, target_db, match_link_name):
+def _get_link(source_accession, source_db, target_db, match_link_name,
+              source_version=None, match_version=True):
    """
    Retrieve a linked accession number from the NCBI.

@@ -22,88 +21,142 @@ def _get_link(source_accession, source_db, target_db, match_link_name):
    :arg function match_link_name: For each link found, this function is
      called with the link name (`str`) and it should return `True` iff the
      link is to be used.
-
-    :returns: Linked accession number (without version number) or `None` if no
-      link can be found.
-    :rtype: str
+    :arg int source_version: Optional version number for `source_accession`.
+    :arg bool match_version: If `False`, the link does not have to match
+      `source_version`.
+
+    :returns: Tuple of `(target_accession, target_version)` representing the
+      link target, or `None` if no link can be found. If `source_version` is
+      not specified or `match_version` is `False`, `target_version` can be
+      `None`.
+    :rtype: tuple(str, int)
    """
    Entrez.email = settings.EMAIL
-    handle = Entrez.esearch(db=source_db, term=source_accession)
+
+    # If we are currently strictly matching on version, we can try again if
+    # no result is found. Otherwise, we just report failure.
+    def fail_or_retry():
+        if source_version is None or match_version:
+            return None
+        return _get_link(source_accession, source_db, target_db,
+                         match_link_name, source_version=None,
+                         match_version=False)
+
+    if source_version is None:
+        source = source_accession
+    else:
+        source = '%s.%d' % (source_accession, source_version)
+
+    # Find source record.
+    handle = Entrez.esearch(db=source_db, term=source)
    try:
        result = Entrez.read(handle)
    except Entrez.Parser.ValidationError:
-        return None
+        return fail_or_retry()
    finally:
        handle.close()

    try:
        source_gi = unicode(result['IdList'][0])
    except IndexError:
-        return None
+        return fail_or_retry()

+    # Find link from source record to target record.
    handle = Entrez.elink(dbfrom=source_db, db=target_db, id=source_gi)
    try:
        result = Entrez.read(handle)
    except Entrez.Parser.ValidationError:
-        return None
+        return fail_or_retry()
    finally:
        handle.close()

    if not result[0]['LinkSetDb']:
-        return None
+        return fail_or_retry()

    for link in result[0]['LinkSetDb']:
        if match_link_name(unicode(link['LinkName'])):
            target_gi = unicode(link['Link'][0]['Id'])
            break
    else:
-        return None
+        return fail_or_retry()

+    # Get target record.
    handle = Entrez.efetch(
        db=target_db, id=target_gi, rettype='acc', retmode='text')
-    target_accession = unicode(handle.read()).split('.')[0]
+    target = unicode(handle.read()).strip().split('.')
    handle.close()
-    return target_accession

+    target_accession = target[0]
+    target_version = int(target[1]) if source_version is not None else None
+    return target_accession, target_version

-def cache_link(source, target):
-    """
-    Decorator to add caching to link retrieval.

-    :arg str source: Source database (used to construct cache key).
-    :arg str target: Target database (used to construct cache key).
+def _get_link_cached(forward_key, reverse_key, source_accession, source_db,
+                     target_db, match_link_name, source_version=None,
+                     match_version=True):
    """
-    forward_key = 'ncbi:%s-to-%s:%%s' % (source, target)
-    reverse_key = 'ncbi:%s-to-%s:%%s' % (target, source)
+    Version of :func:`_get_link` with caching.

-    def cache_source_to_target(f):
-        @functools.wraps(f)
-        def cached_f(accession):
-            result = redis.get(forward_key % accession)
-            if result is not None:
-                # The empty string is a cached negative result, which we return as
-                # `None`.
-                return result or None
+    :arg str forward_key: Cache key format string for the forward direction.
+      The source term will be substituted in this template.
+    :arg str reverse_key: Cache key format string for the reverse direction.
+      The target term will be substituted in this template.

-            result = f(accession)
-
-            if result is None:
-                redis.setex(forward_key % accession,
-                            settings.NEGATIVE_LINK_CACHE_EXPIRATION, '')
-                return None
+    The cache value for a negative result (no link found) is the empty string
+    and expires in `NEGATIVE_LINK_CACHE_EXPIRATION` seconds.
+    """
+    if source_version is not None:
+        # Query cache for link with version.
+        target = redis.get(forward_key %
+                           ('%s.%d' % (source_accession, source_version)))
+        if target == '':
+            return None
+        if target:
+            target_accession, target_version = target.split('.')
+            return target_accession, int(target_version)
+
+    if source_version is None or not match_version:
+        # Query cache for link without version.
+        target = redis.get(forward_key % source_accession)
+        if target == '':
+            return None
+        if target is not None:
+            return target, None
+
+    # Query NCBI service.
+    try:
+        target_accession, target_version = _get_link(
+            source_accession, source_db, target_db, match_link_name,
+            source_version=source_version, match_version=match_version)
+    except TypeError:
+        # No link was found.
+        if source_version is not None:
+            # Store a negative forward link with version.
+            redis.setex(forward_key %
+                        ('%s.%d' % (source_accession, source_version)),
+                        settings.NEGATIVE_LINK_CACHE_EXPIRATION, '')
+        if source_version is None or not match_version:
+            # Store a negative forward link without version.
+            redis.setex(forward_key % source_accession,
+                        settings.NEGATIVE_LINK_CACHE_EXPIRATION, '')
+        return None

-            # We store the resulting link in both directions.
-            redis.set(forward_key % accession, result)
-            redis.set(reverse_key % result, accession)
-            return result
+    # Store the link without version in both directions.
+    redis.set(forward_key % source_accession, target_accession)
+    redis.set(reverse_key % target_accession, source_accession)

-        return cached_f
+    if source_version is not None and target_version is not None:
+        # Store the link with version in both directions.
+        redis.set(forward_key % ('%s.%d' % (source_accession, source_version)),
+                  '%s.%d' % (target_accession, target_version))
+        redis.set(reverse_key % ('%s.%d' % (target_accession, target_version)),
+                  '%s.%d' % (source_accession, source_version))

-    return cache_source_to_target
+    return target_accession, target_version


-@cache_link('transcript', 'protein')
-def transcript_to_protein(transcript_accession):
+def transcript_to_protein(transcript_accession, transcript_version=None,
+                          match_version=True):
    """
    Try to find the protein linked to a transcript.

@@ -113,18 +166,26 @@ def transcript_to_protein(transcript_accession):

    :arg str transcript_accession: Accession number of the transcript for
      which we want to find the protein (without version number).
-
-    :returns: Accession number of a protein (without version number) or `None`
-      if no link can be found.
-    :rtype: str
+    :arg int transcript_version: Transcript version number. Please provide
+      this if available, also if it does not need to match. This will enrich
+      the cache.
+    :arg bool match_version: If `False`, the link does not have to match
+      `transcript_version`.
+
+    :returns: Tuple of `(protein_accession, protein_version)` representing the
+      linked protein, or `None` if no link can be found. If `match_version` is
+      `False`, `protein_version` can be `None`.  TODO: can or will?
+    :rtype: tuple(str, int)
    """
-    return _get_link(
+    return _get_link_cached(
+        'ncbi:transcript-to-protein:%s', 'ncbi:protein-to-transcript:%s',
        transcript_accession, 'nucleotide', 'protein',
-        lambda link: link in ('nuccore_protein', 'nuccore_protein_cds'))
+        lambda link: link in ('nuccore_protein', 'nuccore_protein_cds'),
+        source_version=transcript_version, match_version=match_version)


-@cache_link('protein', 'transcript')
-def protein_to_transcript(protein_accession):
+def protein_to_transcript(protein_accession, protein_version=None,
+                          match_version=True):
    """
    Try to find the transcript linked to a protein.

@@ -134,11 +195,14 @@ def protein_to_transcript(protein_accession):

    :arg str protein_accession: Accession number of the protein for which we
      want to find the transcript (without version number).
+    TODO

    :returns: Accession number of a transcript (without version number) or
      `None` if no link can be found.
    :rtype: str
    """
-    return _get_link(
+    return _get_link_cached(
+        'ncbi:protein-to-transcript:%s', 'ncbi:transcript-to-protein:%s',
        protein_accession, 'protein', 'nucleotide',
-        lambda link: link == 'protein_nuccore_mrna')
+        lambda link: link == 'protein_nuccore_mrna',
+        source_version=protein_version, match_version=match_version)
--- a/mutalyzer/parsers/genbank.py
+++ b/mutalyzer/parsers/genbank.py
@@ -220,8 +220,14 @@ class GBparser():
                    i.proteinLink = i.protein_id.split('.')[0]
            #if
            else :                # Tag an mRNA with the protein id too.
-                i.proteinLink = \
-                    ncbi.transcript_to_protein(i.transcript_id.split('.')[0])
+                accession, version = i.transcript_id.split('.')
+                protein = ncbi.transcript_to_protein(
+                    accession, int(version), match_version=False)
+                if protein is None:
+                    i.proteinLink = None
+                else:
+                    # We ignore the version.
+                    i.proteinLink = protein[0]
            i.positionList = self.__locationList2posList(i)
            i.location = self.__location2pos(i.location) #FIXME
            #if not i.positionList : # FIXME ???

--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -73,6 +73,35 @@ def available_references():
        return yaml.safe_load(f)


+def _add_links(settings, links):
+    """
+    Add transcript-protein links to the cache.
+    """
+    for transcript, protein in links:
+        if transcript is not None:
+            key = 'ncbi:transcript-to-protein:%s' % transcript
+            if protein is not None:
+                redis.set(key, protein)
+                if '.' in transcript:
+                    key = key.rsplit('.', 1)[0]
+                    redis.set(key, protein.rsplit('.', 1)[0])
+            else:
+                redis.setex(key,
+                            settings.NEGATIVE_LINK_CACHE_EXPIRATION,
+                            '')
+        if protein is not None:
+            key = 'ncbi:protein-to-transcript:%s' % protein
+            if transcript is not None:
+                redis.set(key, transcript)
+                if '.' in protein:
+                    key = key.rsplit('.', 1)[0]
+                    redis.set(key, transcript.rsplit('.', 1)[0])
+            else:
+                redis.setex(key,
+                            settings.NEGATIVE_LINK_CACHE_EXPIRATION,
+                            '')
+
+
 @pytest.fixture
 def references(request, settings, db, available_references):
    try:
@@ -99,23 +128,7 @@ def references(request, settings, db, available_references):
        references.append(Reference(
            accession, entry['checksum'], geninfo_identifier=geninfo_id))

-        for transcript_accession, protein_accession in entry.get('links', []):
-            if transcript_accession is not None:
-                key = 'ncbi:transcript-to-protein:%s' % transcript_accession
-                if protein_accession is not None:
-                    redis.set(key, protein_accession)
-                else:
-                    redis.setex(key,
-                                settings.NEGATIVE_LINK_CACHE_EXPIRATION,
-                                '')
-            if protein_accession is not None:
-                key = 'ncbi:protein-to-transcript:%s' % protein_accession
-                if transcript_accession is not None:
-                    redis.set(key, transcript_accession)
-                else:
-                    redis.setex(key,
-                                settings.NEGATIVE_LINK_CACHE_EXPIRATION,
-                                '')
+        _add_links(settings, entry.get('links', []))

    db.session.add_all(references)
    db.session.commit()
@@ -123,6 +136,17 @@ def references(request, settings, db, available_references):
    return references


+@pytest.fixture
+def links(request, settings, db, available_references):
+    try:
+        links = request.param
+    except AttributeError:
+        return []
+
+    _add_links(settings, links)
+    return links
+
+
 @pytest.fixture
 def hg19(db):
    """
@@ -573,3 +597,63 @@ def hg19_transcript_mappings(db, hg19):
        version=3))

    db.session.commit()
+
+
+def with_references(*references):
+    """
+    Convenience decorator for parameterizing tests with reference fixtures.
+
+    Allows us to write:
+
+        @with_references('NM_004006.1', 'NM_004006.2')
+        def test_references():
+            pass
+
+    Instead of:
+
+        @pytest.mark.usefixtures('references')
+        @pytest.mark.parametrize('references',
+                                 [['NM_004006.1', 'NM_004006.2']],
+                                 ids=['NM_004006.1,NM_004006.2'],
+                                 indirect=True)
+        def test_references():
+            pass
+
+    """
+    def test_with_references(test):
+        return pytest.mark.usefixtures('references')(
+            pytest.mark.parametrize('references', [references], indirect=True,
+                                    ids=[','.join(references)])(test))
+    return test_with_references
+
+
+def with_links(*links):
+    """
+    Convenience decorator for parameterizing tests with transcript-protein
+    link fixtures.
+
+    Allows us to write:
+
+        @with_links(('NM_018650', 'NP_061120'), ('NM_027221', None))
+        def test_links():
+            pass
+
+    Instead of:
+
+        @pytest.mark.usefixtures('links')
+        @pytest.mark.parametrize('links',
+                                 [('NM_018650', 'NP_061120'),
+                                  ('NM_027221', None)],
+                                 ids=['NM_018650/NP_061120,NM_027221/*'],
+                                 indirect=True)
+        def test_links():
+            pass
+
+    """
+    def test_with_links(test):
+        return pytest.mark.usefixtures('links')(
+            pytest.mark.parametrize(
+                'links', [links], indirect=True,
+                ids=[','.join('/'.join(a or '*' for a in l)
+                              for l in links)])(test))
+    return test_with_links
--- a/tests/test_ncbi.py
+++ b/tests/test_ncbi.py
@@ -5,33 +5,394 @@ Tests for the mutalyzer.ncbi module.

 from __future__ import unicode_literals

+import Bio.Entrez
 import pytest

 from mutalyzer import ncbi
+from mutalyzer.redisclient import client as redis

+from fixtures import with_links

-pytestmark = [
-    pytest.mark.usefixtures('references'),
-    pytest.mark.parametrize('references', [['MARK1']], indirect=True)
-]

+@pytest.fixture
+def entrez(request, monkeypatch):
+    """
+    Fixture monkey-patching the NCBI Entrez API to return transcript-protein
+    links defined in the fixture parameter.

-def test_transcript_to_protein():
+    The fixture is similar to the :func:`fixtures.links` fixture, but instead
+    of storing the links in the cache, the API is monkey-patched.
    """
-    Get protein for transcript.
+    try:
+        links = request.param
+    except AttributeError:
+        return []
+
+    # We need two-way lookup.
+    transcript_to_protein = dict(links)
+    protein_to_transcript = dict((p, t) for t, p in links)
+
+    # Store original methods which should be called as a fallback.
+    esearch = Bio.Entrez.esearch
+    elink = Bio.Entrez.elink
+    efetch = Bio.Entrez.efetch
+
+    # Intermediate Entrez result object which can be parsed with Entrez.read.
+    class EntrezResult(object):
+        def __init__(self, result):
+            self.result = result
+
+        def read(self):
+            return self.result
+
+        def close(self):
+            pass
+
+    def mock_esearch(db=None, term=None):
+        if ((db == 'nucleotide' and term in transcript_to_protein)
+                or (db == 'protein' and term in protein_to_transcript)):
+            return EntrezResult({
+                'Count': '1',
+                'RetMax': '1',
+                'IdList': [term],
+                'TranslationSet': [],
+                'RetStart': '0',
+                'QueryTranslation': ''
+            })
+        return esearch(db=db, term=term)
+
+    def mock_elink(dbfrom=None, db=None, id=None):
+        if dbfrom == 'nucleotide' and id in transcript_to_protein:
+            if transcript_to_protein[id] is None:
+                linkset = []
+            else:
+                linkset = [{'DbTo': 'protein',
+                            'Link': [{'Id': transcript_to_protein[id]}],
+                            'LinkName': 'nuccore_protein'}]
+            return EntrezResult([{
+                'LinkSetDb': linkset,
+                'DbFrom': 'nuccore',
+                'IdList': [id],
+                'LinkSetDbHistory': [],
+                'ERROR': []
+            }])
+        if dbfrom == 'protein' and id in protein_to_transcript:
+            if protein_to_transcript[id] is None:
+                linkset = []
+            else:
+                linkset = [{'DbTo': 'nuccore',
+                            'Link': [{'Id': '568815587'},
+                                     {'Id': '528476600'},
+                                     {'Id': '568815270'},
+                                     {'Id': '528474155'},
+                                     {'Id': '452415518'},
+                                     {'Id': '452405284'},
+                                     {'Id': '383209650'}],
+                            'LinkName': 'protein_nuccore'},
+                           {'DbTo': 'nuccore',
+                            'Link': [{'Id': '4506864'}],
+                            'LinkName': 'protein_nuccore_cds'},
+                           {'DbTo': 'nuccore',
+                            'Link': [{'Id': '48735311'},
+                                     {'Id': '48734961'},
+                                     {'Id': '47682402'},
+                                     {'Id': '18490203'},
+                                     {'Id': '16359050'},
+                                     {'Id': '16306997'},
+                                     {'Id': '15929518'},
+                                     {'Id': '15214938'},
+                                     {'Id': '13528941'}],
+                            'LinkName': 'protein_nuccore_mgc_refseq'},
+                           {'DbTo': 'nuccore',
+                            'Link': [{'Id': protein_to_transcript[id]}],
+                            'LinkName': 'protein_nuccore_mrna'}]
+            return EntrezResult([{
+                'LinkSetDb': linkset,
+                'DbFrom': 'protein',
+                'IdList': [id],
+                'LinkSetDbHistory': [],
+                'ERROR': []
+            }])
+        return elink(dbfrom=dbfrom, db=db, id=id)
+
+    def mock_efetch(db=None, id=None, rettype=None, retmode=None):
+        if ((db == 'nucleotide' and id in transcript_to_protein)
+                or (db == 'protein' and id in protein_to_transcript)):
+            if '.' not in id:
+                id += '.9999'
+            return EntrezResult(id + '\n')
+        return efetch(db=db, id=id, rettype=rettype, retmode=retmode)
+
+    def mock_read(result):
+        return result.read()
+
+    monkeypatch.setattr(Bio.Entrez, 'esearch', mock_esearch)
+    monkeypatch.setattr(Bio.Entrez, 'elink', mock_elink)
+    monkeypatch.setattr(Bio.Entrez, 'efetch', mock_efetch)
+    monkeypatch.setattr(Bio.Entrez, 'read', mock_read)
+    return links
+
+
+def with_entrez(*links):
+    """
+    Convenience decorator for parameterizing tests with transcript-protein
+    link fixtures in the Entrez API.
+
+    Similar to :func:`fixtures.with_links`.
    """
-    assert ncbi.transcript_to_protein('NM_018650') == 'NP_061120'
+    def test_with_entrez(test):
+        return pytest.mark.usefixtures('entrez')(
+            pytest.mark.parametrize(
+                'entrez', [links], indirect=True,
+                ids=[','.join('/'.join(a or '*' for a in l)
+                              for l in links)])(test))
+    return test_with_entrez


-def test_transcript_to_protein_negative():
+@with_entrez(('NM_11111.1', None),
+             ('NM_11111.2', 'NP_11111.2'),
+             ('NM_22222.2', None),
+             ('NM_22222.3', 'NP_22222.3'),
+             ('NM_33333.4', None),
+             ('NM_33333.5', 'NP_33333.5'),
+             ('NM_44444', None),
+             ('NM_44444.5', None),
+             ('NM_44444.6', None),
+             ('NM_55555', 'NP_55555'),
+             ('NM_55555.6', None),
+             ('NM_66666', 'NP_66666'),
+             ('NM_66666.6', 'NP_66666.6'),
+             ('NM_66666.7', 'NP_66666.7'),
+             ('NM_66666.8', None),
+             ('NM_77777', 'NP_77777'),
+             ('NM_77777.7', 'NP_77777.7'),
+             ('NM_77777.8', None),
+             ('NM_88888', None),
+             ('NM_88888.8', None),
+             ('NM_88888.9', 'NP_88888.9'))
+@with_links(('NM_11111', 'NP_11111'),
+            ('NM_22222', None),
+            ('NM_33333.3', 'NP_33333.3'),
+            ('NM_44444.4', None),
+            ('NM_55555.5', None),
+            ('NM_66666.6', None))
+@pytest.mark.parametrize('accession,version,match_version,expected', [
+    ('NM_11111', None, False, ('NP_11111', None)),
+    ('NM_11111', 1, False, ('NP_11111', None)),
+    ('NM_11111', 1, True, None),
+    ('NM_11111', 2, False, ('NP_11111', None)),
+    ('NM_11111', 2, True, ('NP_11111', 2)),
+    ('NM_22222', None, False, None),
+    ('NM_22222', 2, False, None),
+    ('NM_22222', 2, True, None),
+    ('NM_22222', 3, False, None),
+    ('NM_22222', 3, True, ('NP_22222', 3)),
+    ('NM_33333', None, False, ('NP_33333', None)),
+    ('NM_33333', 3, True, ('NP_33333', 3)),
+    ('NM_33333', 3, False, ('NP_33333', 3)),
+    ('NM_33333', 4, True, None),
+    ('NM_33333', 4, False, ('NP_33333', None)),
+    ('NM_33333', 5, True, ('NP_33333', 5)),
+    ('NM_33333', 5, False, ('NP_33333', None)),
+    ('NM_44444', None, False, None),
+    ('NM_44444', 4, True, None),
+    ('NM_44444', 4, False, None),
+    ('NM_44444', 5, True, None),
+    ('NM_44444', 5, False, None),
+    ('NM_44444', 6, True, None),
+    ('NM_44444', 6, False, None),
+    ('NM_55555', None, False, ('NP_55555', None)),
+    ('NM_55555', 5, True, None),
+    ('NM_55555', 5, False, None),
+    ('NM_55555', 6, True, None),
+    ('NM_55555', 6, False, ('NP_55555', None)),
+    ('NM_66666', None, False, ('NP_66666', None)),
+    ('NM_66666', 6, True, None),
+    ('NM_66666', 6, False, None),
+    ('NM_66666', 7, True, ('NP_66666', 7)),
+    ('NM_66666', 7, False, ('NP_66666', 7)),
+    ('NM_66666', 8, True, None),
+    ('NM_66666', 8, False, ('NP_66666', None)),
+    ('NM_77777', None, False, ('NP_77777', None)),
+    ('NM_77777', 7, False, ('NP_77777', 7)),
+    ('NM_77777', 7, True, ('NP_77777', 7)),
+    ('NM_77777', 8, False, ('NP_77777', None)),
+    ('NM_77777', 8, True, None),
+    ('NM_88888', None, False, None),
+    ('NM_88888', 8, False, None),
+    ('NM_88888', 8, True, None),
+    ('NM_88888', 9, False, ('NP_88888', 9)),
+    ('NM_88888', 9, True, ('NP_88888', 9))])
+def test_transcript_to_protein(accession, version, match_version, expected):
    """
-    Get no protein for transcript.
+    Get protein for transcript.
+
+    Both the Entrez API and our cache are fixed with a set of
+    transcript-protein links. This test is parametrized with a list of
+    arguments for the :func:`ncbi.transcript_to_protein` function and the
+    corresponding expected result.
    """
-    assert ncbi.transcript_to_protein('XM_005273133') is None
+    assert ncbi.transcript_to_protein(
+        accession, version, match_version) == expected


-def test_protein_to_transcript():
+@with_entrez((None, 'NP_11111.1'),
+             ('NM_11111.2', 'NP_11111.2'),
+             (None, 'NP_22222.2'),
+             ('NM_22222.3', 'NP_22222.3'),
+             (None, 'NP_33333.4'),
+             ('NM_33333.5', 'NP_33333.5'),
+             (None, 'NP_44444'),
+             (None, 'NP_44444.5'),
+             (None, 'NP_44444.6'),
+             ('NM_55555', 'NP_55555'),
+             (None, 'NP_55555.6'),
+             ('NM_66666', 'NP_66666'),
+             ('NM_66666.6', 'NP_66666.6'),
+             ('NM_66666.7', 'NP_66666.7'),
+             (None, 'NP_66666.8'),
+             ('NM_77777', 'NP_77777'),
+             ('NM_77777.7', 'NP_77777.7'),
+             (None, 'NP_77777.8'),
+             (None, 'NP_88888'),
+             (None, 'NP_88888.8'),
+             ('NM_88888.9', 'NP_88888.9'))
+@with_links(('NM_11111', 'NP_11111'),
+            (None, 'NP_22222'),
+            ('NM_33333.3', 'NP_33333.3'),
+            (None, 'NP_44444.4'),
+            (None, 'NP_55555.5'),
+            (None, 'NP_66666.6'))
+@pytest.mark.parametrize('accession,version,match_version,expected', [
+    ('NP_11111', None, False, ('NM_11111', None)),
+    ('NP_11111', 1, False, ('NM_11111', None)),
+    ('NP_11111', 1, True, None),
+    ('NP_11111', 2, False, ('NM_11111', None)),
+    ('NP_11111', 2, True, ('NM_11111', 2)),
+    ('NP_22222', None, False, None),
+    ('NP_22222', 2, False, None),
+    ('NP_22222', 2, True, None),
+    ('NP_22222', 3, False, None),
+    ('NP_22222', 3, True, ('NM_22222', 3)),
+    ('NP_33333', None, False, ('NM_33333', None)),
+    ('NP_33333', 3, True, ('NM_33333', 3)),
+    ('NP_33333', 3, False, ('NM_33333', 3)),
+    ('NP_33333', 4, True, None),
+    ('NP_33333', 4, False, ('NM_33333', None)),
+    ('NP_33333', 5, True, ('NM_33333', 5)),
+    ('NP_33333', 5, False, ('NM_33333', None)),
+    ('NP_44444', None, False, None),
+    ('NP_44444', 4, True, None),
+    ('NP_44444', 4, False, None),
+    ('NP_44444', 5, True, None),
+    ('NP_44444', 5, False, None),
+    ('NP_44444', 6, True, None),
+    ('NP_44444', 6, False, None),
+    ('NP_55555', None, False, ('NM_55555', None)),
+    ('NP_55555', 5, True, None),
+    ('NP_55555', 5, False, None),
+    ('NP_55555', 6, True, None),
+    ('NP_55555', 6, False, ('NM_55555', None)),
+    ('NP_66666', None, False, ('NM_66666', None)),
+    ('NP_66666', 6, True, None),
+    ('NP_66666', 6, False, None),
+    ('NP_66666', 7, True, ('NM_66666', 7)),
+    ('NP_66666', 7, False, ('NM_66666', 7)),
+    ('NP_66666', 8, True, None),
+    ('NP_66666', 8, False, ('NM_66666', None)),
+    ('NP_77777', None, False, ('NM_77777', None)),
+    ('NP_77777', 7, False, ('NM_77777', 7)),
+    ('NP_77777', 7, True, ('NM_77777', 7)),
+    ('NP_77777', 8, False, ('NM_77777', None)),
+    ('NP_77777', 8, True, None),
+    ('NP_88888', None, False, None),
+    ('NP_88888', 8, False, None),
+    ('NP_88888', 8, True, None),
+    ('NP_88888', 9, False, ('NM_88888', 9)),
+    ('NP_88888', 9, True, ('NM_88888', 9))])
+def test_protein_to_transcript(accession, version, match_version, expected):
    """
    Get transcript for protein.
+
+    Both the Entrez API and our cache are fixed with a set of
+    transcript-protein links. This test is parametrized with a list of
+    arguments for the :func:`ncbi.transcript_to_protein` function and the
+    corresponding expected result.
+
+    Fixtures and parameters of this test mirror those of the
+    `test_transcript_to_protein` test.
+    """
+    assert ncbi.protein_to_transcript(
+        accession, version, match_version) == expected
+
+
+@with_entrez(('NM_11111', None),
+             ('NM_22222', 'NP_22222'),
+             ('NM_33333', None),
+             ('NM_33333.3', None),
+             ('NM_44444', None),
+             ('NM_44444.4', 'NP_44444.4'))
+@pytest.mark.parametrize('accession,version,match_version,expected_forward,expected_reverse', [
+    ('NM_11111', None, False, [('NM_11111', None)], []),
+    ('NM_22222', None, False,
+     [('NM_22222', 'NP_22222')], [('NM_22222', 'NP_22222')]),
+    ('NM_33333', None, False, [('NM_33333', None)], []),
+    ('NM_33333', 3, False, [('NM_33333', None), ('NM_33333.3', None)], []),
+    ('NM_33333', 3, True, [('NM_33333.3', None)], []),
+    ('NM_44444', None, False, [('NM_44444', None)], []),
+    ('NM_44444', 4, False,
+     [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')],
+     [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')]),
+    ('NM_44444', 4, True,
+     [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')],
+     [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')])])
+def test_transcript_to_protein_cache(accession, version, match_version,
+                                     expected_forward, expected_reverse):
+    """
+    Get protein for transcript and check the resulting cache state.
    """
-    assert ncbi.protein_to_transcript('NP_061120') == 'NM_018650'
+    ncbi.transcript_to_protein(accession, version, match_version)
+
+    forward = [(key.split(':')[-1], redis.get(key) or None)
+               for key in redis.keys('ncbi:transcript-to-protein:*')]
+    assert sorted(forward) == sorted(expected_forward)
+
+    reverse = [(redis.get(key) or None, key.split(':')[-1])
+               for key in redis.keys('ncbi:protein-to-transcript:*')]
+    assert sorted(reverse) == sorted(expected_reverse)
+
+
+@with_entrez((None, 'NP_11111'),
+             ('NM_22222', 'NP_22222'),
+             (None, 'NP_33333'),
+             (None, 'NP_33333.3'),
+             (None, 'NP_44444'),
+             ('NM_44444.4', 'NP_44444.4'))
+@pytest.mark.parametrize('accession,version,match_version,expected_forward,expected_reverse', [
+    ('NP_11111', None, False, [], [(None, 'NP_11111')]),
+    ('NP_22222', None, False,
+     [('NM_22222', 'NP_22222')], [('NM_22222', 'NP_22222')]),
+    ('NP_33333', None, False, [], [(None, 'NP_33333')]),
+    ('NP_33333', 3, False, [], [(None, 'NP_33333'), (None, 'NP_33333.3')]),
+    ('NP_33333', 3, True, [], [(None, 'NP_33333.3')]),
+    ('NP_44444', None, False, [], [(None, 'NP_44444')]),
+    ('NP_44444', 4, False,
+     [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')],
+     [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')]),
+    ('NP_44444', 4, True,
+     [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')],
+     [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')])])
+def test_protein_to_transcript_cache(accession, version, match_version,
+                                     expected_forward, expected_reverse):
+    """
+    Get transcript for protein and check the resulting cache state.
+    """
+    ncbi.protein_to_transcript(accession, version, match_version)
+
+    forward = [(key.split(':')[-1], redis.get(key) or None)
+               for key in redis.keys('ncbi:transcript-to-protein:*')]
+    assert sorted(forward) == sorted(expected_forward)
+
+    reverse = [(redis.get(key) or None, key.split(':')[-1])
+               for key in redis.keys('ncbi:protein-to-transcript:*')]
+    assert sorted(reverse) == sorted(expected_reverse)
--- a/tests/test_parsers_genbank.py
+++ b/tests/test_parsers_genbank.py
@@ -11,6 +11,8 @@ import pytest

 from mutalyzer.parsers.genbank import GBparser

+from fixtures import with_references
+

 @pytest.fixture
 def parser():
@@ -35,7 +37,7 @@ def test_product_lists_mismatch(parser, products, expected):
    assert parser._find_mismatch(products) == expected


-@pytest.mark.parametrize('references', [['A1BG']], indirect=True)
+@with_references('A1BG')
 def test_only_complete_genes_included(settings, references, parser):
    """
    Incomplete genes from the reference file should be ignored.

--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -19,6 +19,8 @@ from mutalyzer import File
 from mutalyzer import output
 from mutalyzer import Scheduler

+from fixtures import with_references
+

 pytestmark = pytest.mark.usefixtures('db')

@@ -85,9 +87,7 @@ def test_large_input():
    _batch_job_plain_text(variants, expected, 'syntax-checker')


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['AB026906.1', 'NM_000059.3']],
-                         indirect=True)
+@with_references('AB026906.1', 'NM_000059.3')
 def test_name_checker():
    """
    Simple name checker batch job.
@@ -212,8 +212,7 @@ def test_name_checker_altered():
        _batch_job_plain_text(variants, expected, 'name-checker')


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['NM_000059.3']], indirect=True)
+@with_references('NM_000059.3')
 def test_name_checker_skipped():
    """
    Name checker job with skipped entries.

--- a/tests/test_services_soap.py
+++ b/tests/test_services_soap.py
@@ -20,6 +20,8 @@ import mutalyzer
 from mutalyzer.services.soap import application
 from mutalyzer import Scheduler

+from fixtures import with_references
+

 @pytest.fixture
 def server():
@@ -188,8 +190,7 @@ def test_gettranscriptsbygenename_invalid(api):
    assert not r


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['AF230870.1']], indirect=True)
+@with_references('AF230870.1')
 def test_gettranscriptsandinfo_valid(api):
    """
    Running getTranscriptsAndInfo with a valid genomic reference should
@@ -203,8 +204,7 @@ def test_gettranscriptsandinfo_valid(api):
        assert t in names


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['AL449423.14']], indirect=True)
+@with_references('AL449423.14')
 def test_gettranscriptsandinfo_restricted_valid(api):
    """
    Running getTranscriptsAndInfo with a valid genomic reference and a
@@ -332,9 +332,7 @@ def test_info(api):
    assert r.version == mutalyzer.__version__


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize(
-    'references', [['AB026906.1', 'AL449423.14', 'NM_003002.2']], indirect=True)
+@with_references('AB026906.1', 'AL449423.14', 'NM_003002.2')
 def test_getcache(output, api):
    """
    Running the getCache method should give us the expected number of
@@ -393,8 +391,7 @@ def test_gettranscripts_with_versions(api):
        assert t in r.string


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['NM_003002.2']], indirect=True)
+@with_references('NM_003002.2')
 def test_runmutalyzer(api):
    """
    Just a runMutalyzer test.
@@ -432,8 +429,7 @@ def test_runmutalyzer_reference_info_nm(api):
    assert r.molecule == 'n'


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['NM_003002.2']], indirect=True)
+@with_references('NM_003002.2')
 def test_runmutalyzer_reference_info_nm_version(api):
    """
    Get reference info for an NM variant with version.
@@ -448,8 +444,7 @@ def test_runmutalyzer_reference_info_nm_version(api):
    assert r.molecule == 'n'


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['LRG_1']], indirect=True)
+@with_references('LRG_1')
 def test_runmutalyzer_reference_info_lrg(api):
    """
    Get reference info for an LRG variant.
@@ -461,8 +456,7 @@ def test_runmutalyzer_reference_info_lrg(api):
    assert r.molecule == 'g'


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['NG_012772.1']], indirect=True)
+@with_references('NG_012772.1')
 def test_runmutalyzer_reference_info_ng(api):
    """
    Get reference info for an NG variant without version.
@@ -489,8 +483,7 @@ def test_runmutalyzer_reference_info_ng(api):
    assert r.molecule == 'g'


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['NG_009105.1']], indirect=True)
+@with_references('NG_009105.1')
 def test_runmutalyzer_reference_info_ng_version(api):
    """
    Get reference info for an NG variant with version.
@@ -505,8 +498,7 @@ def test_runmutalyzer_reference_info_ng_version(api):
    assert r.molecule == 'g'


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['NG_012772.1']], indirect=True)
+@with_references('NG_012772.1')
 def test_runmutalyzer_reference_info_gi(api):
    """
    Get reference info for a GI variant.
@@ -521,8 +513,7 @@ def test_runmutalyzer_reference_info_gi(api):
    assert r.molecule == 'g'


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['NM_000143.3']], indirect=True)
+@with_references('NM_000143.3')
 def test_runmutalyzer_exons(api):
    """
    Exon table in runMutalyzer output.
@@ -544,10 +535,7 @@ def test_runmutalyzer_exons(api):
        assert (exon.gStart, exon.gStop, exon.cStart, exon.cStop) == expected_exon


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize(
-    'references', [['AB026906.1', 'NM_003002.2', 'AL449423.14']],
-    indirect=True)
+@with_references('AB026906.1', 'NM_003002.2', 'AL449423.14')
 def test_batchjob(api):
    """
    Submit a batch job.

--- a/tests/test_variantchecker.py
+++ b/tests/test_variantchecker.py
--- a/tests/test_website.py
+++ b/tests/test_website.py
@@ -18,6 +18,8 @@ from mutalyzer import announce, Scheduler
 from mutalyzer.db.models import BatchJob
 from mutalyzer.website import create_app

+from fixtures import with_references
+

 # TODO: Tests for /upload.

@@ -125,9 +127,7 @@ def test_description_extractor_raw_fastq(website):
    assert '[5_6insTT;17del;26A&gt;C;35dup]' in r.data


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize(
-    'references', [['NM_004006.1', 'NM_004006.2']], indirect=True)
+@with_references('NM_004006.1', 'NM_004006.2')
 def test_description_extractor_refseq(website):
    """
    Submit two accession numbers to the variant description extractor.
@@ -247,8 +247,7 @@ def test_checksyntax_invalid(website):
    assert 'The &quot;^&quot; indicates the position where the error occurred' in r.data


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['NM_002001.2']], indirect=True)
+@with_references('NM_002001.2')
 def test_check_valid(website):
    """
    Submit the name checker form with a valid variant.
@@ -273,8 +272,7 @@ def test_check_invalid(website):
    assert 'The &quot;^&quot; indicates the position where the error occurred' in r.data


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['NP_064445.1']], indirect=True)
+@with_references('NP_064445.1')
 def test_check_protein_reference(website):
    """
    Submit the name checker form with a protein reference sequence (not
@@ -287,8 +285,7 @@ def test_check_protein_reference(website):
    assert 'Protein reference sequences are not supported' in r.data


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['NM_002001.2']], indirect=True)
+@with_references('NM_002001.2')
 def test_check_noninteractive(website):
    """
    Submit the name checker form non-interactively.
@@ -304,8 +301,7 @@ def test_check_noninteractive(website):
    assert 'Raw variant 1: deletion of 1' in r.data


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['NG_012772.1']], indirect=True)
+@with_references('NG_012772.1')
 def test_check_interactive_links(website):
    """
    Submitting interactively should have links to transcripts also
@@ -427,10 +423,7 @@ def _batch(website, job_type='name-checker', assembly_name_or_alias=None,
    return r.data


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize(
-    'references', [['AB026906.1', 'NM_003002.2', 'AL449423.14']],
-    indirect=True)
+@with_references('AB026906.1', 'NM_003002.2', 'AL449423.14')
 def test_batch_namechecker(website):
    """
    Submit the batch name checker form.
@@ -610,8 +603,7 @@ def test_batch_syntaxchecker_oldstyle(website):
           header='Input\tStatus')


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['AB026906.1']], indirect=True)
+@with_references('AB026906.1')
 def test_batch_namechecker_restriction_sites(website):
    """
    Submit the batch name checker form and see if restriction site effects
@@ -703,8 +695,7 @@ def test_annotated_soap_api(website):
    assert 'Web Service: Mutalyzer' in r.data


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['NG_012337.1']], indirect=True)
+@with_references('NG_012337.1')
 def test_getgs(website):
    """
    Test the /getGS interface used by LOVD2.
@@ -721,8 +712,7 @@ def test_getgs(website):
    assert '<input' not in r.data


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['NG_012337.1']], indirect=True)
+@with_references('NG_012337.1')
 def test_getgs_coding_multiple_transcripts(website):
    """
    Test the /getGS interface on a coding description and genomic
@@ -737,8 +727,7 @@ def test_getgs_coding_multiple_transcripts(website):
    assert 'description=NG_012337.1' in r.location


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['NG_008939.1']], indirect=True)
+@with_references('NG_008939.1')
 def test_getgs_variant_error(website):
    """
    Test the /getGS interface on a variant description with an error.
@@ -861,8 +850,7 @@ def test_upload_local_file_invalid(website):
    assert 'The file could not be parsed.' in r.data


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['NM_002001.2']], indirect=True)
+@with_references('NM_002001.2')
 def test_reference(website):
    """
    Test if reference files are cached.
@@ -878,8 +866,7 @@ def test_reference(website):
    assert r.data == bz2.BZ2File(path).read()


-@pytest.mark.usefixtures('references')
-@pytest.mark.parametrize('references', [['NM_002001.2']], indirect=True)
+@with_references('NM_002001.2')
 def test_reference_head(website):
    """
    Test if reference files are cached, by issuing a HEAD request.
@@ -901,8 +888,8 @@ def test_reference_head_none(website):
    assert r.status_code == 404


-@pytest.mark.usefixtures('references', 'hg19_transcript_mappings')
-@pytest.mark.parametrize('references', [['NM_003002.2']], indirect=True)
+@pytest.mark.usefixtures('hg19_transcript_mappings')
+@with_references('NM_003002.2')
 def test_bed(website):
    """
    BED track for variant.
@@ -913,8 +900,8 @@ def test_bed(website):
    assert '\t'.join(['chr11', '111959694', '111959695', '274G>T', '0', '+']) in r.data


-@pytest.mark.usefixtures('references', 'hg19_transcript_mappings')
-@pytest.mark.parametrize('references', [['NM_000132.3']], indirect=True)
+@pytest.mark.usefixtures('hg19_transcript_mappings')
+@with_references('NM_000132.3')
 def test_bed_reverse(website):
    """
    BED track for variant on reverse strand.