diff --git a/mutalyzer/ncbi.py b/mutalyzer/ncbi.py index f131088ae89c6515d80d04e2ba5b11ebee8786c8..62c96c5b8cdf989e229d8d3544e67218d9774ade 100644 --- a/mutalyzer/ncbi.py +++ b/mutalyzer/ncbi.py @@ -3,15 +3,14 @@ Communication with the NCBI. """ -import functools - from Bio import Entrez from .config import settings from .redisclient import client as redis -def _get_link(source_accession, source_db, target_db, match_link_name): +def _get_link(source_accession, source_db, target_db, match_link_name, + source_version=None, match_version=True): """ Retrieve a linked accession number from the NCBI. @@ -22,88 +21,142 @@ def _get_link(source_accession, source_db, target_db, match_link_name): :arg function match_link_name: For each link found, this function is called with the link name (`str`) and it should return `True` iff the link is to be used. - - :returns: Linked accession number (without version number) or `None` if no - link can be found. - :rtype: str + :arg int source_version: Optional version number for `source_accession`. + :arg bool match_version: If `False`, the link does not have to match + `source_version`. + + :returns: Tuple of `(target_accession, target_version)` representing the + link target, or `None` if no link can be found. If `source_version` is + not specified or `match_version` is `False`, `target_version` can be + `None`. + :rtype: tuple(str, int) """ Entrez.email = settings.EMAIL - handle = Entrez.esearch(db=source_db, term=source_accession) + + # If we are currently strictly matching on version, we can try again if + # no result is found. Otherwise, we just report failure. + def fail_or_retry(): + if source_version is None or match_version: + return None + return _get_link(source_accession, source_db, target_db, + match_link_name, source_version=None, + match_version=False) + + if source_version is None: + source = source_accession + else: + source = '%s.%d' % (source_accession, source_version) + + # Find source record. + handle = Entrez.esearch(db=source_db, term=source) try: result = Entrez.read(handle) except Entrez.Parser.ValidationError: - return None + return fail_or_retry() finally: handle.close() try: source_gi = unicode(result['IdList'][0]) except IndexError: - return None + return fail_or_retry() + # Find link from source record to target record. handle = Entrez.elink(dbfrom=source_db, db=target_db, id=source_gi) try: result = Entrez.read(handle) except Entrez.Parser.ValidationError: - return None + return fail_or_retry() finally: handle.close() if not result[0]['LinkSetDb']: - return None + return fail_or_retry() for link in result[0]['LinkSetDb']: if match_link_name(unicode(link['LinkName'])): target_gi = unicode(link['Link'][0]['Id']) break else: - return None + return fail_or_retry() + # Get target record. handle = Entrez.efetch( db=target_db, id=target_gi, rettype='acc', retmode='text') - target_accession = unicode(handle.read()).split('.')[0] + target = unicode(handle.read()).strip().split('.') handle.close() - return target_accession + target_accession = target[0] + target_version = int(target[1]) if source_version is not None else None + return target_accession, target_version -def cache_link(source, target): - """ - Decorator to add caching to link retrieval. - :arg str source: Source database (used to construct cache key). - :arg str target: Target database (used to construct cache key). +def _get_link_cached(forward_key, reverse_key, source_accession, source_db, + target_db, match_link_name, source_version=None, + match_version=True): """ - forward_key = 'ncbi:%s-to-%s:%%s' % (source, target) - reverse_key = 'ncbi:%s-to-%s:%%s' % (target, source) + Version of :func:`_get_link` with caching. - def cache_source_to_target(f): - @functools.wraps(f) - def cached_f(accession): - result = redis.get(forward_key % accession) - if result is not None: - # The empty string is a cached negative result, which we return as - # `None`. - return result or None + :arg str forward_key: Cache key format string for the forward direction. + The source term will be substituted in this template. + :arg str reverse_key: Cache key format string for the reverse direction. + The target term will be substituted in this template. - result = f(accession) - - if result is None: - redis.setex(forward_key % accession, - settings.NEGATIVE_LINK_CACHE_EXPIRATION, '') - return None + The cache value for a negative result (no link found) is the empty string + and expires in `NEGATIVE_LINK_CACHE_EXPIRATION` seconds. + """ + if source_version is not None: + # Query cache for link with version. + target = redis.get(forward_key % + ('%s.%d' % (source_accession, source_version))) + if target == '': + return None + if target: + target_accession, target_version = target.split('.') + return target_accession, int(target_version) + + if source_version is None or not match_version: + # Query cache for link without version. + target = redis.get(forward_key % source_accession) + if target == '': + return None + if target is not None: + return target, None + + # Query NCBI service. + try: + target_accession, target_version = _get_link( + source_accession, source_db, target_db, match_link_name, + source_version=source_version, match_version=match_version) + except TypeError: + # No link was found. + if source_version is not None: + # Store a negative forward link with version. + redis.setex(forward_key % + ('%s.%d' % (source_accession, source_version)), + settings.NEGATIVE_LINK_CACHE_EXPIRATION, '') + if source_version is None or not match_version: + # Store a negative forward link without version. + redis.setex(forward_key % source_accession, + settings.NEGATIVE_LINK_CACHE_EXPIRATION, '') + return None - # We store the resulting link in both directions. - redis.set(forward_key % accession, result) - redis.set(reverse_key % result, accession) - return result + # Store the link without version in both directions. + redis.set(forward_key % source_accession, target_accession) + redis.set(reverse_key % target_accession, source_accession) - return cached_f + if source_version is not None and target_version is not None: + # Store the link with version in both directions. + redis.set(forward_key % ('%s.%d' % (source_accession, source_version)), + '%s.%d' % (target_accession, target_version)) + redis.set(reverse_key % ('%s.%d' % (target_accession, target_version)), + '%s.%d' % (source_accession, source_version)) - return cache_source_to_target + return target_accession, target_version -@cache_link('transcript', 'protein') -def transcript_to_protein(transcript_accession): +def transcript_to_protein(transcript_accession, transcript_version=None, + match_version=True): """ Try to find the protein linked to a transcript. @@ -113,18 +166,26 @@ def transcript_to_protein(transcript_accession): :arg str transcript_accession: Accession number of the transcript for which we want to find the protein (without version number). - - :returns: Accession number of a protein (without version number) or `None` - if no link can be found. - :rtype: str + :arg int transcript_version: Transcript version number. Please provide + this if available, also if it does not need to match. This will enrich + the cache. + :arg bool match_version: If `False`, the link does not have to match + `transcript_version`. + + :returns: Tuple of `(protein_accession, protein_version)` representing the + linked protein, or `None` if no link can be found. If `match_version` is + `False`, `protein_version` can be `None`. TODO: can or will? + :rtype: tuple(str, int) """ - return _get_link( + return _get_link_cached( + 'ncbi:transcript-to-protein:%s', 'ncbi:protein-to-transcript:%s', transcript_accession, 'nucleotide', 'protein', - lambda link: link in ('nuccore_protein', 'nuccore_protein_cds')) + lambda link: link in ('nuccore_protein', 'nuccore_protein_cds'), + source_version=transcript_version, match_version=match_version) -@cache_link('protein', 'transcript') -def protein_to_transcript(protein_accession): +def protein_to_transcript(protein_accession, protein_version=None, + match_version=True): """ Try to find the transcript linked to a protein. @@ -134,11 +195,14 @@ def protein_to_transcript(protein_accession): :arg str protein_accession: Accession number of the protein for which we want to find the transcript (without version number). + TODO :returns: Accession number of a transcript (without version number) or `None` if no link can be found. :rtype: str """ - return _get_link( + return _get_link_cached( + 'ncbi:protein-to-transcript:%s', 'ncbi:transcript-to-protein:%s', protein_accession, 'protein', 'nucleotide', - lambda link: link == 'protein_nuccore_mrna') + lambda link: link == 'protein_nuccore_mrna', + source_version=protein_version, match_version=match_version) diff --git a/mutalyzer/parsers/genbank.py b/mutalyzer/parsers/genbank.py index 5312a1b5996106223c44809e1952f2d6be5d56c9..be3badbdbbc0900910810b07060370bfeb8d27ac 100644 --- a/mutalyzer/parsers/genbank.py +++ b/mutalyzer/parsers/genbank.py @@ -220,8 +220,14 @@ class GBparser(): i.proteinLink = i.protein_id.split('.')[0] #if else : # Tag an mRNA with the protein id too. - i.proteinLink = \ - ncbi.transcript_to_protein(i.transcript_id.split('.')[0]) + accession, version = i.transcript_id.split('.') + protein = ncbi.transcript_to_protein( + accession, int(version), match_version=False) + if protein is None: + i.proteinLink = None + else: + # We ignore the version. + i.proteinLink = protein[0] i.positionList = self.__locationList2posList(i) i.location = self.__location2pos(i.location) #FIXME #if not i.positionList : # FIXME ??? diff --git a/tests/test_ncbi.py b/tests/test_ncbi.py index ca129d15f236b0eb15515f62c707566ae39804c9..2920bc420df661ef750f8d279179446a47802f86 100644 --- a/tests/test_ncbi.py +++ b/tests/test_ncbi.py @@ -5,30 +5,394 @@ Tests for the mutalyzer.ncbi module. from __future__ import unicode_literals +import Bio.Entrez +import pytest + from mutalyzer import ncbi +from mutalyzer.redisclient import client as redis from fixtures import with_links -@with_links(('NM_018650', 'NP_061120')) -def test_transcript_to_protein(): +@pytest.fixture +def entrez(request, monkeypatch): """ - Get protein for transcript. + Fixture monkey-patching the NCBI Entrez API to return transcript-protein + links defined in the fixture parameter. + + The fixture is similar to the :func:`fixtures.links` fixture, but instead + of storing the links in the cache, the API is monkey-patched. + """ + try: + links = request.param + except AttributeError: + return [] + + # We need two-way lookup. + transcript_to_protein = dict(links) + protein_to_transcript = dict((p, t) for t, p in links) + + # Store original methods which should be called as a fallback. + esearch = Bio.Entrez.esearch + elink = Bio.Entrez.elink + efetch = Bio.Entrez.efetch + + # Intermediate Entrez result object which can be parsed with Entrez.read. + class EntrezResult(object): + def __init__(self, result): + self.result = result + + def read(self): + return self.result + + def close(self): + pass + + def mock_esearch(db=None, term=None): + if ((db == 'nucleotide' and term in transcript_to_protein) + or (db == 'protein' and term in protein_to_transcript)): + return EntrezResult({ + 'Count': '1', + 'RetMax': '1', + 'IdList': [term], + 'TranslationSet': [], + 'RetStart': '0', + 'QueryTranslation': '' + }) + return esearch(db=db, term=term) + + def mock_elink(dbfrom=None, db=None, id=None): + if dbfrom == 'nucleotide' and id in transcript_to_protein: + if transcript_to_protein[id] is None: + linkset = [] + else: + linkset = [{'DbTo': 'protein', + 'Link': [{'Id': transcript_to_protein[id]}], + 'LinkName': 'nuccore_protein'}] + return EntrezResult([{ + 'LinkSetDb': linkset, + 'DbFrom': 'nuccore', + 'IdList': [id], + 'LinkSetDbHistory': [], + 'ERROR': [] + }]) + if dbfrom == 'protein' and id in protein_to_transcript: + if protein_to_transcript[id] is None: + linkset = [] + else: + linkset = [{'DbTo': 'nuccore', + 'Link': [{'Id': '568815587'}, + {'Id': '528476600'}, + {'Id': '568815270'}, + {'Id': '528474155'}, + {'Id': '452415518'}, + {'Id': '452405284'}, + {'Id': '383209650'}], + 'LinkName': 'protein_nuccore'}, + {'DbTo': 'nuccore', + 'Link': [{'Id': '4506864'}], + 'LinkName': 'protein_nuccore_cds'}, + {'DbTo': 'nuccore', + 'Link': [{'Id': '48735311'}, + {'Id': '48734961'}, + {'Id': '47682402'}, + {'Id': '18490203'}, + {'Id': '16359050'}, + {'Id': '16306997'}, + {'Id': '15929518'}, + {'Id': '15214938'}, + {'Id': '13528941'}], + 'LinkName': 'protein_nuccore_mgc_refseq'}, + {'DbTo': 'nuccore', + 'Link': [{'Id': protein_to_transcript[id]}], + 'LinkName': 'protein_nuccore_mrna'}] + return EntrezResult([{ + 'LinkSetDb': linkset, + 'DbFrom': 'protein', + 'IdList': [id], + 'LinkSetDbHistory': [], + 'ERROR': [] + }]) + return elink(dbfrom=dbfrom, db=db, id=id) + + def mock_efetch(db=None, id=None, rettype=None, retmode=None): + if ((db == 'nucleotide' and id in transcript_to_protein) + or (db == 'protein' and id in protein_to_transcript)): + if '.' not in id: + id += '.9999' + return EntrezResult(id + '\n') + return efetch(db=db, id=id, rettype=rettype, retmode=retmode) + + def mock_read(result): + return result.read() + + monkeypatch.setattr(Bio.Entrez, 'esearch', mock_esearch) + monkeypatch.setattr(Bio.Entrez, 'elink', mock_elink) + monkeypatch.setattr(Bio.Entrez, 'efetch', mock_efetch) + monkeypatch.setattr(Bio.Entrez, 'read', mock_read) + return links + + +def with_entrez(*links): + """ + Convenience decorator for parameterizing tests with transcript-protein + link fixtures in the Entrez API. + + Similar to :func:`fixtures.with_links`. """ - assert ncbi.transcript_to_protein('NM_018650') == 'NP_061120' + def test_with_entrez(test): + return pytest.mark.usefixtures('entrez')( + pytest.mark.parametrize( + 'entrez', [links], indirect=True, + ids=[','.join('/'.join(a or '*' for a in l) + for l in links)])(test)) + return test_with_entrez -@with_links(('XM_005273133', None)) -def test_transcript_to_protein_negative(): +@with_entrez(('NM_11111.1', None), + ('NM_11111.2', 'NP_11111.2'), + ('NM_22222.2', None), + ('NM_22222.3', 'NP_22222.3'), + ('NM_33333.4', None), + ('NM_33333.5', 'NP_33333.5'), + ('NM_44444', None), + ('NM_44444.5', None), + ('NM_44444.6', None), + ('NM_55555', 'NP_55555'), + ('NM_55555.6', None), + ('NM_66666', 'NP_66666'), + ('NM_66666.6', 'NP_66666.6'), + ('NM_66666.7', 'NP_66666.7'), + ('NM_66666.8', None), + ('NM_77777', 'NP_77777'), + ('NM_77777.7', 'NP_77777.7'), + ('NM_77777.8', None), + ('NM_88888', None), + ('NM_88888.8', None), + ('NM_88888.9', 'NP_88888.9')) +@with_links(('NM_11111', 'NP_11111'), + ('NM_22222', None), + ('NM_33333.3', 'NP_33333.3'), + ('NM_44444.4', None), + ('NM_55555.5', None), + ('NM_66666.6', None)) +@pytest.mark.parametrize('accession,version,match_version,expected', [ + ('NM_11111', None, False, ('NP_11111', None)), + ('NM_11111', 1, False, ('NP_11111', None)), + ('NM_11111', 1, True, None), + ('NM_11111', 2, False, ('NP_11111', None)), + ('NM_11111', 2, True, ('NP_11111', 2)), + ('NM_22222', None, False, None), + ('NM_22222', 2, False, None), + ('NM_22222', 2, True, None), + ('NM_22222', 3, False, None), + ('NM_22222', 3, True, ('NP_22222', 3)), + ('NM_33333', None, False, ('NP_33333', None)), + ('NM_33333', 3, True, ('NP_33333', 3)), + ('NM_33333', 3, False, ('NP_33333', 3)), + ('NM_33333', 4, True, None), + ('NM_33333', 4, False, ('NP_33333', None)), + ('NM_33333', 5, True, ('NP_33333', 5)), + ('NM_33333', 5, False, ('NP_33333', None)), + ('NM_44444', None, False, None), + ('NM_44444', 4, True, None), + ('NM_44444', 4, False, None), + ('NM_44444', 5, True, None), + ('NM_44444', 5, False, None), + ('NM_44444', 6, True, None), + ('NM_44444', 6, False, None), + ('NM_55555', None, False, ('NP_55555', None)), + ('NM_55555', 5, True, None), + ('NM_55555', 5, False, None), + ('NM_55555', 6, True, None), + ('NM_55555', 6, False, ('NP_55555', None)), + ('NM_66666', None, False, ('NP_66666', None)), + ('NM_66666', 6, True, None), + ('NM_66666', 6, False, None), + ('NM_66666', 7, True, ('NP_66666', 7)), + ('NM_66666', 7, False, ('NP_66666', 7)), + ('NM_66666', 8, True, None), + ('NM_66666', 8, False, ('NP_66666', None)), + ('NM_77777', None, False, ('NP_77777', None)), + ('NM_77777', 7, False, ('NP_77777', 7)), + ('NM_77777', 7, True, ('NP_77777', 7)), + ('NM_77777', 8, False, ('NP_77777', None)), + ('NM_77777', 8, True, None), + ('NM_88888', None, False, None), + ('NM_88888', 8, False, None), + ('NM_88888', 8, True, None), + ('NM_88888', 9, False, ('NP_88888', 9)), + ('NM_88888', 9, True, ('NP_88888', 9))]) +def test_transcript_to_protein(accession, version, match_version, expected): """ - Get no protein for transcript. + Get protein for transcript. + + Both the Entrez API and our cache are fixed with a set of + transcript-protein links. This test is parametrized with a list of + arguments for the :func:`ncbi.transcript_to_protein` function and the + corresponding expected result. """ - assert ncbi.transcript_to_protein('XM_005273133') is None + assert ncbi.transcript_to_protein( + accession, version, match_version) == expected -@with_links(('NM_018650', 'NP_061120')) -def test_protein_to_transcript(): +@with_entrez((None, 'NP_11111.1'), + ('NM_11111.2', 'NP_11111.2'), + (None, 'NP_22222.2'), + ('NM_22222.3', 'NP_22222.3'), + (None, 'NP_33333.4'), + ('NM_33333.5', 'NP_33333.5'), + (None, 'NP_44444'), + (None, 'NP_44444.5'), + (None, 'NP_44444.6'), + ('NM_55555', 'NP_55555'), + (None, 'NP_55555.6'), + ('NM_66666', 'NP_66666'), + ('NM_66666.6', 'NP_66666.6'), + ('NM_66666.7', 'NP_66666.7'), + (None, 'NP_66666.8'), + ('NM_77777', 'NP_77777'), + ('NM_77777.7', 'NP_77777.7'), + (None, 'NP_77777.8'), + (None, 'NP_88888'), + (None, 'NP_88888.8'), + ('NM_88888.9', 'NP_88888.9')) +@with_links(('NM_11111', 'NP_11111'), + (None, 'NP_22222'), + ('NM_33333.3', 'NP_33333.3'), + (None, 'NP_44444.4'), + (None, 'NP_55555.5'), + (None, 'NP_66666.6')) +@pytest.mark.parametrize('accession,version,match_version,expected', [ + ('NP_11111', None, False, ('NM_11111', None)), + ('NP_11111', 1, False, ('NM_11111', None)), + ('NP_11111', 1, True, None), + ('NP_11111', 2, False, ('NM_11111', None)), + ('NP_11111', 2, True, ('NM_11111', 2)), + ('NP_22222', None, False, None), + ('NP_22222', 2, False, None), + ('NP_22222', 2, True, None), + ('NP_22222', 3, False, None), + ('NP_22222', 3, True, ('NM_22222', 3)), + ('NP_33333', None, False, ('NM_33333', None)), + ('NP_33333', 3, True, ('NM_33333', 3)), + ('NP_33333', 3, False, ('NM_33333', 3)), + ('NP_33333', 4, True, None), + ('NP_33333', 4, False, ('NM_33333', None)), + ('NP_33333', 5, True, ('NM_33333', 5)), + ('NP_33333', 5, False, ('NM_33333', None)), + ('NP_44444', None, False, None), + ('NP_44444', 4, True, None), + ('NP_44444', 4, False, None), + ('NP_44444', 5, True, None), + ('NP_44444', 5, False, None), + ('NP_44444', 6, True, None), + ('NP_44444', 6, False, None), + ('NP_55555', None, False, ('NM_55555', None)), + ('NP_55555', 5, True, None), + ('NP_55555', 5, False, None), + ('NP_55555', 6, True, None), + ('NP_55555', 6, False, ('NM_55555', None)), + ('NP_66666', None, False, ('NM_66666', None)), + ('NP_66666', 6, True, None), + ('NP_66666', 6, False, None), + ('NP_66666', 7, True, ('NM_66666', 7)), + ('NP_66666', 7, False, ('NM_66666', 7)), + ('NP_66666', 8, True, None), + ('NP_66666', 8, False, ('NM_66666', None)), + ('NP_77777', None, False, ('NM_77777', None)), + ('NP_77777', 7, False, ('NM_77777', 7)), + ('NP_77777', 7, True, ('NM_77777', 7)), + ('NP_77777', 8, False, ('NM_77777', None)), + ('NP_77777', 8, True, None), + ('NP_88888', None, False, None), + ('NP_88888', 8, False, None), + ('NP_88888', 8, True, None), + ('NP_88888', 9, False, ('NM_88888', 9)), + ('NP_88888', 9, True, ('NM_88888', 9))]) +def test_protein_to_transcript(accession, version, match_version, expected): """ Get transcript for protein. + + Both the Entrez API and our cache are fixed with a set of + transcript-protein links. This test is parametrized with a list of + arguments for the :func:`ncbi.transcript_to_protein` function and the + corresponding expected result. + + Fixtures and parameters of this test mirror those of the + `test_transcript_to_protein` test. + """ + assert ncbi.protein_to_transcript( + accession, version, match_version) == expected + + +@with_entrez(('NM_11111', None), + ('NM_22222', 'NP_22222'), + ('NM_33333', None), + ('NM_33333.3', None), + ('NM_44444', None), + ('NM_44444.4', 'NP_44444.4')) +@pytest.mark.parametrize('accession,version,match_version,expected_forward,expected_reverse', [ + ('NM_11111', None, False, [('NM_11111', None)], []), + ('NM_22222', None, False, + [('NM_22222', 'NP_22222')], [('NM_22222', 'NP_22222')]), + ('NM_33333', None, False, [('NM_33333', None)], []), + ('NM_33333', 3, False, [('NM_33333', None), ('NM_33333.3', None)], []), + ('NM_33333', 3, True, [('NM_33333.3', None)], []), + ('NM_44444', None, False, [('NM_44444', None)], []), + ('NM_44444', 4, False, + [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')], + [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')]), + ('NM_44444', 4, True, + [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')], + [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')])]) +def test_transcript_to_protein_cache(accession, version, match_version, + expected_forward, expected_reverse): + """ + Get protein for transcript and check the resulting cache state. """ - assert ncbi.protein_to_transcript('NP_061120') == 'NM_018650' + ncbi.transcript_to_protein(accession, version, match_version) + + forward = [(key.split(':')[-1], redis.get(key) or None) + for key in redis.keys('ncbi:transcript-to-protein:*')] + assert sorted(forward) == sorted(expected_forward) + + reverse = [(redis.get(key) or None, key.split(':')[-1]) + for key in redis.keys('ncbi:protein-to-transcript:*')] + assert sorted(reverse) == sorted(expected_reverse) + + +@with_entrez((None, 'NP_11111'), + ('NM_22222', 'NP_22222'), + (None, 'NP_33333'), + (None, 'NP_33333.3'), + (None, 'NP_44444'), + ('NM_44444.4', 'NP_44444.4')) +@pytest.mark.parametrize('accession,version,match_version,expected_forward,expected_reverse', [ + ('NP_11111', None, False, [], [(None, 'NP_11111')]), + ('NP_22222', None, False, + [('NM_22222', 'NP_22222')], [('NM_22222', 'NP_22222')]), + ('NP_33333', None, False, [], [(None, 'NP_33333')]), + ('NP_33333', 3, False, [], [(None, 'NP_33333'), (None, 'NP_33333.3')]), + ('NP_33333', 3, True, [], [(None, 'NP_33333.3')]), + ('NP_44444', None, False, [], [(None, 'NP_44444')]), + ('NP_44444', 4, False, + [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')], + [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')]), + ('NP_44444', 4, True, + [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')], + [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')])]) +def test_protein_to_transcript_cache(accession, version, match_version, + expected_forward, expected_reverse): + """ + Get transcript for protein and check the resulting cache state. + """ + ncbi.protein_to_transcript(accession, version, match_version) + + forward = [(key.split(':')[-1], redis.get(key) or None) + for key in redis.keys('ncbi:transcript-to-protein:*')] + assert sorted(forward) == sorted(expected_forward) + + reverse = [(redis.get(key) or None, key.split(':')[-1]) + for key in redis.keys('ncbi:protein-to-transcript:*')] + assert sorted(reverse) == sorted(expected_reverse)