diff --git a/mutalyzer/ncbi.py b/mutalyzer/ncbi.py index 62c96c5b8cdf989e229d8d3544e67218d9774ade..c4fc77bcf1112089d54c453bca4faf31da7c0ba8 100644 --- a/mutalyzer/ncbi.py +++ b/mutalyzer/ncbi.py @@ -9,26 +9,43 @@ from .config import settings from .redisclient import client as redis -def _get_link(source_accession, source_db, target_db, match_link_name, - source_version=None, match_version=True): +class _NegativeLinkError(Exception): + """ + Raised when no transcript-protein link exists (used for cached negative + links). + """ + pass + + +class NoLinkError(Exception): + """ + Raised when no transcript-protein link can be found. + """ + pass + + +def _get_link_from_ncbi(source_db, target_db, match_link_name, + source_accession, source_version=None, + match_version=True): """ Retrieve a linked accession number from the NCBI. - :arg str source_accession: Accession number for which we want to find a - link (without version number). :arg str source_db: NCBI source database. :arg str target_db: NCBI target database. :arg function match_link_name: For each link found, this function is called with the link name (`str`) and it should return `True` iff the link is to be used. + :arg str source_accession: Accession number for which we want to find a + link (without version number). :arg int source_version: Optional version number for `source_accession`. :arg bool match_version: If `False`, the link does not have to match `source_version`. + :raises NoLinkError: If no link could be retrieved from the NCBI. + :returns: Tuple of `(target_accession, target_version)` representing the - link target, or `None` if no link can be found. If `source_version` is - not specified or `match_version` is `False`, `target_version` can be - `None`. + link target. If `source_version` is not specified or `match_version` is + `False`, `target_version` can be `None`. :rtype: tuple(str, int) """ Entrez.email = settings.EMAIL @@ -37,10 +54,10 @@ def _get_link(source_accession, source_db, target_db, match_link_name, # no result is found. Otherwise, we just report failure. def fail_or_retry(): if source_version is None or match_version: - return None - return _get_link(source_accession, source_db, target_db, - match_link_name, source_version=None, - match_version=False) + raise NoLinkError() + return _get_link_from_ncbi(source_db, target_db, match_link_name, + source_accession, source_version=None, + match_version=False) if source_version is None: source = source_accession @@ -91,26 +108,35 @@ def _get_link(source_accession, source_db, target_db, match_link_name, return target_accession, target_version -def _get_link_cached(forward_key, reverse_key, source_accession, source_db, - target_db, match_link_name, source_version=None, - match_version=True): +def _get_link_from_cache(forward_key, reverse_key, source_accession, + source_version=None, match_version=True): """ - Version of :func:`_get_link` with caching. + Retrieve a linked accession number from the cache. :arg str forward_key: Cache key format string for the forward direction. The source term will be substituted in this template. :arg str reverse_key: Cache key format string for the reverse direction. The target term will be substituted in this template. + :arg str source_accession: Accession number for which we want to find a + link (without version number). + :arg int source_version: Optional version number for `source_accession`. + :arg bool match_version: If `False`, the link does not have to match + `source_version`. - The cache value for a negative result (no link found) is the empty string - and expires in `NEGATIVE_LINK_CACHE_EXPIRATION` seconds. + :raises _NegativeLinkError: If a negative link was found. + :raises NoLinkError: If no link could be found. + + :returns: Tuple of `(target_accession, target_version)` representing the + link target. If `source_version` is not specified or `match_version` is + `False`, `target_version` can be `None`. + :rtype: tuple(str, int) """ if source_version is not None: # Query cache for link with version. target = redis.get(forward_key % ('%s.%d' % (source_accession, source_version))) if target == '': - return None + raise _NegativeLinkError() if target: target_accession, target_version = target.split('.') return target_accession, int(target_version) @@ -119,28 +145,38 @@ def _get_link_cached(forward_key, reverse_key, source_accession, source_db, # Query cache for link without version. target = redis.get(forward_key % source_accession) if target == '': - return None + raise _NegativeLinkError() if target is not None: return target, None - # Query NCBI service. - try: - target_accession, target_version = _get_link( - source_accession, source_db, target_db, match_link_name, - source_version=source_version, match_version=match_version) - except TypeError: - # No link was found. - if source_version is not None: - # Store a negative forward link with version. - redis.setex(forward_key % - ('%s.%d' % (source_accession, source_version)), - settings.NEGATIVE_LINK_CACHE_EXPIRATION, '') - if source_version is None or not match_version: - # Store a negative forward link without version. - redis.setex(forward_key % source_accession, - settings.NEGATIVE_LINK_CACHE_EXPIRATION, '') - return None + raise NoLinkError() + +def _cache_negative_link(forward_key, source_accession, source_version=None, + match_version=True): + """ + Store a negative transcript-protein link (a "no link found" result) in the + cache. + + The cache value for a negative link is the empty string and expires in + `NEGATIVE_LINK_CACHE_EXPIRATION` seconds. + """ + if source_version is not None: + # Store a negative forward link with version. + redis.setex(forward_key % + ('%s.%d' % (source_accession, source_version)), + settings.NEGATIVE_LINK_CACHE_EXPIRATION, '') + if source_version is None or not match_version: + # Store a negative forward link without version. + redis.setex(forward_key % source_accession, + settings.NEGATIVE_LINK_CACHE_EXPIRATION, '') + + +def _cache_link(forward_key, reverse_key, source_accession, target_accession, + source_version=None, target_version=None): + """ + Store a transcript-protein link in the cache. + """ # Store the link without version in both directions. redis.set(forward_key % source_accession, target_accession) redis.set(reverse_key % target_accession, source_accession) @@ -152,6 +188,41 @@ def _get_link_cached(forward_key, reverse_key, source_accession, source_db, redis.set(reverse_key % ('%s.%d' % (target_accession, target_version)), '%s.%d' % (source_accession, source_version)) + +def _get_link(forward_key, reverse_key, source_db, target_db, match_link_name, + source_accession, source_version=None, match_version=True): + """ + Combines :func:`_get_link_from_ncbi` with :func:`_get_link_from_cache` to + add caching to transcript-protein-link retrieval. + """ + try: + return _get_link_from_cache( + forward_key, reverse_key, source_accession, + source_version=source_version, match_version=match_version) + except _NegativeLinkError: + # If a negative link was in the cache, we report no link found. + raise NoLinkError() + except NoLinkError: + # If no link was in the cache, we continue by querying the NCBI. + pass + + # Query NCBI service. + try: + target_accession, target_version = _get_link_from_ncbi( + source_db, target_db, match_link_name, source_accession, + source_version=source_version, match_version=match_version) + except NoLinkError: + # No link found, store this negative result in the cache and re-raise + # the exception. + _cache_negative_link( + forward_key, source_accession, source_version=source_version, + match_version=match_version) + raise + + # Store the link in the cache and return the target value. + _cache_link( + forward_key, reverse_key, source_accession, target_accession, + source_version=source_version, target_version=target_version) return target_accession, target_version @@ -172,16 +243,19 @@ def transcript_to_protein(transcript_accession, transcript_version=None, :arg bool match_version: If `False`, the link does not have to match `transcript_version`. + :raises NoLinkError: If no link could be found. + :returns: Tuple of `(protein_accession, protein_version)` representing the - linked protein, or `None` if no link can be found. If `match_version` is - `False`, `protein_version` can be `None`. TODO: can or will? + linked protein. If `transcript_version` is not specified or + `match_version` is `False`, `protein_version` can be `None`. :rtype: tuple(str, int) """ - return _get_link_cached( + return _get_link( 'ncbi:transcript-to-protein:%s', 'ncbi:protein-to-transcript:%s', - transcript_accession, 'nucleotide', 'protein', + 'nucleotide', 'protein', lambda link: link in ('nuccore_protein', 'nuccore_protein_cds'), - source_version=transcript_version, match_version=match_version) + transcript_accession, source_version=transcript_version, + match_version=match_version) def protein_to_transcript(protein_accession, protein_version=None, @@ -195,14 +269,22 @@ def protein_to_transcript(protein_accession, protein_version=None, :arg str protein_accession: Accession number of the protein for which we want to find the transcript (without version number). - TODO + :arg int protein_version: Protein version number. Please provide this if + available, also if it does not need to match. This will enrich the + cache. + :arg bool match_version: If `False`, the link does not have to match + `protein_version`. - :returns: Accession number of a transcript (without version number) or - `None` if no link can be found. - :rtype: str + :raises NoLinkError: If no link could be found. + + :returns: Tuple of `(transcript_accession, transcript_version)` + representing the linked transcript. If `protein_version` is not + specified or `match_version` is `False`, `transcript_version` can be + `None`. + :rtype: tuple(str, int) """ - return _get_link_cached( + return _get_link( 'ncbi:protein-to-transcript:%s', 'ncbi:transcript-to-protein:%s', - protein_accession, 'protein', 'nucleotide', - lambda link: link == 'protein_nuccore_mrna', - source_version=protein_version, match_version=match_version) + 'protein', 'nucleotide', lambda link: link == 'protein_nuccore_mrna', + protein_accession, source_version=protein_version, + match_version=match_version) diff --git a/mutalyzer/parsers/genbank.py b/mutalyzer/parsers/genbank.py index be3badbdbbc0900910810b07060370bfeb8d27ac..b85dd5316fd060920e8231966bfa9db5c853b2cf 100644 --- a/mutalyzer/parsers/genbank.py +++ b/mutalyzer/parsers/genbank.py @@ -221,13 +221,12 @@ class GBparser(): #if else : # Tag an mRNA with the protein id too. accession, version = i.transcript_id.split('.') - protein = ncbi.transcript_to_protein( - accession, int(version), match_version=False) - if protein is None: - i.proteinLink = None - else: + try: # We ignore the version. - i.proteinLink = protein[0] + i.proteinLink = ncbi.transcript_to_protein( + accession, int(version), match_version=False)[0] + except ncbi.NoLinkError: + pass i.positionList = self.__locationList2posList(i) i.location = self.__location2pos(i.location) #FIXME #if not i.positionList : # FIXME ??? diff --git a/tests/test_ncbi.py b/tests/test_ncbi.py index 2920bc420df661ef750f8d279179446a47802f86..f2a7e5122d6c13db2a867c122ec53865ea925880 100644 --- a/tests/test_ncbi.py +++ b/tests/test_ncbi.py @@ -230,10 +230,15 @@ def test_transcript_to_protein(accession, version, match_version, expected): Both the Entrez API and our cache are fixed with a set of transcript-protein links. This test is parametrized with a list of arguments for the :func:`ncbi.transcript_to_protein` function and the - corresponding expected result. + corresponding expected result (`None` if `NoLinkError` is expected to be + raised). """ - assert ncbi.transcript_to_protein( - accession, version, match_version) == expected + if expected is None: + with pytest.raises(ncbi.NoLinkError): + ncbi.transcript_to_protein(accession, version, match_version) + else: + assert ncbi.transcript_to_protein( + accession, version, match_version) == expected @with_entrez((None, 'NP_11111.1'), @@ -322,8 +327,12 @@ def test_protein_to_transcript(accession, version, match_version, expected): Fixtures and parameters of this test mirror those of the `test_transcript_to_protein` test. """ - assert ncbi.protein_to_transcript( - accession, version, match_version) == expected + if expected is None: + with pytest.raises(ncbi.NoLinkError): + ncbi.protein_to_transcript(accession, version, match_version) + else: + assert ncbi.protein_to_transcript( + accession, version, match_version) == expected @with_entrez(('NM_11111', None), @@ -351,7 +360,10 @@ def test_transcript_to_protein_cache(accession, version, match_version, """ Get protein for transcript and check the resulting cache state. """ - ncbi.transcript_to_protein(accession, version, match_version) + try: + ncbi.transcript_to_protein(accession, version, match_version) + except ncbi.NoLinkError: + pass forward = [(key.split(':')[-1], redis.get(key) or None) for key in redis.keys('ncbi:transcript-to-protein:*')] @@ -387,7 +399,10 @@ def test_protein_to_transcript_cache(accession, version, match_version, """ Get transcript for protein and check the resulting cache state. """ - ncbi.protein_to_transcript(accession, version, match_version) + try: + ncbi.protein_to_transcript(accession, version, match_version) + except ncbi.NoLinkError: + pass forward = [(key.split(':')[-1], redis.get(key) or None) for key in redis.keys('ncbi:transcript-to-protein:*')]