Skip to content
Snippets Groups Projects
Commit a98f48f1 authored by Vermaat's avatar Vermaat
Browse files

Merge pull request #99 from mutalyzer/refactor-links

Refactor transcript-protein links to raise NoLinkError instead of None
parents f4b7d13e 0f6cafe0
No related branches found
No related tags found
No related merge requests found
......@@ -9,26 +9,43 @@ from .config import settings
from .redisclient import client as redis
def _get_link(source_accession, source_db, target_db, match_link_name,
source_version=None, match_version=True):
class _NegativeLinkError(Exception):
"""
Raised when no transcript-protein link exists (used for cached negative
links).
"""
pass
class NoLinkError(Exception):
"""
Raised when no transcript-protein link can be found.
"""
pass
def _get_link_from_ncbi(source_db, target_db, match_link_name,
source_accession, source_version=None,
match_version=True):
"""
Retrieve a linked accession number from the NCBI.
:arg str source_accession: Accession number for which we want to find a
link (without version number).
:arg str source_db: NCBI source database.
:arg str target_db: NCBI target database.
:arg function match_link_name: For each link found, this function is
called with the link name (`str`) and it should return `True` iff the
link is to be used.
:arg str source_accession: Accession number for which we want to find a
link (without version number).
:arg int source_version: Optional version number for `source_accession`.
:arg bool match_version: If `False`, the link does not have to match
`source_version`.
:raises NoLinkError: If no link could be retrieved from the NCBI.
:returns: Tuple of `(target_accession, target_version)` representing the
link target, or `None` if no link can be found. If `source_version` is
not specified or `match_version` is `False`, `target_version` can be
`None`.
link target. If `source_version` is not specified or `match_version` is
`False`, `target_version` can be `None`.
:rtype: tuple(str, int)
"""
Entrez.email = settings.EMAIL
......@@ -37,10 +54,10 @@ def _get_link(source_accession, source_db, target_db, match_link_name,
# no result is found. Otherwise, we just report failure.
def fail_or_retry():
if source_version is None or match_version:
return None
return _get_link(source_accession, source_db, target_db,
match_link_name, source_version=None,
match_version=False)
raise NoLinkError()
return _get_link_from_ncbi(source_db, target_db, match_link_name,
source_accession, source_version=None,
match_version=False)
if source_version is None:
source = source_accession
......@@ -91,26 +108,35 @@ def _get_link(source_accession, source_db, target_db, match_link_name,
return target_accession, target_version
def _get_link_cached(forward_key, reverse_key, source_accession, source_db,
target_db, match_link_name, source_version=None,
match_version=True):
def _get_link_from_cache(forward_key, reverse_key, source_accession,
source_version=None, match_version=True):
"""
Version of :func:`_get_link` with caching.
Retrieve a linked accession number from the cache.
:arg str forward_key: Cache key format string for the forward direction.
The source term will be substituted in this template.
:arg str reverse_key: Cache key format string for the reverse direction.
The target term will be substituted in this template.
:arg str source_accession: Accession number for which we want to find a
link (without version number).
:arg int source_version: Optional version number for `source_accession`.
:arg bool match_version: If `False`, the link does not have to match
`source_version`.
The cache value for a negative result (no link found) is the empty string
and expires in `NEGATIVE_LINK_CACHE_EXPIRATION` seconds.
:raises _NegativeLinkError: If a negative link was found.
:raises NoLinkError: If no link could be found.
:returns: Tuple of `(target_accession, target_version)` representing the
link target. If `source_version` is not specified or `match_version` is
`False`, `target_version` can be `None`.
:rtype: tuple(str, int)
"""
if source_version is not None:
# Query cache for link with version.
target = redis.get(forward_key %
('%s.%d' % (source_accession, source_version)))
if target == '':
return None
raise _NegativeLinkError()
if target:
target_accession, target_version = target.split('.')
return target_accession, int(target_version)
......@@ -119,28 +145,38 @@ def _get_link_cached(forward_key, reverse_key, source_accession, source_db,
# Query cache for link without version.
target = redis.get(forward_key % source_accession)
if target == '':
return None
raise _NegativeLinkError()
if target is not None:
return target, None
# Query NCBI service.
try:
target_accession, target_version = _get_link(
source_accession, source_db, target_db, match_link_name,
source_version=source_version, match_version=match_version)
except TypeError:
# No link was found.
if source_version is not None:
# Store a negative forward link with version.
redis.setex(forward_key %
('%s.%d' % (source_accession, source_version)),
settings.NEGATIVE_LINK_CACHE_EXPIRATION, '')
if source_version is None or not match_version:
# Store a negative forward link without version.
redis.setex(forward_key % source_accession,
settings.NEGATIVE_LINK_CACHE_EXPIRATION, '')
return None
raise NoLinkError()
def _cache_negative_link(forward_key, source_accession, source_version=None,
match_version=True):
"""
Store a negative transcript-protein link (a "no link found" result) in the
cache.
The cache value for a negative link is the empty string and expires in
`NEGATIVE_LINK_CACHE_EXPIRATION` seconds.
"""
if source_version is not None:
# Store a negative forward link with version.
redis.setex(forward_key %
('%s.%d' % (source_accession, source_version)),
settings.NEGATIVE_LINK_CACHE_EXPIRATION, '')
if source_version is None or not match_version:
# Store a negative forward link without version.
redis.setex(forward_key % source_accession,
settings.NEGATIVE_LINK_CACHE_EXPIRATION, '')
def _cache_link(forward_key, reverse_key, source_accession, target_accession,
source_version=None, target_version=None):
"""
Store a transcript-protein link in the cache.
"""
# Store the link without version in both directions.
redis.set(forward_key % source_accession, target_accession)
redis.set(reverse_key % target_accession, source_accession)
......@@ -152,6 +188,41 @@ def _get_link_cached(forward_key, reverse_key, source_accession, source_db,
redis.set(reverse_key % ('%s.%d' % (target_accession, target_version)),
'%s.%d' % (source_accession, source_version))
def _get_link(forward_key, reverse_key, source_db, target_db, match_link_name,
source_accession, source_version=None, match_version=True):
"""
Combines :func:`_get_link_from_ncbi` with :func:`_get_link_from_cache` to
add caching to transcript-protein-link retrieval.
"""
try:
return _get_link_from_cache(
forward_key, reverse_key, source_accession,
source_version=source_version, match_version=match_version)
except _NegativeLinkError:
# If a negative link was in the cache, we report no link found.
raise NoLinkError()
except NoLinkError:
# If no link was in the cache, we continue by querying the NCBI.
pass
# Query NCBI service.
try:
target_accession, target_version = _get_link_from_ncbi(
source_db, target_db, match_link_name, source_accession,
source_version=source_version, match_version=match_version)
except NoLinkError:
# No link found, store this negative result in the cache and re-raise
# the exception.
_cache_negative_link(
forward_key, source_accession, source_version=source_version,
match_version=match_version)
raise
# Store the link in the cache and return the target value.
_cache_link(
forward_key, reverse_key, source_accession, target_accession,
source_version=source_version, target_version=target_version)
return target_accession, target_version
......@@ -172,16 +243,19 @@ def transcript_to_protein(transcript_accession, transcript_version=None,
:arg bool match_version: If `False`, the link does not have to match
`transcript_version`.
:raises NoLinkError: If no link could be found.
:returns: Tuple of `(protein_accession, protein_version)` representing the
linked protein, or `None` if no link can be found. If `match_version` is
`False`, `protein_version` can be `None`. TODO: can or will?
linked protein. If `transcript_version` is not specified or
`match_version` is `False`, `protein_version` can be `None`.
:rtype: tuple(str, int)
"""
return _get_link_cached(
return _get_link(
'ncbi:transcript-to-protein:%s', 'ncbi:protein-to-transcript:%s',
transcript_accession, 'nucleotide', 'protein',
'nucleotide', 'protein',
lambda link: link in ('nuccore_protein', 'nuccore_protein_cds'),
source_version=transcript_version, match_version=match_version)
transcript_accession, source_version=transcript_version,
match_version=match_version)
def protein_to_transcript(protein_accession, protein_version=None,
......@@ -195,14 +269,22 @@ def protein_to_transcript(protein_accession, protein_version=None,
:arg str protein_accession: Accession number of the protein for which we
want to find the transcript (without version number).
TODO
:arg int protein_version: Protein version number. Please provide this if
available, also if it does not need to match. This will enrich the
cache.
:arg bool match_version: If `False`, the link does not have to match
`protein_version`.
:returns: Accession number of a transcript (without version number) or
`None` if no link can be found.
:rtype: str
:raises NoLinkError: If no link could be found.
:returns: Tuple of `(transcript_accession, transcript_version)`
representing the linked transcript. If `protein_version` is not
specified or `match_version` is `False`, `transcript_version` can be
`None`.
:rtype: tuple(str, int)
"""
return _get_link_cached(
return _get_link(
'ncbi:protein-to-transcript:%s', 'ncbi:transcript-to-protein:%s',
protein_accession, 'protein', 'nucleotide',
lambda link: link == 'protein_nuccore_mrna',
source_version=protein_version, match_version=match_version)
'protein', 'nucleotide', lambda link: link == 'protein_nuccore_mrna',
protein_accession, source_version=protein_version,
match_version=match_version)
......@@ -221,13 +221,12 @@ class GBparser():
#if
else : # Tag an mRNA with the protein id too.
accession, version = i.transcript_id.split('.')
protein = ncbi.transcript_to_protein(
accession, int(version), match_version=False)
if protein is None:
i.proteinLink = None
else:
try:
# We ignore the version.
i.proteinLink = protein[0]
i.proteinLink = ncbi.transcript_to_protein(
accession, int(version), match_version=False)[0]
except ncbi.NoLinkError:
pass
i.positionList = self.__locationList2posList(i)
i.location = self.__location2pos(i.location) #FIXME
#if not i.positionList : # FIXME ???
......
......@@ -230,10 +230,15 @@ def test_transcript_to_protein(accession, version, match_version, expected):
Both the Entrez API and our cache are fixed with a set of
transcript-protein links. This test is parametrized with a list of
arguments for the :func:`ncbi.transcript_to_protein` function and the
corresponding expected result.
corresponding expected result (`None` if `NoLinkError` is expected to be
raised).
"""
assert ncbi.transcript_to_protein(
accession, version, match_version) == expected
if expected is None:
with pytest.raises(ncbi.NoLinkError):
ncbi.transcript_to_protein(accession, version, match_version)
else:
assert ncbi.transcript_to_protein(
accession, version, match_version) == expected
@with_entrez((None, 'NP_11111.1'),
......@@ -322,8 +327,12 @@ def test_protein_to_transcript(accession, version, match_version, expected):
Fixtures and parameters of this test mirror those of the
`test_transcript_to_protein` test.
"""
assert ncbi.protein_to_transcript(
accession, version, match_version) == expected
if expected is None:
with pytest.raises(ncbi.NoLinkError):
ncbi.protein_to_transcript(accession, version, match_version)
else:
assert ncbi.protein_to_transcript(
accession, version, match_version) == expected
@with_entrez(('NM_11111', None),
......@@ -351,7 +360,10 @@ def test_transcript_to_protein_cache(accession, version, match_version,
"""
Get protein for transcript and check the resulting cache state.
"""
ncbi.transcript_to_protein(accession, version, match_version)
try:
ncbi.transcript_to_protein(accession, version, match_version)
except ncbi.NoLinkError:
pass
forward = [(key.split(':')[-1], redis.get(key) or None)
for key in redis.keys('ncbi:transcript-to-protein:*')]
......@@ -387,7 +399,10 @@ def test_protein_to_transcript_cache(accession, version, match_version,
"""
Get transcript for protein and check the resulting cache state.
"""
ncbi.protein_to_transcript(accession, version, match_version)
try:
ncbi.protein_to_transcript(accession, version, match_version)
except ncbi.NoLinkError:
pass
forward = [(key.split(':')[-1], redis.get(key) or None)
for key in redis.keys('ncbi:transcript-to-protein:*')]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment