Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mutalyzer
Manage
Activity
Members
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Analyze
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Mirrors
mutalyzer
Commits
0f6cafe0
Commit
0f6cafe0
authored
9 years ago
by
Vermaat
Browse files
Options
Downloads
Patches
Plain Diff
Refactor transcript-protein links to raise NoLinkError instead of None
parent
f4b7d13e
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
mutalyzer/ncbi.py
+131
-49
131 additions, 49 deletions
mutalyzer/ncbi.py
mutalyzer/parsers/genbank.py
+5
-6
5 additions, 6 deletions
mutalyzer/parsers/genbank.py
tests/test_ncbi.py
+22
-7
22 additions, 7 deletions
tests/test_ncbi.py
with
158 additions
and
62 deletions
mutalyzer/ncbi.py
+
131
−
49
View file @
0f6cafe0
...
...
@@ -9,26 +9,43 @@ from .config import settings
from
.redisclient
import
client
as
redis
def
_get_link
(
source_accession
,
source_db
,
target_db
,
match_link_name
,
source_version
=
None
,
match_version
=
True
):
class
_NegativeLinkError
(
Exception
):
"""
Raised when no transcript-protein link exists (used for cached negative
links).
"""
pass
class
NoLinkError
(
Exception
):
"""
Raised when no transcript-protein link can be found.
"""
pass
def
_get_link_from_ncbi
(
source_db
,
target_db
,
match_link_name
,
source_accession
,
source_version
=
None
,
match_version
=
True
):
"""
Retrieve a linked accession number from the NCBI.
:arg str source_accession: Accession number for which we want to find a
link (without version number).
:arg str source_db: NCBI source database.
:arg str target_db: NCBI target database.
:arg function match_link_name: For each link found, this function is
called with the link name (`str`) and it should return `True` iff the
link is to be used.
:arg str source_accession: Accession number for which we want to find a
link (without version number).
:arg int source_version: Optional version number for `source_accession`.
:arg bool match_version: If `False`, the link does not have to match
`source_version`.
:raises NoLinkError: If no link could be retrieved from the NCBI.
:returns: Tuple of `(target_accession, target_version)` representing the
link target, or `None` if no link can be found. If `source_version` is
not specified or `match_version` is `False`, `target_version` can be
`None`.
link target. If `source_version` is not specified or `match_version` is
`False`, `target_version` can be `None`.
:rtype: tuple(str, int)
"""
Entrez
.
email
=
settings
.
EMAIL
...
...
@@ -37,10 +54,10 @@ def _get_link(source_accession, source_db, target_db, match_link_name,
# no result is found. Otherwise, we just report failure.
def
fail_or_retry
():
if
source_version
is
None
or
match_version
:
r
eturn
None
return
_get_link
(
source_accession
,
source_db
,
target_db
,
match_link_name
,
source_version
=
None
,
match_version
=
False
)
r
aise
NoLinkError
()
return
_get_link
_from_ncbi
(
source_db
,
target_db
,
match_link_name
,
source_accession
,
source_version
=
None
,
match_version
=
False
)
if
source_version
is
None
:
source
=
source_accession
...
...
@@ -91,26 +108,35 @@ def _get_link(source_accession, source_db, target_db, match_link_name,
return
target_accession
,
target_version
def
_get_link_cached
(
forward_key
,
reverse_key
,
source_accession
,
source_db
,
target_db
,
match_link_name
,
source_version
=
None
,
match_version
=
True
):
def
_get_link_from_cache
(
forward_key
,
reverse_key
,
source_accession
,
source_version
=
None
,
match_version
=
True
):
"""
Version of :func:`_get_link` wi
th cach
ing
.
Retrieve a linked accession number from
th
e
cach
e
.
:arg str forward_key: Cache key format string for the forward direction.
The source term will be substituted in this template.
:arg str reverse_key: Cache key format string for the reverse direction.
The target term will be substituted in this template.
:arg str source_accession: Accession number for which we want to find a
link (without version number).
:arg int source_version: Optional version number for `source_accession`.
:arg bool match_version: If `False`, the link does not have to match
`source_version`.
The cache value for a negative result (no link found) is the empty string
and expires in `NEGATIVE_LINK_CACHE_EXPIRATION` seconds.
:raises _NegativeLinkError: If a negative link was found.
:raises NoLinkError: If no link could be found.
:returns: Tuple of `(target_accession, target_version)` representing the
link target. If `source_version` is not specified or `match_version` is
`False`, `target_version` can be `None`.
:rtype: tuple(str, int)
"""
if
source_version
is
not
None
:
# Query cache for link with version.
target
=
redis
.
get
(
forward_key
%
(
'
%s.%d
'
%
(
source_accession
,
source_version
)))
if
target
==
''
:
r
eturn
None
r
aise
_NegativeLinkError
()
if
target
:
target_accession
,
target_version
=
target
.
split
(
'
.
'
)
return
target_accession
,
int
(
target_version
)
...
...
@@ -119,28 +145,38 @@ def _get_link_cached(forward_key, reverse_key, source_accession, source_db,
# Query cache for link without version.
target
=
redis
.
get
(
forward_key
%
source_accession
)
if
target
==
''
:
r
eturn
None
r
aise
_NegativeLinkError
()
if
target
is
not
None
:
return
target
,
None
# Query NCBI service.
try
:
target_accession
,
target_version
=
_get_link
(
source_accession
,
source_db
,
target_db
,
match_link_name
,
source_version
=
source_version
,
match_version
=
match_version
)
except
TypeError
:
# No link was found.
if
source_version
is
not
None
:
# Store a negative forward link with version.
redis
.
setex
(
forward_key
%
(
'
%s.%d
'
%
(
source_accession
,
source_version
)),
settings
.
NEGATIVE_LINK_CACHE_EXPIRATION
,
''
)
if
source_version
is
None
or
not
match_version
:
# Store a negative forward link without version.
redis
.
setex
(
forward_key
%
source_accession
,
settings
.
NEGATIVE_LINK_CACHE_EXPIRATION
,
''
)
return
None
raise
NoLinkError
()
def
_cache_negative_link
(
forward_key
,
source_accession
,
source_version
=
None
,
match_version
=
True
):
"""
Store a negative transcript-protein link (a
"
no link found
"
result) in the
cache.
The cache value for a negative link is the empty string and expires in
`NEGATIVE_LINK_CACHE_EXPIRATION` seconds.
"""
if
source_version
is
not
None
:
# Store a negative forward link with version.
redis
.
setex
(
forward_key
%
(
'
%s.%d
'
%
(
source_accession
,
source_version
)),
settings
.
NEGATIVE_LINK_CACHE_EXPIRATION
,
''
)
if
source_version
is
None
or
not
match_version
:
# Store a negative forward link without version.
redis
.
setex
(
forward_key
%
source_accession
,
settings
.
NEGATIVE_LINK_CACHE_EXPIRATION
,
''
)
def
_cache_link
(
forward_key
,
reverse_key
,
source_accession
,
target_accession
,
source_version
=
None
,
target_version
=
None
):
"""
Store a transcript-protein link in the cache.
"""
# Store the link without version in both directions.
redis
.
set
(
forward_key
%
source_accession
,
target_accession
)
redis
.
set
(
reverse_key
%
target_accession
,
source_accession
)
...
...
@@ -152,6 +188,41 @@ def _get_link_cached(forward_key, reverse_key, source_accession, source_db,
redis
.
set
(
reverse_key
%
(
'
%s.%d
'
%
(
target_accession
,
target_version
)),
'
%s.%d
'
%
(
source_accession
,
source_version
))
def
_get_link
(
forward_key
,
reverse_key
,
source_db
,
target_db
,
match_link_name
,
source_accession
,
source_version
=
None
,
match_version
=
True
):
"""
Combines :func:`_get_link_from_ncbi` with :func:`_get_link_from_cache` to
add caching to transcript-protein-link retrieval.
"""
try
:
return
_get_link_from_cache
(
forward_key
,
reverse_key
,
source_accession
,
source_version
=
source_version
,
match_version
=
match_version
)
except
_NegativeLinkError
:
# If a negative link was in the cache, we report no link found.
raise
NoLinkError
()
except
NoLinkError
:
# If no link was in the cache, we continue by querying the NCBI.
pass
# Query NCBI service.
try
:
target_accession
,
target_version
=
_get_link_from_ncbi
(
source_db
,
target_db
,
match_link_name
,
source_accession
,
source_version
=
source_version
,
match_version
=
match_version
)
except
NoLinkError
:
# No link found, store this negative result in the cache and re-raise
# the exception.
_cache_negative_link
(
forward_key
,
source_accession
,
source_version
=
source_version
,
match_version
=
match_version
)
raise
# Store the link in the cache and return the target value.
_cache_link
(
forward_key
,
reverse_key
,
source_accession
,
target_accession
,
source_version
=
source_version
,
target_version
=
target_version
)
return
target_accession
,
target_version
...
...
@@ -172,16 +243,19 @@ def transcript_to_protein(transcript_accession, transcript_version=None,
:arg bool match_version: If `False`, the link does not have to match
`transcript_version`.
:raises NoLinkError: If no link could be found.
:returns: Tuple of `(protein_accession, protein_version)` representing the
linked protein
, or `None` if no link can be found. If `match_version` is
`False`, `protein_version` can be `None`.
TODO: can or will?
linked protein
. If `transcript_version` is not specified or
`match_version` is
`False`, `protein_version` can be `None`.
:rtype: tuple(str, int)
"""
return
_get_link
_cached
(
return
_get_link
(
'
ncbi:transcript-to-protein:%s
'
,
'
ncbi:protein-to-transcript:%s
'
,
transcript_accession
,
'
nucleotide
'
,
'
protein
'
,
'
nucleotide
'
,
'
protein
'
,
lambda
link
:
link
in
(
'
nuccore_protein
'
,
'
nuccore_protein_cds
'
),
source_version
=
transcript_version
,
match_version
=
match_version
)
transcript_accession
,
source_version
=
transcript_version
,
match_version
=
match_version
)
def
protein_to_transcript
(
protein_accession
,
protein_version
=
None
,
...
...
@@ -195,14 +269,22 @@ def protein_to_transcript(protein_accession, protein_version=None,
:arg str protein_accession: Accession number of the protein for which we
want to find the transcript (without version number).
TODO
:arg int protein_version: Protein version number. Please provide this if
available, also if it does not need to match. This will enrich the
cache.
:arg bool match_version: If `False`, the link does not have to match
`protein_version`.
:returns: Accession number of a transcript (without version number) or
`None` if no link can be found.
:rtype: str
:raises NoLinkError: If no link could be found.
:returns: Tuple of `(transcript_accession, transcript_version)`
representing the linked transcript. If `protein_version` is not
specified or `match_version` is `False`, `transcript_version` can be
`None`.
:rtype: tuple(str, int)
"""
return
_get_link
_cached
(
return
_get_link
(
'
ncbi:protein-to-transcript:%s
'
,
'
ncbi:transcript-to-protein:%s
'
,
protein
_accession
,
'
protein
'
,
'
nucleotide
'
,
lambda
link
:
link
==
'
protein_nuccore_mrna
'
,
source_version
=
protein_version
,
match_version
=
match_version
)
'
protein
'
,
'
nucleotide
'
,
lambda
link
:
link
==
'
protein_nuccore_mrna
'
,
protein_accession
,
source_version
=
protein_version
,
match_version
=
match_version
)
This diff is collapsed.
Click to expand it.
mutalyzer/parsers/genbank.py
+
5
−
6
View file @
0f6cafe0
...
...
@@ -221,13 +221,12 @@ class GBparser():
#if
else
:
# Tag an mRNA with the protein id too.
accession
,
version
=
i
.
transcript_id
.
split
(
'
.
'
)
protein
=
ncbi
.
transcript_to_protein
(
accession
,
int
(
version
),
match_version
=
False
)
if
protein
is
None
:
i
.
proteinLink
=
None
else
:
try
:
# We ignore the version.
i
.
proteinLink
=
protein
[
0
]
i
.
proteinLink
=
ncbi
.
transcript_to_protein
(
accession
,
int
(
version
),
match_version
=
False
)[
0
]
except
ncbi
.
NoLinkError
:
pass
i
.
positionList
=
self
.
__locationList2posList
(
i
)
i
.
location
=
self
.
__location2pos
(
i
.
location
)
#FIXME
#if not i.positionList : # FIXME ???
...
...
This diff is collapsed.
Click to expand it.
tests/test_ncbi.py
+
22
−
7
View file @
0f6cafe0
...
...
@@ -230,10 +230,15 @@ def test_transcript_to_protein(accession, version, match_version, expected):
Both the Entrez API and our cache are fixed with a set of
transcript-protein links. This test is parametrized with a list of
arguments for the :func:`ncbi.transcript_to_protein` function and the
corresponding expected result.
corresponding expected result (`None` if `NoLinkError` is expected to be
raised).
"""
assert
ncbi
.
transcript_to_protein
(
accession
,
version
,
match_version
)
==
expected
if
expected
is
None
:
with
pytest
.
raises
(
ncbi
.
NoLinkError
):
ncbi
.
transcript_to_protein
(
accession
,
version
,
match_version
)
else
:
assert
ncbi
.
transcript_to_protein
(
accession
,
version
,
match_version
)
==
expected
@with_entrez
((
None
,
'
NP_11111.1
'
),
...
...
@@ -322,8 +327,12 @@ def test_protein_to_transcript(accession, version, match_version, expected):
Fixtures and parameters of this test mirror those of the
`test_transcript_to_protein` test.
"""
assert
ncbi
.
protein_to_transcript
(
accession
,
version
,
match_version
)
==
expected
if
expected
is
None
:
with
pytest
.
raises
(
ncbi
.
NoLinkError
):
ncbi
.
protein_to_transcript
(
accession
,
version
,
match_version
)
else
:
assert
ncbi
.
protein_to_transcript
(
accession
,
version
,
match_version
)
==
expected
@with_entrez
((
'
NM_11111
'
,
None
),
...
...
@@ -351,7 +360,10 @@ def test_transcript_to_protein_cache(accession, version, match_version,
"""
Get protein for transcript and check the resulting cache state.
"""
ncbi
.
transcript_to_protein
(
accession
,
version
,
match_version
)
try
:
ncbi
.
transcript_to_protein
(
accession
,
version
,
match_version
)
except
ncbi
.
NoLinkError
:
pass
forward
=
[(
key
.
split
(
'
:
'
)[
-
1
],
redis
.
get
(
key
)
or
None
)
for
key
in
redis
.
keys
(
'
ncbi:transcript-to-protein:*
'
)]
...
...
@@ -387,7 +399,10 @@ def test_protein_to_transcript_cache(accession, version, match_version,
"""
Get transcript for protein and check the resulting cache state.
"""
ncbi
.
protein_to_transcript
(
accession
,
version
,
match_version
)
try
:
ncbi
.
protein_to_transcript
(
accession
,
version
,
match_version
)
except
ncbi
.
NoLinkError
:
pass
forward
=
[(
key
.
split
(
'
:
'
)[
-
1
],
redis
.
get
(
key
)
or
None
)
for
key
in
redis
.
keys
(
'
ncbi:transcript-to-protein:*
'
)]
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment