test_ncbi.py

"""
Tests for the mutalyzer.ncbi module.
"""


from __future__ import unicode_literals

import Bio.Entrez
import pytest

from mutalyzer import ncbi
from mutalyzer.redisclient import client as redis

from fixtures import with_links


@pytest.fixture
def entrez(request, monkeypatch):
    """
    Fixture monkey-patching the NCBI Entrez API to return transcript-protein
    links defined in the fixture parameter.

    The fixture is similar to the :func:`fixtures.links` fixture, but instead
    of storing the links in the cache, the API is monkey-patched.
    """
    try:
        links = request.param
    except AttributeError:
        return []

    # We need two-way lookup.
    transcript_to_protein = dict(links)
    protein_to_transcript = dict((p, t) for t, p in links)

    # Store original methods which should be called as a fallback.
    esearch = Bio.Entrez.esearch
    elink = Bio.Entrez.elink
    efetch = Bio.Entrez.efetch

    # Intermediate Entrez result object which can be parsed with Entrez.read.
    class EntrezResult(object):
        def __init__(self, result):
            self.result = result

        def read(self):
            return self.result

        def close(self):
            pass

    def mock_esearch(db=None, term=None):
        if ((db == 'nucleotide' and term in transcript_to_protein)
                or (db == 'protein' and term in protein_to_transcript)):
            return EntrezResult({
                'Count': '1',
                'RetMax': '1',
                'IdList': [term],
                'TranslationSet': [],
                'RetStart': '0',
                'QueryTranslation': ''
            })
        return esearch(db=db, term=term)

    def mock_elink(dbfrom=None, db=None, id=None):
        if dbfrom == 'nucleotide' and id in transcript_to_protein:
            if transcript_to_protein[id] is None:
                linkset = []
            else:
                linkset = [{'DbTo': 'protein',
                            'Link': [{'Id': transcript_to_protein[id]}],
                            'LinkName': 'nuccore_protein'}]
            return EntrezResult([{
                'LinkSetDb': linkset,
                'DbFrom': 'nuccore',
                'IdList': [id],
                'LinkSetDbHistory': [],
                'ERROR': []
            }])
        if dbfrom == 'protein' and id in protein_to_transcript:
            if protein_to_transcript[id] is None:
                linkset = []
            else:
                linkset = [{'DbTo': 'nuccore',
                            'Link': [{'Id': '568815587'},
                                     {'Id': '528476600'},
                                     {'Id': '568815270'},
                                     {'Id': '528474155'},
                                     {'Id': '452415518'},
                                     {'Id': '452405284'},
                                     {'Id': '383209650'}],
                            'LinkName': 'protein_nuccore'},
                           {'DbTo': 'nuccore',
                            'Link': [{'Id': '4506864'}],
                            'LinkName': 'protein_nuccore_cds'},
                           {'DbTo': 'nuccore',
                            'Link': [{'Id': '48735311'},
                                     {'Id': '48734961'},
                                     {'Id': '47682402'},
                                     {'Id': '18490203'},
                                     {'Id': '16359050'},
                                     {'Id': '16306997'},
                                     {'Id': '15929518'},
                                     {'Id': '15214938'},
                                     {'Id': '13528941'}],
                            'LinkName': 'protein_nuccore_mgc_refseq'},
                           {'DbTo': 'nuccore',
                            'Link': [{'Id': protein_to_transcript[id]}],
                            'LinkName': 'protein_nuccore_mrna'}]
            return EntrezResult([{
                'LinkSetDb': linkset,
                'DbFrom': 'protein',
                'IdList': [id],
                'LinkSetDbHistory': [],
                'ERROR': []
            }])
        return elink(dbfrom=dbfrom, db=db, id=id)

    def mock_efetch(db=None, id=None, rettype=None, retmode=None):
        if ((db == 'nucleotide' and id in transcript_to_protein)
                or (db == 'protein' and id in protein_to_transcript)):
            if '.' not in id:
                id += '.9999'
            return EntrezResult(id + '\n')
        return efetch(db=db, id=id, rettype=rettype, retmode=retmode)

    def mock_read(result):
        return result.read()

    monkeypatch.setattr(Bio.Entrez, 'esearch', mock_esearch)
    monkeypatch.setattr(Bio.Entrez, 'elink', mock_elink)
    monkeypatch.setattr(Bio.Entrez, 'efetch', mock_efetch)
    monkeypatch.setattr(Bio.Entrez, 'read', mock_read)
    return links


def with_entrez(*links):
    """
    Convenience decorator for parameterizing tests with transcript-protein
    link fixtures in the Entrez API.

    Similar to :func:`fixtures.with_links`.
    """
    def test_with_entrez(test):
        return pytest.mark.usefixtures('entrez')(
            pytest.mark.parametrize(
                'entrez', [links], indirect=True,
                ids=[','.join('/'.join(a or '*' for a in l)
                              for l in links)])(test))
    return test_with_entrez


@with_entrez(('NM_11111.1', None),
             ('NM_11111.2', 'NP_11111.2'),
             ('NM_22222.2', None),
             ('NM_22222.3', 'NP_22222.3'),
             ('NM_33333.4', None),
             ('NM_33333.5', 'NP_33333.5'),
             ('NM_44444', None),
             ('NM_44444.5', None),
             ('NM_44444.6', None),
             ('NM_55555', 'NP_55555'),
             ('NM_55555.6', None),
             ('NM_66666', 'NP_66666'),
             ('NM_66666.6', 'NP_66666.6'),
             ('NM_66666.7', 'NP_66666.7'),
             ('NM_66666.8', None),
             ('NM_77777', 'NP_77777'),
             ('NM_77777.7', 'NP_77777.7'),
             ('NM_77777.8', None),
             ('NM_88888', None),
             ('NM_88888.8', None),
             ('NM_88888.9', 'NP_88888.9'))
@with_links(('NM_11111', 'NP_11111'),
            ('NM_22222', None),
            ('NM_33333.3', 'NP_33333.3'),
            ('NM_44444.4', None),
            ('NM_55555.5', None),
            ('NM_66666.6', None))
@pytest.mark.parametrize('accession,version,match_version,expected', [
    ('NM_11111', None, False, ('NP_11111', None)),
    ('NM_11111', 1, False, ('NP_11111', None)),
    ('NM_11111', 1, True, None),
    ('NM_11111', 2, False, ('NP_11111', None)),
    ('NM_11111', 2, True, ('NP_11111', 2)),
    ('NM_22222', None, False, None),
    ('NM_22222', 2, False, None),
    ('NM_22222', 2, True, None),
    ('NM_22222', 3, False, None),
    ('NM_22222', 3, True, ('NP_22222', 3)),
    ('NM_33333', None, False, ('NP_33333', None)),
    ('NM_33333', 3, True, ('NP_33333', 3)),
    ('NM_33333', 3, False, ('NP_33333', 3)),
    ('NM_33333', 4, True, None),
    ('NM_33333', 4, False, ('NP_33333', None)),
    ('NM_33333', 5, True, ('NP_33333', 5)),
    ('NM_33333', 5, False, ('NP_33333', None)),
    ('NM_44444', None, False, None),
    ('NM_44444', 4, True, None),
    ('NM_44444', 4, False, None),
    ('NM_44444', 5, True, None),
    ('NM_44444', 5, False, None),
    ('NM_44444', 6, True, None),
    ('NM_44444', 6, False, None),
    ('NM_55555', None, False, ('NP_55555', None)),
    ('NM_55555', 5, True, None),
    ('NM_55555', 5, False, None),
    ('NM_55555', 6, True, None),
    ('NM_55555', 6, False, ('NP_55555', None)),
    ('NM_66666', None, False, ('NP_66666', None)),
    ('NM_66666', 6, True, None),
    ('NM_66666', 6, False, None),
    ('NM_66666', 7, True, ('NP_66666', 7)),
    ('NM_66666', 7, False, ('NP_66666', 7)),
    ('NM_66666', 8, True, None),
    ('NM_66666', 8, False, ('NP_66666', None)),
    ('NM_77777', None, False, ('NP_77777', None)),
    ('NM_77777', 7, False, ('NP_77777', 7)),
    ('NM_77777', 7, True, ('NP_77777', 7)),
    ('NM_77777', 8, False, ('NP_77777', None)),
    ('NM_77777', 8, True, None),
    ('NM_88888', None, False, None),
    ('NM_88888', 8, False, None),
    ('NM_88888', 8, True, None),
    ('NM_88888', 9, False, ('NP_88888', 9)),
    ('NM_88888', 9, True, ('NP_88888', 9))])
def test_transcript_to_protein(accession, version, match_version, expected):
    """
    Get protein for transcript.

    Both the Entrez API and our cache are fixed with a set of
    transcript-protein links. This test is parametrized with a list of
    arguments for the :func:`ncbi.transcript_to_protein` function and the
    corresponding expected result (`None` if `NoLinkError` is expected to be
    raised).
    """
    if expected is None:
        with pytest.raises(ncbi.NoLinkError):
            ncbi.transcript_to_protein(accession, version, match_version)
    else:
        assert ncbi.transcript_to_protein(
            accession, version, match_version) == expected


@with_entrez((None, 'NP_11111.1'),
             ('NM_11111.2', 'NP_11111.2'),
             (None, 'NP_22222.2'),
             ('NM_22222.3', 'NP_22222.3'),
             (None, 'NP_33333.4'),
             ('NM_33333.5', 'NP_33333.5'),
             (None, 'NP_44444'),
             (None, 'NP_44444.5'),
             (None, 'NP_44444.6'),
             ('NM_55555', 'NP_55555'),
             (None, 'NP_55555.6'),
             ('NM_66666', 'NP_66666'),
             ('NM_66666.6', 'NP_66666.6'),
             ('NM_66666.7', 'NP_66666.7'),
             (None, 'NP_66666.8'),
             ('NM_77777', 'NP_77777'),
             ('NM_77777.7', 'NP_77777.7'),
             (None, 'NP_77777.8'),
             (None, 'NP_88888'),
             (None, 'NP_88888.8'),
             ('NM_88888.9', 'NP_88888.9'))
@with_links(('NM_11111', 'NP_11111'),
            (None, 'NP_22222'),
            ('NM_33333.3', 'NP_33333.3'),
            (None, 'NP_44444.4'),
            (None, 'NP_55555.5'),
            (None, 'NP_66666.6'))
@pytest.mark.parametrize('accession,version,match_version,expected', [
    ('NP_11111', None, False, ('NM_11111', None)),
    ('NP_11111', 1, False, ('NM_11111', None)),
    ('NP_11111', 1, True, None),
    ('NP_11111', 2, False, ('NM_11111', None)),
    ('NP_11111', 2, True, ('NM_11111', 2)),
    ('NP_22222', None, False, None),
    ('NP_22222', 2, False, None),
    ('NP_22222', 2, True, None),
    ('NP_22222', 3, False, None),
    ('NP_22222', 3, True, ('NM_22222', 3)),
    ('NP_33333', None, False, ('NM_33333', None)),
    ('NP_33333', 3, True, ('NM_33333', 3)),
    ('NP_33333', 3, False, ('NM_33333', 3)),
    ('NP_33333', 4, True, None),
    ('NP_33333', 4, False, ('NM_33333', None)),
    ('NP_33333', 5, True, ('NM_33333', 5)),
    ('NP_33333', 5, False, ('NM_33333', None)),
    ('NP_44444', None, False, None),
    ('NP_44444', 4, True, None),
    ('NP_44444', 4, False, None),
    ('NP_44444', 5, True, None),
    ('NP_44444', 5, False, None),
    ('NP_44444', 6, True, None),
    ('NP_44444', 6, False, None),
    ('NP_55555', None, False, ('NM_55555', None)),
    ('NP_55555', 5, True, None),
    ('NP_55555', 5, False, None),
    ('NP_55555', 6, True, None),
    ('NP_55555', 6, False, ('NM_55555', None)),
    ('NP_66666', None, False, ('NM_66666', None)),
    ('NP_66666', 6, True, None),
    ('NP_66666', 6, False, None),
    ('NP_66666', 7, True, ('NM_66666', 7)),
    ('NP_66666', 7, False, ('NM_66666', 7)),
    ('NP_66666', 8, True, None),
    ('NP_66666', 8, False, ('NM_66666', None)),
    ('NP_77777', None, False, ('NM_77777', None)),
    ('NP_77777', 7, False, ('NM_77777', 7)),
    ('NP_77777', 7, True, ('NM_77777', 7)),
    ('NP_77777', 8, False, ('NM_77777', None)),
    ('NP_77777', 8, True, None),
    ('NP_88888', None, False, None),
    ('NP_88888', 8, False, None),
    ('NP_88888', 8, True, None),
    ('NP_88888', 9, False, ('NM_88888', 9)),
    ('NP_88888', 9, True, ('NM_88888', 9))])
def test_protein_to_transcript(accession, version, match_version, expected):
    """
    Get transcript for protein.

    Both the Entrez API and our cache are fixed with a set of
    transcript-protein links. This test is parametrized with a list of
    arguments for the :func:`ncbi.transcript_to_protein` function and the
    corresponding expected result.

    Fixtures and parameters of this test mirror those of the
    `test_transcript_to_protein` test.
    """
    if expected is None:
        with pytest.raises(ncbi.NoLinkError):
            ncbi.protein_to_transcript(accession, version, match_version)
    else:
        assert ncbi.protein_to_transcript(
            accession, version, match_version) == expected


@with_entrez(('NM_11111', None),
             ('NM_22222', 'NP_22222'),
             ('NM_33333', None),
             ('NM_33333.3', None),
             ('NM_44444', None),
             ('NM_44444.4', 'NP_44444.4'))
@pytest.mark.parametrize('accession,version,match_version,expected_forward,expected_reverse', [
    ('NM_11111', None, False, [('NM_11111', None)], []),
    ('NM_22222', None, False,
     [('NM_22222', 'NP_22222')], [('NM_22222', 'NP_22222')]),
    ('NM_33333', None, False, [('NM_33333', None)], []),
    ('NM_33333', 3, False, [('NM_33333', None), ('NM_33333.3', None)], []),
    ('NM_33333', 3, True, [('NM_33333.3', None)], []),
    ('NM_44444', None, False, [('NM_44444', None)], []),
    ('NM_44444', 4, False,
     [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')],
     [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')]),
    ('NM_44444', 4, True,
     [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')],
     [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')])])
def test_transcript_to_protein_cache(accession, version, match_version,
                                     expected_forward, expected_reverse):
    """
    Get protein for transcript and check the resulting cache state.
    """
    try:
        ncbi.transcript_to_protein(accession, version, match_version)
    except ncbi.NoLinkError:
        pass

    forward = [(key.split(':')[-1], redis.get(key) or None)
               for key in redis.keys('ncbi:transcript-to-protein:*')]
    assert sorted(forward) == sorted(expected_forward)

    reverse = [(redis.get(key) or None, key.split(':')[-1])
               for key in redis.keys('ncbi:protein-to-transcript:*')]
    assert sorted(reverse) == sorted(expected_reverse)


@with_entrez((None, 'NP_11111'),
             ('NM_22222', 'NP_22222'),
             (None, 'NP_33333'),
             (None, 'NP_33333.3'),
             (None, 'NP_44444'),
             ('NM_44444.4', 'NP_44444.4'))
@pytest.mark.parametrize('accession,version,match_version,expected_forward,expected_reverse', [
    ('NP_11111', None, False, [], [(None, 'NP_11111')]),
    ('NP_22222', None, False,
     [('NM_22222', 'NP_22222')], [('NM_22222', 'NP_22222')]),
    ('NP_33333', None, False, [], [(None, 'NP_33333')]),
    ('NP_33333', 3, False, [], [(None, 'NP_33333'), (None, 'NP_33333.3')]),
    ('NP_33333', 3, True, [], [(None, 'NP_33333.3')]),
    ('NP_44444', None, False, [], [(None, 'NP_44444')]),
    ('NP_44444', 4, False,
     [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')],
     [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')]),
    ('NP_44444', 4, True,
     [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')],
     [('NM_44444', 'NP_44444'), ('NM_44444.4', 'NP_44444.4')])])
def test_protein_to_transcript_cache(accession, version, match_version,
                                     expected_forward, expected_reverse):
    """
    Get transcript for protein and check the resulting cache state.
    """
    try:
        ncbi.protein_to_transcript(accession, version, match_version)
    except ncbi.NoLinkError:
        pass

    forward = [(key.split(':')[-1], redis.get(key) or None)
               for key in redis.keys('ncbi:transcript-to-protein:*')]
    assert sorted(forward) == sorted(expected_forward)

    reverse = [(redis.get(key) or None, key.split(':')[-1])
               for key in redis.keys('ncbi:protein-to-transcript:*')]
    assert sorted(reverse) == sorted(expected_reverse)