From 9d06cc02b520092c4dc9c77d8c1c457cb35871c6 Mon Sep 17 00:00:00 2001
From: "Jeroen F.J. Laros" <jlaros@fixedpoint.nl>
Date: Sun, 27 Sep 2015 21:14:04 +0200
Subject: [PATCH] Add protein_to_transcript in separate ncbi module

Existing `transcript_to_protein` is factored out into a separate
`mutalyzer.ncbi` module.
---
 mutalyzer/ncbi.py            | 128 +++++++++++++++++++++++++++++++++++
 mutalyzer/parsers/genbank.py |  70 ++-----------------
 2 files changed, 132 insertions(+), 66 deletions(-)
 create mode 100644 mutalyzer/ncbi.py

diff --git a/mutalyzer/ncbi.py b/mutalyzer/ncbi.py
new file mode 100644
index 00000000..dc02da6d
--- /dev/null
+++ b/mutalyzer/ncbi.py
@@ -0,0 +1,128 @@
+from Bio import Entrez
+
+from .config import settings
+from .db import queries
+
+
+def transcript_to_protein(transcript_accession):
+    """
+    Try to find the protein linked to a transcript.
+
+    First look in our database. If a link cannot be found, try to retrieve it
+    from the NCBI. Add the result to our database.
+
+    :arg str transcript_accession: Accession number of the transcript for
+      which we want to find the protein (without version number).
+
+    :returns: Accession number of a protein (without version number) or `None`
+      if no link can be found.
+    :rtype: str
+    """
+    Entrez.email = settings.EMAIL
+
+    link = queries.get_transcript_protein_link(transcript_accession)
+    if link is not None:
+        return link.protein_accession
+
+    handle = Entrez.esearch(db='nucleotide', term=transcript_accession)
+    try:
+        result = Entrez.read(handle)
+    except Entrez.Parser.ValidationError:
+        # Todo: Log this error.
+        return None
+    finally:
+        handle.close()
+
+    transcript_gi = unicode(result['IdList'][0])
+
+    handle = Entrez.elink(dbfrom='nucleotide', db='protein', id=transcript_gi)
+    try:
+        result = Entrez.read(handle)
+    except Entrez.Parser.ValidationError:
+        # Todo: Log this error.
+        return None
+    finally:
+        handle.close()
+
+    if not result[0]['LinkSetDb']:
+        # We also cache negative results.
+        queries.update_transcript_protein_link(
+            transcript_accession=transcript_accession)
+        return None
+
+    protein_gi = unicode(result[0]['LinkSetDb'][0]['Link'][0]['Id'])
+
+    handle = Entrez.efetch(
+        db='protein', id=protein_gi, rettype='acc', retmode='text')
+    protein_accession = unicode(handle.read()).split('.')[0]
+    handle.close()
+
+    queries.update_transcript_protein_link(
+        transcript_accession=transcript_accession,
+        protein_accession=protein_accession)
+    return protein_accession
+
+
+def protein_to_transcript(protein_accession):
+    """
+    Try to find the transcript linked to a protein.
+
+    First look in our database. If a link cannot be found, try to retrieve it
+    from the NCBI. Add the result to our database.
+
+    :arg str protein_accession: Accession number of the protein for which we
+      want to find the transcript (without version number).
+
+    :returns: Accession number of a transcript (without version number) or
+      `None` if no link can be found.
+    :rtype: str
+    """
+    Entrez.email = settings.EMAIL
+
+    link = queries.get_transcript_protein_link(protein_accession, reverse=True)
+    if link is not None:
+        return link.transcript_accession
+
+    handle = Entrez.esearch(db='protein', term=protein_accession)
+    try:
+        result = Entrez.read(handle)
+    except Entrez.Parser.ValidationError:
+        # Todo: Log this error.
+        return None
+    finally:
+        handle.close()
+
+    if not result['IdList']:
+        return None
+    protein_gi = unicode(result['IdList'][0])
+
+    handle = Entrez.elink(dbfrom='protein', db='nucleotide', id=protein_gi)
+    try:
+        result = Entrez.read(handle)
+    except Entrez.Parser.ValidationError:
+        # Todo: Log this error.
+        return None
+    finally:
+        handle.close()
+
+    if not result[0]['LinkSetDb']:
+        # We also cache negative results.
+        queries.update_transcript_protein_link(
+            protein_accession=protein_accession)
+        return None
+
+    transcript_gi = ''
+    for link in result[0]['LinkSetDb']:
+        if unicode(link['LinkName']) == 'protein_nuccore_mrna':
+            transcript_gi = unicode(link['Link'][0]['Id'])
+            break
+
+    handle = Entrez.efetch(
+        db='nucleotide', id=transcript_gi, rettype='acc', retmode='text')
+    transcript_accession = unicode(handle.read()).split('.')[0]
+    handle.close()
+
+    queries.update_transcript_protein_link(
+        transcript_accession=transcript_accession,
+        protein_accession=protein_accession)
+    return transcript_accession
diff --git a/mutalyzer/parsers/genbank.py b/mutalyzer/parsers/genbank.py
index 907ee45f..5312a1b5 100644
--- a/mutalyzer/parsers/genbank.py
+++ b/mutalyzer/parsers/genbank.py
@@ -11,12 +11,11 @@ import re
 import bz2
 from itertools import izip_longest
 
-from Bio import SeqIO, Entrez
+from Bio import SeqIO
 from Bio.Alphabet import ProteinAlphabet
 
-from mutalyzer.config import settings
-from mutalyzer.db import queries
-from mutalyzer.GenRecord import PList, Locus, Gene, Record
+from .. import ncbi
+from ..GenRecord import PList, Locus, Gene, Record
 
 
 # Regular expression used to find version number in locus tag
@@ -58,13 +57,6 @@ class GBparser():
     """
     @todo: documentation
     """
-    def __init__(self):
-        """
-        Initialise the class
-        """
-        Entrez.email = settings.EMAIL
-    #__init__
-
     def __location2pos(self, location):
         """
         Convert a location object to a tuple of integers.
@@ -122,60 +114,6 @@ class GBparser():
         return ret
     #__locationList2posList
 
-    def __transcriptToProtein(self, transcriptAcc):
-        """
-        Try to find the protein linked to a transcript id.
-
-        First look in our database, if a link can not be found, try to
-        retrieve it via the NCBI. Store the result in our database.
-
-        @arg transcriptAcc: Accession number of the transcript for which we
-                            want to find the protein
-        @type transcriptAcc: unicode
-
-        @return: Accession number of a protein or None if nothing can be found
-        @rtype: unicode
-        """
-        link = queries.get_transcript_protein_link(transcriptAcc)
-        if link is not None:
-            return link.protein_accession
-
-        handle = Entrez.esearch(db = "nucleotide", term = transcriptAcc)
-        try:
-            result = Entrez.read(handle)
-        except Entrez.Parser.ValidationError:
-            # Todo: Log this error.
-            return None
-        finally:
-            handle.close()
-
-        transcriptGI = unicode(result["IdList"][0])
-
-        handle = Entrez.elink(dbfrom = "nucleotide", db = "protein",
-                              id = transcriptGI)
-        try:
-            result = Entrez.read(handle)
-        except Entrez.Parser.ValidationError:
-            # Todo: Log this error.
-            return None
-        finally:
-            handle.close()
-
-        if not result[0]["LinkSetDb"] :
-            queries.update_transcript_protein_link(transcriptAcc)
-            return None
-
-        proteinGI = unicode(result[0]["LinkSetDb"][0]["Link"][0]["Id"])
-
-        handle = Entrez.efetch(db='protein', id=proteinGI, rettype='acc', retmode='text')
-
-        proteinAcc = unicode(handle.read()).split('.')[0]
-        handle.close()
-
-        queries.update_transcript_protein_link(transcriptAcc, proteinAcc)
-        return proteinAcc
-    #__transcriptToProtein
-
     def _find_mismatch(self, sentences):
         """
         Find the indices of the first and last words that distinguishes one
@@ -283,7 +221,7 @@ class GBparser():
             #if
             else :                # Tag an mRNA with the protein id too.
                 i.proteinLink = \
-                    self.__transcriptToProtein(i.transcript_id.split('.')[0])
+                    ncbi.transcript_to_protein(i.transcript_id.split('.')[0])
             i.positionList = self.__locationList2posList(i)
             i.location = self.__location2pos(i.location) #FIXME
             #if not i.positionList : # FIXME ???
-- 
GitLab