Skip to content
Snippets Groups Projects
Commit 9d06cc02 authored by Jeroen F.J. Laros's avatar Jeroen F.J. Laros Committed by Vermaat
Browse files

Add protein_to_transcript in separate ncbi module

Existing `transcript_to_protein` is factored out into a separate
`mutalyzer.ncbi` module.
parent 1bb4eb9b
No related branches found
No related tags found
No related merge requests found
from Bio import Entrez
from .config import settings
from .db import queries
def transcript_to_protein(transcript_accession):
"""
Try to find the protein linked to a transcript.
First look in our database. If a link cannot be found, try to retrieve it
from the NCBI. Add the result to our database.
:arg str transcript_accession: Accession number of the transcript for
which we want to find the protein (without version number).
:returns: Accession number of a protein (without version number) or `None`
if no link can be found.
:rtype: str
"""
Entrez.email = settings.EMAIL
link = queries.get_transcript_protein_link(transcript_accession)
if link is not None:
return link.protein_accession
handle = Entrez.esearch(db='nucleotide', term=transcript_accession)
try:
result = Entrez.read(handle)
except Entrez.Parser.ValidationError:
# Todo: Log this error.
return None
finally:
handle.close()
transcript_gi = unicode(result['IdList'][0])
handle = Entrez.elink(dbfrom='nucleotide', db='protein', id=transcript_gi)
try:
result = Entrez.read(handle)
except Entrez.Parser.ValidationError:
# Todo: Log this error.
return None
finally:
handle.close()
if not result[0]['LinkSetDb']:
# We also cache negative results.
queries.update_transcript_protein_link(
transcript_accession=transcript_accession)
return None
protein_gi = unicode(result[0]['LinkSetDb'][0]['Link'][0]['Id'])
handle = Entrez.efetch(
db='protein', id=protein_gi, rettype='acc', retmode='text')
protein_accession = unicode(handle.read()).split('.')[0]
handle.close()
queries.update_transcript_protein_link(
transcript_accession=transcript_accession,
protein_accession=protein_accession)
return protein_accession
def protein_to_transcript(protein_accession):
"""
Try to find the transcript linked to a protein.
First look in our database. If a link cannot be found, try to retrieve it
from the NCBI. Add the result to our database.
:arg str protein_accession: Accession number of the protein for which we
want to find the transcript (without version number).
:returns: Accession number of a transcript (without version number) or
`None` if no link can be found.
:rtype: str
"""
Entrez.email = settings.EMAIL
link = queries.get_transcript_protein_link(protein_accession, reverse=True)
if link is not None:
return link.transcript_accession
handle = Entrez.esearch(db='protein', term=protein_accession)
try:
result = Entrez.read(handle)
except Entrez.Parser.ValidationError:
# Todo: Log this error.
return None
finally:
handle.close()
if not result['IdList']:
return None
protein_gi = unicode(result['IdList'][0])
handle = Entrez.elink(dbfrom='protein', db='nucleotide', id=protein_gi)
try:
result = Entrez.read(handle)
except Entrez.Parser.ValidationError:
# Todo: Log this error.
return None
finally:
handle.close()
if not result[0]['LinkSetDb']:
# We also cache negative results.
queries.update_transcript_protein_link(
protein_accession=protein_accession)
return None
transcript_gi = ''
for link in result[0]['LinkSetDb']:
if unicode(link['LinkName']) == 'protein_nuccore_mrna':
transcript_gi = unicode(link['Link'][0]['Id'])
break
handle = Entrez.efetch(
db='nucleotide', id=transcript_gi, rettype='acc', retmode='text')
transcript_accession = unicode(handle.read()).split('.')[0]
handle.close()
queries.update_transcript_protein_link(
transcript_accession=transcript_accession,
protein_accession=protein_accession)
return transcript_accession
......@@ -11,12 +11,11 @@ import re
import bz2
from itertools import izip_longest
from Bio import SeqIO, Entrez
from Bio import SeqIO
from Bio.Alphabet import ProteinAlphabet
from mutalyzer.config import settings
from mutalyzer.db import queries
from mutalyzer.GenRecord import PList, Locus, Gene, Record
from .. import ncbi
from ..GenRecord import PList, Locus, Gene, Record
# Regular expression used to find version number in locus tag
......@@ -58,13 +57,6 @@ class GBparser():
"""
@todo: documentation
"""
def __init__(self):
"""
Initialise the class
"""
Entrez.email = settings.EMAIL
#__init__
def __location2pos(self, location):
"""
Convert a location object to a tuple of integers.
......@@ -122,60 +114,6 @@ class GBparser():
return ret
#__locationList2posList
def __transcriptToProtein(self, transcriptAcc):
"""
Try to find the protein linked to a transcript id.
First look in our database, if a link can not be found, try to
retrieve it via the NCBI. Store the result in our database.
@arg transcriptAcc: Accession number of the transcript for which we
want to find the protein
@type transcriptAcc: unicode
@return: Accession number of a protein or None if nothing can be found
@rtype: unicode
"""
link = queries.get_transcript_protein_link(transcriptAcc)
if link is not None:
return link.protein_accession
handle = Entrez.esearch(db = "nucleotide", term = transcriptAcc)
try:
result = Entrez.read(handle)
except Entrez.Parser.ValidationError:
# Todo: Log this error.
return None
finally:
handle.close()
transcriptGI = unicode(result["IdList"][0])
handle = Entrez.elink(dbfrom = "nucleotide", db = "protein",
id = transcriptGI)
try:
result = Entrez.read(handle)
except Entrez.Parser.ValidationError:
# Todo: Log this error.
return None
finally:
handle.close()
if not result[0]["LinkSetDb"] :
queries.update_transcript_protein_link(transcriptAcc)
return None
proteinGI = unicode(result[0]["LinkSetDb"][0]["Link"][0]["Id"])
handle = Entrez.efetch(db='protein', id=proteinGI, rettype='acc', retmode='text')
proteinAcc = unicode(handle.read()).split('.')[0]
handle.close()
queries.update_transcript_protein_link(transcriptAcc, proteinAcc)
return proteinAcc
#__transcriptToProtein
def _find_mismatch(self, sentences):
"""
Find the indices of the first and last words that distinguishes one
......@@ -283,7 +221,7 @@ class GBparser():
#if
else : # Tag an mRNA with the protein id too.
i.proteinLink = \
self.__transcriptToProtein(i.transcript_id.split('.')[0])
ncbi.transcript_to_protein(i.transcript_id.split('.')[0])
i.positionList = self.__locationList2posList(i)
i.location = self.__location2pos(i.location) #FIXME
#if not i.positionList : # FIXME ???
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment