From 9d06cc02b520092c4dc9c77d8c1c457cb35871c6 Mon Sep 17 00:00:00 2001 From: "Jeroen F.J. Laros" <jlaros@fixedpoint.nl> Date: Sun, 27 Sep 2015 21:14:04 +0200 Subject: [PATCH] Add protein_to_transcript in separate ncbi module Existing `transcript_to_protein` is factored out into a separate `mutalyzer.ncbi` module. --- mutalyzer/ncbi.py | 128 +++++++++++++++++++++++++++++++++++ mutalyzer/parsers/genbank.py | 70 ++----------------- 2 files changed, 132 insertions(+), 66 deletions(-) create mode 100644 mutalyzer/ncbi.py diff --git a/mutalyzer/ncbi.py b/mutalyzer/ncbi.py new file mode 100644 index 00000000..dc02da6d --- /dev/null +++ b/mutalyzer/ncbi.py @@ -0,0 +1,128 @@ +from Bio import Entrez + +from .config import settings +from .db import queries + + +def transcript_to_protein(transcript_accession): + """ + Try to find the protein linked to a transcript. + + First look in our database. If a link cannot be found, try to retrieve it + from the NCBI. Add the result to our database. + + :arg str transcript_accession: Accession number of the transcript for + which we want to find the protein (without version number). + + :returns: Accession number of a protein (without version number) or `None` + if no link can be found. + :rtype: str + """ + Entrez.email = settings.EMAIL + + link = queries.get_transcript_protein_link(transcript_accession) + if link is not None: + return link.protein_accession + + handle = Entrez.esearch(db='nucleotide', term=transcript_accession) + try: + result = Entrez.read(handle) + except Entrez.Parser.ValidationError: + # Todo: Log this error. + return None + finally: + handle.close() + + transcript_gi = unicode(result['IdList'][0]) + + handle = Entrez.elink(dbfrom='nucleotide', db='protein', id=transcript_gi) + try: + result = Entrez.read(handle) + except Entrez.Parser.ValidationError: + # Todo: Log this error. + return None + finally: + handle.close() + + if not result[0]['LinkSetDb']: + # We also cache negative results. + queries.update_transcript_protein_link( + transcript_accession=transcript_accession) + return None + + protein_gi = unicode(result[0]['LinkSetDb'][0]['Link'][0]['Id']) + + handle = Entrez.efetch( + db='protein', id=protein_gi, rettype='acc', retmode='text') + protein_accession = unicode(handle.read()).split('.')[0] + handle.close() + + queries.update_transcript_protein_link( + transcript_accession=transcript_accession, + protein_accession=protein_accession) + return protein_accession + + +def protein_to_transcript(protein_accession): + """ + Try to find the transcript linked to a protein. + + First look in our database. If a link cannot be found, try to retrieve it + from the NCBI. Add the result to our database. + + :arg str protein_accession: Accession number of the protein for which we + want to find the transcript (without version number). + + :returns: Accession number of a transcript (without version number) or + `None` if no link can be found. + :rtype: str + """ + Entrez.email = settings.EMAIL + + link = queries.get_transcript_protein_link(protein_accession, reverse=True) + if link is not None: + return link.transcript_accession + + handle = Entrez.esearch(db='protein', term=protein_accession) + try: + result = Entrez.read(handle) + except Entrez.Parser.ValidationError: + # Todo: Log this error. + return None + finally: + handle.close() + + if not result['IdList']: + return None + protein_gi = unicode(result['IdList'][0]) + + handle = Entrez.elink(dbfrom='protein', db='nucleotide', id=protein_gi) + try: + result = Entrez.read(handle) + except Entrez.Parser.ValidationError: + # Todo: Log this error. + return None + finally: + handle.close() + + if not result[0]['LinkSetDb']: + # We also cache negative results. + queries.update_transcript_protein_link( + protein_accession=protein_accession) + return None + + transcript_gi = '' + for link in result[0]['LinkSetDb']: + if unicode(link['LinkName']) == 'protein_nuccore_mrna': + transcript_gi = unicode(link['Link'][0]['Id']) + break + + handle = Entrez.efetch( + db='nucleotide', id=transcript_gi, rettype='acc', retmode='text') + transcript_accession = unicode(handle.read()).split('.')[0] + handle.close() + + queries.update_transcript_protein_link( + transcript_accession=transcript_accession, + protein_accession=protein_accession) + return transcript_accession diff --git a/mutalyzer/parsers/genbank.py b/mutalyzer/parsers/genbank.py index 907ee45f..5312a1b5 100644 --- a/mutalyzer/parsers/genbank.py +++ b/mutalyzer/parsers/genbank.py @@ -11,12 +11,11 @@ import re import bz2 from itertools import izip_longest -from Bio import SeqIO, Entrez +from Bio import SeqIO from Bio.Alphabet import ProteinAlphabet -from mutalyzer.config import settings -from mutalyzer.db import queries -from mutalyzer.GenRecord import PList, Locus, Gene, Record +from .. import ncbi +from ..GenRecord import PList, Locus, Gene, Record # Regular expression used to find version number in locus tag @@ -58,13 +57,6 @@ class GBparser(): """ @todo: documentation """ - def __init__(self): - """ - Initialise the class - """ - Entrez.email = settings.EMAIL - #__init__ - def __location2pos(self, location): """ Convert a location object to a tuple of integers. @@ -122,60 +114,6 @@ class GBparser(): return ret #__locationList2posList - def __transcriptToProtein(self, transcriptAcc): - """ - Try to find the protein linked to a transcript id. - - First look in our database, if a link can not be found, try to - retrieve it via the NCBI. Store the result in our database. - - @arg transcriptAcc: Accession number of the transcript for which we - want to find the protein - @type transcriptAcc: unicode - - @return: Accession number of a protein or None if nothing can be found - @rtype: unicode - """ - link = queries.get_transcript_protein_link(transcriptAcc) - if link is not None: - return link.protein_accession - - handle = Entrez.esearch(db = "nucleotide", term = transcriptAcc) - try: - result = Entrez.read(handle) - except Entrez.Parser.ValidationError: - # Todo: Log this error. - return None - finally: - handle.close() - - transcriptGI = unicode(result["IdList"][0]) - - handle = Entrez.elink(dbfrom = "nucleotide", db = "protein", - id = transcriptGI) - try: - result = Entrez.read(handle) - except Entrez.Parser.ValidationError: - # Todo: Log this error. - return None - finally: - handle.close() - - if not result[0]["LinkSetDb"] : - queries.update_transcript_protein_link(transcriptAcc) - return None - - proteinGI = unicode(result[0]["LinkSetDb"][0]["Link"][0]["Id"]) - - handle = Entrez.efetch(db='protein', id=proteinGI, rettype='acc', retmode='text') - - proteinAcc = unicode(handle.read()).split('.')[0] - handle.close() - - queries.update_transcript_protein_link(transcriptAcc, proteinAcc) - return proteinAcc - #__transcriptToProtein - def _find_mismatch(self, sentences): """ Find the indices of the first and last words that distinguishes one @@ -283,7 +221,7 @@ class GBparser(): #if else : # Tag an mRNA with the protein id too. i.proteinLink = \ - self.__transcriptToProtein(i.transcript_id.split('.')[0]) + ncbi.transcript_to_protein(i.transcript_id.split('.')[0]) i.positionList = self.__locationList2posList(i) i.location = self.__location2pos(i.location) #FIXME #if not i.positionList : # FIXME ??? -- GitLab