diff --git a/mutalyzer/Db.py b/mutalyzer/Db.py index 23cf48fb847634ce373c44b24aa6edd0430b3dd7..04b01dd8d75f2ddfeaaa519e850640f6c8599880 100644 --- a/mutalyzer/Db.py +++ b/mutalyzer/Db.py @@ -795,460 +795,6 @@ class Mapping(Db) : #Mapping -class Cache(Db) : - """ - Database functions for cache administration. - - Special methods: - - __init__() ; Initialise the class. - - Public methods: - - insertGB(accNo, GI, fileHash, ChrAccVer, ChrStart, ChrStop, - orientation, url) ; Insert info about a GenBank record. - - updateHash(accNo, fileHash) ; Update the hash of an accession number. - - getGBFromLoc(ChrAccVer, ChrStart, ChrStop, orientation) ; Get the - accession number from slicing information. - - getGBFromHash(fileHash) ; Get the accession number from its hash. - - getGBFromGI(GI) ; Get the accession number from its GI - number. - - getLoc(accNo) ; Get the slicing information of an - accession number. - - getHash(accNo) ; Get the hash of a GenBank record. - - getUrl(accNo) ; Get the URL of an accession number. - - Inherited methods from Db: - - query(statement) ; General query function. - - SQL tables from internalDb: - - GBInfo ; Information about cached and uploaded GenBank files. - """ - - def __init__(self) : - """ - Initialise the Db parent class. Use the internalDb. - """ - Db.__init__(self, settings.MYSQL_DATABASE, - settings.MYSQL_USER, settings.MYSQL_HOST) - #__init__ - - def insertGB(self, accNo, GI, fileHash, ChrAccVer, ChrStart, - ChrStop, orientation, url) : - """ - Insert information about a GenBank record in the internal database. - - The accNo and fileHash arguments are mandatory. - - If the record is a normal RefSeq, then the GI number should be - provided. - - If the record is a chromosome slice, then the ChrAccVer, ChrStart, - ChrStop and orientation variables should be specified. - - If the record is downloaded from the internet, the url should be - provided. - - If all fields except the mandatory ones are empty, the record is - assumed to be uploaded. - - SQL tables from internalDb (altered): - - GBInfo ; Information about cached and uploaded GenBank files. - - @arg accNo: The name associated with this record - @type accNo: string - @arg GI: The GI number (if available) - @type GI: string - @arg fileHash: The hash of the content of the record - @type fileHash: string - @arg ChrAccVer: The accession number of the chromosome (if available) - @type ChrAccVer: string - @arg ChrStart: The start of the record in chromosomal - coordinates (if available) - @type ChrStart: integer - @arg ChrStop: The end of the record in chromosomal coordinates - (if available) - @type ChrStop: integer - @arg orientation: The orientation of the record relative to the - chromosome (if available) (1 = forward, - 2 = reverse complement) - @type orientation: integer - @arg url: The originating URL (if available) - @type url: string - """ - - statement = """ - INSERT INTO GBInfo - (AccNo, GI, hash, ChrAccVer, ChrStart, ChrStop, orientation, url) - VALUES (%s, %s, %s, %s, %s, %s, %s, %s); - """, (accNo, GI, fileHash, ChrAccVer, ChrStart, ChrStop, orientation, - url) - - self.query(statement) - #insertGB - - def insertLRG(self, accNo, fileHash, url): - """ - Insert information about a LRG record in the internal database. - - See insertGB() for more information. - - @arg accNo: The name associated with this record - @type accNo: string - @arg fileHash: The hash of the content of the record - @type fileHash: string - @arg url: The originating URL (if available) - @type url: string - """ - - statement = """ - INSERT INTO GBInfo - (AccNo, GI, hash, ChrAccVer, ChrStart, ChrStop, orientation, url) - VALUES (%s, %s, %s, %s, %s, %s, %s, %s); - """, (accNo, None, fileHash, None, None, None, None, url) - - self.query(statement) - #insertLRG - - - def updateHash(self, accNo, fileHash) : - """ - Update the hash of an accession number. - - SQL tables from internalDb (altered): - - GBInfo ; Information about cached and uploaded GenBank files. - - @arg accNo: The accession number of a GenBank record - @type accNo: string - @arg fileHash: The hash of a GenBank record - @type fileHash: string - """ - - statement = """ - UPDATE GBInfo - SET hash = %s - WHERE AccNo = %s; - """, (fileHash, accNo) - - self.query(statement) - #updateHash - - def getGBFromLoc(self, ChrAccVer, ChrStart, ChrStop, orientation) : - """ - Get the accession number from a chromosomic location, used - to make a slice, typically this only affects UD-numbers. - - SQL tables from internalDb: - - GBInfo ; Information about cached and uploaded GenBank files. - - @arg ChrAccVer: The accession number of the chromosome - @type ChrAccVer: string - @arg ChrStart: Start position of the slice - @type ChrStart: integer - @arg ChrStop: End position of the slice - @type ChrStop: integer - @arg orientation: Orientation of the slice: - 1. Forward - 2. Reverse complement - @type orientation: integer - - @return: The accession number - @rtype: string - """ - - statement = """ - SELECT AccNo - FROM GBInfo - WHERE ChrAccVer = %s - AND ChrStart = %s - AND ChrStop = %s - AND orientation = %s; - """, (ChrAccVer, ChrStart, ChrStop, orientation) - - ret = self.query(statement) - if ret : - return ret[0][0] - return None - #getGBFromLoc - - def getGBFromHash(self, fileHash) : - """ - Get the accession number from its hash. - - SQL tables from internalDb: - - GBInfo ; Information about cached and uploaded GenBank files. - - @arg fileHash: The hash of a GenBank record - @type fileHash: string - - @return: The accession number - @rtype: string - """ - - statement = """ - SELECT AccNo - FROM GBInfo - WHERE hash = %s; - """, fileHash - - ret = self.query(statement) - if ret : - return ret[0][0] - return None - #getGBFromHash - - def getGBFromGI(self, GI) : - """ - Get the accession number from its GI number, typically this only - affects RefSeq sequences. - - SQL tables from internalDb: - - GBInfo ; Information about cached and uploaded GenBank files. - - @arg GI: The GI number of a GenBank record - @type GI: string - - @return: The accession number - @rtype: string - """ - - statement = """ - SELECT AccNo - FROM GBInfo - WHERE GI = %s; - """, GI - - ret = self.query(statement) - if ret : - return ret[0][0] - return None - #getGBFromGI - - def getGBSince(self, created_since): - """ - Get all accession number entries with creation date {created_since} - or later. - - SQL tables from internalDb: - - GBInfo ; Information about cached and uploaded GenBank files. - - @arg created_since: Only entries with later creation dates are returned. - @type created_since: datatime.datetime - - @return: The accession number - @rtype: string - """ - statement = """ - SELECT AccNo, GI, hash, ChrAccVer, ChrStart, - ChrStop, orientation, url, created - FROM GBInfo - WHERE created >= %s; - """, created_since - - return self.query(statement) - #getGBSince - - def getLoc(self, accNo) : - """ - Get the slicing information of an accession number, typically this - only affects UD numbers. - - SQL tables from internalDb: - - GBInfo ; Information about cached and uploaded GenBank files. - - @arg accNo: The accession number of a genbank record - @type accNo: string - - @return: The slicing information: - - ChrAccVer ; Accession number of the chromosome - - ChrStart ; Start position of the slice - - ChrStop ; End position of the slice - - orientation ; Orientation of the slice (1 = forward, - 2 = reverse complement) - @rtype: list - """ - - statement = """ - SELECT ChrAccVer, ChrStart, ChrStop, orientation - FROM GBInfo - WHERE AccNo = %s; - """, accNo - - ret = self.query(statement) - if ret : - return list(ret[0]) - return None - #getLoc - - def getHash(self, accNo) : - """ - Get the hash of a GenBank record identified by an accession number. - - SQL tables from internalDb: - - GBInfo ; Information about cached and uploaded GenBank files. - - @arg accNo: The accession number of a genbank record - @type accNo: string - - @return: The hash of the GenBank record - @rtype: string - """ - - statement = """ - SELECT hash - FROM GBInfo - WHERE AccNo = %s; - """, accNo - - ret = self.query(statement) - if ret : - return ret[0][0] - return None - #getHash - - def getUrl(self, accNo) : - """ - Get the URL of an accession number, typically this only affects - uploaded UD numbers. - - SQL tables from internalDb: - - GBInfo ; Information about cached and uploaded GenBank files. - - @arg accNo: The accession number of a genbank record - @type accNo: string - - @return: The URL of the GenBank record - @rtype: string - """ - - statement = """ - SELECT url - FROM GBInfo - WHERE AccNo = %s; - """, accNo - - ret = self.query(statement) - if ret : - return ret[0][0] - return None - #getHash - - def getGI(self, accNo) : - """ - Get the GI number that is connected to the accession number. - - SQL tables from internalDb: - - GBInfo ; Information about cached and uploaded GenBank files. - - @arg accNo: The accession number - @type accNo: string - - @return: GI number - @rtype: string - """ - - statement = """ - SELECT GI - FROM GBInfo - WHERE AccNo = %s; - """, accNo - - ret = self.query(statement) - if ret : - return ret[0][0] - return None - #getGI - - def getProtAcc(self, mrnaAcc) : - """ - Gets the protein accession number for the given mRNA accession - number. - - SQL tables from internalDb: - - Link ; mRNA and associated protein IDs. - - @arg mrnaAcc: The ID of an mRNA - @type mrnaAcc: string - - @return: The protein accession number - @rtype: string - - @raise: IndexError if no link is stored in the database (recently). - """ - statement = """ - SELECT protAcc - FROM Link - WHERE mrnaAcc = %s - AND - ( - (protAcc IS NULL AND - created >= DATE_SUB(CURDATE(), INTERVAL %s DAY)) - OR - (protAcc IS NOT NULL AND - created >= DATE_SUB(CURDATE(), INTERVAL %s DAY)) - ); - """, (mrnaAcc, - settings.PROTEIN_LINK_NONE_LIFETIME, - settings.PROTEIN_LINK_LIFETIME) - - ret = self.query(statement) - return ret[0][0] - #getProtAcc - - def getmrnaAcc(self, protAcc) : - """ - Gets the mRNA accession number for a given protein accession number. - - SQL tables from internalDb: - - Link ; mRNA and associated protein IDs. - - @arg protAcc: The protein ID - @type protAcc: string - - @return: The mRNA accession number - @rtype: string - - @raise: IndexError if no link is stored in the database (recently). - """ - statement = """ - SELECT mrnaAcc - FROM Link - WHERE protAcc = %s - AND - ( - (protAcc IS NULL AND - created >= DATE_SUB(CURDATE(), INTERVAL %s DAY)) - OR - (protAcc IS NOT NULL AND - created >= DATE_SUB(CURDATE(), INTERVAL %s DAY)) - ); - """, (protAcc, - settings.PROTEIN_LINK_NONE_LIFETIME, - settings.PROTEIN_LINK_LIFETIME) - - ret = self.query(statement) - return ret[0][0] - #getmrnaAcc - - def insertLink(self, mrnaAcc, protAcc) : - """ - Inserts the given mRNA and protein accession numbers into the Link - table. If a record already exists for this mrnaAcc value, it is - replaced by the new data. - - SQL tables from internalDb: - - Link ; mRNA and associated protein IDs. - - @arg protAcc: The protein ID - @type protAcc: string - @arg mrnaAcc: The ID of an mRNA - @type mrnaAcc: string - """ - - statement = """ - REPLACE INTO Link (mrnaAcc, protAcc) - VALUES (%s, %s); - """, (mrnaAcc, protAcc) - - self.query(statement) - #insertLink -#Cache - - class Counter(Db): """ Database functions for the service counters. diff --git a/mutalyzer/Retriever.py b/mutalyzer/Retriever.py index 37835cf723f56d7bf3102329003df779ffe47029..7c224e4894a6052f6c6d71aec980e36fe24175a4 100644 --- a/mutalyzer/Retriever.py +++ b/mutalyzer/Retriever.py @@ -24,9 +24,12 @@ from Bio.Alphabet import ProteinAlphabet from xml.dom import DOMException, minidom from xml.parsers import expat from httplib import HTTPException, IncompleteRead +from sqlalchemy.orm.exc import NoResultFound from mutalyzer import util from mutalyzer.config import settings +from mutalyzer.db import session +from mutalyzer.db.models import Reference from mutalyzer.parsers import lrg from mutalyzer.parsers import genbank @@ -67,7 +70,7 @@ class Retriever(object) : - LogMsg(filename, message) ; Log a message. """ - def __init__(self, output, database) : + def __init__(self, output) : """ Use variables from the configuration file for some simple settings. Make the cache directory if it does not exist yet. @@ -78,7 +81,6 @@ class Retriever(object) : @type database: """ self._output = output - self._database = database if not os.path.isdir(settings.CACHE_DIR) : os.mkdir(settings.CACHE_DIR) Entrez.email = settings.EMAIL @@ -218,19 +220,26 @@ class Retriever(object) : @return: filename @rtype: string """ + try: + reference = Reference.query.filter_by(accession=name).one() + currentmd5sum = reference.checksum + except NoResultFound: + currentmd5sum = None - currentmd5sum = self._database.getHash(name) if currentmd5sum : md5sum = self._calcHash(raw_data) if md5sum != currentmd5sum : self._output.addMessage(__file__, -1, "WHASH", "Warning: Hash of %s changed from %s to %s." % ( name, currentmd5sum, md5sum)) - self._database.updateHash(name, md5sum) + Reference.query.filter_by(accession=name).update({'checksum': md5sum}) + session.commit() #if else : - self._database.insertGB(name, GI, - self._calcHash(raw_data), None, 0, 0, 0, None) + reference = Reference(name, self._calcHash(raw_data), + geninfo_identifier=GI) + session.add(reference) + session.commit() return self._nametofile(name) #_updateDBmd5 @@ -326,12 +335,12 @@ class GenBankRetriever(Retriever): """ """ - def __init__(self, output, database): + def __init__(self, output): """ @todo: Documentation. """ # Recall init of parent - Retriever.__init__(self, output, database) + Retriever.__init__(self, output) self.fileType = "gb" # Child specific init #__init__ @@ -511,11 +520,18 @@ class GenBankRetriever(Retriever): if stop - start > settings.MAX_FILE_SIZE: return None + slice_orientation = ['forward', 'reverse'][orientation - 1] + # Check whether we have seen this slice before. - UD = self._database.getGBFromLoc(accno, start, stop, orientation) - if UD : # This has been requested before. - if os.path.isfile(self._nametofile(UD)) : # It's still present. - return UD + try: + reference = Reference.query.filter_by( + slice_accession=accno, slice_start=start, slice_stop=stop, + slice_orientation=slice_orientation).one() + except NoResultFound: + reference = None + else: + if os.path.isfile(self._nametofile(reference.accession)) : # It's still present. + return reference.accession # It's not present, so download it. try: @@ -534,21 +550,28 @@ class GenBankRetriever(Retriever): # Calculate the hash of the downloaded file. md5sum = self._calcHash(raw_data) - if UD : # We have seen this one before. - currentmd5sum = self._database.getHash(UD) + if reference is not None: # We have seen this one before. + currentmd5sum = reference.checksum + if md5sum != currentmd5sum : self._output.addMessage(__file__, -1, "WHASH", "Warning: Hash of %s changed from %s to %s." % ( UD, currentmd5sum, md5sum)) - self._database.updateHash(UD, md5sum) + Reference.query.filter_by(accession=UD).update({'checksum': md5sum}) + session.commit() #if else : # We haven't seen it before, so give it a name. UD = self._newUD() - self._database.insertGB(UD, None, md5sum, accno, start, - stop, orientation, None) + slice_orientation = ['forward', 'reverse'][orientation - 1] + reference = Reference(UD, md5sum, slice_accession=accno, + slice_start=start, slice_stop=stop, + slice_orientation=slice_orientation) + session.add(reference) + session.commit() #else - return self.write(raw_data, UD, 0) and UD + if self.write(raw_data, reference.accession, 0): + return str(reference.accession) #retrieveslice def retrievegene(self, gene, organism, upstream, downstream) : @@ -628,9 +651,9 @@ class GenBankRetriever(Retriever): #if # Figure out the orientation of the gene. - orientation = "1" + orientation = 1 if ChrStart > ChrStop : # Swap start and stop. - orientation = "2" + orientation = 2 temp = ChrStart ChrStart = ChrStop - downstream # Also take care of the flanking ChrStop = temp + upstream + 1 # sequences. @@ -663,17 +686,21 @@ class GenBankRetriever(Retriever): if 512 < length < settings.MAX_FILE_SIZE: raw_data = handle.read() md5sum = self._calcHash(raw_data) - UD = self._database.getGBFromHash(md5sum) - if UD: #hash found - if not os.path.isfile(self._nametofile(UD)): - UD = self.write(raw_data, UD, 0) and UD - else: + + try: + reference = Reference.query.filter_by(checksum=md5sum).one() + except NoResultFound: UD = self._newUD() if not os.path.isfile(self._nametofile(UD)): - UD = self.write(raw_data, UD, 0) and UD + UD = self.write(raw_data, UD, 0) and str(UD) if UD: #Parsing went OK, add to DB - self._database.insertGB(UD, None, md5sum, - None, 0, 0, 0, url) + reference = Reference(UD, md5sum, download_url=url) + session.add(reference) + session.commit() + else: + if not os.path.isfile(self._nametofile(reference.accession)): + UD = self.write(raw_data, reference.accession, 0) and str(reference.accession) + return UD #Returns the UD or None #if else : @@ -702,84 +729,111 @@ class GenBankRetriever(Retriever): @rtype: string????? """ md5sum = self._calcHash(raw_data) - UD = self._database.getGBFromHash(md5sum) - if not UD : + + try: + reference = Reference.query.filter_by(checksum=md5sum).one() + except NoResultFound: UD = self._newUD() if self.write(raw_data, UD, 0): - self._database.insertGB(UD, None, md5sum, None, 0, 0, 0, None) + reference = Reference(UD, md5sum) + session.add(reference) + session.commit() return UD - #if else: - if os.path.isfile(self._nametofile(UD)): - return UD + if os.path.isfile(self._nametofile(reference.accession)): + return reference.accession else: - return self.write(raw_data, UD, 0) and UD + return self.write(raw_data, reference.accession, 0) and str(reference.accession) #uploadrecord - def loadrecord(self, identifier) : + def loadrecord(self, identifier): """ - Load a record and return it. - If the filename associated with the accession number is not found - in the cache, try to re-download it. + Load a RefSeq record and return it. - @arg identifier: An accession number - @type identifier: string + The record is found by trying the following options in order: + + 1. Returned from the cache if it is there. + 2. Re-created (if it was created by slicing) or re-downloaded (if it + was created by URL) if we have information on its source in the + database. + 3. Fetched from the NCBI. + + :arg identifier: A RefSeq accession number or geninfo identifier (GI). + :type identifier: string - @return: A GenBank.Record record - @rtype: object + :return: A parsed RefSeq record or `None` if no record could be found + for the given identifier. + :rtype: mutalyzer.GenRecord.Record """ - if (identifier[0].isdigit()) : # This is a GI identifier. - name = self._database.getGBFromGI(identifier) - if name is None: - self._output.addMessage(__file__, 4, "ERETR", - "Unknown reference: %s" % identifier) - return - else : - name = identifier + if identifier[0].isdigit(): + # This is a GI number (geninfo identifier). + reference = Reference.query \ + .filter_by(geninfo_identifier=identifier) \ + .first() + else: + # This is a RefSeq accession number. + reference = Reference.query \ + .filter_by(accession=identifier) \ + .first() - # Make a filename based upon the identifier. - filename = self._nametofile(name) + if reference is None: + # We don't know it, fetch it from NCBI. + filename = self.fetch(identifier) - if not os.path.isfile(filename) : # We can't find the file. - md5 = self._database.getHash(name) - if md5: # We have seen it before though. - Loc = self._database.getLoc(name) # Try to find the location. - if not Loc[0]: # No location found. - url = self._database.getUrl(name) # Try to find an URL. - if not url : - if self._database.getGI(name) : # It was from NCBI. - filename = self.fetch(name) - else : - self._output.addMessage(__file__, 4, "ERETR", - "Please upload this sequence again.") - filename = None - #if - else : # This used to be a downloaded seq - filename = self.downloadrecord(url) and filename - #if - else : # This used to be a slice. - filename = self.retrieveslice(*Loc) and filename - #if - else : # Never seen this name before. - filename = self.fetch(name) - #else - #if + else: + # We have seen it before. + filename = self._nametofile(reference.accession) + + if os.path.isfile(filename): + # It is still in the cache, so filename is valid. + pass + + if reference.slice_accession: + # It was previously created by slicing. + cast_orientation = {None: None, + 'forward': 1, + 'reverse': 2} + if not self.retrieveslice(reference.slice_accession, + reference.slice_start, + reference.slice_stop, + cast_orientation[reference.slice_orientation]): + filename = None + + elif reference.download_url: + # It was previously created by URL. + if not self.downloadrecord(reference.download_url): + filename = None + + elif reference.geninfo_identifier: + # It was previously fetched from NCBI. + filename = self.fetch(reference.accession) - # If filename is None an error occured + else: + # It was previously created by uploading. + self._output.addMessage(__file__, 4, 'ERETR', + 'Please upload this sequence again.') + filename = None + + # If filename is None, we could not retrieve the record. if filename is None: - #Notify batch to skip all instance of identifier - self._output.addOutput("BatchFlags", ("S1", identifier)) + # Notify batch job to skip all instance of identifier. + self._output.addOutput('BatchFlags', ('S1', identifier)) return None # Now we have the file, so we can parse it. GenBankParser = genbank.GBparser() record = GenBankParser.create_record(filename) - record.id = name - # Todo: This will change once we support protein references + if reference: + record.id = reference.accession + else: + record.id = record.source_id + + # Todo: This will change once we support protein references. if isinstance(record.seq.alphabet, ProteinAlphabet): - self._output.addMessage(__file__, 4, 'ENOTIMPLEMENTED', - 'Protein reference sequences are not supported.') + self._output.addMessage( + __file__, 4, 'ENOTIMPLEMENTED', + 'Protein reference sequences are not supported.') return None return record @@ -795,7 +849,7 @@ class LRGRetriever(Retriever): the cache and return the record. """ - def __init__(self, output, database): + def __init__(self, output): #TODO documentation """ Initialize the class. @@ -807,7 +861,7 @@ class LRGRetriever(Retriever): @type database: """ # Recall init of parent - Retriever.__init__(self, output, database) + Retriever.__init__(self, output) self.fileType = "xml" # Child specific init #__init__ @@ -912,14 +966,22 @@ class LRGRetriever(Retriever): #Do an md5 check md5sum = self._calcHash(raw_data) - md5db = self._database.getHash(lrgID) + try: + reference = Reference.query.filter_by(accession=lrgID).one() + md5db = reference.checksum + except NoResultFound: + md5db = None + if md5db is None: - self._database.insertLRG(lrgID, md5sum, url) + reference = Reference(lrgID, md5sum, download_url=url) + session.add(reference) + session.commit() elif md5sum != md5db: #hash has changed for the LRG ID self._output.addMessage(__file__, -1, "WHASH", "Warning: Hash of %s changed from %s to %s." % ( lrgID, md5db, md5sum)) - self._database.updateHash(lrgID, md5sum) + Reference.query.filter_by(accession=lrgID).update({'checksum': md5sum}) + session.commit() else: #hash the same as in db pass diff --git a/mutalyzer/Scheduler.py b/mutalyzer/Scheduler.py index b675c4b9d308ed7920023dfa788710f33b70daee..3fd52c961ad0f5b8b527f20457cb8cf6181d2ea0 100644 --- a/mutalyzer/Scheduler.py +++ b/mutalyzer/Scheduler.py @@ -659,7 +659,7 @@ Mutalyzer batch scheduler""" % url) descriptions = [] if not skip : - R = Retriever.Retriever(O, None) + R = Retriever.Retriever(O) descriptions = R.snpConvert(cmd) # Todo: Is output ok? diff --git a/mutalyzer/config/default_settings.py b/mutalyzer/config/default_settings.py index 03dd2a108dca87aeb530beab889a7b3a2c295206..9670dd5542ef3d385bfe56cfdb20d57b25f6b776 100644 --- a/mutalyzer/config/default_settings.py +++ b/mutalyzer/config/default_settings.py @@ -68,13 +68,12 @@ LRG_PREFIX_URL = 'ftp://ftp.ebi.ac.uk/pub/databases/lrgex/' # Allow for this fraction of errors in batch jobs. BATCH_JOBS_ERROR_THRESHOLD = 0.05 -# Number of days a cached transcript->protein link from the NCBI is considered -# valid. -PROTEIN_LINK_LIFETIME = 30 +# Expiration time for transcript->protein links from the NCBI (in seconds). +PROTEIN_LINK_EXPIRATION = 60 * 60 * 24 * 30 -# Number of days a cached nonexisting transcript->protein link from the NCBI -# is considered valid. -PROTEIN_LINK_NONE_LIFETIME = 5 +# Expiration time for negative transcript->protein links from the NCBI (in +# seconds). +NEGATIVE_PROTEIN_LINK_EXPIRATION = 60 * 60 * 24 * 5 # Is Piwik enabled? PIWIK = False diff --git a/mutalyzer/db/models.py b/mutalyzer/db/models.py index c3fc9b4b156559b5afe9cd5a9ef352923db494ba..33587f8ab410566201efdae6bbdcddf4a8b07570 100644 --- a/mutalyzer/db/models.py +++ b/mutalyzer/db/models.py @@ -8,7 +8,8 @@ import sqlite3 import uuid from sqlalchemy import (event, Column, Boolean, BigInteger, DateTime, ForeignKey, - Integer, Numeric, String, Table, Text, Index, Enum) + Integer, Numeric, String, Table, Text, Index, Enum, + UniqueConstraint) from sqlalchemy.engine import Engine from sqlalchemy.orm import relationship, backref @@ -41,7 +42,6 @@ class BatchJob(db.Base): __tablename__ = 'batch_jobs' __table_args__ = {'mysql_engine': 'InnoDB', 'mysql_charset': 'utf8'} - # Todo: JobId was generated by code to get a free filename in the cache... id = Column(Integer, primary_key=True) #: Email address of user who submitted the job. @@ -61,7 +61,7 @@ class BatchJob(db.Base): #: Identifier to use in the job result filename and thus the URL for #: downloading it. We don't use the auto-incrementing `id` field for this, #: since it can be guessed by any user. - result_id = Column(String(50), index=True, unique=True) + result_id = Column(String(50), nullable=False, index=True, unique=True) #: Date and time of creation. added = Column(DateTime) @@ -100,7 +100,7 @@ class BatchQueueItem(db.Base): #: We simply store the concatenation of these flags. flags = Column(String(20), nullable=False) - # The :class:`BatchJob` for this item. + #: The :class:`BatchJob` for this item. batch_job = relationship( BatchJob, backref=backref('batch_jobs', lazy='dynamic', @@ -120,6 +120,103 @@ Index('batch_queue_item_with_batch_job', BatchQueueItem.batch_job_id, BatchQueueItem.id) +class Reference(db.Base): + """ + Cached information about a reference sequence. + """ + __tablename__ = 'references' + __table_args__ = {'mysql_engine': 'InnoDB', 'mysql_charset': 'utf8'} + + id = Column(Integer, primary_key=True) + + #: Accession number for this reference, including the version number if + #: applicable (e.g., AL449423.14, NM_000059.3, UD_138781341344). + accession = Column(String(20), nullable=False, index=True, unique=True) + + #: MD5 checksum of the reference file. + checksum = Column(String(32), nullable=False, index=True, unique=True) + + #: The corresponding GI number, if available. + geninfo_identifier = Column(String(13), index=True, unique=True) + + #: The accession number from which we took a slice, if available. + slice_accession = Column(String(20)) + + #: The start position on the accession number from which we took a slice, + #: if available. + slice_start = Column(Integer) + + #: The stop position on the accession number from which we took a slice, + #: if available. + slice_stop = Column(Integer) + + #: The orientation on the accession number from which we took a slice, if + #: available. + slice_orientation = Column(Enum('forward', 'reverse', + name='slice_orentation')) + + #: The URL from which the reference file was downloaded, if available. + download_url = Column(String(255), index=True, unique=True) + + #: Date and time of creation. + added = Column(DateTime) + + def __init__(self, accession, checksum, geninfo_identifier=None, + slice_accession=None, slice_start=None, slice_stop=None, + slice_orientation=None, download_url=None): + self.accession = accession + self.checksum = checksum + self.geninfo_identifier = geninfo_identifier + self.slice_accession = slice_accession + self.slice_start = slice_start + self.slice_stop = slice_stop + self.slice_orientation = slice_orientation + self.download_url = download_url + self.added = datetime.now() + + def __repr__(self): + return '<Reference %r>' % self.accession + + +Index('reference_slice', + Reference.slice_accession, Reference.slice_start, Reference.slice_stop, + Reference.slice_orientation, + unique=True) + + +# Todo: Perhaps it is a better fit to implement this with Redis. +class TranscriptProteinLink(db.Base): + """ + Cached link between a transcript and protein reference. + """ + __tablename__ = 'transcript_protein_links' + __table_args__ = {'mysql_engine': 'InnoDB', 'mysql_charset': 'utf8'} + + id = Column(Integer, primary_key=True) + + #: Accession number for the transcript, not including the version number + #: (e.g., NM_018195, XM_005270562, NR_015380). + transcript_accession = Column(String(20), nullable=False, index=True, + unique=True) + + #: Accession number for the protein, not including the version number + #: (e.g., NP_060665, XP_005258635). If `NULL`, the record states that no + #: protein is linked to the transcript by the NCBI. + protein_accession = Column(String(20), index=True) + + #: Date and time of creation. + added = Column(DateTime) + + def __init__(self, transcript_accession, protein_accession=None): + self.transcript_accession = transcript_accession + self.protein_accession = protein_accession + self.added = datetime.now() + + def __repr__(self): + return '<TranscriptProteinLink transcript=%r protein=%r>' \ + % (self.transcript_accession, self.protein_accession) + + def create_all(): db.Base.metadata.drop_all(db.session.get_bind()) db.Base.metadata.create_all(db.session.get_bind()) diff --git a/mutalyzer/db/queries.py b/mutalyzer/db/queries.py index 03851d85f1e318e315f444d664791e268fa4a31c..f5ab7be3533055174160830cbebcc2c737bf281b 100644 --- a/mutalyzer/db/queries.py +++ b/mutalyzer/db/queries.py @@ -3,8 +3,13 @@ Queries on database models. """ +from datetime import datetime, timedelta + +from sqlalchemy import and_, or_ + +from mutalyzer.config import settings from mutalyzer.db import session -from mutalyzer.db.models import BatchQueueItem +from mutalyzer.db.models import BatchQueueItem, TranscriptProteinLink def pop_batch_queue_item(batch_job): @@ -42,3 +47,50 @@ def pop_batch_queue_item(batch_job): session.commit() return item, flags + + +def get_transcript_protein_link(transcript_accession): + """ + Get a cached link between a transcript and a protein that is not expired + according to the configuration settings `PROTEIN_LINK_EXPIRATION` and + `NEGATIVE_PROTEIN_LINK_EXPIRATION`. + + Note that the link may be negative, i.e., the knowledge that no link + exists can also be cached. In that case, the `protein_accession` field of + the resulting `TranscriptProteinLink` object is `None`. + + Returns `None` if no link (positive or negative) is found. + """ + link_datetime = datetime.now() - \ + timedelta(seconds=settings.PROTEIN_LINK_EXPIRATION) + negative_link_datetime = datetime.now() - \ + timedelta(seconds=settings.NEGATIVE_PROTEIN_LINK_EXPIRATION) + + return TranscriptProteinLink.query \ + .filter_by(transcript_accession=transcript_accession) \ + .filter(or_( + and_(TranscriptProteinLink.protein_accession != None, + TranscriptProteinLink.added >= link_datetime), + and_(TranscriptProteinLink.protein_accession == None, + TranscriptProteinLink.added >= negative_link_datetime))) \ + .first() + + +def update_transcript_protein_link(transcript_accession, + protein_accession=None): + """ + Update cached link between a transcript and a protein, or create it if it + doesn't exist yet. + """ + link = TranscriptProteinLink.query \ + .filter_by(transcript_accession=transcript_accession) \ + .first() + + if link is not None: + link.protein_accession = protein_accession + link.added = datetime.now() + else: + link = TranscriptProteinLink(transcript_accession, protein_accession) + session.add(link) + + session.commit() diff --git a/mutalyzer/entrypoints/cache_sync.py b/mutalyzer/entrypoints/cache_sync.py index f238ce6273ba3d6eb813669d34748ae4613298bc..fc2c6bee46035ddffdd5a4b07f7020fdd331364a 100644 --- a/mutalyzer/entrypoints/cache_sync.py +++ b/mutalyzer/entrypoints/cache_sync.py @@ -10,7 +10,6 @@ This program is intended to be run daily from cron. Example: import argparse -from .. import Db from .. import output from .. import sync @@ -20,9 +19,8 @@ def sync_cache(remote_wsdl, url_template, history=7): Synchronize the database cache with other Mutalyzer instances. """ output = output.Output(__file__) - database = Db.Cache() - cache_sync = sync.CacheSync(output, database) + cache_sync = sync.CacheSync(output) cache_sync.sync_with_remote(remote_wsdl, url_template, history) diff --git a/mutalyzer/mapping.py b/mutalyzer/mapping.py index 98bfe3260dfc683085f0ea8acfc16a4d02753fd1..217b925eabc226e60134b0aa4e6a3e21caf6a6df 100644 --- a/mutalyzer/mapping.py +++ b/mutalyzer/mapping.py @@ -1151,8 +1151,7 @@ class ReferenceUpdater(Updater): (default: False). @type overwrite: bool """ - cache = Db.Cache() - retriever = Retriever.GenBankRetriever(output, cache) + retriever = Retriever.GenBankRetriever(output) record = retriever.loadrecord(reference) transcripts = [] diff --git a/mutalyzer/parsers/genbank.py b/mutalyzer/parsers/genbank.py index 2e6c1338e3dbfea10699e9aa9072613d4fcbfa3a..ca9982824df6d42421c4b8da1509bd118c7e1844 100644 --- a/mutalyzer/parsers/genbank.py +++ b/mutalyzer/parsers/genbank.py @@ -10,10 +10,11 @@ from itertools import izip_longest from Bio import SeqIO, Entrez from Bio.Alphabet import ProteinAlphabet +from sqlalchemy import and_, or_ from mutalyzer.config import settings -from mutalyzer import Db -from mutalyzer.GenRecord import PList, Locus, Gene, Record, GenRecord +from mutalyzer.db import queries +from mutalyzer.GenRecord import PList, Locus, Gene, Record # Regular expression used to find version number in locus tag @@ -58,12 +59,8 @@ class GBparser(): def __init__(self): """ Initialise the class - - Private variables: - - __database ; Db.Cache object """ Entrez.email = settings.EMAIL - self.__database = Db.Cache() #__init__ def __location2pos(self, location): @@ -137,10 +134,9 @@ class GBparser(): @return: Accession number of a protein or None if nothing can be found @rtype: string """ - try: - return self.__database.getProtAcc(transcriptAcc) - except IndexError: - pass + link = queries.get_transcript_protein_link(transcriptAcc) + if link is not None: + return link.protein_accession handle = Entrez.esearch(db = "nucleotide", term = transcriptAcc) result = Entrez.read(handle) @@ -154,7 +150,7 @@ class GBparser(): handle.close() if not result[0]["LinkSetDb"] : - self.__database.insertLink(transcriptAcc, None) + queries.update_transcript_protein_link(transcriptAcc) return None proteinGI = result[0]["LinkSetDb"][0]["Link"][0]["Id"] @@ -164,7 +160,7 @@ class GBparser(): proteinAcc = handle.read().split('.')[0] handle.close() - self.__database.insertLink(transcriptAcc, proteinAcc) + queries.update_transcript_protein_link(transcriptAcc, proteinAcc) return proteinAcc #__transcriptToProtein diff --git a/mutalyzer/services/rpc.py b/mutalyzer/services/rpc.py index ed83ea9ba1f40c6ec29af2cc57299ef4619252d0..66482261058419a8f34f55bef71a5d5d9247ba41 100644 --- a/mutalyzer/services/rpc.py +++ b/mutalyzer/services/rpc.py @@ -871,12 +871,11 @@ class MutalyzerService(ServiceBase): Todo: documentation. """ O = Output(__file__) - D = Db.Cache() O.addMessage(__file__, -1, "INFO", "Received request getGeneAndTranscript(%s, %s)" % ( genomicReference, transcriptReference)) - retriever = Retriever.GenBankRetriever(O, D) + retriever = Retriever.GenBankRetriever(O) record = retriever.loadrecord(genomicReference) GenRecordInstance = GenRecord.GenRecord(O) @@ -943,12 +942,11 @@ class MutalyzerService(ServiceBase): - product """ O = Output(__file__) - D = Db.Cache() O.addMessage(__file__, -1, "INFO", "Received request getTranscriptsAndInfo(%s, %s)" % ( genomicReference, geneName)) - retriever = Retriever.GenBankRetriever(O, D) + retriever = Retriever.GenBankRetriever(O) record = retriever.loadrecord(genomicReference) # Todo: If loadRecord failed (e.g. DTD missing), we should abort here. @@ -1071,8 +1069,7 @@ class MutalyzerService(ServiceBase): Todo: documentation, error handling, argument checking, tests. """ O = Output(__file__) - D = Db.Cache() - retriever = Retriever.GenBankRetriever(O, D) + retriever = Retriever.GenBankRetriever(O) O.addMessage(__file__, -1, "INFO", "Received request sliceChromosomeByGene(%s, %s, %s, %s)" % ( @@ -1104,8 +1101,7 @@ class MutalyzerService(ServiceBase): @type orientation: integer """ O = Output(__file__) - D = Db.Cache() - retriever = Retriever.GenBankRetriever(O, D) + retriever = Retriever.GenBankRetriever(O) O.addMessage(__file__, -1, "INFO", "Received request sliceChromosome(%s, %s, %s, %s)" % ( @@ -1204,8 +1200,7 @@ class MutalyzerService(ServiceBase): output.addMessage(__file__, -1, 'INFO', 'Received request getCache') - database = Db.Cache() - sync = CacheSync(output, database) + sync = CacheSync(output) cache = sync.local_cache(created_since) @@ -1241,7 +1236,7 @@ class MutalyzerService(ServiceBase): counter = Db.Counter() counter.increment('snpconvert', 'webservice') - retriever = Retriever.Retriever(output, None) + retriever = Retriever.Retriever(output) descriptions = retriever.snpConvert(rs_id) output.addMessage(__file__, -1, 'INFO', diff --git a/mutalyzer/sync.py b/mutalyzer/sync.py index 2438096f70c5e725c3a4facb32ec2091ff622639..2122385919d8e2be8790ec1087dc0a4e0b249a0f 100644 --- a/mutalyzer/sync.py +++ b/mutalyzer/sync.py @@ -12,6 +12,8 @@ import urllib2 from suds.client import Client from mutalyzer.config import settings +from mutalyzer.db import session +from mutalyzer.db.models import Reference from mutalyzer import Retriever @@ -22,17 +24,14 @@ class CacheSync(object): """ Synchronize the database cache with other Mutalyzer instances. """ - def __init__(self, output, database): + def __init__(self, output): """ Instantiate the object. @arg output: An output object. @type output: mutalyzer.output.Output - @arg database: A database object. - @type database: mutalyzer.Db.Cache """ self._output = output - self._database = database def local_cache(self, created_since=None): """ @@ -50,26 +49,30 @@ class CacheSync(object): created_since = datetime.today() - \ timedelta(days=DEFAULT_CREATED_SINCE_DAYS) - entries = self._database.getGBSince(created_since) + references = Reference.query.filter(Reference.added >= created_since) cache = [] + cast_orientation = {None: None, + 'forward': 1, + 'reverse': 2} + # Translate each entry to a dictionary and check if it is cached on # our filesystem. - for entry in entries: + for reference in references: # Note that this way we only include Genbank files, not LRG files. cached = None if os.path.isfile(os.path.join(settings.CACHE_DIR, - '%s.gb.bz2' % entry[0])): - cached = '%s.gb' % entry[0] - cache.append({'name': entry[0], - 'gi': entry[1], - 'hash': entry[2], - 'chromosomeName': entry[3], - 'chromosomeStart': entry[4], - 'chromosomeStop': entry[5], - 'chromosomeOrientation': entry[6], - 'url': entry[7], - 'created': entry[8], + '%s.gb.bz2' % reference.accession)): + cached = '%s.gb' % reference.accession + cache.append({'name': reference.accession, + 'gi': reference.geninfo_identifier, + 'hash': reference.checksum, + 'chromosomeName': reference.slice_accession, + 'chromosomeStart': reference.slice_start, + 'chromosomeStop': reference.slice_stop, + 'chromosomeOrientation': cast_orientation[reference.slice_orientation], + 'url': reference.download_url, + 'created': reference.added, 'cached': cached}) return cache @@ -133,7 +136,7 @@ class CacheSync(object): handle.close() # Store remote data - retriever = Retriever.GenBankRetriever(self._output, self._database) + retriever = Retriever.GenBankRetriever(self._output) retriever.write(data, name, 0) def sync_with_remote(self, remote_wsdl, url_template, @@ -167,20 +170,26 @@ class CacheSync(object): inserted = downloaded = 0 for entry in remote_cache: - if self._database.getHash(entry['name']): - continue - if self._database.getGBFromHash(entry['hash']): + try: + reference = Reference.query.filter_by(accession=entry['name']).one() + if reference.checksum is not None: + continue + except NoResultFound: + pass + + if Reference.query.filter_by(checksum=entry['hash']).count() > 0: + # Todo: Combine these queries. continue - if entry['gi'] and self._database.getGBFromGI(entry['gi']): + if entry['gi'] and Reference.query.filter_by(geninfo_identifier=entry['gi']).count() > 0: + # Todo: Combine these queries. continue - self._database.insertGB(entry['name'], - entry['gi'], - entry['hash'], - entry['chromosomeName'], - entry['chromosomeStart'], - entry['chromosomeStop'], - entry['chromosomeOrientation'], - entry['url']) + reference = Reference(entry['name'], entry['hash'], + geninfo_identifier=entry['gi'], + slice_accession=entry['chromosomeName'], + slice_start=entry['chromosomeStart'], + slice_stop=entry['chromosomeStop'], + slice_orientation=entry['chromosomeOrientation'], + download_url=entry['url']) inserted += 1 if not entry['chromosomeName'] and not entry['url'] \ and entry['cached']: diff --git a/mutalyzer/variantchecker.py b/mutalyzer/variantchecker.py index 67ef0a63c6d9a0e4e11e4aaaaa113de8af881937..8623c824e43a40ca329bc0609af512891852470d 100644 --- a/mutalyzer/variantchecker.py +++ b/mutalyzer/variantchecker.py @@ -27,7 +27,6 @@ from mutalyzer.mutator import Mutator from mutalyzer.mapping import Converter from mutalyzer import Retriever from mutalyzer import GenRecord -from mutalyzer import Db # Exceptions used (privately) in this module. @@ -1602,11 +1601,10 @@ def check_variant(description, output): gene_symbol = transcript_id = '' - database = Db.Cache() if parsed_description.LrgAcc: filetype = 'LRG' transcript_id = parsed_description.LRGTranscriptID - retriever = Retriever.LRGRetriever(output, database) + retriever = Retriever.LRGRetriever(output) else: filetype = 'GB' if parsed_description.Gene: @@ -1615,7 +1613,7 @@ def check_variant(description, output): if parsed_description.Gene.ProtIso: output.addMessage(__file__, 4, 'EPROT', 'Indexing by protein isoform is not supported.') - retriever = Retriever.GenBankRetriever(output, database) + retriever = Retriever.GenBankRetriever(output) retrieved_record = retriever.loadrecord(record_id) diff --git a/mutalyzer/website.py b/mutalyzer/website.py index e1ad823305284c1f8b2248137499a70851d07025..5ea2dd410e7d51fbfaeb5af66cc4c2ab4c32b628 100644 --- a/mutalyzer/website.py +++ b/mutalyzer/website.py @@ -433,7 +433,7 @@ class Snp: counter = Db.Counter() counter.increment('snpconvert', 'website') - retriever = Retriever.Retriever(output, None) + retriever = Retriever.Retriever(output) descriptions = retriever.snpConvert(rs_id) output.addMessage(__file__, -1, 'INFO', 'Finished request snpConvert(%s)' % rs_id) @@ -1296,8 +1296,7 @@ class Uploader: O = Output(__file__) IP = web.ctx["ip"] - D = Db.Cache() - R = Retriever.GenBankRetriever(O, D) + R = Retriever.GenBankRetriever(O) UD, errors = "", [] diff --git a/tests/test_services_soap.py b/tests/test_services_soap.py index 46f3977c8a826cb23dbc1e5738476c4ce2bd1c59..efe58c119ea081d6dd2ebb2718da7c049b1698d0 100644 --- a/tests/test_services_soap.py +++ b/tests/test_services_soap.py @@ -404,7 +404,7 @@ class TestServicesSoap(): """ r = self._call('runMutalyzer', 'NM_003002:c.274G>T') assert_equal(r.errors, 0) - assert_equal(r.referenceId, 'NM_003002') + assert_equal(r.referenceId, 'NM_003002.3') assert_equal(r.sourceId, 'NM_003002.3') assert_equal(r.sourceAccession, 'NM_003002') assert_equal(r.sourceVersion, '3') @@ -457,7 +457,7 @@ class TestServicesSoap(): """ r = self._call('runMutalyzer', 'NG_012772:g.18964del') assert_equal(r.errors, 0) - assert_equal(r.referenceId, 'NG_012772') + assert_equal(r.referenceId, 'NG_012772.3') assert_equal(r.sourceId, 'NG_012772.3') assert_equal(r.sourceAccession, 'NG_012772') assert_equal(r.sourceVersion, '3')