From dfa57a20fa8a07c2862f122a1e080d57b8a6a22f Mon Sep 17 00:00:00 2001 From: Martijn Vermaat <martijn@vermaat.name> Date: Tue, 26 Jul 2011 14:36:55 +0000 Subject: [PATCH] Basic working cache sync. git-svn-id: https://humgenprojects.lumc.nl/svn/mutalyzer/branches/gbinfo-sync-branch@316 eb6bd6ab-9ccd-42b9-aceb-e2899b4a52f1 --- extras/soap-tools/getcache.py | 4 ++ mutalyzer/Db.py | 8 ++-- mutalyzer/models.py | 2 +- mutalyzer/sync.py | 83 ++++++++++++++++++++++++----------- mutalyzer/webservice.py | 2 +- 5 files changed, 67 insertions(+), 32 deletions(-) diff --git a/extras/soap-tools/getcache.py b/extras/soap-tools/getcache.py index cf9baa9e..45e0be3e 100755 --- a/extras/soap-tools/getcache.py +++ b/extras/soap-tools/getcache.py @@ -48,4 +48,8 @@ if cache: if 'url' in r: print r.url print r.created + if r.cached: + print 'cached' + else: + print 'not cached' print diff --git a/mutalyzer/Db.py b/mutalyzer/Db.py index bb57f851..b608cffb 100644 --- a/mutalyzer/Db.py +++ b/mutalyzer/Db.py @@ -1035,10 +1035,10 @@ class Cache(Db) : return None #getGBFromGI - def getGB(self, created_since): + def getGBSince(self, created_since): """ - Get all accession number entries starting with creation date - {created_since}. + Get all accession number entries with creation date {created_since} + or later. SQL tables from internalDb: - GBInfo ; Information about cached and uploaded GenBank files. @@ -1057,7 +1057,7 @@ class Cache(Db) : """, created_since return self.query(statement) - #getGB + #getGBSince def getLoc(self, accNo) : """ diff --git a/mutalyzer/models.py b/mutalyzer/models.py index c9e32322..fba12fbd 100644 --- a/mutalyzer/models.py +++ b/mutalyzer/models.py @@ -233,5 +233,5 @@ class CacheEntry(ClassModel): chromosomeOrientation = Integer url = String created = Mandatory.DateTime - cached = Mandatory.Boolean + cached = String #CacheEntry diff --git a/mutalyzer/sync.py b/mutalyzer/sync.py index d5c7a6b7..82fd0c40 100644 --- a/mutalyzer/sync.py +++ b/mutalyzer/sync.py @@ -6,9 +6,13 @@ Module for synchronizing the database with other Mutalyzer instances. from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import os +import re from datetime import datetime, timedelta +import urllib2 from suds.client import Client +from mutalyzer import Retriever + DEFAULT_CREATED_SINCE_DAYS = 7 @@ -17,11 +21,12 @@ class CacheSync(object): """ Todo. """ - def __init__(self, config, database): + def __init__(self, config, output, database): """ Todo. """ self._config = config + self._output = output self._database = database def local_cache(self, created_since=None): @@ -31,16 +36,21 @@ class CacheSync(object): if not created_since: created_since = datetime.today() - \ timedelta(days=DEFAULT_CREATED_SINCE_DAYS) - cache = self._database.getGB(created_since) + cache = self._database.getGBSince(created_since) entries = [] # For each entry, check if it is cached on our filesystem. # Todo: refactor for entry in cache: - file_path = os.path.join(self._config.cache, '%s.bz2' % entry[0]) e = list(entry) - e.append(os.path.isfile(file_path)) + # Note that this way we only include Genbank files, not LRG files. + file_name = '%s.gb.bz2' % entry[0] + file_path = os.path.join(self._config.Retriever.cache, file_name) + if os.path.isfile(file_path): + e.append('%s.gb' % entry[0]) + else: + e.append(None) entries.append(e) return entries @@ -59,20 +69,21 @@ class CacheSync(object): """ Create a nice dictionary out of the CacheEntry object. """ - entry_dict = {'name': entry.name, - 'hash': entry.hash, - 'created': entry.created, - 'cached': bool(entry.cached)} - for attribute in ('gi', 'chromosomeName', 'chromosomeStart' - 'chromosomeStop', 'chromosomeOrientation', - 'url'): - entry_dict[attribute] = entry[attribute] \ + entry_dict = {'name': str(entry.name), + 'hash': str(entry.hash), + 'created': entry.created} + for attribute in ('gi', 'chromosomeName', 'url', 'cached'): + entry_dict[attribute] = str(entry[attribute]) \ + if attribute in entry else None + for attribute in ('chromosomeStart', 'chromosomeStop', + 'chromosomeOrientation'): + entry_dict[attribute] = int(entry[attribute]) \ if attribute in entry else None return entry_dict return map(cache_entry_from_soap, cache.CacheEntry) - def sync_with_remote(self, remote_wsdl, remote_cache, created_since=None): + def sync_with_remote(self, remote_wsdl, url_template, created_since=None): """ Todo. """ @@ -81,18 +92,38 @@ class CacheSync(object): for entry in remote_cache: if self._database.getHash(entry['name']): continue - #self._database.insertGB(entry['name'], - # entry['gi'], - # entry['hash'], - # entry['chromosomeName'], - # entry['chromosomeStart'], - # entry['chromosomeStop'], - # entry['chromosomeOrientation'], - # entry['url']) - #print 'inserting %s' % entry['name'] - #print entry + if self._database.getGBFromHash(entry['hash']): + continue + if entry['gi'] and self._database.getGBFromGI(entry['gi']): + continue + self._database.insertGB(entry['name'], + entry['gi'], + entry['hash'], + entry['chromosomeName'], + entry['chromosomeStart'], + entry['chromosomeStop'], + entry['chromosomeOrientation'], + entry['url']) + print 'inserting %s' % entry['name'] if not entry['chromosomeName'] and not entry['url']: if entry['cached']: - print 'downloading file from remote cache: %s' % (remote_cache % entry['name']) - #else: - #print 'cannot download this file from remote cache' + print 'downloading file from remote cache: %s' % (url_template % str(entry['cached'])) + self.store_remote_file(entry['name'], url_template % entry['cached']) + else: + print 'cannot download this file from remote cache' + + def store_remote_file(self, name, url): + """ + Todo. + """ + if not re.match('^[\da-zA-Z\._-]+$', name): + return + + # Download remote data + handle = urllib2.urlopen(url) + data = handle.read() + handle.close() + + # Store remote data + retriever = Retriever.GenBankRetriever(self._config.Retriever, self._output, self._database) + retriever.write(data, name, 0) diff --git a/mutalyzer/webservice.py b/mutalyzer/webservice.py index cd480fda..b3b00a42 100644 --- a/mutalyzer/webservice.py +++ b/mutalyzer/webservice.py @@ -893,7 +893,7 @@ class MutalyzerService(DefinitionBase): 'Received request getCache') database = Db.Cache(self._config.Db) - sync = CacheSync(self._config.Sync, database) + sync = CacheSync(self._config, output, database) cache = sync.local_cache(created_since) -- GitLab