diff --git a/mutalyzer/sync.py b/mutalyzer/sync.py index 82fd0c406ea19b51d549a6a06c91bc760bcb7689..0d0b51cf6ffc34136cbc2320a55e034096acdb9a 100644 --- a/mutalyzer/sync.py +++ b/mutalyzer/sync.py @@ -1,5 +1,7 @@ """ Module for synchronizing the database with other Mutalyzer instances. + +Todo: add some logging to the output object. """ @@ -19,11 +21,18 @@ DEFAULT_CREATED_SINCE_DAYS = 7 class CacheSync(object): """ - Todo. + Synchronize the database cache with other Mutalyzer instances. """ def __init__(self, config, output, database): """ - Todo. + Instantiate the object. + + @arg config: A configuration object. + @type config: mutalyzer.config.Config + @arg output: An output object. + @type output: mutalyzer.output.Output + @arg database: A database object. + @type database: mutalyzer.Db.Cache """ self._config = config self._output = output @@ -31,33 +40,57 @@ class CacheSync(object): def local_cache(self, created_since=None): """ - Todo. + Get all entries in the local cache with creation date {created_since} + or later. + + @kwarg created_since: Only entries with this creation date or later + are returned. + @type created_since: datatime.datetime + + @return: List of cache entries. + @rtype: list(dictionary) """ if not created_since: created_since = datetime.today() - \ timedelta(days=DEFAULT_CREATED_SINCE_DAYS) - cache = self._database.getGBSince(created_since) - entries = [] + entries = self._database.getGBSince(created_since) + cache = [] - # For each entry, check if it is cached on our filesystem. - # Todo: refactor - for entry in cache: - e = list(entry) + # Translate each entry to a dictionary and check if it is cached on + # our filesystem. + for entry in entries: # Note that this way we only include Genbank files, not LRG files. - file_name = '%s.gb.bz2' % entry[0] - file_path = os.path.join(self._config.Retriever.cache, file_name) - if os.path.isfile(file_path): - e.append('%s.gb' % entry[0]) - else: - e.append(None) - entries.append(e) - - return entries + cached = None + if os.path.isfile(os.path.join(self._config.Retriever.cache, + '%s.gb.bz2' % entry[0])): + cached = '%s.gb' % entry[0] + cache.append({'name': entry[0], + 'gi': entry[1], + 'hash': entry[2], + 'chromosomeName': entry[3], + 'chromosomeStart': entry[4], + 'chromosomeStop': entry[5], + 'chromosomeOrientation': entry[6], + 'url': entry[7], + 'created': entry[8], + 'cached': cached} + + return cache def remote_cache(self, remote_wsdl, created_since=None): """ - Todo. + Get all entries in the remote cache with creation date {created_since} + or later. + + @arg remote_wsdl: The url of the remote SOAP WSDL description. + @type remote_wsdl: string + @kwarg created_since: Only entries with this creation date or later + are returned. + @type created_since: datatime.datetime + + @return: List of cache entries. + @rtype: list(dictionary) """ if not created_since: created_since = datetime.today() - \ @@ -85,10 +118,30 @@ class CacheSync(object): def sync_with_remote(self, remote_wsdl, url_template, created_since=None): """ - Todo. + Synchronize the local cache with the remote cache. + + >>> wsdl = 'http://mutalyzer.nl/mutalyzer/services/?wsdl' + >>> template = 'http://mutalyzer.nl/mutalyzer/Reference/{file}' + >>> self.sync_with_remote(wsdl, template) + (14, 3) + + @arg remote_wsdl: The url of the remote SOAP WSDL description. + @type remote_wsdl: string + @arg url_template: Formatting string containing a {file} occurence, + see examle usage above. + @string url_template: string + @kwarg created_since: Only remote entries with this creation date or + later are considered. + @type created_since: datatime.datetime + + @return: The number of entries added to the local cache and the number + cache files downloaded from the remote site. + @rtype: tuple(int, int) """ remote_cache = self.remote_cache(remote_wsdl, created_since) + inserted = downloaded = 0 + for entry in remote_cache: if self._database.getHash(entry['name']): continue @@ -104,17 +157,23 @@ class CacheSync(object): entry['chromosomeStop'], entry['chromosomeOrientation'], entry['url']) - print 'inserting %s' % entry['name'] - if not entry['chromosomeName'] and not entry['url']: - if entry['cached']: - print 'downloading file from remote cache: %s' % (url_template % str(entry['cached'])) - self.store_remote_file(entry['name'], url_template % entry['cached']) - else: - print 'cannot download this file from remote cache' + inserted += 1 + if not entry['chromosomeName'] and not entry['url'] \ + and entry['cached']: + url = url_template.format(file=entry['cached']) + self.store_remote_file(entry['name'], url) + downloaded += 1 + + return inserted, downloaded def store_remote_file(self, name, url): """ - Todo. + Download a remote file located at {url} and store it as {name}. + + @arg name: Name to store the file under. + @type name: string + @arg url: Url to the remote file. + @type url: string """ if not re.match('^[\da-zA-Z\._-]+$', name): return @@ -125,5 +184,7 @@ class CacheSync(object): handle.close() # Store remote data - retriever = Retriever.GenBankRetriever(self._config.Retriever, self._output, self._database) + retriever = Retriever.GenBankRetriever(self._config.Retriever, + self._output, + self._database) retriever.write(data, name, 0)