From c7e609b9098626bbbdff285ff2fd99bb80adc7ba Mon Sep 17 00:00:00 2001 From: Martijn Vermaat <martijn@vermaat.name> Date: Wed, 27 Jul 2011 13:35:08 +0000 Subject: [PATCH] Fully functional remote cache sync. - Writes to log file. - Can be run from bin/mutalyzer-cache-sync. - Post-install script installs (by default disabled) cronjob. git-svn-id: https://humgenprojects.lumc.nl/svn/mutalyzer/branches/gbinfo-sync-branch@318 eb6bd6ab-9ccd-42b9-aceb-e2899b4a52f1 --- bin/mutalyzer-cache-sync | 35 +++++++++----- extras/cron.d/mutalyzer-cache-sync | 2 + extras/post-install.sh | 3 ++ mutalyzer/config.py | 4 +- mutalyzer/sync.py | 73 +++++++++++++++++------------- mutalyzer/webservice.py | 21 ++++----- setup.py | 6 +-- 7 files changed, 84 insertions(+), 60 deletions(-) create mode 100644 extras/cron.d/mutalyzer-cache-sync diff --git a/bin/mutalyzer-cache-sync b/bin/mutalyzer-cache-sync index a58d51ad..864ca5ba 100755 --- a/bin/mutalyzer-cache-sync +++ b/bin/mutalyzer-cache-sync @@ -3,35 +3,48 @@ """ Synchronize the database cache with other Mutalyzer instances. +Usage: + ./mutalyzer-cache-sync days remote_wsdl url_template + + remote_wsdl: Location of the remote WSDL description. + url_template: URL to remote downloads, where {file} is to be substituted + by the filename. + days: Number of days to go back in the remote cache. + This program is intended to be run daily from cron. Example: - 25 5 * * * mutalyzer-cache-sync + 25 5 * * * mutalyzer-cache-sync 'http://dom1/?wsdl' 'http://dom1/{file}' 7 + 55 5 * * * mutalyzer-cache-sync 'http://dom2/?wsdl' 'http://dom2/{file}' 7 """ +import sys + from mutalyzer.config import Config from mutalyzer.output import Output from mutalyzer.sync import CacheSync from mutalyzer import Db -def main(): +def cache_sync(remote_wsdl, url_template, days): """ Synchronize the database cache with other Mutalyzer instances. """ config = Config() output = Output(__file__, config.Output) - output.addMessage(__file__, -1, 'INFO', - 'Starting cache sync') - database = Db.Cache(config.Db) - sync = CacheSync(config.Sync, database) - - created_since = datetime.today() - timedelta(days=60) - sync.sync_with_remote(created_since) - output.addMessage(__file__, -1, 'INFO', 'Cache sync end') + sync = CacheSync(config.Retriever, output, database) + sync.sync_with_remote(remote_wsdl, url_template, days) if __name__ == '__main__': - main() + if len(sys.argv) < 4: + print __doc__.strip() + sys.exit(1) + try: + days = int(sys.argv[3]) + except ValueError: + print 'Last argument must be an integer.' + sys.exit(1) + cache_sync(sys.argv[1], sys.argv[2], int(sys.argv[3])) diff --git a/extras/cron.d/mutalyzer-cache-sync b/extras/cron.d/mutalyzer-cache-sync new file mode 100644 index 00000000..c58ea774 --- /dev/null +++ b/extras/cron.d/mutalyzer-cache-sync @@ -0,0 +1,2 @@ +# Synchronize the local cache with the live server every morning at 05:25 +#25 5 * * * www-data <MUTALYZER_BIN_CACHE_SYNC> 'http://www.mutalyzer.nl/2.0/services/?wsdl' 'http://www.mutalyzer.nl/2.0/Reference/{file}' 3 diff --git a/extras/post-install.sh b/extras/post-install.sh index b24e2c8b..4cff268e 100644 --- a/extras/post-install.sh +++ b/extras/post-install.sh @@ -21,6 +21,7 @@ set -e # directory to be used. PACKAGE_ROOT=$(cd / && python -c 'import mutalyzer; print mutalyzer.package_root()') BIN_BATCHD=$(which mutalyzer-batchd) +BIN_CACHE_SYNC=$(which mutalyzer-cache-sync) BIN_UCSC_UPDATE=$(which mutalyzer-ucsc-update) BIN_WEBSITE=$(which mutalyzer-website.wsgi) BIN_WEBSERVICE=$(which mutalyzer-webservice.wsgi) @@ -56,6 +57,8 @@ update-rc.d mutalyzer-batchd defaults 98 02 echo "Installing crontab" cp extras/cron.d/mutalyzer-ucsc-update /etc/cron.d/mutalyzer-ucsc-update sed -i -e "s@<MUTALYZER_BIN_UCSC_UPDATE>@${BIN_UCSC_UPDATE}@g" /etc/cron.d/mutalyzer-ucsc-update +cp extras/cron.d/mutalyzer-cache-sync /etc/cron.d/mutalyzer-cache-sync +sed -i -e "s@<MUTALYZER_BIN_CACHE_SYNC>@${BIN_CACHE_SYNC}@g" /etc/cron.d/mutalyzer-cache-sync echo "Creating /etc/apache2/conf.d/mutalyzer.conf" cp extras/apache/mutalyzer.conf /etc/apache2/conf.d/mutalyzer.conf diff --git a/mutalyzer/config.py b/mutalyzer/config.py index 0314500a..f4c0c6b8 100644 --- a/mutalyzer/config.py +++ b/mutalyzer/config.py @@ -10,6 +10,7 @@ import tempfile from configobj import ConfigObj import mutalyzer +from mutalyzer import util class ConfigurationError(Exception): @@ -28,7 +29,6 @@ class Config(): class Batch(): pass class File(): pass class GenRecord(): pass - class Sync(): pass def __init__(self, filename=None): """ @@ -134,8 +134,6 @@ class Config(): self.GenRecord.spliceAlarm = int(config["spliceAlarm"]) self.GenRecord.spliceWarn = int(config["spliceWarn"]) - # Set the variables needed by the sync module. - self.Sync.cache = config["cache"] # If we are in a testing environment, use a temporary file for # logging and a temporary directory for the cache. diff --git a/mutalyzer/sync.py b/mutalyzer/sync.py index 0d0b51cf..2a96446b 100644 --- a/mutalyzer/sync.py +++ b/mutalyzer/sync.py @@ -1,7 +1,5 @@ """ Module for synchronizing the database with other Mutalyzer instances. - -Todo: add some logging to the output object. """ @@ -28,7 +26,7 @@ class CacheSync(object): Instantiate the object. @arg config: A configuration object. - @type config: mutalyzer.config.Config + @type config: mutalyzer.config.Config.Retriever @arg output: An output object. @type output: mutalyzer.output.Output @arg database: A database object. @@ -62,7 +60,7 @@ class CacheSync(object): for entry in entries: # Note that this way we only include Genbank files, not LRG files. cached = None - if os.path.isfile(os.path.join(self._config.Retriever.cache, + if os.path.isfile(os.path.join(self._config.cache, '%s.gb.bz2' % entry[0])): cached = '%s.gb' % entry[0] cache.append({'name': entry[0], @@ -74,7 +72,7 @@ class CacheSync(object): 'chromosomeOrientation': entry[6], 'url': entry[7], 'created': entry[8], - 'cached': cached} + 'cached': cached}) return cache @@ -92,6 +90,9 @@ class CacheSync(object): @return: List of cache entries. @rtype: list(dictionary) """ + self._output.addMessage(__file__, -1, 'INFO', 'Getting remote cache' + ' from %s' % remote_wsdl) + if not created_since: created_since = datetime.today() - \ timedelta(days=DEFAULT_CREATED_SINCE_DAYS) @@ -116,7 +117,31 @@ class CacheSync(object): return map(cache_entry_from_soap, cache.CacheEntry) - def sync_with_remote(self, remote_wsdl, url_template, created_since=None): + def store_remote_file(self, name, url): + """ + Download a remote file located at {url} and store it as {name}. + + @arg name: Name to store the file under. + @type name: string + @arg url: Url to the remote file. + @type url: string + """ + if not re.match('^[\da-zA-Z\._-]+$', name): + return + + # Download remote data + handle = urllib2.urlopen(url) + data = handle.read() + handle.close() + + # Store remote data + retriever = Retriever.GenBankRetriever(self._config, + self._output, + self._database) + retriever.write(data, name, 0) + + def sync_with_remote(self, remote_wsdl, url_template, + days=DEFAULT_CREATED_SINCE_DAYS): """ Synchronize the local cache with the remote cache. @@ -130,14 +155,17 @@ class CacheSync(object): @arg url_template: Formatting string containing a {file} occurence, see examle usage above. @string url_template: string - @kwarg created_since: Only remote entries with this creation date or + @kwarg days: Only remote entries added this number of days ago or later are considered. - @type created_since: datatime.datetime + @type days: int @return: The number of entries added to the local cache and the number cache files downloaded from the remote site. @rtype: tuple(int, int) """ + self._output.addMessage(__file__, -1, 'INFO', 'Starting cache sync') + + created_since = datetime.today() - timedelta(days=days) remote_cache = self.remote_cache(remote_wsdl, created_since) inserted = downloaded = 0 @@ -164,27 +192,10 @@ class CacheSync(object): self.store_remote_file(entry['name'], url) downloaded += 1 - return inserted, downloaded - - def store_remote_file(self, name, url): - """ - Download a remote file located at {url} and store it as {name}. - - @arg name: Name to store the file under. - @type name: string - @arg url: Url to the remote file. - @type url: string - """ - if not re.match('^[\da-zA-Z\._-]+$', name): - return - - # Download remote data - handle = urllib2.urlopen(url) - data = handle.read() - handle.close() + self._output.addMessage(__file__, -1, 'INFO', + 'Inserted %d entries in the cache,' + ' downloaded %d files.' \ + % (inserted, downloaded)) + self._output.addMessage(__file__, -1, 'INFO', 'Finished cache sync') - # Store remote data - retriever = Retriever.GenBankRetriever(self._config.Retriever, - self._output, - self._database) - retriever.write(data, name, 0) + return inserted, downloaded diff --git a/mutalyzer/webservice.py b/mutalyzer/webservice.py index b3b00a42..181c4af9 100644 --- a/mutalyzer/webservice.py +++ b/mutalyzer/webservice.py @@ -885,7 +885,10 @@ class MutalyzerService(DefinitionBase): @soap(DateTime, _returns = Array(CacheEntry)) def getCache(self, created_since=None): """ - Todo: documentation. + Get a list of entries from the local cache created since given date. + + This method is intended to be used by Mutalyzer itself to synchronize + the cache between installations on different servers. """ output = Output(__file__, self._config.Output) @@ -893,22 +896,16 @@ class MutalyzerService(DefinitionBase): 'Received request getCache') database = Db.Cache(self._config.Db) - sync = CacheSync(self._config, output, database) + sync = CacheSync(self._config.Retriever, output, database) cache = sync.local_cache(created_since) def cache_entry_to_soap(entry): e = CacheEntry() - (e.name, - e.gi, - e.hash, - e.chromosomeName, - e.chromosomeStart, - e.chromosomeStop, - e.chromosomeOrientation, - e.url, - e.created, - e.cached) = entry + for attr in ('name', 'gi', 'hash', 'chromosomeName', + 'chromosomeStart', 'chromosomeStop', + 'chromosomeOrientation', 'url', 'created', 'cached'): + setattr(e, attr, entry[attr]) return e output.addMessage(__file__, -1, 'INFO', diff --git a/setup.py b/setup.py index daf316b0..62ea18aa 100644 --- a/setup.py +++ b/setup.py @@ -18,11 +18,11 @@ setup( packages=find_packages(exclude=['doc', 'extras', 'tests']), include_package_data=True, scripts=['bin/mutalyzer', - 'bin/mutalyzer-cache-sync', 'bin/mutalyzer-batchd', + 'bin/mutalyzer-cache-sync', 'bin/mutalyzer-ucsc-update', - 'bin/mutalyzer-website.wsgi', - 'bin/mutalyzer-webservice.wsgi'], + 'bin/mutalyzer-webservice.wsgi', + 'bin/mutalyzer-website.wsgi'], zip_safe=False ) -- GitLab