From c7e609b9098626bbbdff285ff2fd99bb80adc7ba Mon Sep 17 00:00:00 2001
From: Martijn Vermaat <martijn@vermaat.name>
Date: Wed, 27 Jul 2011 13:35:08 +0000
Subject: [PATCH] Fully functional remote cache sync.

- Writes to log file.
- Can be run from bin/mutalyzer-cache-sync.
- Post-install script installs (by default disabled) cronjob.



git-svn-id: https://humgenprojects.lumc.nl/svn/mutalyzer/branches/gbinfo-sync-branch@318 eb6bd6ab-9ccd-42b9-aceb-e2899b4a52f1
---
 bin/mutalyzer-cache-sync           | 35 +++++++++-----
 extras/cron.d/mutalyzer-cache-sync |  2 +
 extras/post-install.sh             |  3 ++
 mutalyzer/config.py                |  4 +-
 mutalyzer/sync.py                  | 73 +++++++++++++++++-------------
 mutalyzer/webservice.py            | 21 ++++-----
 setup.py                           |  6 +--
 7 files changed, 84 insertions(+), 60 deletions(-)
 create mode 100644 extras/cron.d/mutalyzer-cache-sync

diff --git a/bin/mutalyzer-cache-sync b/bin/mutalyzer-cache-sync
index a58d51ad..864ca5ba 100755
--- a/bin/mutalyzer-cache-sync
+++ b/bin/mutalyzer-cache-sync
@@ -3,35 +3,48 @@
 """
 Synchronize the database cache with other Mutalyzer instances.
 
+Usage:
+  ./mutalyzer-cache-sync days remote_wsdl url_template
+
+  remote_wsdl:  Location of the remote WSDL description.
+  url_template: URL to remote downloads, where {file} is to be substituted
+                by the filename.
+  days:         Number of days to go back in the remote cache.
+
 This program is intended to be run daily from cron. Example:
 
-  25 5 * * *  mutalyzer-cache-sync
+  25 5 * * *  mutalyzer-cache-sync 'http://dom1/?wsdl' 'http://dom1/{file}' 7
+  55 5 * * *  mutalyzer-cache-sync 'http://dom2/?wsdl' 'http://dom2/{file}' 7
 """
 
 
+import sys
+
 from mutalyzer.config import Config
 from mutalyzer.output import Output
 from mutalyzer.sync import CacheSync
 from mutalyzer import Db
 
 
-def main():
+def cache_sync(remote_wsdl, url_template, days):
     """
     Synchronize the database cache with other Mutalyzer instances.
     """
     config = Config()
     output = Output(__file__, config.Output)
-    output.addMessage(__file__, -1, 'INFO',
-                      'Starting cache sync')
-
     database = Db.Cache(config.Db)
-    sync = CacheSync(config.Sync, database)
-
-    created_since = datetime.today() - timedelta(days=60)
-    sync.sync_with_remote(created_since)
 
-    output.addMessage(__file__, -1, 'INFO', 'Cache sync end')
+    sync = CacheSync(config.Retriever, output, database)
+    sync.sync_with_remote(remote_wsdl, url_template, days)
 
 
 if __name__ == '__main__':
-    main()
+    if len(sys.argv) < 4:
+        print __doc__.strip()
+        sys.exit(1)
+    try:
+        days = int(sys.argv[3])
+    except ValueError:
+        print 'Last argument must be an integer.'
+        sys.exit(1)
+    cache_sync(sys.argv[1], sys.argv[2], int(sys.argv[3]))
diff --git a/extras/cron.d/mutalyzer-cache-sync b/extras/cron.d/mutalyzer-cache-sync
new file mode 100644
index 00000000..c58ea774
--- /dev/null
+++ b/extras/cron.d/mutalyzer-cache-sync
@@ -0,0 +1,2 @@
+# Synchronize the local cache with the live server every morning at 05:25
+#25 5 * * * www-data <MUTALYZER_BIN_CACHE_SYNC> 'http://www.mutalyzer.nl/2.0/services/?wsdl' 'http://www.mutalyzer.nl/2.0/Reference/{file}' 3
diff --git a/extras/post-install.sh b/extras/post-install.sh
index b24e2c8b..4cff268e 100644
--- a/extras/post-install.sh
+++ b/extras/post-install.sh
@@ -21,6 +21,7 @@ set -e
 # directory to be used.
 PACKAGE_ROOT=$(cd / && python -c 'import mutalyzer; print mutalyzer.package_root()')
 BIN_BATCHD=$(which mutalyzer-batchd)
+BIN_CACHE_SYNC=$(which mutalyzer-cache-sync)
 BIN_UCSC_UPDATE=$(which mutalyzer-ucsc-update)
 BIN_WEBSITE=$(which mutalyzer-website.wsgi)
 BIN_WEBSERVICE=$(which mutalyzer-webservice.wsgi)
@@ -56,6 +57,8 @@ update-rc.d mutalyzer-batchd defaults 98 02
 echo "Installing crontab"
 cp extras/cron.d/mutalyzer-ucsc-update /etc/cron.d/mutalyzer-ucsc-update
 sed -i -e "s@<MUTALYZER_BIN_UCSC_UPDATE>@${BIN_UCSC_UPDATE}@g" /etc/cron.d/mutalyzer-ucsc-update
+cp extras/cron.d/mutalyzer-cache-sync /etc/cron.d/mutalyzer-cache-sync
+sed -i -e "s@<MUTALYZER_BIN_CACHE_SYNC>@${BIN_CACHE_SYNC}@g" /etc/cron.d/mutalyzer-cache-sync
 
 echo "Creating /etc/apache2/conf.d/mutalyzer.conf"
 cp extras/apache/mutalyzer.conf /etc/apache2/conf.d/mutalyzer.conf
diff --git a/mutalyzer/config.py b/mutalyzer/config.py
index 0314500a..f4c0c6b8 100644
--- a/mutalyzer/config.py
+++ b/mutalyzer/config.py
@@ -10,6 +10,7 @@ import tempfile
 from configobj import ConfigObj
 
 import mutalyzer
+from mutalyzer import util
 
 
 class ConfigurationError(Exception):
@@ -28,7 +29,6 @@ class Config():
     class Batch(): pass
     class File(): pass
     class GenRecord(): pass
-    class Sync(): pass
 
     def __init__(self, filename=None):
         """
@@ -134,8 +134,6 @@ class Config():
             self.GenRecord.spliceAlarm = int(config["spliceAlarm"])
             self.GenRecord.spliceWarn = int(config["spliceWarn"])
 
-            # Set the variables needed by the sync module.
-            self.Sync.cache = config["cache"]
 
             # If we are in a testing environment, use a temporary file for
             # logging and a temporary directory for the cache.
diff --git a/mutalyzer/sync.py b/mutalyzer/sync.py
index 0d0b51cf..2a96446b 100644
--- a/mutalyzer/sync.py
+++ b/mutalyzer/sync.py
@@ -1,7 +1,5 @@
 """
 Module for synchronizing the database with other Mutalyzer instances.
-
-Todo: add some logging to the output object.
 """
 
 
@@ -28,7 +26,7 @@ class CacheSync(object):
         Instantiate the object.
 
         @arg config: A configuration object.
-        @type config: mutalyzer.config.Config
+        @type config: mutalyzer.config.Config.Retriever
         @arg output: An output object.
         @type output: mutalyzer.output.Output
         @arg database: A database object.
@@ -62,7 +60,7 @@ class CacheSync(object):
         for entry in entries:
             # Note that this way we only include Genbank files, not LRG files.
             cached = None
-            if os.path.isfile(os.path.join(self._config.Retriever.cache,
+            if os.path.isfile(os.path.join(self._config.cache,
                                            '%s.gb.bz2' % entry[0])):
                 cached = '%s.gb' % entry[0]
             cache.append({'name':                  entry[0],
@@ -74,7 +72,7 @@ class CacheSync(object):
                           'chromosomeOrientation': entry[6],
                           'url':                   entry[7],
                           'created':               entry[8],
-                          'cached':                cached}
+                          'cached':                cached})
 
         return cache
 
@@ -92,6 +90,9 @@ class CacheSync(object):
         @return: List of cache entries.
         @rtype: list(dictionary)
         """
+        self._output.addMessage(__file__, -1, 'INFO', 'Getting remote cache'
+                                ' from %s' % remote_wsdl)
+
         if not created_since:
             created_since = datetime.today() - \
                             timedelta(days=DEFAULT_CREATED_SINCE_DAYS)
@@ -116,7 +117,31 @@ class CacheSync(object):
 
         return map(cache_entry_from_soap, cache.CacheEntry)
 
-    def sync_with_remote(self, remote_wsdl, url_template, created_since=None):
+    def store_remote_file(self, name, url):
+        """
+        Download a remote file located at {url} and store it as {name}.
+
+        @arg name: Name to store the file under.
+        @type name: string
+        @arg url: Url to the remote file.
+        @type url: string
+        """
+        if not re.match('^[\da-zA-Z\._-]+$', name):
+            return
+
+        # Download remote data
+        handle = urllib2.urlopen(url)
+        data = handle.read()
+        handle.close()
+
+        # Store remote data
+        retriever = Retriever.GenBankRetriever(self._config,
+                                               self._output,
+                                               self._database)
+        retriever.write(data, name, 0)
+
+    def sync_with_remote(self, remote_wsdl, url_template,
+                         days=DEFAULT_CREATED_SINCE_DAYS):
         """
         Synchronize the local cache with the remote cache.
 
@@ -130,14 +155,17 @@ class CacheSync(object):
         @arg url_template: Formatting string containing a {file} occurence,
             see examle usage above.
         @string url_template: string
-        @kwarg created_since: Only remote entries with this creation date or
+        @kwarg days: Only remote entries added this number of days ago or
             later are considered.
-        @type created_since: datatime.datetime
+        @type days: int
 
         @return: The number of entries added to the local cache and the number
             cache files downloaded from the remote site.
         @rtype: tuple(int, int)
         """
+        self._output.addMessage(__file__, -1, 'INFO', 'Starting cache sync')
+
+        created_since = datetime.today() - timedelta(days=days)
         remote_cache = self.remote_cache(remote_wsdl, created_since)
 
         inserted = downloaded = 0
@@ -164,27 +192,10 @@ class CacheSync(object):
                 self.store_remote_file(entry['name'], url)
                 downloaded += 1
 
-        return inserted, downloaded
-
-    def store_remote_file(self, name, url):
-        """
-        Download a remote file located at {url} and store it as {name}.
-
-        @arg name: Name to store the file under.
-        @type name: string
-        @arg url: Url to the remote file.
-        @type url: string
-        """
-        if not re.match('^[\da-zA-Z\._-]+$', name):
-            return
-
-        # Download remote data
-        handle = urllib2.urlopen(url)
-        data = handle.read()
-        handle.close()
+        self._output.addMessage(__file__, -1, 'INFO',
+                                'Inserted %d entries in the cache,'
+                                ' downloaded %d files.' \
+                                % (inserted, downloaded))
+        self._output.addMessage(__file__, -1, 'INFO', 'Finished cache sync')
 
-        # Store remote data
-        retriever = Retriever.GenBankRetriever(self._config.Retriever,
-                                               self._output,
-                                               self._database)
-        retriever.write(data, name, 0)
+        return inserted, downloaded
diff --git a/mutalyzer/webservice.py b/mutalyzer/webservice.py
index b3b00a42..181c4af9 100644
--- a/mutalyzer/webservice.py
+++ b/mutalyzer/webservice.py
@@ -885,7 +885,10 @@ class MutalyzerService(DefinitionBase):
     @soap(DateTime, _returns = Array(CacheEntry))
     def getCache(self, created_since=None):
         """
-        Todo: documentation.
+        Get a list of entries from the local cache created since given date.
+
+        This method is intended to be used by Mutalyzer itself to synchronize
+        the cache between installations on different servers.
         """
         output = Output(__file__, self._config.Output)
 
@@ -893,22 +896,16 @@ class MutalyzerService(DefinitionBase):
                           'Received request getCache')
 
         database = Db.Cache(self._config.Db)
-        sync = CacheSync(self._config, output, database)
+        sync = CacheSync(self._config.Retriever, output, database)
 
         cache = sync.local_cache(created_since)
 
         def cache_entry_to_soap(entry):
             e = CacheEntry()
-            (e.name,
-             e.gi,
-             e.hash,
-             e.chromosomeName,
-             e.chromosomeStart,
-             e.chromosomeStop,
-             e.chromosomeOrientation,
-             e.url,
-             e.created,
-             e.cached) = entry
+            for attr in ('name', 'gi', 'hash', 'chromosomeName',
+                         'chromosomeStart', 'chromosomeStop',
+                         'chromosomeOrientation', 'url', 'created', 'cached'):
+                setattr(e, attr, entry[attr])
             return e
 
         output.addMessage(__file__, -1, 'INFO',
diff --git a/setup.py b/setup.py
index daf316b0..62ea18aa 100644
--- a/setup.py
+++ b/setup.py
@@ -18,11 +18,11 @@ setup(
     packages=find_packages(exclude=['doc', 'extras', 'tests']),
     include_package_data=True,
     scripts=['bin/mutalyzer',
-             'bin/mutalyzer-cache-sync',
              'bin/mutalyzer-batchd',
+             'bin/mutalyzer-cache-sync',
              'bin/mutalyzer-ucsc-update',
-             'bin/mutalyzer-website.wsgi',
-             'bin/mutalyzer-webservice.wsgi'],
+             'bin/mutalyzer-webservice.wsgi',
+             'bin/mutalyzer-website.wsgi'],
     zip_safe=False
 )
 
-- 
GitLab