diff --git a/bin/mutalyzer-cache-sync b/bin/mutalyzer-cache-sync new file mode 100755 index 0000000000000000000000000000000000000000..b53d709c4071a682d287eb2f9a4f0a5a29264637 --- /dev/null +++ b/bin/mutalyzer-cache-sync @@ -0,0 +1,50 @@ +#!/usr/bin/env python + +""" +Synchronize the database cache with other Mutalyzer instances. + +Usage: + ./mutalyzer-cache-sync remote_wsdl url_template days + + remote_wsdl: Location of the remote WSDL description. + url_template: URL to remote downloads, where {file} is to be substituted + by the filename. + days: Number of days to go back in the remote cache. + +This program is intended to be run daily from cron. Example: + + 25 5 * * * mutalyzer-cache-sync 'http://dom1/?wsdl' 'http://dom1/{file}' 7 + 55 5 * * * mutalyzer-cache-sync 'http://dom2/?wsdl' 'http://dom2/{file}' 7 +""" + + +import sys + +from mutalyzer.config import Config +from mutalyzer.output import Output +from mutalyzer.sync import CacheSync +from mutalyzer import Db + + +def cache_sync(remote_wsdl, url_template, days): + """ + Synchronize the database cache with other Mutalyzer instances. + """ + config = Config() + output = Output(__file__, config.Output) + database = Db.Cache(config.Db) + + sync = CacheSync(config.Retriever, output, database) + sync.sync_with_remote(remote_wsdl, url_template, days) + + +if __name__ == '__main__': + if len(sys.argv) < 4: + print __doc__.strip() + sys.exit(1) + try: + days = int(sys.argv[3]) + except ValueError: + print 'Last argument must be an integer.' + sys.exit(1) + cache_sync(sys.argv[1], sys.argv[2], int(sys.argv[3])) diff --git a/extras/cron.d/mutalyzer-cache-sync b/extras/cron.d/mutalyzer-cache-sync new file mode 100644 index 0000000000000000000000000000000000000000..c58ea7742d54a461e7a93ee102cd6c9a3274a492 --- /dev/null +++ b/extras/cron.d/mutalyzer-cache-sync @@ -0,0 +1,2 @@ +# Synchronize the local cache with the live server every morning at 05:25 +#25 5 * * * www-data <MUTALYZER_BIN_CACHE_SYNC> 'http://www.mutalyzer.nl/2.0/services/?wsdl' 'http://www.mutalyzer.nl/2.0/Reference/{file}' 3 diff --git a/extras/post-install.sh b/extras/post-install.sh index 123f3850dbadcd196c469c55da3844d0126326a9..4cff268e639ba0e5c8c1410804c7e4bd9b0be1e3 100644 --- a/extras/post-install.sh +++ b/extras/post-install.sh @@ -21,6 +21,7 @@ set -e # directory to be used. PACKAGE_ROOT=$(cd / && python -c 'import mutalyzer; print mutalyzer.package_root()') BIN_BATCHD=$(which mutalyzer-batchd) +BIN_CACHE_SYNC=$(which mutalyzer-cache-sync) BIN_UCSC_UPDATE=$(which mutalyzer-ucsc-update) BIN_WEBSITE=$(which mutalyzer-website.wsgi) BIN_WEBSERVICE=$(which mutalyzer-webservice.wsgi) @@ -56,6 +57,8 @@ update-rc.d mutalyzer-batchd defaults 98 02 echo "Installing crontab" cp extras/cron.d/mutalyzer-ucsc-update /etc/cron.d/mutalyzer-ucsc-update sed -i -e "s@<MUTALYZER_BIN_UCSC_UPDATE>@${BIN_UCSC_UPDATE}@g" /etc/cron.d/mutalyzer-ucsc-update +cp extras/cron.d/mutalyzer-cache-sync /etc/cron.d/mutalyzer-cache-sync +sed -i -e "s@<MUTALYZER_BIN_CACHE_SYNC>@${BIN_CACHE_SYNC}@g" /etc/cron.d/mutalyzer-cache-sync echo "Creating /etc/apache2/conf.d/mutalyzer.conf" cp extras/apache/mutalyzer.conf /etc/apache2/conf.d/mutalyzer.conf @@ -303,9 +306,11 @@ CREATE TABLE GBInfo ( ChrStop int(12) DEFAULT NULL, orientation int(2) DEFAULT NULL, url char(255) DEFAULT NULL, + created TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (AccNo), UNIQUE KEY hash (hash), - UNIQUE KEY alias (GI) + UNIQUE KEY alias (GI), + INDEX (created) ); CREATE TABLE Link ( mrnaAcc char(20) NOT NULL, diff --git a/extras/soap-tools/getcache.py b/extras/soap-tools/getcache.py new file mode 100755 index 0000000000000000000000000000000000000000..4ad13aa33d976a81bbf427b1103687f996b722f3 --- /dev/null +++ b/extras/soap-tools/getcache.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python + +# Monkey patch suds, because for some weird reason the location +# http://www.w3.org/2001/xml.xsd is used for the XML namespace, but the W3C +# seems to respond too slow on that url. We use therefore use +# http://www.w3.org/2009/01/xml.xsd which fixes this. +from suds.xsd.sxbasic import Import +_import_open = Import.open +def _import_open_patched(self, *args, **kwargs): + if self.location == 'http://www.w3.org/2001/xml.xsd': + self.location = 'http://www.w3.org/2009/01/xml.xsd' + return _import_open(self, *args, **kwargs) +Import.open = _import_open_patched + +import sys +from datetime import datetime, timedelta +from suds.client import Client + +URL = 'http://localhost/mutalyzer/services/?wsdl' + +c = Client(URL, cache=None) +o = c.service + +days = 1 +if len(sys.argv) > 1: + days = int(sys.argv[1]) + +created_since = datetime.today() - timedelta(days=days) + +print 'Getting cache...' + +cache = o.getCache(created_since) + +if cache: + for r in cache.CacheEntry: + print r.name + if 'gi' in r: + print 'GI: %s' % r.gi + print 'Hash: %s' % r.hash + if 'chromosomeName' in r: + print r.chromosomeName + if 'chromosomeStart' in r: + print r.chromosomeStart + if 'chromosomeStop' in r: + print r.chromosomeStop + if 'chromosomeOrientation' in r: + print r.chromosomeOrientation + if 'url' in r: + print r.url + print 'Created: %s' % r.created + if 'cached' in r: + print 'Cached as %s' % r.cached + print diff --git a/mutalyzer/Db.py b/mutalyzer/Db.py index a5d58becc2c1c4e7fcfcd5595d92fb5f8d8d2cb8..b608cffbd9a16f047ed5e3e7457784e97674eded 100644 --- a/mutalyzer/Db.py +++ b/mutalyzer/Db.py @@ -890,6 +890,7 @@ class Cache(Db) : statement = """ INSERT INTO GBInfo + (AccNo, GI, hash, ChrAccVer, ChrStart, ChrStop, orientation, url) VALUES (%s, %s, %s, %s, %s, %s, %s, %s); """, (accNo, GI, fileHash, ChrAccVer, ChrStart, ChrStop, orientation, url) @@ -913,6 +914,7 @@ class Cache(Db) : statement = """ INSERT INTO GBInfo + (AccNo, GI, hash, ChrAccVer, ChrStart, ChrStop, orientation, url) VALUES (%s, %s, %s, %s, %s, %s, %s, %s); """, (accNo, None, fileHash, None, None, None, None, url) @@ -1033,6 +1035,30 @@ class Cache(Db) : return None #getGBFromGI + def getGBSince(self, created_since): + """ + Get all accession number entries with creation date {created_since} + or later. + + SQL tables from internalDb: + - GBInfo ; Information about cached and uploaded GenBank files. + + @arg created_since: Only entries with later creation dates are returned. + @type created_since: datatime.datetime + + @return: The accession number + @rtype: string + """ + statement = """ + SELECT AccNo, GI, hash, ChrAccVer, ChrStart, + ChrStop, orientation, url, created + FROM GBInfo + WHERE created >= %s; + """, created_since + + return self.query(statement) + #getGBSince + def getLoc(self, accNo) : """ Get the slicing information of an accession number, typically this @@ -1523,10 +1549,3 @@ class Batch(Db) : return inputl, flags #getFromQueue #Batch - -# -# Unit test. -# -if __name__ == "__main__" : - pass -#if diff --git a/mutalyzer/config.py b/mutalyzer/config.py index 339ec46e6c807b0340ae134298d22f52b8d7b5f3..2d8837af8a0dbd009351ab3cdc0c98892187120c 100644 --- a/mutalyzer/config.py +++ b/mutalyzer/config.py @@ -133,6 +133,7 @@ class Config(): self.GenRecord.spliceAlarm = int(config["spliceAlarm"]) self.GenRecord.spliceWarn = int(config["spliceWarn"]) + # If we are in a testing environment, use a temporary file for # logging and a temporary directory for the cache. # We don't remove these after the tests, since they might be diff --git a/mutalyzer/models.py b/mutalyzer/models.py index 94c6934d796da7b0097965b152fcab88a0bbd183..fba12fbdfc751370e6b0955678973b7b3c43d089 100644 --- a/mutalyzer/models.py +++ b/mutalyzer/models.py @@ -17,7 +17,7 @@ Additional attributes values for the soaplib String model: """ -from soaplib.core.model.primitive import String, Integer, Boolean +from soaplib.core.model.primitive import String, Integer, Boolean, DateTime from soaplib.core.model.clazz import ClassModel, Array from mutalyzer import SOAP_NAMESPACE @@ -31,6 +31,7 @@ class Mandatory(object): String = String(min_occurs=1, nillable=False) Integer = Integer(min_occurs=1, nillable=False) Boolean = Boolean(min_occurs=1, nillable=False) + DateTime = DateTime(min_occurs=1, nillable=False) #Mandatory @@ -214,4 +215,23 @@ class InfoOutput(ClassModel): nomenclatureVersionParts = Array(String) serverName = String contactEmail = String -#MutalyzerOutput +#InfoOutput + + +class CacheEntry(ClassModel): + """ + Used in getCache SOAP method. + """ + __namespace__ = SOAP_NAMESPACE + + name = Mandatory.String + gi = String + hash = Mandatory.String + chromosomeName = String + chromosomeStart = Integer + chromosomeStop = Integer + chromosomeOrientation = Integer + url = String + created = Mandatory.DateTime + cached = String +#CacheEntry diff --git a/mutalyzer/sync.py b/mutalyzer/sync.py new file mode 100644 index 0000000000000000000000000000000000000000..2a96446bf2b268e284c1a9b0db6dca4349c20d27 --- /dev/null +++ b/mutalyzer/sync.py @@ -0,0 +1,201 @@ +""" +Module for synchronizing the database with other Mutalyzer instances. +""" + + +from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() + +import os +import re +from datetime import datetime, timedelta +import urllib2 +from suds.client import Client + +from mutalyzer import Retriever + + +DEFAULT_CREATED_SINCE_DAYS = 7 + + +class CacheSync(object): + """ + Synchronize the database cache with other Mutalyzer instances. + """ + def __init__(self, config, output, database): + """ + Instantiate the object. + + @arg config: A configuration object. + @type config: mutalyzer.config.Config.Retriever + @arg output: An output object. + @type output: mutalyzer.output.Output + @arg database: A database object. + @type database: mutalyzer.Db.Cache + """ + self._config = config + self._output = output + self._database = database + + def local_cache(self, created_since=None): + """ + Get all entries in the local cache with creation date {created_since} + or later. + + @kwarg created_since: Only entries with this creation date or later + are returned. + @type created_since: datatime.datetime + + @return: List of cache entries. + @rtype: list(dictionary) + """ + if not created_since: + created_since = datetime.today() - \ + timedelta(days=DEFAULT_CREATED_SINCE_DAYS) + + entries = self._database.getGBSince(created_since) + cache = [] + + # Translate each entry to a dictionary and check if it is cached on + # our filesystem. + for entry in entries: + # Note that this way we only include Genbank files, not LRG files. + cached = None + if os.path.isfile(os.path.join(self._config.cache, + '%s.gb.bz2' % entry[0])): + cached = '%s.gb' % entry[0] + cache.append({'name': entry[0], + 'gi': entry[1], + 'hash': entry[2], + 'chromosomeName': entry[3], + 'chromosomeStart': entry[4], + 'chromosomeStop': entry[5], + 'chromosomeOrientation': entry[6], + 'url': entry[7], + 'created': entry[8], + 'cached': cached}) + + return cache + + def remote_cache(self, remote_wsdl, created_since=None): + """ + Get all entries in the remote cache with creation date {created_since} + or later. + + @arg remote_wsdl: The url of the remote SOAP WSDL description. + @type remote_wsdl: string + @kwarg created_since: Only entries with this creation date or later + are returned. + @type created_since: datatime.datetime + + @return: List of cache entries. + @rtype: list(dictionary) + """ + self._output.addMessage(__file__, -1, 'INFO', 'Getting remote cache' + ' from %s' % remote_wsdl) + + if not created_since: + created_since = datetime.today() - \ + timedelta(days=DEFAULT_CREATED_SINCE_DAYS) + client = Client(remote_wsdl, cache=None) + cache = client.service.getCache(created_since) + + def cache_entry_from_soap(entry): + """ + Create a nice dictionary out of the CacheEntry object. + """ + entry_dict = {'name': str(entry.name), + 'hash': str(entry.hash), + 'created': entry.created} + for attribute in ('gi', 'chromosomeName', 'url', 'cached'): + entry_dict[attribute] = str(entry[attribute]) \ + if attribute in entry else None + for attribute in ('chromosomeStart', 'chromosomeStop', + 'chromosomeOrientation'): + entry_dict[attribute] = int(entry[attribute]) \ + if attribute in entry else None + return entry_dict + + return map(cache_entry_from_soap, cache.CacheEntry) + + def store_remote_file(self, name, url): + """ + Download a remote file located at {url} and store it as {name}. + + @arg name: Name to store the file under. + @type name: string + @arg url: Url to the remote file. + @type url: string + """ + if not re.match('^[\da-zA-Z\._-]+$', name): + return + + # Download remote data + handle = urllib2.urlopen(url) + data = handle.read() + handle.close() + + # Store remote data + retriever = Retriever.GenBankRetriever(self._config, + self._output, + self._database) + retriever.write(data, name, 0) + + def sync_with_remote(self, remote_wsdl, url_template, + days=DEFAULT_CREATED_SINCE_DAYS): + """ + Synchronize the local cache with the remote cache. + + >>> wsdl = 'http://mutalyzer.nl/mutalyzer/services/?wsdl' + >>> template = 'http://mutalyzer.nl/mutalyzer/Reference/{file}' + >>> self.sync_with_remote(wsdl, template) + (14, 3) + + @arg remote_wsdl: The url of the remote SOAP WSDL description. + @type remote_wsdl: string + @arg url_template: Formatting string containing a {file} occurence, + see examle usage above. + @string url_template: string + @kwarg days: Only remote entries added this number of days ago or + later are considered. + @type days: int + + @return: The number of entries added to the local cache and the number + cache files downloaded from the remote site. + @rtype: tuple(int, int) + """ + self._output.addMessage(__file__, -1, 'INFO', 'Starting cache sync') + + created_since = datetime.today() - timedelta(days=days) + remote_cache = self.remote_cache(remote_wsdl, created_since) + + inserted = downloaded = 0 + + for entry in remote_cache: + if self._database.getHash(entry['name']): + continue + if self._database.getGBFromHash(entry['hash']): + continue + if entry['gi'] and self._database.getGBFromGI(entry['gi']): + continue + self._database.insertGB(entry['name'], + entry['gi'], + entry['hash'], + entry['chromosomeName'], + entry['chromosomeStart'], + entry['chromosomeStop'], + entry['chromosomeOrientation'], + entry['url']) + inserted += 1 + if not entry['chromosomeName'] and not entry['url'] \ + and entry['cached']: + url = url_template.format(file=entry['cached']) + self.store_remote_file(entry['name'], url) + downloaded += 1 + + self._output.addMessage(__file__, -1, 'INFO', + 'Inserted %d entries in the cache,' + ' downloaded %d files.' \ + % (inserted, downloaded)) + self._output.addMessage(__file__, -1, 'INFO', 'Finished cache sync') + + return inserted, downloaded diff --git a/mutalyzer/util.py b/mutalyzer/util.py index 892bde65f623843bdff3b2354b333b62f1734944..180f179c9a12761288c53b05e9fff762cd42ab84 100644 --- a/mutalyzer/util.py +++ b/mutalyzer/util.py @@ -739,3 +739,34 @@ def slow(f): f(*args, **kwargs) return slow_f #slow + + +def monkey_patch_suds(): + """ + Apply our monkey-patch for the suds package. + + For some weird reason the location http://www.w3.org/2001/xml.xsd is used + for the XML namespace, but the W3C seems to respond too slow on that url. + We therefore use http://www.w3.org/2009/01/xml.xsd which fixes this. + + Call this function before importing anything from the suds package. For + example, start your file with the following: + + import monkey; monkey.monkey_patch_suds() + from suds.client import Client + """ + from suds.xsd.sxbasic import Import + _import_open = Import.open + + # Only apply the patch once. + if getattr(Import, 'MUTALYZER_MONKEY_PATCHED', False): + return + + def _import_open_patched(self, *args, **kwargs): + if self.location == 'http://www.w3.org/2001/xml.xsd': + self.location = 'http://www.w3.org/2009/01/xml.xsd' + return _import_open(self, *args, **kwargs) + + Import.open = _import_open_patched + Import.MUTALYZER_MONKEY_PATCHED = True +#monkey_patch_suds diff --git a/mutalyzer/webservice.py b/mutalyzer/webservice.py index 52866e547b39ce1366d2ed42b330b2716fa71e55..181c4af9e5c383fac302e696348b22436f9f61d5 100644 --- a/mutalyzer/webservice.py +++ b/mutalyzer/webservice.py @@ -23,7 +23,7 @@ import logging; logging.basicConfig() from soaplib.core import Application from soaplib.core.service import soap from soaplib.core.service import DefinitionBase -from soaplib.core.model.primitive import String, Integer +from soaplib.core.model.primitive import String, Integer, DateTime from soaplib.core.model.clazz import Array from soaplib.core.model.exception import Fault from soaplib.core.server import wsgi @@ -35,6 +35,7 @@ import mutalyzer from mutalyzer.config import Config from mutalyzer.output import Output from mutalyzer.grammar import Grammar +from mutalyzer.sync import CacheSync from mutalyzer import variantchecker from mutalyzer import Db from mutalyzer import Mapper @@ -880,6 +881,38 @@ class MutalyzerService(DefinitionBase): output.addMessage(__file__, -1, 'INFO', 'Finished processing info') return result #info + + @soap(DateTime, _returns = Array(CacheEntry)) + def getCache(self, created_since=None): + """ + Get a list of entries from the local cache created since given date. + + This method is intended to be used by Mutalyzer itself to synchronize + the cache between installations on different servers. + """ + output = Output(__file__, self._config.Output) + + output.addMessage(__file__, -1, 'INFO', + 'Received request getCache') + + database = Db.Cache(self._config.Db) + sync = CacheSync(self._config.Retriever, output, database) + + cache = sync.local_cache(created_since) + + def cache_entry_to_soap(entry): + e = CacheEntry() + for attr in ('name', 'gi', 'hash', 'chromosomeName', + 'chromosomeStart', 'chromosomeStop', + 'chromosomeOrientation', 'url', 'created', 'cached'): + setattr(e, attr, entry[attr]) + return e + + output.addMessage(__file__, -1, 'INFO', + 'Finished processing getCache') + + return map(cache_entry_to_soap, cache) + #getCache #MutalyzerService diff --git a/setup.py b/setup.py index 22e5c89b1ed5d501dd8ff2b5ae14555a3b36e823..62ea18aa7a268f5f918d10428bbbd18db387abfe 100644 --- a/setup.py +++ b/setup.py @@ -17,9 +17,12 @@ setup( platforms=['any'], packages=find_packages(exclude=['doc', 'extras', 'tests']), include_package_data=True, - scripts=['bin/mutalyzer', 'bin/mutalyzer-batchd', - 'bin/mutalyzer-ucsc-update', 'bin/mutalyzer-website.wsgi', - 'bin/mutalyzer-webservice.wsgi'], + scripts=['bin/mutalyzer', + 'bin/mutalyzer-batchd', + 'bin/mutalyzer-cache-sync', + 'bin/mutalyzer-ucsc-update', + 'bin/mutalyzer-webservice.wsgi', + 'bin/mutalyzer-website.wsgi'], zip_safe=False ) diff --git a/tests/test_webservice.py b/tests/test_webservice.py index 16019a5b655820e8cdff4a39296b6acc4c39aedd..5052cb4ac277c278ea31708df45563f32223ebaa 100644 --- a/tests/test_webservice.py +++ b/tests/test_webservice.py @@ -3,21 +3,15 @@ Tests for the SOAP interface to Mutalyzer. """ -# Monkey patch suds, because for some weird reason the location -# http://www.w3.org/2001/xml.xsd is used for the XML namespace, but the W3C -# seems to respond too slow on that url. We use therefore use -# http://www.w3.org/2009/01/xml.xsd which fixes this. -from suds.xsd.sxbasic import Import -_import_open = Import.open -def _import_open_patched(self, *args, **kwargs): - if self.location == 'http://www.w3.org/2001/xml.xsd': - self.location = 'http://www.w3.org/2009/01/xml.xsd' - return _import_open(self, *args, **kwargs) -Import.open = _import_open_patched - +from mutalyzer.util import monkey_patch_suds; monkey_patch_suds() import os +from datetime import datetime, timedelta import mutalyzer +from mutalyzer.config import Config +from mutalyzer.output import Output +from mutalyzer.sync import CacheSync +from mutalyzer import Db import logging; logging.raiseExceptions = 0 import urllib2 from suds.client import Client @@ -169,3 +163,19 @@ class TestWebservice(): r = self.client.service.info() assert_equal(type(r.versionParts.string), list) assert_equal(r.version, mutalyzer.__version__) + + def test_getcache(self): + """ + Running the getCache method should give us the expected number of + cache entries. + """ + created_since = datetime.today() - timedelta(days=14) + + config = Config() + database = Db.Cache(config.Db) + output = Output(__file__, config.Output) + sync = CacheSync(config.Retriever, output, database) + cache = sync.local_cache(created_since) + + r = self.client.service.getCache(created_since) + assert_equal(len(r.CacheEntry), len(cache)) diff --git a/tests/test_website.py b/tests/test_website.py index 089e09a97cbdf1a27dd663921458c1b0d7e9ef15..7d01e93d79e113b0522cc132b36771111c881b96 100644 --- a/tests/test_website.py +++ b/tests/test_website.py @@ -280,7 +280,11 @@ class TestWSGI(): r.mustcontain(header) if not lines: lines = size - assert_equal(len(r.body.strip().split('\n')), lines + 1) + if len(r.body.strip().split('\n')) -1 != lines: + # Heisenbug, whenever it occurs we want to see some info. + print 'File: /Results_' + id + '.txt' + print r.body + assert_equal(len(r.body.strip().split('\n')) - 1, lines) return r.body def test_batch_namechecker(self):