Retriever.py 41.9 KB
Newer Older
1
"""
2
3
4
5
6
7
Module for retrieving files from either the cache or the NCBI.

A hash of every retrieved file is stored in the internal database. If a
requested file is not found, but its hash is, we use additional information
to re-download the file.

8
9
Public classes:
- Retriever ; Retrieve a record from either the cache or the NCBI.
10
"""
11

12

13
14
import os              # path.isfile(), link() path.isdir(), path.mkdir(),
                       # walk(), path.getsize(), path.join(), stat(), remove()
15
import time
16
17
18
19
20
21
import bz2             # BZ2Compressor(), BZ2File()
import hashlib         # md5(), update(), hexdigest()
import urllib2         # urlopen()
import StringIO        # StringIO()
from Bio import SeqIO  # read()
from Bio import Entrez # efetch(), read(), esearch(), esummary()
22
from Bio.Seq import UnknownSeq
23
from Bio.Alphabet import ProteinAlphabet
24
25
from xml.dom import DOMException, minidom
from xml.parsers import expat
26
from httplib import HTTPException, IncompleteRead
27

28
from mutalyzer import util
29
from mutalyzer import config
30
31
from mutalyzer.parsers import lrg
from mutalyzer.parsers import genbank
32
from mutalyzer.parsers import embl
33

34

35
36
37
38
ENTREZ_MAX_TRIES = 4
ENTREZ_SLEEP = 1      # in seconds


39
class Retriever(object) :
40
    """
41
42
43
    Retrieve a record from either the cache or the NCBI.

    Special methods:
44
        - __init__(output, database) ; Use variables from the
45
        configuration file to initialise the class private variables.
46

47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
    Private methods:
        - _foldersize(folder) ; Return the size of a folder.
        - _cleancache()       ; Keep the cache at a maximum size.
        - _nametofile(name)   ; Convert a name to a filename.
        - _write(raw_data, filename, extract) ; Write a record to a file.
        - _calcHash(content)  ; Calculate the md5sum of 'content'.
        - _newUD()            ; Generate a new UD number.

    Public methods:
        - retrieveslice(accno, start, stop, orientation) ; Retrieve a chromosome
        slice from the NCBI.
        - retrievegene(gene, organism, upstream, downstream) ; Retrieve a gene
        from the NCBI.
        - downloadrecord(url)    ; Download a GenBank file.
        - uploadrecord(raw_data) ; Let someone upload a GenBank file.
        - loadrecord(identifier) ; Load a record, store it in the cache, manage
        the cache and return the record.

    Inherited methods from Db.Output:
        - WarningMsg(filename, message) ; Print a warning message.
        - ErrorMsg(filename, message)   ; Print an error message and log it.
        - LogMsg(filename, message)     ; Log a message.
69
70
    """

71
    def __init__(self, output, database) :
72
        """
73
74
        Use variables from the configuration file for some simple
        settings. Make the cache directory if it does not exist yet.
75

76
77
78
79
        @arg output:
        @type output:
        @arg database:
        @type database:
80
        """
81
82
        self._output = output
        self._database = database
83
84
85
        if not os.path.isdir(config.get('cache')) :
            os.mkdir(config.get('cache'))
        Entrez.email = config.get('email')
86
        self.fileType = None
87
    #__init__
88
89

    def _foldersize(self, folder) :
90
        """
91
        Return the size of a folder in bytes.
92

93
94
        @arg folder: Name of a directory
        @type folder: string
95

96
97
        @return: The size of the directory
        @rtype: integer
98
        """
Laros's avatar
Added:    
Laros committed
99

100
101
        folder_size = 0
        for (path, dirs, files) in os.walk(folder) :
102
103
            for fileName in files :
                folder_size += os.path.getsize(os.path.join(path, fileName))
104

105
        return folder_size
106
107
108
    #_foldersize

    def _cleancache(self) :
109
        """
110
111
112
113
114
        Keep removing files until the size of the cache is less than the
        maximum size.
        First, the cache checked for its size, if it exceeds the maximum
        size the ``oldest'' files are deleted. Note that accessing a file
        makes it ``new''.
115
        """
116
        if self._foldersize(config.get('cache')) < config.get('cachesize'):
117
            return
118

119
        # Build a list of files sorted by access time.
Laros's avatar
Added:    
Laros committed
120
        cachelist = []
121
        for (path, dirs, files) in os.walk(config.get('cache')) :
Laros's avatar
Added:    
Laros committed
122
            for filename in files :
123
                filepath = os.path.join(path, filename)
Laros's avatar
Added:    
Laros committed
124
                cachelist.append(
125
                    (os.stat(filepath).st_atime, filepath))
Laros's avatar
Added:    
Laros committed
126
        cachelist.sort()
127

128
129
        # Now start removing pairs of files until the size of the folder is
        # small enough (or until the list is exhausted).
Laros's avatar
Added:    
Laros committed
130
        for i in range(0, len(cachelist)) :
131
            os.remove(cachelist[i][1])
132
            if self._foldersize(config.get('cache')) < config.get('cachesize'):
133
134
                break;
        #for
135
    #_cleancache
136

137
    def _nametofile(self, name) :
138
        """
139
        Convert an accession number to a filename.
140

141
142
        @arg name: The accession number
        @type name: string
143

144
145
        @return: A filename
        @rtype: string
146
        """
147
        return config.get('cache') + '/' + name + "." + self.fileType + ".bz2"
148
149
150
151
    #_nametofile

    def _write(self, raw_data, filename) :
        """
152
        Write raw data to a compressed file.
153

154
155
156
157
        @arg raw_data: The raw_data to be compressed and written
        @type raw_data: string
        @arg filename: The intended name of the outfile
        @type filename: string
158

159
160
        @return: outfile ; The full path and name of the file written
        @rtype: string
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
        """
        # Compress the data to save disk space.
        comp = bz2.BZ2Compressor()
        data = comp.compress(raw_data)
        data += comp.flush()
        out_handle = open(self._nametofile(filename), "w")
        out_handle.write(data)
        out_handle.close()
        # Since we put something in the cache, check if it needs cleaning.
        self._cleancache()

        return out_handle.name      # return the full path to the file
    #_write

    def _calcHash(self, content) :
        """
177
        Calculate the md5sum of a piece of text.
178

179
180
        @arg content: Arbitrary text
        @type content: string
181

182
183
        @return: The md5sum of 'content'
        @rtype: string
184
185
186
187
188
189
190
191
192
193
194
195
        """

        hashfunc = hashlib.md5()
        hashfunc.update(content)
        md5sum = hashfunc.hexdigest()
        del hashfunc

        return md5sum
    #_calcHash

    def _newUD(self) :
        """
196
        Make a new UD number based on the current time (seconds since 1970).
197

198
199
        @return: A new UD number
        @rtype: string
200
201
        """

Vermaat's avatar
Vermaat committed
202
        UD = util.generate_id()
203
204
        return "UD_" + str(UD)
    #_newUD
205

206
    def _updateDBmd5(self, raw_data, name, GI):
207
208
        #TODO documentation
        """
209
        @todo: documentation
210

211
212
213
214
215
216
        @arg raw_data:
        @type raw_data:
        @arg name:
        @type name:
        @arg GI:
        @type GI:
217

218
219
        @return: filename
        @rtype: string
220
221
        """

222
223
224
225
226
227
228
229
230
231
232
233
234
235
        currentmd5sum = self._database.getHash(name)
        if currentmd5sum :
            md5sum = self._calcHash(raw_data)
            if md5sum != currentmd5sum :
                self._output.addMessage(__file__, -1, "WHASH",
                    "Warning: Hash of %s changed from %s to %s." % (
                    name, currentmd5sum, md5sum))
                self._database.updateHash(name, md5sum)
            #if
        else :
            self._database.insertGB(name, GI,
                self._calcHash(raw_data), None, 0, 0, 0, None)
        return self._nametofile(name)
    #_updateDBmd5
236
237


238
    def snpConvert(self, rs_id) :
239
        """
Vermaat's avatar
Vermaat committed
240
        Search for an rsId in dbSNP and return all annotated HGVS notations of
241
        it.
242

Vermaat's avatar
Vermaat committed
243
        @arg rsId: The rsId of the SNP (example: 'rs9919552').
244
        @type rsId: string
245

Vermaat's avatar
Vermaat committed
246
247
        @return: A list of HGVS notations.
        @rtype: list(string)
248
249
        """
        # A simple input check.
250
251
        id = rs_id[2:]
        if rs_id[:2] != 'rs' or not id.isdigit():
Vermaat's avatar
Vermaat committed
252
253
254
            self._output.addMessage(__file__, 4, 'ESNPID',
                                    'This is not a valid dbSNP id.')
            return []
255

256
257
258
259
260
261
        # Query dbSNP for the SNP. The following weird construct is to catch
        # any glitches in our Entrez connections. We try up to ENTREZ_MAX_TRIES
        # and only then give up.
        # Todo: maybe also implement this for other Entrez queries?
        for i in range(ENTREZ_MAX_TRIES - 1):
            try:
Vermaat's avatar
Vermaat committed
262
                response = Entrez.efetch(db='snp', id=id, rettype='flt',
263
264
                                         retmode='xml')
                break
265
            except (IOError, HTTPException):
266
267
268
                time.sleep(ENTREZ_SLEEP)
        else:
            try:
Vermaat's avatar
Vermaat committed
269
                response = Entrez.efetch(db='snp', id=id, rettype='flt',
270
                                         retmode='xml')
271
            except (IOError, HTTPException) as e:
272
273
274
275
276
277
                # Could not parse XML.
                self._output.addMessage(__file__, 4, 'EENTREZ',
                                        'Error connecting to dbSNP.')
                self._output.addMessage(__file__, -1, 'INFO',
                                        'IOError: %s' % str(e))
                return []
Vermaat's avatar
Vermaat committed
278

279
280
281
282
283
284
285
286
        try:
            response_text = response.read()
        except IncompleteRead as e:
            self._output.addMessage(__file__, 4, 'EENTREZ',
                                    'Error reading from dbSNP.')
            self._output.addMessage(__file__, -1, 'INFO',
                                    'IncompleteRead: %s' % str(e))
            return []
287

288
289
290
291
292
293
294
        if response_text == '\n':
            # This is apparently what dbSNP returns for non-existing dbSNP id
            self._output.addMessage(__file__, 4, 'EENTREZ',
                                    'ID rs%s could not be found in dbSNP.' \
                                    % id)
            return []

295
296
        try:
            # Parse the output.
297
            doc = minidom.parseString(response_text)
298
            rs = doc.getElementsByTagName('Rs')[0]
299
        except expat.ExpatError as e:
300
301
302
303
304
305
306
307
308
309
            # Could not parse XML.
            self._output.addMessage(__file__, 4, 'EENTREZ', 'Unknown dbSNP ' \
                                    'error. Error parsing result XML.')
            self._output.addMessage(__file__, -1, 'INFO',
                                    'ExpatError: %s' % str(e))
            self._output.addMessage(__file__, -1, 'INFO',
                                    'Result from dbSNP: %s' % response_text)
            return []
        except IndexError:
            # The expected root element is not present.
310
            self._output.addMessage(__file__, 4, 'EENTREZ', 'Unknown dbSNP ' \
311
312
313
                                    'error. Result XML was not as expected.')
            self._output.addMessage(__file__, -1, 'INFO',
                                    'Result from dbSNP: %s' % response_text)
Vermaat's avatar
Vermaat committed
314
            return []
315

Vermaat's avatar
Vermaat committed
316
        snps = []
317
        for i in rs.getElementsByTagName('hgvs'):
Vermaat's avatar
Vermaat committed
318
319
320
            snps.append(i.lastChild.data.encode('utf8'))

        return snps
321
    #snpConvert
322
323
324
#Retriever

class GenBankRetriever(Retriever):
325
    # TODO documentation
326
327
    """
    """
328

329
    def __init__(self, output, database):
330
        """
331
        @todo: Documentation.
332
        """
333
        # Recall init of parent
334
        Retriever.__init__(self, output, database)
335
        self.fileType = "gb"
336
        # Child specific init
337
    #__init__
338
339

    def write(self, raw_data, filename, extract) :
340
        """
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
        Write raw data to a file. The data is parsed before writing, if a
        parse error occurs an error is returned and the function exits.
        If 'filename' is set and 'extract' is set to 0, then 'filename' is
        used for output.
        If 'extract' is set to 1, then the filename is constructed from the
        id of the GenBank record. Additionally the id and GI number are
        returned for further processing (putting them in the internal
        database).

        @arg raw_data: The data
        @type raw_data: string
        @arg filename: The intended name of the file.
        @type filename: string
        @arg extract: Flag that indicates whether to extract the record ID and
        GI number:
            - 0 ; Do not extract, use 'filename'
            - 1 ; Extract
        @type extract: integer

        @return: tuple ; Depending on the value of 'extract':
            - 0 ; ('filename', None)
            - 1 ; (id, GI)
        @rtype: tuple (string, string)
364
        """
365
366
367
368

        if raw_data == "\nNothing has been found\n" :
            self._output.addMessage(__file__, 4, "ENORECORD",
                "The record could not be retrieved.")
Vermaat's avatar
Vermaat committed
369
            return None
370
371
        #if

372
373
374
375
376
        fakehandle = StringIO.StringIO() # Unfortunately, BioPython needs a
        fakehandle.write(raw_data)       # file handle.
        fakehandle.seek(0)
        try :
            record = SeqIO.read(fakehandle, "genbank")
Gerben Stouten's avatar
Alpha 5    
Gerben Stouten committed
377
        except (ValueError, AttributeError):  # An error occured while parsing.
378
379
            self._output.addMessage(__file__, 4, "ENOPARSE",
                "The file could not be parsed.")
380
            fakehandle.close()
Vermaat's avatar
Vermaat committed
381
            return None
382
        #except
Gerben Stouten's avatar
   
Gerben Stouten committed
383

384
385
386
387
388
        if type(record.seq) == UnknownSeq :
            fakehandle.close()
            self._output.addMessage(__file__, 4, "ENOSEQ",
                "This record contains no sequence. Chromosomal or contig " \
                "records should be uploaded with the GenBank uploader.")
Vermaat's avatar
Vermaat committed
389
            return None
390
391
        #if

392
393
394
395
396
397
        outfile = filename
        GI = None
        if extract :
            outfile = record.id
            GI = record.annotations["gi"]
            if outfile != filename :
Gerben Stouten's avatar
Gerben Stouten committed
398
399
400
401
402
403
404
405
                # Add the reference (incl version) to the reference output
                # This differs if the original reference lacks a version
                self._output.addOutput("reference", record.id)
                self._output.addOutput(
                        "BatchFlags", ("A1",(
                            filename,
                            outfile,
                            filename+"[[.period.]]" )))
406
                self._output.addMessage(__file__, 2, "WNOVER",
407
408
409
410
411
                    "No version number is given, using %s. Please use this " \
                    "number to reduce downloading overhead." % record.id)
        #if
        fakehandle.close()

412
        self._write(raw_data, outfile)
413

414
        return outfile, GI
415
    #write
416

417
    def fetch(self, name) :
418
        """
419
        Todo: Documentation.
420
421
422
423

        Todo: A better implementation would probably use an esummary query
            first to get the length of the sequence. If this is within limits,
            use efetch with rettype=gbwithparts to download the GenBank file.
424
        """
425
426
427
428
429
        try:
            net_handle = Entrez.efetch(db='nuccore', id=name, rettype='gb', retmode='text')
            raw_data = net_handle.read()
            net_handle.close()
        except (IOError, urllib2.HTTPError, HTTPException) as e:
Vermaat's avatar
Vermaat committed
430
            self._output.addMessage(__file__, -1, 'INFO',
431
432
433
434
                                    'Error connecting to Entrez nuccore database: %s' % str(e))
            self._output.addMessage(__file__, 4, 'ERETR',
                                    'Could not retrieve %s.' % name)
            return None
435

436
437
438
        if raw_data == '\n' :       # Check if the file is empty or not.
            self._output.addMessage(__file__, 4, 'ERETR',
                                    'Could not retrieve %s.' % name)
439
            return None
440
441
442
443
444
445
446
447
448
449
450

        # This is a hack to detect constructed references, the proper way to
        # do this would be to check the data_file_division attribute of the
        # parsed GenBank file (it would be 'CON').
        if '\nCONTIG' in raw_data:
            try:
                # Get the length in base pairs
                length = int(raw_data[:raw_data.index(' bp', 0, 500)].split()[-1])
            except ValueError, IndexError:
                self._output.addMessage(__file__, 4, 'ERETR',
                                        'Could not retrieve %s.' % name)
Vermaat's avatar
Vermaat committed
451
                return None
452
            if length > config.get('maxDldSize'):
453
454
                self._output.addMessage(__file__, 4, 'ERETR',
                                        'Could not retrieve %s.' % name)
455
                return None
456
457
458
459
460
            try:
                net_handle = Entrez.efetch(db='nuccore', id=name, rettype='gbwithparts', retmode='text')
                raw_data = net_handle.read()
                net_handle.close()
            except (IOError, urllib2.HTTPError, HTTPException) as e:
Vermaat's avatar
Vermaat committed
461
                self._output.addMessage(__file__, -1, 'INFO',
462
463
464
465
                                        'Error connecting to Entrez nuccore database: %s' % str(e))
                self._output.addMessage(__file__, 4, 'ERETR',
                                        'Could not retrieve %s.' % name)
                return None
466
467
468
469
470
471
472
473
474

        result = self.write(raw_data, name, 1)
        if not result:
            return None
        name, GI = result
        if name:               # Processing went okay.
            return self._updateDBmd5(raw_data, name, GI)
        else:                  # Parse error in the GenBank file.
            return None
475
    #fetch
Gerben Stouten's avatar
   
Gerben Stouten committed
476

477
478
    def retrieveslice(self, accno, start, stop, orientation) :
        """
479
480
481
482
483
484
485
486
487
488
        Retrieve a slice of a chromosome.
        If the arguments are recognised (found in the internal database),
        we look if the associated file is still present and if so: return
        its UD number.
        If the arguments are recognised but no file was found, we download
        the new slice and update the hash (and log if the hash changes).
        If the arguments are not recognised, we download the new slice and
        make a new UD number.
        The content of the slice is placed in the cache with the UD number
        as filename.
489

490
491
492
493
494
495
496
497
498
499
500
        @arg accno: The accession number of the chromosome
        @type accno: string
        @arg start: Start position of the slice
        @type start: integer
        @arg stop: End position of the slice.
        @type stop: integer
        @arg orientation:
        Orientation of the slice:
            - 1 ; Forward
            - 2 ; Reverse complement
        @type orientation: integer
501

502
503
        @return: An UD number
        @rtype: string
504
505
        """

506
507
508
509
        # Not a valid slice.
        if start >= stop :
            return None

510
        # The slice can not be too big.
511
        if stop - start > config.get('maxDldSize'):
512
513
514
            return None

        # Check whether we have seen this slice before.
515
        UD = self._database.getGBFromLoc(accno, start, stop, orientation)
516
        if UD : # This has been requested before.
517
            if os.path.isfile(self._nametofile(UD)) : # It's still present.
518
519
520
                return UD

        # It's not present, so download it.
521
522
523
524
525
526
527
        try:
            handle = Entrez.efetch(db='nuccore', rettype='gb', retmode='text',
                                   id=accno, seq_start=start, seq_stop=stop,
                                   strand=orientation)
            raw_data = handle.read()
            handle.close()
        except (IOError, urllib2.HTTPError, HTTPException) as e:
Vermaat's avatar
Vermaat committed
528
            self._output.addMessage(__file__, -1, 'INFO',
529
530
531
532
                                    'Error connecting to Entrez nuccore database: %s' % str(e))
            self._output.addMessage(__file__, 4, 'ERETR',
                                    'Could not retrieve slice.')
            return None
533
534

        # Calculate the hash of the downloaded file.
535
        md5sum = self._calcHash(raw_data)
536
537

        if UD : # We have seen this one before.
538
            currentmd5sum = self._database.getHash(UD)
539
            if md5sum != currentmd5sum :
540
                self._output.addMessage(__file__, -1, "WHASH",
541
542
                    "Warning: Hash of %s changed from %s to %s." % (
                    UD, currentmd5sum, md5sum))
543
                self._database.updateHash(UD, md5sum)
544
545
            #if
        else : # We haven't seen it before, so give it a name.
546
547
            UD = self._newUD()
            self._database.insertGB(UD, None, md5sum, accno, start,
548
549
                          stop, orientation, None)
        #else
550

Gerben Stouten's avatar
Gerben Stouten committed
551
        return self.write(raw_data, UD, 0) and UD
552
553
554
555
    #retrieveslice

    def retrievegene(self, gene, organism, upstream, downstream) :
        """
556
557
        Query the NCBI for the chromosomal location of a gene and make a
        slice if the gene can be found.
558

559
560
561
562
563
564
565
566
        @arg gene: Name of the gene
        @type gene: string
        @arg organism: The organism in which we search.
        @type organism: string
        @arg upstream: Number of upstream nucleotides for the slice.
        @type upstream: integer
        @arg downstream: Number of downstream nucleotides for the slice.
        @type downstream: integer
567

568
569
        @return: slice
        @rtype:
570
571
572
573
        """

        # Search the NCBI for a specific gene in an organism.
        query = "%s[Gene] AND %s[Orgn]" % (gene, organism)
574
575
576
577
578
        try:
            handle = Entrez.esearch(db = "gene", term = query)
            searchresult = Entrez.read(handle)
            handle.close()
        except (IOError, urllib2.HTTPError, HTTPException) as e:
Vermaat's avatar
Vermaat committed
579
            self._output.addMessage(__file__, -1, 'INFO',
580
581
582
583
                                    'Error connecting to Entrez esearch: %s' % str(e))
            self._output.addMessage(__file__, 4, 'ERETR',
                                    'Could not search for gene %s.' % gene)
            return None
584

585
586
587
        ChrAccVer = None        # We did not find anything yet.
        aliases = []            # A list of aliases in case we find them.
        for i in searchresult["IdList"] :                 # Inspect all results.
588
589
590
591
592
            try:
                handle = Entrez.esummary(db = "gene", id = i)
                summary = Entrez.read(handle)
                handle.close()
            except (IOError, urllib2.HTTPError, HTTPException) as e:
Vermaat's avatar
Vermaat committed
593
                self._output.addMessage(__file__, -1, 'INFO',
594
595
596
597
598
                                        'Error connecting to Entrez esummary: %s' % str(e))
                self._output.addMessage(__file__, 4, 'ERETR',
                                        'Could not get mapping information for gene %s.' % gene)
                return None

599
            if summary[0]["NomenclatureSymbol"].lower() == gene.lower() : # Found it.
600
601
602
603
604
                if not summary[0]["GenomicInfo"] :
                    self._output.addMessage(__file__, 4, "ENOMAPPING",
                        "No mapping information found for gene %s." % gene)
                    return None
                #if
605
606
607
608
609
610
                ChrAccVer = summary[0]["GenomicInfo"][0]["ChrAccVer"]
                ChrLoc = summary[0]["GenomicInfo"][0]["ChrLoc"]
                ChrStart = summary[0]["GenomicInfo"][0]["ChrStart"]
                ChrStop = summary[0]["GenomicInfo"][0]["ChrStop"]
                break;
            #if
611

612
            # Collect official symbols that has this gene as alias in case we
613
614
615
616
617
            # can not find anything.
            if gene in summary[0]["OtherAliases"] and \
                summary[0]["NomenclatureSymbol"] :
                aliases.append(summary[0]["NomenclatureSymbol"]);
        #for
618

619
620
        if not ChrAccVer : # We did not find any genes.
            if aliases :
621
                self._output.addMessage(__file__, 4, "ENOGENE",
622
623
624
                    "Gene %s not found, found aliases: %s" % (gene, aliases))
                return None
            #if
625
            self._output.addMessage(__file__, 4, "ENOGENE",
626
627
                "Gene %s not found." % gene)
            return None
628
        #if
629

630
631
632
633
634
635
        # Figure out the orientation of the gene.
        orientation = "1"
        if ChrStart > ChrStop :             # Swap start and stop.
            orientation = "2"
            temp = ChrStart
            ChrStart = ChrStop - downstream # Also take care of the flanking
636
            ChrStop = temp + upstream + 1   # sequences.
637
638
        #if
        else :
639
640
            ChrStart -= upstream - 1
            ChrStop += downstream + 2
641
642
643
        #else

        # And retrieve the slice.
644
        return self.retrieveslice(ChrAccVer, ChrStart, ChrStop, orientation)
645
646
647
648
    #retrievegene

    def downloadrecord(self, url) :
        """
649
650
651
        Download a GenBank record from a URL.
        If the downloaded file is recognised by its hash, the old UD number
        is used.
652

653
654
        @arg url: Location of a GenBank record
        @type url: string
655

656
657
        @return: UD or None
        @rtype: string
658
659
660
661
662
        """
        handle = urllib2.urlopen(url)
        info = handle.info()
        if info["Content-Type"] == "text/plain" :
            length = int(info["Content-Length"])
663
664
            if length > config.get('minDldSize') and \
               length < config.get('maxDldSize'):
665
                raw_data = handle.read()
666
667
                md5sum = self._calcHash(raw_data)
                UD = self._database.getGBFromHash(md5sum)
Gerben Stouten's avatar
Gerben Stouten committed
668
669
670
671
                if UD:  #hash found
                    if not os.path.isfile(self._nametofile(UD)):
                        UD = self.write(raw_data, UD, 0) and UD
                else:
672
                    UD = self._newUD()
Gerben Stouten's avatar
Gerben Stouten committed
673
674
675
676
677
678
                    if not os.path.isfile(self._nametofile(UD)):
                        UD = self.write(raw_data, UD, 0) and UD
                    if UD:      #Parsing went OK, add to DB
                        self._database.insertGB(UD, None, md5sum,
                                None, 0, 0, 0, url)
                return UD #Returns the UD or None
Laros's avatar
Laros committed
679
680
            #if
            else :
681
                self._output.addMessage(__file__, 4, "EFILESIZE",
682
                    "Filesize is not within the allowed boundaries.")
683
684
                return None
            #else
685
        #if
686
        else :
687
            self._output.addMessage(__file__, 4, "ERECPARSE",
688
                                     "This is not a GenBank record.")
689
690
            return None
        #else
691
    #downloadrecord
Laros's avatar
Laros committed
692

693
694
    def uploadrecord(self, raw_data) :
        """
695
696
697
        Write an uploaded record to a file.
        If the downloaded file is recognised by its hash, the old UD number
        is used.
698

699
700
        @arg raw_data: A GenBank record
        @type raw_data: string
701
702

        @return:
703
        @rtype: string?????
704
        """
705
        md5sum = self._calcHash(raw_data)
706
        UD = self._database.getGBFromHash(md5sum)
707
        if not UD :
708
            UD = self._newUD()
Vermaat's avatar
Vermaat committed
709
710
711
            if self.write(raw_data, UD, 0):
                self._database.insertGB(UD, None, md5sum, None, 0, 0, 0, None)
                return UD
712
        #if
Vermaat's avatar
Vermaat committed
713
714
        else:
            if os.path.isfile(self._nametofile(UD)):
715
                return UD
Vermaat's avatar
Vermaat committed
716
717
            else:
                return self.write(raw_data, UD, 0) and UD
718
    #uploadrecord
719
720

    def loadrecord(self, identifier) :
721
        """
722
723
724
        Load a record and return it.
        If the filename associated with the accession number is not found
        in the cache, try to re-download it.
725

726
727
        @arg identifier: An accession number
        @type identifier: string
728

729
730
        @return: A GenBank.Record record
        @rtype: object
731
732
        """
        if (identifier[0].isdigit()) : # This is a GI identifier.
733
            name = self._database.getGBFromGI(identifier)
734
735
736
737
            if name is None:
                self._output.addMessage(__file__, 4, "ERETR",
                                        "Unknown reference: %s" % identifier)
                return
738
739
        else :
            name = identifier
Gerben Stouten's avatar
Gerben Stouten committed
740

741
        # Make a filename based upon the identifier.
742
        filename = self._nametofile(name)
743
744

        if not os.path.isfile(filename) :   # We can't find the file.
745
746
747
            md5 = self._database.getHash(name)
            if md5:   # We have seen it before though.
                Loc = self._database.getLoc(name)  # Try to find the location.
748
                if not Loc[0]:              # No location found.
749
                    url = self._database.getUrl(name)   # Try to find an URL.
750
                    if not url :
751
752
                        if self._database.getGI(name) : # It was from NCBI.
                            filename = self.fetch(name)
753
                        else :
754
                            self._output.addMessage(__file__, 4, "ERETR",
755
                                "Please upload this sequence again.")
Gerben Stouten's avatar
Gerben Stouten committed
756
                            filename = None
757
                    #if
758
                    else :                  # This used to be a downloaded seq
Gerben Stouten's avatar
Gerben Stouten committed
759
                        filename = self.downloadrecord(url) and filename
760
761
                #if
                else :                      # This used to be a slice.
Gerben Stouten's avatar
Gerben Stouten committed
762
                    filename = self.retrieveslice(*Loc) and filename
763
764
            #if
            else :                          # Never seen this name before.
765
                filename = self.fetch(name)
766
            #else
767
        #if
Gerben Stouten's avatar
Gerben Stouten committed
768

769
        # If filename is None an error occured
Gerben Stouten's avatar
Gerben Stouten committed
770
771
772
773
        if filename is None:
            #Notify batch to skip all instance of identifier
            self._output.addOutput("BatchFlags", ("S1", identifier))
            return None
774
775

        # Now we have the file, so we can parse it.
776
        GenBankParser = genbank.GBparser()
777
        record = GenBankParser.create_record(filename, self._output)
778
        record.id = name
779
780
781

        # Todo: This will change once we support protein references
        if isinstance(record.seq.alphabet, ProteinAlphabet):
782
            self._output.addMessage(__file__, 4, 'ENOTIMPLEMENTED',
783
784
785
786
                                    'Protein reference sequences are not supported.')
            return None

        return record
787
788
789
    #loadrecord
#GenBankRetriever

790
class LRGRetriever(Retriever):
791
    """
792
    Retrieve a LRG record from either the cache or the web.
793

794
795
796
    Public methods:
        - loadrecord(identifier) ; Load a record, store it in the cache, manage
                                   the cache and return the record.
797
    """
798

799
    def __init__(self, output, database):
800
801
        #TODO documentation
        """
802
        Initialize the class.
803

804
        @todo: documentation
805
806
807
808
        @arg  output:
        @type  output:
        @arg  database:
        @type  database:
809
        """
810
        # Recall init of parent
811
        Retriever.__init__(self, output, database)
812
        self.fileType = "xml"
813
        # Child specific init
814
    #__init__
815
816
817

    def loadrecord(self, identifier):
        """
818
        Load and parse a LRG file based on the identifier
819

820
821
        @arg identifier: The name of the LRG file to read
        @type identifier: string
822

823
824
825
        @return: record ; GenRecord.Record of LRG file
                   None ; in case of failure
        @rtype:
826
        """
827

828
829
830
831
832
833
834
        # Make a filename based upon the identifier.
        filename = self._nametofile(identifier)

        if not os.path.isfile(filename) :   # We can't find the file.
            filename = self.fetch(identifier)

        if filename is None:                # return None in case of error
Gerben Stouten's avatar
Gerben Stouten committed
835
836
            #Notify batch to skip all instance of identifier
            self._output.addOutput("BatchFlags", ("S1", identifier))
Gerben Stouten's avatar
Gerben Stouten committed
837
            return None
838

839
840
        # Now we have the file, so we can parse it.
        file_handle = bz2.BZ2File(filename, "r")
841
842

        #create GenRecord.Record from LRG file
843
        record = lrg.create_record(file_handle.read())
844
        file_handle.close()
845

846
847
848
849
850
        # We don't create LRGs from other sources, so id is always the same
        # as source_id.
        record.id = identifier
        record.source_id = identifier

851
852
        return record
    #loadrecord
Laros's avatar
Laros committed
853

854
855
    def fetch(self, name):
        """
856
857
858
        Fetch the LRG file and store in the cache directory. First try to
        grab the file from the confirmed section, if this fails, get it
        from the pending section.
859

860
861
        @arg name: The name of the LRG file to fetch
        @type name: string
862

863
864
        @return: the full path to the file; None in case of an error
        @rtype: string
865
        """
866

Vermaat's avatar
Vermaat committed
867
        prefix = config.get('lrgurl')
868
        url        = prefix + "%s.xml"          % name
869
870
871
872
        pendingurl = prefix + "pending/%s.xml"  % name

        try:
            return self.downloadrecord(url, name)
873
        except urllib2.URLError: #Catch error: file not found
874
875
876
877
878
879
880
            pass

        try:                # Try to get the file from the pending section
            filename = self.downloadrecord(pendingurl, name)
            self._output.addMessage(__file__, 2, "WPEND",
                "Warning: LRG file %s is a pending entry." % name)
            return filename
881
        except urllib2.URLError:
882
883
884
            self._output.addMessage(__file__, 4, "ERETR",
                                 "Could not retrieve %s." % name)
            return None             #Explicit return in case of an Error
885
    #fetch
886
887
888

    def downloadrecord(self, url, name = None) :
        """
889
        Download an LRG record from an URL.
890

891
892
        @arg url: Location of the LRG record
        @type url: string
893

894
895
896
897
        @return:
            - filename    ; The full path to the file
            - None        ; in case of failure
        @rtype: string
898
        """
899

900
901
902
903
904
905
906
        lrgID = name or os.path.splitext(os.path.split(url)[1])[0]
        #if not lrgID.startswith("LRG"):
        #    return None
        filename = self._nametofile(lrgID)

        handle = urllib2.urlopen(url)
        info = handle.info()
907
        if info["Content-Type"] == "application/xml" and info.has_key("Content-length"):
908

909
            length = int(info["Content-Length"])
910
            if config.get('minDldSize') < length < config.get('maxDldSize'):
911
912
913
914
915
916
917
918
                raw_data = handle.read()
                handle.close()

                #Do an md5 check
                md5sum = self._calcHash(raw_data)
                md5db = self._database.getHash(lrgID)
                if md5db is None:
                    self._database.insertLRG(lrgID, md5sum, url)
919
                elif md5sum != md5db:       #hash has changed for the LRG ID
920
921
922
923
                    self._output.addMessage(__file__, -1, "WHASH",
                        "Warning: Hash of %s changed from %s to %s." % (
                        lrgID, md5db, md5sum))
                    self._database.updateHash(lrgID, md5sum)
924
                else:                       #hash the same as in db
925
926
927
928
929
                    pass

                if not os.path.isfile(filename) :
                    return self.write(raw_data, lrgID)
                else:
930
931
                    # This can only occur if synchronus calls to mutalyzer are
                    # made to recover a file that did not exist. Still leaves
932
                    # a window in between the check and the write.
933
934
935
936
937
938
939
940
                    return filename
            #if
            else :
                self._output.addMessage(__file__, 4, "EFILESIZE",
                    "Filesize is not within the allowed boundaries.")
        #if
        else :
            self._output.addMessage(__file__, 4, "ERECPARSE",
941
                                     "This is not an LRG record.")