Commit 272f690f authored by Jeroen F.J. Laros's avatar Jeroen F.J. Laros
Browse files

PEP-8 rewrite except for public function names.

parent 3e462e82
......@@ -4,108 +4,78 @@ Module for retrieving files from either the cache or the NCBI.
A hash of every retrieved file is stored in the internal database. If a
requested file is not found, but its hash is, we use additional information
to re-download the file.
Public classes:
- Retriever ; Retrieve a record from either the cache or the NCBI.
"""
from __future__ import unicode_literals
import bz2
import chardet
import hashlib
import io
import os # path.isfile(), link() path.isdir(), path.mkdir(),
# walk(), path.getsize(), path.join(), stat(), remove()
import os
import time
import bz2 # BZ2Compressor(), BZ2File()
import hashlib # md5(), update(), hexdigest()
import urllib2 # urlopen()
from Bio import SeqIO # read()
from Bio import Entrez # efetch(), read(), esearch(), esummary()
from Bio.Seq import UnknownSeq
import urllib2
from Bio import Entrez
from Bio import SeqIO
from Bio.Alphabet import ProteinAlphabet
from xml.dom import DOMException, minidom
from xml.parsers import expat
from Bio.Seq import UnknownSeq
from httplib import HTTPException, IncompleteRead
from sqlalchemy.orm.exc import NoResultFound
import chardet
from xml.dom import DOMException, minidom
from xml.parsers import expat
from mutalyzer import util
from mutalyzer.config import settings
from mutalyzer.db import session
from mutalyzer.db.models import Reference
from mutalyzer.parsers import lrg
from mutalyzer.parsers import genbank
from mutalyzer.parsers import lrg
ENTREZ_MAX_TRIES = 4
ENTREZ_SLEEP = 1 # in seconds
ENTREZ_SLEEP = 1 # In seconds.
class Retriever(object) :
class Retriever(object):
"""
Retrieve a record from either the cache or the NCBI.
Special methods:
- __init__(output, database) ; Use variables from the
configuration file to initialise the class private variables.
Private methods:
- _nametofile(name) ; Convert a name to a filename.
- _write(raw_data, filename, extract) ; Write a record to a file.
- _calcHash(content) ; Calculate the md5sum of 'content'.
- _newUD() ; Generate a new UD number.
Public methods:
- retrieveslice(accno, start, stop, orientation) ; Retrieve a chromosome
slice from the NCBI.
- retrievegene(gene, organism, upstream, downstream) ; Retrieve a gene
from the NCBI.
- downloadrecord(url) ; Download a GenBank file.
- uploadrecord(raw_data) ; Let someone upload a GenBank file.
- loadrecord(identifier) ; Load a record, store it in the cache, manage
the cache and return the record.
"""
def __init__(self, output) :
def __init__(self, output):
"""
Use variables from the configuration file for some simple
settings. Make the cache directory if it does not exist yet.
@arg output:
@type output:
@arg database:
@type database:
:arg object output: The output object.
"""
self._output = output
if not os.path.isdir(settings.CACHE_DIR) :
if not os.path.isdir(settings.CACHE_DIR):
os.mkdir(settings.CACHE_DIR)
Entrez.email = settings.EMAIL
self.fileType = None
#__init__
self.file_type = None
def _nametofile(self, name) :
def _name_to_file(self, name):
"""
Convert an accession number to a filename.
@arg name: The accession number
@type name: unicode
:arg unicode name: The accession number.
@return: A filename
@rtype: unicode
:returns unicode: A filename.
"""
return os.path.join(settings.CACHE_DIR, name + "." + self.fileType + ".bz2")
#_nametofile
return os.path.join(
settings.CACHE_DIR, '{}.{}.bz2'.format(name, self.file_type))
def _write(self, raw_data, filename) :
def _write(self, raw_data, filename):
"""
Write raw data to a compressed file.
@arg raw_data: The raw_data to be compressed and written
@type raw_data: byte string
@arg filename: The intended name of the outfile
@type filename: unicode
:arg str raw_data: The raw_data to be compressed and written.
:arg unicode filename: The intended name of the output filename.
@return: outfile ; The full path and name of the file written
@rtype: unicode
:returns unicode: The full path and name of the file written.
"""
result = chardet.detect(raw_data)
if result['confidence'] > 0.5:
......@@ -117,108 +87,95 @@ class Retriever(object) :
try:
raw_data = raw_data.decode(encoding).encode('utf-8')
except UnicodeDecodeError:
self._output.addMessage(__file__, 4, 'ENOPARSE',
'Could not decode file (using %s encoding).'
% encoding)
self._output.addMessage(
__file__, 4, 'ENOPARSE',
'Could not decode file (using {} encoding).'.format(
encoding))
return None
# Compress the data to save disk space.
comp = bz2.BZ2Compressor()
data = comp.compress(raw_data)
data += comp.flush()
out_handle = open(self._nametofile(filename), "wb")
out_handle = open(self._name_to_file(filename), 'wb')
out_handle.write(data)
out_handle.close()
return out_handle.name # return the full path to the file
#_write
return out_handle.name # Return the full path to the file.
def _calcHash(self, content) :
def _calculate_hash(self, content):
"""
Calculate the md5sum of a piece of text.
@arg content: Arbitrary text
@type content: byte string
:arg unicode content: Arbitrary text.
@return: The md5sum of 'content'
@rtype: unicode
:returns unicode: The md5sum of 'content'.
"""
hashfunc = hashlib.md5()
hashfunc.update(content)
md5sum = hashfunc.hexdigest()
del hashfunc
hash_func = hashlib.md5()
hash_func.update(content)
md5sum = hash_func.hexdigest()
return unicode(md5sum)
#_calcHash
def _newUD(self) :
def _new_ud(self):
"""
Make a new UD number based on the current time (seconds since 1970).
@return: A new UD number
@rtype: unicode
:returns unicode: A new UD number.
"""
ud = util.generate_id()
return 'UD_' + unicode(ud)
UD = util.generate_id()
return "UD_" + unicode(UD)
#_newUD
def _updateDBmd5(self, raw_data, name, GI):
#TODO documentation
def _update_db_md5(self, raw_data, name, gi):
# TODO: Documentation.
"""
@todo: documentation
@arg raw_data:
@type raw_data:
@arg name:
@type name:
@arg GI:
@type GI:
:arg str raw_data:
:arg unicode name:
:arg unicode gi:
@return: filename
@rtype: unicode
:returns unicode : filename
"""
try:
reference = Reference.query.filter_by(accession=name).one()
currentmd5sum = reference.checksum
current_md5sum = reference.checksum
except NoResultFound:
currentmd5sum = None
if currentmd5sum :
md5sum = self._calcHash(raw_data)
if md5sum != currentmd5sum :
self._output.addMessage(__file__, -1, "WHASH",
"Warning: Hash of %s changed from %s to %s." % (
name, currentmd5sum, md5sum))
Reference.query.filter_by(accession=name).update({'checksum': md5sum})
current_md5sum = None
if current_md5sum:
md5sum = self._calculate_hash(raw_data)
if md5sum != current_md5sum:
self._output.addMessage(
__file__, -1, 'WHASH',
'Warning: Hash of {} changed from {} to {}.'.format(
name, current_md5sum, md5sum))
Reference.query.filter_by(accession=name).update(
{'checksum': md5sum})
session.commit()
#if
else :
reference = Reference(name, self._calcHash(raw_data),
geninfo_identifier=GI)
else:
reference = Reference(
name, self._calculate_hash(raw_data), geninfo_identifier=gi)
session.add(reference)
session.commit()
return self._nametofile(name)
#_updateDBmd5
return self._name_to_file(name)
def snpConvert(self, rs_id) :
def snpConvert(self, rs_id):
"""
Search for an rsId in dbSNP and return all annotated HGVS notations of
it.
@arg rsId: The rsId of the SNP (example: 'rs9919552').
@type rsId: unicode
:arg unicode rsId: The rsId of the SNP (example: 'rs9919552').
@return: A list of HGVS notations.
@rtype: list(unicode)
:returns list(unicode): A list of HGVS notations.
"""
# A simple input check.
id = rs_id[2:]
if rs_id[:2] != 'rs' or not id.isdigit():
self._output.addMessage(__file__, 4, 'ESNPID',
'This is not a valid dbSNP id.')
self._output.addMessage(
__file__, 4, 'ESNPID', 'This is not a valid dbSNP id.')
return []
# Query dbSNP for the SNP. The following weird construct is to catch
......@@ -227,37 +184,37 @@ class Retriever(object) :
# Todo: maybe also implement this for other Entrez queries?
for i in range(ENTREZ_MAX_TRIES - 1):
try:
response = Entrez.efetch(db='snp', id=id, rettype='flt',
retmode='xml')
response = Entrez.efetch(
db='snp', id=id, rettype='flt', retmode='xml')
break
except (IOError, HTTPException):
time.sleep(ENTREZ_SLEEP)
else:
try:
response = Entrez.efetch(db='snp', id=id, rettype='flt',
retmode='xml')
response = Entrez.efetch(
db='snp', id=id, rettype='flt', retmode='xml')
except (IOError, HTTPException) as e:
# Could not parse XML.
self._output.addMessage(__file__, 4, 'EENTREZ',
'Error connecting to dbSNP.')
self._output.addMessage(__file__, -1, 'INFO',
'IOError: %s' % unicode(e))
self._output.addMessage(
__file__, 4, 'EENTREZ', 'Error connecting to dbSNP.')
self._output.addMessage(
__file__, -1, 'INFO', 'IOError: {}'.format(unicode(e)))
return []
try:
response_text = response.read()
except IncompleteRead as e:
self._output.addMessage(__file__, 4, 'EENTREZ',
'Error reading from dbSNP.')
self._output.addMessage(__file__, -1, 'INFO',
'IncompleteRead: %s' % unicode(e))
self._output.addMessage(
__file__, 4, 'EENTREZ', 'Error reading from dbSNP.')
self._output.addMessage(
__file__, -1, 'INFO', 'IncompleteRead: {}'.format(unicode(e)))
return []
if response_text.strip() == b'\n':
# This is apparently what dbSNP returns for non-existing dbSNP id
self._output.addMessage(__file__, 4, 'EENTREZ',
'ID rs%s could not be found in dbSNP.' \
% id)
self._output.addMessage(
__file__, 4, 'EENTREZ',
'ID rs{} could not be found in dbSNP.'.format(id))
return []
try:
......@@ -266,19 +223,23 @@ class Retriever(object) :
rs = doc.getElementsByTagName('Rs')[0]
except expat.ExpatError as e:
# Could not parse XML.
self._output.addMessage(__file__, 4, 'EENTREZ', 'Unknown dbSNP ' \
'error. Error parsing result XML.')
self._output.addMessage(__file__, -1, 'INFO',
'ExpatError: %s' % unicode(e))
self._output.addMessage(__file__, -1, 'INFO',
'Result from dbSNP: %s' % unicode(response_text, 'utf-8'))
self._output.addMessage(
__file__, 4, 'EENTREZ',
'Unknown dbSNP error. Error parsing result XML.')
self._output.addMessage(
__file__, -1, 'INFO', 'ExpatError: {}'.format(unicode(e)))
self._output.addMessage(
__file__, -1, 'INFO', 'Result from dbSNP: {}'.format(
unicode(response_text, 'utf-8')))
return []
except IndexError:
# The expected root element is not present.
self._output.addMessage(__file__, 4, 'EENTREZ', 'Unknown dbSNP ' \
'error. Result XML was not as expected.')
self._output.addMessage(__file__, -1, 'INFO',
'Result from dbSNP: %s' % unicode(response_text, 'utf-8'))
self._output.addMessage(
__file__, 4, 'EENTREZ',
'Unknown dbSNP error. Result XML was not as expected.')
self._output.addMessage(
__file__, -1, 'INFO', 'Result from dbSNP: {}'.format(
unicode(response_text, 'utf-8')))
return []
snps = []
......@@ -286,25 +247,23 @@ class Retriever(object) :
snps.append(i.lastChild.data)
return snps
#snpConvert
#Retriever
class GenBankRetriever(Retriever):
# TODO documentation
"""
"""
def __init__(self, output):
"""
@todo: Documentation.
Initialise the class.
:arg object output: The output object.
"""
# Recall init of parent
Retriever.__init__(self, output)
self.fileType = "gb"
# Child specific init
#__init__
self.file_type = 'gb'
def write(self, raw_data, filename, extract) :
def write(self, raw_data, filename, extract):
"""
Write raw data to a file. The data is parsed before writing, if a
parse error occurs an error is returned and the function exits.
......@@ -315,72 +274,65 @@ class GenBankRetriever(Retriever):
returned for further processing (putting them in the internal
database).
@arg raw_data: The data
@type raw_data: byte string
@arg filename: The intended name of the file.
@type filename: unicode
@arg extract: Flag that indicates whether to extract the record ID and
GI number:
:arg str raw_data: The data.
:arg unicode filename: The intended name of the file.
:arg int extract: Flag that indicates whether to extract the record ID
and GI number:
- 0 ; Do not extract, use 'filename'
- 1 ; Extract
@type extract: integer
@return: tuple ; Depending on the value of 'extract':
:returns tuple(unicode, unicode): Depending on the value of 'extract':
- 0 ; ('filename', None)
- 1 ; (id, GI)
@rtype: tuple (unicode, unicode)
- 1 ; (id, gi)
"""
if raw_data.strip() == b'Nothing has been found':
self._output.addMessage(__file__, 4, "ENORECORD",
"The record could not be retrieved.")
self._output.addMessage(
__file__, 4, 'ENORECORD', 'The record could not be retrieved.')
return None
#if
fakehandle = io.BytesIO() # Unfortunately, BioPython needs a
fakehandle.write(raw_data) # file handle.
fakehandle.seek(0)
try :
record = SeqIO.read(fakehandle, "genbank")
except (ValueError, AttributeError): # An error occured while parsing.
self._output.addMessage(__file__, 4, "ENOPARSE",
"The file could not be parsed.")
fake_handle = io.BytesIO() # BioPython needs a file handle.
fake_handle.write(raw_data)
fake_handle.seek(0)
try:
record = SeqIO.read(fake_handle, 'genbank')
except (ValueError, AttributeError): # An error occured while parsing.
self._output.addMessage(
__file__, 4, 'ENOPARSE', 'The file could not be parsed.')
return None
#except
if type(record.seq) == UnknownSeq :
self._output.addMessage(__file__, 4, "ENOSEQ",
"This record contains no sequence. Chromosomal or contig " \
"records should be uploaded with the GenBank uploader.")
if type(record.seq) == UnknownSeq:
self._output.addMessage(
__file__, 4, 'ENOSEQ',
'This record contains no sequence. Chromosomal or contig ' \
'records should be uploaded with the GenBank uploader.')
return None
#if
outfile = filename
GI = None
if extract :
outfile = unicode(record.id)
GI = unicode(record.annotations["gi"])
if outfile != filename :
out_filename = filename
gi = None
if extract:
out_filename = unicode(record.id)
gi = unicode(record.annotations['gi'])
if out_filename != filename:
# Add the reference (incl version) to the reference output
# This differs if the original reference lacks a version
self._output.addOutput("reference", unicode(record.id))
self._output.addOutput('reference', unicode(record.id))
self._output.addOutput(
"BatchFlags", ("A1",(
filename,
outfile,
filename+"." )))
self._output.addMessage(__file__, 2, "WNOVER",
"No version number is given, using %s. Please use this " \
"number to reduce downloading overhead." % unicode(record.id))
#if
if not self._write(raw_data, outfile):
'BatchFlags',
('A1', (filename, out_filename, filename+'.' )))
self._output.addMessage(
__file__, 2, 'WNOVER',
'No version number is given, using {}. Please use this ' \
'number to reduce downloading overhead.'.format(
unicode(record.id)))
if not self._write(raw_data, out_filename):
return None
return outfile, GI
#write
return out_filename, gi
def fetch(self, name) :
def fetch(self, name):
"""
Todo: Documentation.
......@@ -389,24 +341,29 @@ class GenBankRetriever(Retriever):
use efetch with rettype=gbwithparts to download the GenBank file.
"""
try:
net_handle = Entrez.efetch(db='nuccore', id=name, rettype='gb', retmode='text')
net_handle = Entrez.efetch(
db='nuccore', id=name, rettype='gb', retmode='text')
raw_data = net_handle.read()
net_handle.close()
except (IOError, urllib2.HTTPError, HTTPException) as e:
self._output.addMessage(__file__, -1, 'INFO',
'Error connecting to Entrez nuccore database: %s' % unicode(e))
self._output.addMessage(__file__, 4, 'ERETR',
'Could not retrieve %s.' % name)
self._output.addMessage(
__file__, -1, 'INFO',
'Error connecting to Entrez nuccore database: {}'.format(
unicode(e)))
self._output.addMessage(
__file__, 4, 'ERETR', 'Could not retrieve {}.'.format(name))
return None
if raw_data.strip() == b'': # Check if the file is empty or not.
self._output.addMessage(__file__, 4, 'ERETR',
'Could not retrieve %s.' % name)
if raw_data.strip() == b'': # Check if the file is empty or not.
self._output.addMessage(
__file__, 4, 'ERETR', 'Could not retrieve {}.'.format(name))
return None
if b'Resource temporarily unavailable' in raw_data:
self._output.addMessage(__file__, 4, 'ERETR',
'Resource temporarily unavailable from NCBI servers: %s.' % name)
self._output.addMessage(
__file__, 4, 'ERETR',
'Resource temporarily unavailable from NCBI servers: ' \
'{}.'.format(name))
return None
# This is a hack to detect constructed references, the proper way to
......@@ -415,37 +372,45 @@ class GenBankRetriever(Retriever):
if b'\nCONTIG' in raw_data:
try:
# Get the length in base pairs
length = int(raw_data[:raw_data.index(b' bp', 0, 500)].split()[-1])
length = int(
raw_data[:raw_data.index(b' bp', 0, 500)].split()[-1])
except ValueError, IndexError:
self._output.addMessage(__file__, 4, 'ERETR',
'Could not retrieve %s.' % name)
self._output.addMessage(
__file__, 4, 'ERETR', 'Could not retrieve {}.'.format(
name))
return None
if length > settings.MAX_FILE_SIZE:
self._output.addMessage(__file__, 4, 'ERETR',
'Could not retrieve %s.' % name)
self._output.addMessage(
__file__, 4, 'ERETR', 'Could not retrieve {}.'.format(
name))
return None
try:
net_handle = Entrez.efetch(db='nuccore', id=name, rettype='gbwithparts', retmode='text')
net_handle = Entrez.efetch(
db='nuccore', id=name, rettype='gbwithparts',
retmode='text')
raw_data = net_handle.read()
net_handle.close()
except (IOError, urllib2.HTTPError, HTTPException) as e:
self._output.addMessage(__file__, -1, 'INFO',
'Error connecting to Entrez nuccore database: %s' % unicode(e))
self._output.addMessage(__file__, 4, 'ERETR',
'Could not retrieve %s.' % name)
self._output.addMessage(
__file__, -1, 'INFO',
'Error connecting to Entrez nuccore database: {}'.format(
unicode(e)))
self._output.addMessage(
__file__, 4, 'ERETR', 'Could not retrieve {}.'.format(
name))
return None
result = self.write(raw_data, name, 1)
if not result:
return None
name, GI = result
if name: # Processing went okay.
return self._updateDBmd5(raw_data, name, GI)