Commit 3623a33c authored by Vermaat's avatar Vermaat
Browse files

[wip] Query Reference using accession and version

While working on this, I came to the conclusion it's not a good idea to
split accession and version. It introduces a lot of complexity for
little benefit.

In general, Mutalyzer always sees 'accession.version' as the identifier
of the reference and because we always want exact identifiers, there is
little need for accession numbers without version.

The most obvious use case I see for a split is that we can easily query
available references with a certain accession, not taking version into
account, as a way to inform the user when a specific reference
identifier was not found. But I guess we better have this use case as
the exception, and make our life easier for the rest.

So I guess I'm aborting this for now. Addition of the `version` column
has already landed in the master branch, but this is easy to roll
back. The original column has not yet been touched in master.
parent 0796d339
......@@ -4,7 +4,9 @@ sudo: false
cache: pip
python: "2.7"
apt_packages: swig
sources: travis-ci/sqlite3
packages: [swig, sqlite3]
postgresql: "9.4"
services: redis-server
......@@ -129,18 +129,21 @@ class Retriever(object):
ud = util.generate_id()
return 'UD_' + unicode(ud)
def _update_db_md5(self, raw_data, name, source):
def _update_db_md5(self, raw_data, accession, source, version=None):
:arg str raw_data:
:arg unicode name:
:arg unicode source:
:arg str raw_data: Reference file contents.
:arg unicode accession: Accession number, without version.
:arg unicode source: Source of the reference file.
:arg int version: Optional accession version number.
:returns: filename
:rtype: unicode
# TODO: Documentation.
reference = Reference.query.filter_by(accession=name).one()
reference = Reference.query.filter_by(
current_md5sum = reference.checksum
except NoResultFound:
current_md5sum = None
......@@ -151,15 +154,20 @@ class Retriever(object):
__file__, -1, 'WHASH',
'Warning: Hash of {} changed from {} to {}.'.format(
name, current_md5sum, md5sum))
{'checksum': md5sum})
util.accession_with_version(accession, version),
current_md5sum, md5sum))
).update({'checksum': md5sum})
reference = Reference(name, self._calculate_hash(raw_data), source)
reference = Reference(accession, self._calculate_hash(raw_data),
source, version=version)
return self._name_to_file(name)
return self._name_to_file(
util.accession_with_version(accession, version))
def snpConvert(self, rs_id):
......@@ -455,6 +463,9 @@ class GenBankRetriever(Retriever):
assert reference.version is None
if reference and os.path.isfile(self._name_to_file(reference.accession)):
# It's still present.
return reference.accession
......@@ -490,7 +501,8 @@ class GenBankRetriever(Retriever):
'Warning: Hash of {} changed from {} to {}.'.format(
reference.accession, current_md5sum, md5sum))
accession=reference.accession).update({'checksum': md5sum})
).update({'checksum': md5sum})
# We haven't seen it before, so give it a name.
......@@ -649,7 +661,7 @@ class GenBankRetriever(Retriever):
:arg unicode url: Location of a GenBank record.
:returns: UD or None.
:returns: UD or accession including version number or None.
:rtype: unicode
if not (url.startswith('http://') or url.startswith('https://') or
......@@ -682,9 +694,14 @@ class GenBankRetriever(Retriever):
if (os.path.isfile(self._name_to_file(reference.accession)) or
# We found an existing entry with the same checksum. This
# could just be a regulare RefSeq which is redownloaded,
# so we should also handle potential version numbers here.
acc_with_version = util.accession_with_version(
reference.accession, reference.version)
if (os.path.isfile(self._name_to_file(acc_with_version)) or
self.write(raw_data, reference.accession, 0)):
ud = reference.accession
ud = acc_with_version
# Returns the UD or None.
return ud
......@@ -706,7 +723,8 @@ class GenBankRetriever(Retriever):
:arg str raw_data: A GenBank record.
:returns: Accession number for the uploaded file.
:returns: Accession number for the uploaded file, potentially
including version number.
:rtype: unicode
md5sum = self._calculate_hash(raw_data)
......@@ -721,13 +739,18 @@ class GenBankRetriever(Retriever):
return ud
if os.path.isfile(self._name_to_file(reference.accession)):
return reference.accession
# We found an existing entry with the same checksum. This could
# just be a regulare RefSeq which is reuploaded, so we should also
# handle potential version numbers here.
acc_with_version = util.accession_with_version(reference.accession,
if os.path.isfile(self._name_to_file(acc_with_version)):
return acc_with_version
return (self.write(raw_data, reference.accession, 0) and
return (self.write(raw_data, acc_with_version, 0) and
def loadrecord(self, accession):
def loadrecord(self, accession, version=None):
Load a RefSeq record and return it.
......@@ -740,6 +763,7 @@ class GenBankRetriever(Retriever):
3. Fetched from the NCBI.
:arg unicode accession: A RefSeq accession number.
:arg int version: Accession version.
:returns: A parsed RefSeq record or `None` if no record could be found
for the given accession.
......@@ -14,10 +14,13 @@ from sqlalchemy import event, or_
from sqlalchemy import (Boolean, Column, DateTime, Enum, ForeignKey, Index,
Integer, String, Text, TypeDecorator)
from sqlalchemy.engine import Engine
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.ext.hybrid import hybrid_property
from sqlalchemy.orm import backref, relationship
from sqlalchemy.sql import expression, func
from mutalyzer import db
from mutalyzer import util
BATCH_JOB_TYPES = ('name-checker',
......@@ -65,6 +68,26 @@ class Positions(TypeDecorator):
return [int(s) for s in value.split(',')]
class accession_without_version(expression.FunctionElement):
type = String()
name = 'accession_without_version'
def default_accession_without_version(element, compiler, **kw):
accession = element.clauses.clauses[0]
return "SPLIT_PART({}, '.', 1)".format(compiler.process(accession))
@compiles(accession_without_version, 'sqlite')
def sqlite_accession_without_version(element, compiler, **kw):
accession = element.clauses.clauses[0]
return ("CASE WHEN INSTR({0}, '.') > 0 "
"THEN SUBSTR({0}, 1, INSTR({0}, '.') - 1) "
"ELSE {0} "
class BatchJob(db.Base):
Batch job.
......@@ -188,27 +211,56 @@ class Reference(db.Base):
#: Date and time of creation.
added = Column(DateTime)
def __init__(self, accession, checksum, source, source_data=None):
def __init__(self, accession, checksum, source, version=None,
self.accession = accession
self.version = version
self.checksum = checksum
self.source = source
self.source_data = source_data
self.added =
def __repr__(self):
return '<Reference %r>' % self.accession
return '<Reference accession=%r, version=%r, source=%r>' % (
self.accession, self.version, self.source)
def accession(self):
return self._accession
return self._accession.split('.')[0]
def accession(self, accession):
self._accession = accession
def accession(cls):
return accession_without_version(cls._accession)
def name(self):
Complete name for this reference. Combination of `accession` and
`version` if applicable, just `accession` otherwise (e.g.,
``AL449423.14``, ``NM_000059.3``, ``UD_138781341344``).
return util.accession_with_version(self.version, self.accession)
def name(self, name):
self.version = int(accession.split('.', 1)[1])
except (IndexError, ValueError):
accession, version = name.split('.')
version = int(version)
except ValueError:
accession = name
version = None
self.accession = accession
self.version = version
def name(cls):
return func.coalesce(
cls.accession + '.' + expression.cast(cls.version, String),
......@@ -37,6 +37,12 @@ from Bio.SeqUtils import seq3
from extractor.describe import palinsnoop, roll
def accession_with_version(accession, version=None):
if version is None:
return accession
return '%s.%i' % (accession, version)
def reverse_complement(sequence):
Reverse complement of a sequence represented as unicode string.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment