Commit 8fe2acfe authored by Vermaat's avatar Vermaat
Browse files

wip

parent c7c614f0
......@@ -285,7 +285,7 @@ class Record(object) :
self.seq = ""
self.mapping = []
self.organelle = None
self.source = Gene(None)
self.source = Gene(None) # TODO: seems to be unused
self.description = ""
self._sourcetype = None #LRG or GB
self.version = None
......
......@@ -332,7 +332,7 @@ class GenBankRetriever(Retriever):
return out_filename, gi
def fetch(self, name):
def fetch(self, accession):
"""
Todo: Documentation.
......@@ -739,72 +739,62 @@ class GenBankRetriever(Retriever):
return (self.write(raw_data, reference.accession, 0) and
reference.accession)
def loadrecord(self, identifier):
def loadrecord(self, accession, version):
"""
Load a RefSeq record and return it.
The record is found by trying the following options in order:
1. Returned from the cache if it is there.
1. Returned from the database if it is there.
2. Re-created (if it was created by slicing) or re-downloaded (if it
was created by URL) if we have information on its source in the
database.
3. Fetched from the NCBI.
:arg unicode identifier: A RefSeq accession number or geninfo
identifier (GI).
:arg unicode accession: A RefSeq accession number.
:arg int version: Version of the accession number.
:returns: A parsed RefSeq record or `None` if no record could be found
for the given identifier.
:rtype: object
"""
if identifier[0].isdigit():
# This is a GI number (geninfo identifier).
reference = Reference.query \
.filter_by(geninfo_identifier=identifier) \
.first()
else:
# This is a RefSeq accession number.
reference = Reference.query \
.filter_by(accession=identifier) \
.first()
reference = Reference.query.filter_by(
accession=accession, version=version
).first()
if reference is None:
# We don't know it, fetch it from NCBI.
filename = self.fetch(identifier)
else:
# We have seen it before.
filename = self._name_to_file(reference.accession)
if os.path.isfile(filename):
# It is still in the cache, so filename is valid.
pass
elif reference.slice_accession:
# It was previously created by slicing.
cast_orientation = {
None: None, 'forward': 1, 'reverse': 2}
if not self.retrieveslice(
reference.slice_accession, reference.slice_start,
reference.slice_stop,
cast_orientation[reference.slice_orientation]):
filename = None
elif reference.download_url:
# It was previously created by URL.
if not self.downloadrecord(reference.download_url):
filename = None
elif reference.geninfo_identifier:
# It was previously fetched from NCBI.
filename = self.fetch(reference.accession)
else:
# It was previously created by uploading.
self._output.addMessage(
__file__, 4, 'ERETR', 'Please upload this sequence again.')
filename = None
reference = self.fetch(accession, version)
# TODO: Also fetch if the sequence is not in the cach.
#if need_sequence and not sequence_in_cache(accession, version):
# reference = self.fetch(accession, version)
# TODO: should be handled in self.fetch
# elif reference.slice_accession:
# # It was previously created by slicing.
# cast_orientation = {
# None: None, 'forward': 1, 'reverse': 2}
# if not self.retrieveslice(
# reference.slice_accession, reference.slice_start,
# reference.slice_stop,
# cast_orientation[reference.slice_orientation]):
# filename = None
# elif reference.download_url:
# # It was previously created by URL.
# if not self.downloadrecord(reference.download_url):
# filename = None
# elif reference.geninfo_identifier:
# # It was previously fetched from NCBI.
# filename = self.fetch(reference.accession)
# else:
# # It was previously created by uploading.
# self._output.addMessage(
# __file__, 4, 'ERETR', 'Please upload this sequence again.')
# filename = None
# If filename is None, we could not retrieve the record.
if filename is None:
......
......@@ -152,17 +152,32 @@ class Reference(db.Base):
id = Column(Integer, primary_key=True)
#: Accession number for this reference, including the version number if
#: applicable (e.g., ``AL449423.14``, ``NM_000059.3``,
#: ``UD_138781341344``).
accession = Column(String(20), nullable=False, index=True, unique=True)
#: Accession number for this reference, not including the version number
#: (e.g., ``AL449423``, ``NM_000059``, ``LRG_23``, ``UD_138781341344``).
accession = Column(String(20), nullable=False)
#: Accession version (e.g., 3, 2). Not applicaple when `reference_type` is
#: ``lrg``.
version = Column(Integer)
#: Type of reference.
reference_type = Column(Enum('refseq', 'lrg', name='reference_type'),
nullable=False)
#: Enclosing organelle. Used to differentiate between references requiring
#: ``m.`` and ``g.`` descriptions.
organelle = Column(Enum('nucleus', 'mitochondrion', name='organelle'),
nullable=False)
#: Type of sequence.
sequence_type = Column(Enum('dna', 'rna', 'protein', name='sequence_type'),
nullable=False)
#: MD5 checksum of the reference file.
# TODO: keeping the checksum might be useful (e.g., to determine if we
# should update the database), but do we need the index and unique?
checksum = Column(String(32), nullable=False, index=True, unique=True)
#: The corresponding GI number, if available.
geninfo_identifier = Column(String(13), index=True, unique=True)
#: The accession number from which we took a slice, if available.
slice_accession = Column(String(20))
......@@ -185,12 +200,14 @@ class Reference(db.Base):
#: Date and time of creation.
added = Column(DateTime)
def __init__(self, accession, checksum, geninfo_identifier=None,
slice_accession=None, slice_start=None, slice_stop=None,
slice_orientation=None, download_url=None):
def __init__(self, accession, reference_type, organelle, sequence_type,
checksum, slice_accession=None, slice_start=None,
slice_stop=None, slice_orientation=None, download_url=None):
self.accession = accession
self.reference_type = reference_type
self.organelle = organelle
self.sequence_type = sequence_type
self.checksum = checksum
self.geninfo_identifier = geninfo_identifier
self.slice_accession = slice_accession
self.slice_start = slice_start
self.slice_stop = slice_stop
......@@ -202,12 +219,113 @@ class Reference(db.Base):
return '<Reference %r>' % self.accession
Index('reference_accession_version',
Reference.accession, Reference.version,
unique=True)
Index('reference_slice',
Reference.slice_accession, Reference.slice_start, Reference.slice_stop,
Reference.slice_orientation,
unique=True)
class Transcript(db.Base):
"""
Mapping of a gene transcript on a reference.
.. note:: All positions are ordered according to the chromosome,
irrespective of transcript orientation. For example, `start` is always
smaller than `stop`.
"""
__tablename__ = 'transcripts'
__table_args__ = {'mysql_engine': 'InnoDB', 'mysql_charset': 'utf8'}
id = Column(Integer, primary_key=True)
reference_id = Column(Integer,
ForeignKey('references.id', ondelete='CASCADE'),
nullable=False)
#: Gene symbol (e.g., ``DMD``, ``PSEN1``, ``TRNS1``).
gene = Column(String(30), nullable=False)
#: Transcript number in reference (e.g., 1, 2).
transcript = Column(Integer, nullable=False)
# Todo: Use constants and some utilities for conversion.
#: The orientation of the transcript on the chromosome.
orientation = Column(Enum('forward', 'reverse', name='orientation'),
nullable=False)
#: The start position of the transcript on the chromosome (one-based,
#: inclusive, in chromosomal orientation).
start = Column(Integer, nullable=False)
#: The stop position of the transcript on the chromosome (one-based,
#: inclusive, in chromosomal orientation).
stop = Column(Integer, nullable=False)
#: The CDS start position of the transcript on the chromosome (one-based,
#: inclusive, in chromosomal orientation).
cds_start = Column(Integer)
#: The CDS stop position of the transcript on the chromosome (one-based,
#: inclusive).
cds_stop = Column(Integer)
#: The exon start positions of the transcript on the chromosome
#: (one-based, inclusive, in chromosomal orientation).
exon_starts = Column(Positions, nullable=False)
#: The exon start positions of the transcript on the chromosome
#: (one-based, inclusive, in chromosomal orientation).
exon_stops = Column(Positions, nullable=False)
#: The :class:`Assembly` this chromosome is in.
reference = relationship(
Reference,
lazy='joined',
innerjoin=True,
backref=backref('transcripts', lazy='dynamic',
cascade='all, delete-orphan',
passive_deletes=True))
def __init__(self, reference, gene, orientation, start, stop, exon_starts,
exon_stops, transcript=1, cds=None):
self.reference = reference
self.gene = gene
self.orientation = orientation
self.start = start
self.stop = stop
self.exon_starts = exon_starts
self.exon_stops = exon_stops
self.transcript = transcript
self.cds = cds
def __repr__(self):
return ('<Transcript gene=%r transcript=%r>'
% (self.gene, self.transcript))
@property
def coding(self):
"""
Set to `True` iff the transcript is coding.
"""
return self.cds_start is not None and self.cds_stop is not None
@property
def cds(self):
"""
Tuple of CDS start and stop positions on the chromosome, or `None` if
the transcript is non-coding.
"""
if self.coding:
return self.cds_start, self.cds_stop
return None
@cds.setter
def cds(self, cds):
self.cds_start, self.cds_stop = cds or (None, None)
class Assembly(db.Base):
"""
Genome assembly.
......
......@@ -510,6 +510,7 @@ class GBparser():
record.molType = 'm'
#if
# TODO: record.source seems to be unused.
fakeGene = Locus("001")
record.source.transcriptList.append(fakeGene)
fakeGene.CDS = PList()
......
......@@ -20,6 +20,7 @@ from __future__ import unicode_literals
from operator import attrgetter
import Bio.Seq
from Bio.Data import CodonTable
from Bio.Alphabet import IUPAC
from Bio.Alphabet import DNAAlphabet
......@@ -27,7 +28,7 @@ from Bio.Alphabet import ProteinAlphabet
from Bio.Alphabet import _verify_alphabet
from mutalyzer import util
from mutalyzer.db.models import Assembly
from mutalyzer.db.models import Assembly, Reference
from mutalyzer.grammar import Grammar
from mutalyzer.mutator import Mutator
from mutalyzer.mapping import Converter
......@@ -1740,7 +1741,9 @@ def check_variant(description, output):
'Indexing by protein isoform is not supported.')
retriever = Retriever.GenBankRetriever(output)
retrieved_record = retriever.loadrecord(record_id)
retrieved_record = record_from_database(record_id)
if not retrieved_record:
retrieved_record = retriever.loadrecord(record_id)
if not retrieved_record:
return
......@@ -2014,3 +2017,28 @@ def check_variant(description, output):
_add_batch_output(output)
#check_variant
def record_from_database(accession):
reference = Reference.query.filter_by(accession=accession).first()
if reference is None:
return
record = GenRecord.Record()
# TODO: shouldn't have to be a seq record
record.seq = Bio.Seq.Seq(reference.sequence, DNAAlphabet())
record.id = accession
record.source_id = accession
try:
record.source_accession, record.source_version = accession.rsplit(',', 1)
except ValueError:
record.source_accession = accession
record.source_version = '1'
record.source_gi = reference.geninfo_identifier
record.organism = 'todo' # biorecord.annotations['organism']
#record.organelle = 'mitochondrion'
record.molType = 'n' if accession.startswith('NM') else 'g' # TODO: m.
return record
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment