Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mutalyzer
mutalyzer2
Commits
8fe2acfe
Commit
8fe2acfe
authored
Mar 14, 2016
by
Vermaat
Browse files
wip
parent
c7c614f0
Changes
5
Hide whitespace changes
Inline
Side-by-side
mutalyzer/GenRecord.py
View file @
8fe2acfe
...
...
@@ -285,7 +285,7 @@ class Record(object) :
self
.
seq
=
""
self
.
mapping
=
[]
self
.
organelle
=
None
self
.
source
=
Gene
(
None
)
self
.
source
=
Gene
(
None
)
# TODO: seems to be unused
self
.
description
=
""
self
.
_sourcetype
=
None
#LRG or GB
self
.
version
=
None
...
...
mutalyzer/Retriever.py
View file @
8fe2acfe
...
...
@@ -332,7 +332,7 @@ class GenBankRetriever(Retriever):
return
out_filename
,
gi
def
fetch
(
self
,
name
):
def
fetch
(
self
,
accession
):
"""
Todo: Documentation.
...
...
@@ -739,72 +739,62 @@ class GenBankRetriever(Retriever):
return
(
self
.
write
(
raw_data
,
reference
.
accession
,
0
)
and
reference
.
accession
)
def
loadrecord
(
self
,
identifier
):
def
loadrecord
(
self
,
accession
,
version
):
"""
Load a RefSeq record and return it.
The record is found by trying the following options in order:
1. Returned from the
cach
e if it is there.
1. Returned from the
databas
e if it is there.
2. Re-created (if it was created by slicing) or re-downloaded (if it
was created by URL) if we have information on its source in the
database.
3. Fetched from the NCBI.
:arg unicode
identifier
: A RefSeq accession number
or geninfo
identifier (GI)
.
:arg unicode
accession
: A RefSeq accession number
.
:arg int version: Version of the accession number
.
:returns: A parsed RefSeq record or `None` if no record could be found
for the given identifier.
:rtype: object
"""
if
identifier
[
0
].
isdigit
():
# This is a GI number (geninfo identifier).
reference
=
Reference
.
query
\
.
filter_by
(
geninfo_identifier
=
identifier
)
\
.
first
()
else
:
# This is a RefSeq accession number.
reference
=
Reference
.
query
\
.
filter_by
(
accession
=
identifier
)
\
.
first
()
reference
=
Reference
.
query
.
filter_by
(
accession
=
accession
,
version
=
version
).
first
()
if
reference
is
None
:
# We don't know it, fetch it from NCBI.
filename
=
self
.
fetch
(
identifier
)
else
:
# We have seen it before.
filename
=
self
.
_name_to_file
(
reference
.
accession
)
if
os
.
path
.
isfile
(
filename
):
# It is still in the cache, so filename is valid.
pass
elif
reference
.
slice_accession
:
# It was previously created by slicing.
cast_orientation
=
{
None
:
None
,
'forward'
:
1
,
'reverse'
:
2
}
if
not
self
.
retrieveslice
(
reference
.
slice_accession
,
reference
.
slice_start
,
reference
.
slice_stop
,
cast_orientation
[
reference
.
slice_orientation
]):
filename
=
None
elif
reference
.
download_url
:
# It was previously created by URL.
if
not
self
.
downloadrecord
(
reference
.
download_url
):
filename
=
None
elif
reference
.
geninfo_identifier
:
# It was previously fetched from NCBI.
filename
=
self
.
fetch
(
reference
.
accession
)
else
:
# It was previously created by uploading.
self
.
_output
.
addMessage
(
__file__
,
4
,
'ERETR'
,
'Please upload this sequence again.'
)
filename
=
None
reference
=
self
.
fetch
(
accession
,
version
)
# TODO: Also fetch if the sequence is not in the cach.
#if need_sequence and not sequence_in_cache(accession, version):
# reference = self.fetch(accession, version)
# TODO: should be handled in self.fetch
# elif reference.slice_accession:
# # It was previously created by slicing.
# cast_orientation = {
# None: None, 'forward': 1, 'reverse': 2}
# if not self.retrieveslice(
# reference.slice_accession, reference.slice_start,
# reference.slice_stop,
# cast_orientation[reference.slice_orientation]):
# filename = None
# elif reference.download_url:
# # It was previously created by URL.
# if not self.downloadrecord(reference.download_url):
# filename = None
# elif reference.geninfo_identifier:
# # It was previously fetched from NCBI.
# filename = self.fetch(reference.accession)
# else:
# # It was previously created by uploading.
# self._output.addMessage(
# __file__, 4, 'ERETR', 'Please upload this sequence again.')
# filename = None
# If filename is None, we could not retrieve the record.
if
filename
is
None
:
...
...
mutalyzer/db/models.py
View file @
8fe2acfe
...
...
@@ -152,17 +152,32 @@ class Reference(db.Base):
id
=
Column
(
Integer
,
primary_key
=
True
)
#: Accession number for this reference, including the version number if
#: applicable (e.g., ``AL449423.14``, ``NM_000059.3``,
#: ``UD_138781341344``).
accession
=
Column
(
String
(
20
),
nullable
=
False
,
index
=
True
,
unique
=
True
)
#: Accession number for this reference, not including the version number
#: (e.g., ``AL449423``, ``NM_000059``, ``LRG_23``, ``UD_138781341344``).
accession
=
Column
(
String
(
20
),
nullable
=
False
)
#: Accession version (e.g., 3, 2). Not applicaple when `reference_type` is
#: ``lrg``.
version
=
Column
(
Integer
)
#: Type of reference.
reference_type
=
Column
(
Enum
(
'refseq'
,
'lrg'
,
name
=
'reference_type'
),
nullable
=
False
)
#: Enclosing organelle. Used to differentiate between references requiring
#: ``m.`` and ``g.`` descriptions.
organelle
=
Column
(
Enum
(
'nucleus'
,
'mitochondrion'
,
name
=
'organelle'
),
nullable
=
False
)
#: Type of sequence.
sequence_type
=
Column
(
Enum
(
'dna'
,
'rna'
,
'protein'
,
name
=
'sequence_type'
),
nullable
=
False
)
#: MD5 checksum of the reference file.
# TODO: keeping the checksum might be useful (e.g., to determine if we
# should update the database), but do we need the index and unique?
checksum
=
Column
(
String
(
32
),
nullable
=
False
,
index
=
True
,
unique
=
True
)
#: The corresponding GI number, if available.
geninfo_identifier
=
Column
(
String
(
13
),
index
=
True
,
unique
=
True
)
#: The accession number from which we took a slice, if available.
slice_accession
=
Column
(
String
(
20
))
...
...
@@ -185,12 +200,14 @@ class Reference(db.Base):
#: Date and time of creation.
added
=
Column
(
DateTime
)
def
__init__
(
self
,
accession
,
checksum
,
geninfo_identifier
=
Non
e
,
slice_accession
=
None
,
slice_start
=
None
,
slice_stop
=
None
,
slice_orientation
=
None
,
download_url
=
None
):
def
__init__
(
self
,
accession
,
reference_type
,
organelle
,
sequence_typ
e
,
checksum
,
slice_accession
=
None
,
slice_start
=
None
,
slice_stop
=
None
,
slice_orientation
=
None
,
download_url
=
None
):
self
.
accession
=
accession
self
.
reference_type
=
reference_type
self
.
organelle
=
organelle
self
.
sequence_type
=
sequence_type
self
.
checksum
=
checksum
self
.
geninfo_identifier
=
geninfo_identifier
self
.
slice_accession
=
slice_accession
self
.
slice_start
=
slice_start
self
.
slice_stop
=
slice_stop
...
...
@@ -202,12 +219,113 @@ class Reference(db.Base):
return
'<Reference %r>'
%
self
.
accession
Index
(
'reference_accession_version'
,
Reference
.
accession
,
Reference
.
version
,
unique
=
True
)
Index
(
'reference_slice'
,
Reference
.
slice_accession
,
Reference
.
slice_start
,
Reference
.
slice_stop
,
Reference
.
slice_orientation
,
unique
=
True
)
class
Transcript
(
db
.
Base
):
"""
Mapping of a gene transcript on a reference.
.. note:: All positions are ordered according to the chromosome,
irrespective of transcript orientation. For example, `start` is always
smaller than `stop`.
"""
__tablename__
=
'transcripts'
__table_args__
=
{
'mysql_engine'
:
'InnoDB'
,
'mysql_charset'
:
'utf8'
}
id
=
Column
(
Integer
,
primary_key
=
True
)
reference_id
=
Column
(
Integer
,
ForeignKey
(
'references.id'
,
ondelete
=
'CASCADE'
),
nullable
=
False
)
#: Gene symbol (e.g., ``DMD``, ``PSEN1``, ``TRNS1``).
gene
=
Column
(
String
(
30
),
nullable
=
False
)
#: Transcript number in reference (e.g., 1, 2).
transcript
=
Column
(
Integer
,
nullable
=
False
)
# Todo: Use constants and some utilities for conversion.
#: The orientation of the transcript on the chromosome.
orientation
=
Column
(
Enum
(
'forward'
,
'reverse'
,
name
=
'orientation'
),
nullable
=
False
)
#: The start position of the transcript on the chromosome (one-based,
#: inclusive, in chromosomal orientation).
start
=
Column
(
Integer
,
nullable
=
False
)
#: The stop position of the transcript on the chromosome (one-based,
#: inclusive, in chromosomal orientation).
stop
=
Column
(
Integer
,
nullable
=
False
)
#: The CDS start position of the transcript on the chromosome (one-based,
#: inclusive, in chromosomal orientation).
cds_start
=
Column
(
Integer
)
#: The CDS stop position of the transcript on the chromosome (one-based,
#: inclusive).
cds_stop
=
Column
(
Integer
)
#: The exon start positions of the transcript on the chromosome
#: (one-based, inclusive, in chromosomal orientation).
exon_starts
=
Column
(
Positions
,
nullable
=
False
)
#: The exon start positions of the transcript on the chromosome
#: (one-based, inclusive, in chromosomal orientation).
exon_stops
=
Column
(
Positions
,
nullable
=
False
)
#: The :class:`Assembly` this chromosome is in.
reference
=
relationship
(
Reference
,
lazy
=
'joined'
,
innerjoin
=
True
,
backref
=
backref
(
'transcripts'
,
lazy
=
'dynamic'
,
cascade
=
'all, delete-orphan'
,
passive_deletes
=
True
))
def
__init__
(
self
,
reference
,
gene
,
orientation
,
start
,
stop
,
exon_starts
,
exon_stops
,
transcript
=
1
,
cds
=
None
):
self
.
reference
=
reference
self
.
gene
=
gene
self
.
orientation
=
orientation
self
.
start
=
start
self
.
stop
=
stop
self
.
exon_starts
=
exon_starts
self
.
exon_stops
=
exon_stops
self
.
transcript
=
transcript
self
.
cds
=
cds
def
__repr__
(
self
):
return
(
'<Transcript gene=%r transcript=%r>'
%
(
self
.
gene
,
self
.
transcript
))
@
property
def
coding
(
self
):
"""
Set to `True` iff the transcript is coding.
"""
return
self
.
cds_start
is
not
None
and
self
.
cds_stop
is
not
None
@
property
def
cds
(
self
):
"""
Tuple of CDS start and stop positions on the chromosome, or `None` if
the transcript is non-coding.
"""
if
self
.
coding
:
return
self
.
cds_start
,
self
.
cds_stop
return
None
@
cds
.
setter
def
cds
(
self
,
cds
):
self
.
cds_start
,
self
.
cds_stop
=
cds
or
(
None
,
None
)
class
Assembly
(
db
.
Base
):
"""
Genome assembly.
...
...
mutalyzer/parsers/genbank.py
View file @
8fe2acfe
...
...
@@ -510,6 +510,7 @@ class GBparser():
record
.
molType
=
'm'
#if
# TODO: record.source seems to be unused.
fakeGene
=
Locus
(
"001"
)
record
.
source
.
transcriptList
.
append
(
fakeGene
)
fakeGene
.
CDS
=
PList
()
...
...
mutalyzer/variantchecker.py
View file @
8fe2acfe
...
...
@@ -20,6 +20,7 @@ from __future__ import unicode_literals
from
operator
import
attrgetter
import
Bio.Seq
from
Bio.Data
import
CodonTable
from
Bio.Alphabet
import
IUPAC
from
Bio.Alphabet
import
DNAAlphabet
...
...
@@ -27,7 +28,7 @@ from Bio.Alphabet import ProteinAlphabet
from
Bio.Alphabet
import
_verify_alphabet
from
mutalyzer
import
util
from
mutalyzer.db.models
import
Assembly
from
mutalyzer.db.models
import
Assembly
,
Reference
from
mutalyzer.grammar
import
Grammar
from
mutalyzer.mutator
import
Mutator
from
mutalyzer.mapping
import
Converter
...
...
@@ -1740,7 +1741,9 @@ def check_variant(description, output):
'Indexing by protein isoform is not supported.'
)
retriever
=
Retriever
.
GenBankRetriever
(
output
)
retrieved_record
=
retriever
.
loadrecord
(
record_id
)
retrieved_record
=
record_from_database
(
record_id
)
if
not
retrieved_record
:
retrieved_record
=
retriever
.
loadrecord
(
record_id
)
if
not
retrieved_record
:
return
...
...
@@ -2014,3 +2017,28 @@ def check_variant(description, output):
_add_batch_output
(
output
)
#check_variant
def
record_from_database
(
accession
):
reference
=
Reference
.
query
.
filter_by
(
accession
=
accession
).
first
()
if
reference
is
None
:
return
record
=
GenRecord
.
Record
()
# TODO: shouldn't have to be a seq record
record
.
seq
=
Bio
.
Seq
.
Seq
(
reference
.
sequence
,
DNAAlphabet
())
record
.
id
=
accession
record
.
source_id
=
accession
try
:
record
.
source_accession
,
record
.
source_version
=
accession
.
rsplit
(
','
,
1
)
except
ValueError
:
record
.
source_accession
=
accession
record
.
source_version
=
'1'
record
.
source_gi
=
reference
.
geninfo_identifier
record
.
organism
=
'todo'
# biorecord.annotations['organism']
#record.organelle = 'mitochondrion'
record
.
molType
=
'n'
if
accession
.
startswith
(
'NM'
)
else
'g'
# TODO: m.
return
record
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment