Commit 61049722 authored by Vermaat's avatar Vermaat
Browse files

Merge refactor-mutalyzer-branch r301 through r329.

git-svn-id: https://humgenprojects.lumc.nl/svn/mutalyzer/branches/namechecker-pdf-branch@330 eb6bd6ab-9ccd-42b9-aceb-e2899b4a52f1
parents 6f3696ff 7bc44ba1
......@@ -107,7 +107,8 @@ Alternatively, if you want to have a development environment, use:
sudo python setup.py develop
The development environment uses symlinks to this source directory, so you can
develop directly from here.
develop directly from here. This command should be re-issued whenever the
version number of Mutalyzer is updated.
Setup Mutalyzer
......
......@@ -73,7 +73,7 @@ Todo list:
- Check for os.path.join vulnerabilities.
- Use web.config.debug=False on production server and perhaps put this in
the configuration file.
- Add database indices to extras/post-install.sh script.
- Solution for database schema migration on version updates.
Code style guide:
- Follow PEP 8 (code) and PEP 257 (docstrings).
......
......@@ -291,7 +291,8 @@ CREATE TABLE BatchQueue (
JobID char(20) NOT NULL,
Input char(255) NOT NULL,
Flags char(20) DEFAULT NULL,
PRIMARY KEY (QueueID)
PRIMARY KEY (QueueID),
KEY JobQueue (JobID,QueueID)
);
CREATE TABLE GBInfo (
AccNo char(20) NOT NULL DEFAULT '',
......
......@@ -435,7 +435,7 @@ class Crossmap() :
return int(s)
#main2int
def int2offset(self, t) :
def int2offset(self, t, fuzzy=False):
"""
Convert a tuple of integers to offset-notation. This adds a `+',
and `u' or `d' to the offset when appropriate. The main value is
......@@ -443,17 +443,22 @@ class Crossmap() :
@arg t: A tuple of integers: (main, offset) in __STOP notation
@type t: tuple
@kwarg fuzzy: Denotes that the coordinate is fuzzy (i.e. offset is
unknown).
@type fuzzy: bool
@return: The offset in HGVS notation
@rtype: string
"""
if t[1] > 0 : # The exon boundary is downstream.
if fuzzy: return '+?'
if t[0] >= self.__trans_end : # It is downstream of the last exon.
return "+d" + str(t[1])
return '+' + str(t[1])
#if
if t[1] < 0 : # The exon boundary is uptream.
if fuzzy: return '-?'
if t[0] <= self.__trans_start : # It is upstream of the first exon.
return "-u" + str(-t[1])
return str(t[1])
......@@ -490,32 +495,38 @@ class Crossmap() :
return int(s[1:])
#offset2int
def tuple2string(self, t) :
def tuple2string(self, t, fuzzy=False) :
"""
Convert a tuple (main, offset) in __STOP notation to I{c.} notation.
@arg t: A tuple (main, offset) in __STOP notation
@type t: tuple
@kwarg fuzzy: Denotes that the coordinate is fuzzy (i.e. offset is
unknown).
@type fuzzy: bool
@return: The position in HGVS notation
@rtype: string
"""
return str(self.int2main(t[0])) + str(self.int2offset(t))
return str(self.int2main(t[0])) + str(self.int2offset(t, fuzzy))
#tuple2string
def g2c(self, a) :
def g2c(self, a, fuzzy=False) :
"""
Uses both g2x() and tuple2string() to translate a genomic position
to __STOP notation to I{c.} notation.
@arg a: The genomic position that must be translated
@type a: integer
@kwarg fuzzy: Denotes that the coordinate is fuzzy (i.e. offset is
unknown).
@type fuzzy: bool
@return: The position in HGVS notation
@rtype: string
"""
return self.tuple2string(self.g2x(a))
return self.tuple2string(self.g2x(a), fuzzy)
#g2c
def info(self) :
......
......@@ -110,21 +110,37 @@ class Locus(object) :
self.proteinProduct = None
#__init__
def addToDescription(self, rawVariant) :
def cancelDescription(self):
"""
Set the description on this locus to 'unknown'.
This can be used if at some point we give up creating a sensible
description on this locus. It also makes sure future additions to
the description are ignored and it keeps the 'unknown' value.
@note: This depends on the check for the unknown value in the
addToDescription method. This is a not a beatiful solution.
"""
self.description = '?'
#cancelDescription
def addToDescription(self, rawVariant):
"""
Expands the DNA description with a new raw variant.
@arg rawVariant: description of a single mutation
@type rawVariant: string
"""
if self.description:
# Don't change anything if we already have an unknown value.
if self.description != '?':
self.description = "%s;%s" % (self.description, rawVariant)
else :
else:
self.description = rawVariant
#addToDescription
#Locus
class Gene(object) :
"""
A Gene object, to store a list of Locus objects and the orientation of
......@@ -602,7 +618,8 @@ class GenRecord() :
return None
#current_transcript
def name(self, start_g, stop_g, varType, arg1, arg2, roll, arg1_reverse=None):
def name(self, start_g, stop_g, varType, arg1, arg2, roll, arg1_reverse=None,
start_fuzzy=False, stop_fuzzy=False):
"""
Generate variant descriptions for all genes, transcripts, etc.
......@@ -620,6 +637,10 @@ class GenRecord() :
@type roll: tuple (integer, integer)
@kwarg arg1_reverse: argument 1 to be used on reverse strand
@type arg1_reverse: string
@kwarg start_fuzzy: Indicates if start position of variant is fuzzy.
@type start_fuzzy: bool
@kwarg stop_fuzzy: Indicates if stop position of variant is fuzzy.
@type stop_fuzzy: bool
"""
forwardStart = start_g
forwardStop = stop_g
......@@ -634,21 +655,62 @@ class GenRecord() :
if varType != "subst" :
if forwardStart != forwardStop :
self.record.addToDescription("%s_%s%s%s" % (forwardStart,
forwardStop, varType, arg1))
# Todo: Fuzzy offsets to genomic positions (see bug #38).
#
# The genomic positioning is problematic. We would like to
# have it in brackets (as fuzzy positions), like the above
# g.(34299_23232)del example.
#
# Now consider a variant c.a-?_b+18del where only the offset
# before the exon is unknown but the offset after the exon is
# exact. Now a genomic description like g.(34299)_23232del
# comes to mind, however, this notation is not allowed by the
# HGVS grammar.
#
# I think all we can do is to treat both positions as fuzzy in
# the genomic description, even if only one of them really is.
#
# Peter thinks the HGVS grammar should at some point be
# updated to allow the brackets around individual locations.
if start_fuzzy or stop_fuzzy:
self.record.addToDescription("(%s_%s)%s%s" % (
forwardStart, forwardStop, varType, arg1))
self.record.addToChromDescription("(%s_%s)%s%s" % (
self.record.toChromPos(forwardStart),
self.record.toChromPos(forwardStop), varType, arg1))
else:
self.record.addToDescription("%s_%s%s%s" % (
forwardStart, forwardStop, varType, arg1))
self.record.addToChromDescription("%s_%s%s%s" % (
self.record.toChromPos(forwardStart),
self.record.toChromPos(forwardStop), varType, arg1))
#if
else :
self.record.addToDescription("%s%s%s" % (forwardStart, varType,
arg1))
if start_fuzzy or stop_fuzzy:
# Todo: Current HGVS does not allow for () around single
# positions, only around ranges (see above and #38).
self.record.addToDescription("(%s)%s%s" % (
forwardStart, varType, arg1))
self.record.addToChromDescription("(%s)%s%s" % (
self.record.toChromPos(forwardStart), varType, arg1))
else:
self.record.addToDescription("%s%s%s" % (
forwardStart, varType, arg1))
self.record.addToChromDescription("%s%s%s" % (
self.record.toChromPos(forwardStart), varType, arg1))
#else
#if
else :
self.record.addToDescription("%s%c>%c" % (forwardStart, arg1, arg2))
if start_fuzzy or stop_fuzzy:
# Todo: Current HGVS does not allow for () around single
# positions, only around ranges (see above and #38).
self.record.addToDescription("(%s)%c>%c" % (
forwardStart, arg1, arg2))
self.record.addToChromDescription("(%s)%c>%c" % (
self.record.toChromPos(forwardStart), arg1, arg2))
else:
self.record.addToDescription("%s%c>%c" % (
forwardStart, arg1, arg2))
self.record.addToChromDescription("%s%c>%c" % (
self.record.toChromPos(forwardStart), arg1, arg2))
......@@ -683,21 +745,42 @@ class GenRecord() :
if varType != "subst" :
if orientedStart != orientedStop :
if (start_fuzzy or stop_fuzzy) and not j.current:
# Don't generate descriptions on transcripts
# other than the current in the case of fuzzy
# positions.
j.cancelDescription()
else:
j.addToDescription("%s_%s%s%s" % (
j.CM.g2c(orientedStart), j.CM.g2c(orientedStop),
j.CM.g2c(orientedStart, start_fuzzy),
j.CM.g2c(orientedStop, stop_fuzzy),
varType, self.__maybeInvert(i, arg1, arg1_reverse)))
self.checkIntron(i, j, orientedStart)
self.checkIntron(i, j, orientedStop)
#if
else :
if start_fuzzy and not j.current:
# Don't generate descriptions on transcripts
# other than the current in the case of fuzzy
# positions.
j.cancelDescription()
else:
j.addToDescription("%s%s%s" % (
j.CM.g2c(orientedStart), varType,
j.CM.g2c(orientedStart, start_fuzzy),
varType,
self.__maybeInvert(i, arg1, arg1_reverse)))
self.checkIntron(i, j, orientedStart)
#else
#if
else :
j.addToDescription("%s%c>%c" % (j.CM.g2c(orientedStart),
if start_fuzzy and not j.current:
# Don't generate descriptions on transcripts
# other than the current in the case of fuzzy
# positions.
j.cancelDescription()
else:
j.addToDescription("%s%c>%c" % (
j.CM.g2c(orientedStart, start_fuzzy),
self.__maybeInvert(i, arg1, arg1_reverse),
self.__maybeInvert(i, arg2)))
self.checkIntron(i, j, orientedStart)
......
......@@ -20,8 +20,8 @@ import os
RELEASE = False
__version_info__ = ('2', '0', 'beta-10', 'dev')
__date__ = '27 Jun 2011'
__version_info__ = ('2', '0', 'beta-11', 'dev')
__date__ = '21 Jul 2011'
__version__ = '.'.join(__version_info__)
......
......@@ -138,14 +138,37 @@ class Config():
# We don't remove these after the tests, since they might be
# useful for debugging.
if mutalyzer.is_test():
handle, filename = tempfile.mkstemp(suffix='.log',
prefix='mutalyzer-tests-')
os.close(handle)
self.Output.log = filename
dirname = tempfile.mkdtemp(suffix='.cache',
prefix='mutalyzer-tests-')
self.Retriever.cache = dirname
self.Scheduler.resultsDir = dirname
# Todo:
#
# This needs some refactoring. The problem with the temporary
# file and dir names is that they will not be used by the
# (running) batch daemon, which will thus save its results to
# to 'normal' directory.
# Furthermore, subsequent web requests from a unit test will
# use different configuration instantiations, so might not
# see results from previous requests.
#
# We need a more robust solution for different configurations,
# depending of the running user/setting (e.g. unit tests).
#
# Idea: Don't create a local instance of the website in the
# unit tests, but only use running instances of all servers
# (website, webservice, batch daemon). They will use their
# own 'normal' configuration.
# All other parts of the unit tests will use temporary test
# configuration values. We might even decorate the tests
# needing server access as such and provide the option of
# skipping these.
#handle, filename = tempfile.mkstemp(suffix='.log',
# prefix='mutalyzer-tests-')
#os.close(handle)
#self.Output.log = filename
#dirname = tempfile.mkdtemp(suffix='.cache',
# prefix='mutalyzer-tests-')
#self.Retriever.cache = dirname
#self.Scheduler.resultsDir = dirname
pass
except KeyError as e:
raise ConfigurationError('Missing configuration value: %s' % e)
......
......@@ -95,9 +95,10 @@
<b>Affected transcripts:</b><br>
<br>
<tt tal:repeat = "i descriptions">
<a tal:content = "i/0"
<a tal:condition = "i/1" tal:content = "i/0"
tal:attributes =
"href string:checkForward?mutationName=${i/1}"></a><br>
"href string:checkForward?mutationName=${i/1}"></a><tal
tal:condition = "not:i/1" tal:replace = "i/0"></tal><br>
</tt>
<br>
<br>
......
......@@ -468,7 +468,7 @@
<td colspan="2">
<a id="page_external_oldmut"
onclick="swapActive('external_oldmut');"
href="http://www.mutalyzer.nl/1.0.4_old/"
href="http://132.229.137.14/1.0.4_old/"
class="vertnavsub">Mutalyzer 1.0.4</a>
</td>
</tr>
......
......@@ -36,6 +36,7 @@ class _NotDNAError(_RawVariantError): pass
class _PositionsNotConsecutiveError(_RawVariantError): pass
class _LengthMismatchError(_RawVariantError): pass
class _ReferenceMismatchError(_RawVariantError): pass
class _RangeInsertionError(_RawVariantError): pass
class _OffsetSignError(_RawVariantError):
def __init__(self, main, offset, acceptor):
self.main = main
......@@ -136,8 +137,8 @@ def _check_argument(argument, reference, first, last, output):
@raise _LengthMismatchError: The argument is a length, but it does not
match the given range length.
@raise NotDNAError: The argument should be DNA, but it is not.
@raise ReferenceMismatchError: The argument is DNA, but it does not
@raise _NotDNAError: The argument should be DNA, but it is not.
@raise _ReferenceMismatchError: The argument is DNA, but it does not
match the given reference.
"""
if not argument:
......@@ -301,7 +302,8 @@ def apply_substitution(position, original, substitute, mutator, record, O):
#apply_substitution
def apply_deletion_duplication(first, last, type, mutator, record, O):
def apply_deletion_duplication(first, last, type, mutator, record, O,
first_fuzzy=False, last_fuzzy=False):
"""
Do a semantic check for a deletion or duplication, do the actual
deletion/duplication and give it a name.
......@@ -318,6 +320,13 @@ def apply_deletion_duplication(first, last, type, mutator, record, O):
@type record: Modules.GenRecord.GenRecord
@arg O: The Output object.
@type O: Modules.Output.Output
@kwarg first_fuzzy: Denotes that the start position is fuzzy (e.g. in the
case of an unknown offset in c. notation).
@type first_fuzzy: bool
@kwarg last_fuzzy: Denotes that the end position is fuzzy (e.g. in the
case of an unknown offset in c. notation).
@type last_fuzzy: bool
"""
reverse_roll, forward_roll = util.roll(mutator.orig, first, last)
......@@ -327,7 +336,7 @@ def apply_deletion_duplication(first, last, type, mutator, record, O):
# We only have to consider the forward roll, since RNA reference
# sequences are always orientated in correspondence with the transcript.
original_forward_roll = forward_roll
if record.record.molType == 'n':
if record.record.molType != 'g':
# Todo: Do we assume .geneList[0].transcriptList[0] is the selected
# transcript here?? Why not use record.current_transcript?
splice_sites = record.record.geneList[0].transcriptList[0] \
......@@ -390,7 +399,9 @@ def apply_deletion_duplication(first, last, type, mutator, record, O):
else:
mutator.dupM(first, last)
record.name(first, last, type, '', '', (reverse_roll, forward_roll))
record.name(first, last, type, '', '', (reverse_roll, forward_roll),
start_fuzzy=first_fuzzy,
stop_fuzzy=last_fuzzy)
#apply_deletion_duplication
......@@ -498,7 +509,7 @@ def apply_insertion(before, after, s, mutator, record, O):
# We only have to consider the forward roll, since RNA reference
# sequences are always orientated in correspondence with the transcript.
original_forward_roll = forward_roll
if record.record.molType == 'n' :
if record.record.molType != 'g' :
splice_sites = record.record.geneList[0].transcriptList[0] \
.mRNA.positionList
for acceptor, donor in util.grouper(splice_sites):
......@@ -643,20 +654,70 @@ def apply_delins(first, last, delete, insert, mutator, record, output):
#apply_delins
def _get_offset(location):
def _get_offset(location, main_genomic, sites, output):
"""
Convert the offset coordinate in a location (from the Parser) to an
integer.
@arg location: A location.
@type location: pyparsing.ParseResults
@arg main_genomic: Genomic main position to which the offset belongs.
@type main_genomic: int
@arg sites: List of splice sites.
@type sites: list
@arg output: The Output object.
@type output: Modules.Output.Output
@return: Integer representation of the offset coordinate.
@rtype: int
"""
if location.Offset :
if location.Offset == '?' : # This is highly debatable.
return 0
if location.Offset == '?' :
try:
# Todo: If it removes CDS start, don't do protein translation.
# Todo: Wrt orientation, perhaps always go to splice site
# locations via the crossmapper...
# Todo: Also check if +? and -? are correctly used.
# Todo: Exactly centering might not be so nice, since the center
# might be closer to a neighbouring exon, making a+xxx from b-?
# and vice versa. This might not be fixed directly by doing a
# center +/- 1 because there might be rolling. Ideally we
# disable rolling entirely for these positions...
#
# Note that the code below might be a bit confusing, especially
# considering reverse strand transcripts. Magically, it works
# for both orientations.
i = sites.index(main_genomic)
if i == 0:
# Before first exon (or last on the reverse strand).
offset = main_genomic / 2
elif i == len(sites) - 1:
# After last exon (or first on the reverse strand).
# Todo: Get length of reference, and calculate a sensible
# offset.
#
# We now use that 2000 is the default downstream length,
# but of course this is bogus on the reverse strand and
# just a hack anyway.
offset = 1000
elif i % 2 == 0:
# Acceptor site (or donor on the reverse strand).
offset = abs(main_genomic - sites[i - 1]) / 2 - 1
else:
# Donor site (or acceptor on the reverse strand).
offset = abs(sites[i + 1] - main_genomic) / 2 - 1
# Todo: We would like to use the c. position in this message.
output.addMessage(__file__, 1, "IUNKNOWNOFFSET", "Unknown offset " \
"relative to %s interpreted as middle of " \
"flanking intron." % main_genomic)
except ValueError:
# Todo: This means we don't get an error if the main position
# was not on an exon boundary. We should return something else
# than 0 I guess.
#return 0 # This is highly debatable.
# Any non-zero value will do.
return 1
else:
offset = int(location.Offset)
if location.OffSgn == '-' :
return -offset
......@@ -760,7 +821,7 @@ def _genomic_to_genomic(first_location, last_location):
return first, last
def _coding_to_genomic(first_location, last_location, transcript):
def _coding_to_genomic(first_location, last_location, transcript, output):
"""
Get genomic range from parsed c. location.
......@@ -770,6 +831,8 @@ def _coding_to_genomic(first_location, last_location, transcript):
@type last_location: pyparsing.ParseResults
@arg transcript: todo
@type transcript: todo
@arg output: The Output object.
@type output: Modules.Output.Output
@return: A tuple of:
- first: Genomic start location represented by given location.
......@@ -791,11 +854,15 @@ def _coding_to_genomic(first_location, last_location, transcript):
first_main = transcript.CM.main2int(first_location.MainSgn + \
first_location.Main)
first_offset = _get_offset(first_location)
first_main_genomic = transcript.CM.x2g(first_main, 0)
first_offset = _get_offset(first_location, first_main_genomic,
transcript.CM.RNA, output)
last_main = transcript.CM.main2int(last_location.MainSgn + \
last_location.Main)
last_offset = _get_offset(last_location)
last_main_genomic = transcript.CM.x2g(last_main, 0)
last_offset = _get_offset(last_location, last_main_genomic,
transcript.CM.RNA, output)
# These raise _RawVariantError exceptions on invalid positions.
_check_intronic_position(first_main, first_offset, transcript)
......@@ -911,7 +978,8 @@ def process_raw_variant(mutator, variant, record, transcript, output):
try:
if transcript:
# Coding positioning.
first, last = _coding_to_genomic(first_location, last_location, transcript)
first, last = _coding_to_genomic(first_location, last_location,
transcript, output)
else:
# Genomic positioning.
first, last = _genomic_to_genomic(first_location, last_location)
......@@ -978,9 +1046,32 @@ def process_raw_variant(mutator, variant, record, transcript, output):
if transcript and variant.MutationType == 'del':
removed_sites = []
for acceptor, donor in util.grouper(transcript.CM.RNA):
if first <= acceptor <= last + 1:
# If we have introns, we match splice sites in a fuzzy way. This
# Means that in the case of
#
# a b
# ===========------------=============
#
# with splice sites a and b, a deletion a+1_b-1 of the entire
# intron gets treated as a deletion of both splice sites.
#
# We don't want this behaviour on e.g. RNA, where we only have
# exons. In the case of
#
# a b c d
# ========== ============= ===========
#
# with splice sites a b c d, a deletion b_c of the middle exon
# should only remove splice sites b and c, not a and d.
if record.record.molType == 'g':
fuzzy = 1
else:
fuzzy = 0
if first <= acceptor <= last + fuzzy:
removed_sites.append(acceptor)
if first - 1 <= donor <= last:
if first - fuzzy <= donor <= last:
removed_sites.append(donor)
if len(removed_sites) and not len(removed_sites) % 2:
......@@ -1004,6 +1095,8 @@ def process_raw_variant(mutator, variant, record, transcript, output):
output.addMessage(__file__, 1, 'IDELSPLICE',
'Removed %i splice sites from selected ' \
'transcript.' % len(removed_sites))
# This is primarily for use in unittests.
output.addOutput('removedSpliceSites', len(removed_sites))
# If splice_abort is set, this basically means WOVERSPLICE was called and
# IDELSPLICE was not called.
......@@ -1028,8 +1121,12 @@ def process_raw_variant(mutator, variant, record, transcript, output):
# Deletion or duplication.
if variant.MutationType in ['del', 'dup']:
# The fuzzy flags are to support deletions of the form c.a-?_b+?del.
first_fuzzy = variant.StartLoc.PtLoc.Offset == '?'
last_fuzzy = variant.EndLoc and variant.EndLoc.PtLoc.Offset == '?'
apply_deletion_duplication(first, last, variant.MutationType, mutator,
record, output)
record, output, first_fuzzy=first_fuzzy,
last_fuzzy=last_fuzzy)
# Inversion.
if variant.MutationType == 'inv':
......@@ -1037,10 +1134,22 @@ def process_raw_variant(mutator, variant, record, transcript, output):
# Insertion.
if variant.MutationType == 'ins':
# Check if the inserted sequence is not a range.
# Todo: Implement this feature.
if not argument:
output.addMessage(__file__, 4, 'ENOTIMPLEMENTED',
'Insertion of a range is not implemented yet.')
raise _RangeInsertionError()
apply_insertion(first, last, argument, mutator, record, output)