diff --git a/.travis.yml b/.travis.yml index 771b6f7ae2c3e61e334805c0be0c81ac73b850b8..bfd46701c8be6d8c70d9746dcbdf16e77c9fcfde 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,8 @@ language: python python: - "2.7" before_install: + - sudo apt-get update -qq + - sudo apt-get install -y swig - pip install -r requirements.txt install: - pip install . diff --git a/doc/install.rst b/doc/install.rst index ab615afb40260ca9457d79d60c29cb2a222f23a2..3db1005f8040c33a16bebb9ae0d384778f66570e 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -100,7 +100,7 @@ covered here. Assuming you created and activated a virtual environment for Mutalyzer, install all required Python packages:: - $ sudo apt-get install python-dev libmysqlclient-dev libxml2-dev libxslt-dev + $ sudo apt-get install python-dev libmysqlclient-dev libxml2-dev libxslt-dev swig $ pip install -r requirements.txt Install Mutalyzer:: diff --git a/mutalyzer/describe.py b/mutalyzer/describe.py index d81254c39aeed1febbb7b5545ae48b82e3dfc7cb..4700201b98bfb05d9ef269aa1e3b6bc578ff6144 100644 --- a/mutalyzer/describe.py +++ b/mutalyzer/describe.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - """ Prototype of a module that can generate a HGVS description of the variant(s) leading from one sequence to an other. @@ -7,6 +5,7 @@ leading from one sequence to an other. @requires: Bio.Seq """ + from __future__ import unicode_literals import collections @@ -17,151 +16,8 @@ from mutalyzer.util import longest_common_prefix, longest_common_suffix from mutalyzer.util import palinsnoop, roll, reverse_complement from mutalyzer import models +from extractor import extractor -# Maximum size of the LCS matrix -MAX_MATRIX_SIZE = 8000000 - - -class LCS(object): - """ - Class that calculates a Longest Common Substring matrix once and provides - a function to find the LCS of a submatrix. - """ - - __delim = ' ' - - def __init__(self, s1, s2, lcp, s1_end, s2_end, DNA=False): - """ - Initialise the class. - - @arg s1: A string. - @type s1: unicode - @arg s2: A string. - @type s2: unicode - @arg lcp: The length of the longest common prefix of {s1} and {s2}. - @type lcp: int - @arg s1_end: End of the substring in {s1}. - @type s1_end: - @arg s2_end: End of the substring in {s2}. - @type s2_end: int - @arg DNA: - @type DNA: bool - """ - self.__lcp = lcp - self.__s1 = s1[self.__lcp:s1_end] - self.__s2 = s2[self.__lcp:s2_end] - self.__s2_len = s2_end - lcp - self.__matrix = self.LCSMatrix(self.__s1, self.__s2) - - self.__s2_rc = None - self.__matrix_rc = None - if DNA: - self.__s2_rc = reverse_complement(s2[self.__lcp:s2_end]) - self.__matrix_rc = self.LCSMatrix(self.__s1, self.__s2_rc) - #if - #__init__ - - def __unicode__(self): - """ - Return a graphical representation of the LCS matrix, mainly for - debugging. - - @returns: A graphical representation of the LCS matrix. - @rtype: unicode - """ - return self.visMatrix((0, len(self.__s1)), (0, len(self.__s2))) - #__unicode__ - - def visMatrix(self, r1, r2, rc=False): - """ - Return a graphical representation of the LCS matrix, mainly for - debugging. - - @returns: A graphical representation of the LCS matrix. - @rtype: unicode - """ - nr1 = r1[0] - self.__lcp, r1[1] - self.__lcp - nr2 = r2[0] - self.__lcp, r2[1] - self.__lcp - - M = self.__matrix - s2 = self.__s2 - if rc: - M = self.__matrix_rc - s2 = self.__s2_rc - - out = self.__delim.join(self.__delim + '-' + s2[nr2[0]:nr2[1]]) + '\n' - for i in range(nr1[0], nr1[1] + 1): - out += (('-' + self.__s1)[i] + self.__delim + - self.__delim.join(map(lambda x: unicode(M[i][x]), - range(nr2[0], nr2[1] + 1))) + '\n') - - return out - #visMatrix - - def LCSMatrix(self, s1, s2): - """ - Calculate the Longest Common Substring matrix. - - @arg s1: A string. - @type s1: unicode - @arg s2: A string. - @type s2: unicode - - @returns: A matrix with the LCS of {s1}[i], {s2}[j] at position i, j. - @rval: list[list[int]] - """ - y_max = len(s1) + 1 - x_max = len(s2) + 1 - M = [[0] * x_max for i in range(y_max)] - - for x in range(1, y_max): - for y in range(1, x_max): - if s1[x - 1] == s2[y - 1]: - M[x][y] = M[x - 1][y - 1] + 1 - - return M - #LCSMatrix - - def findMax(self, r1, r2, rc=False): - """ - Find the LCS in a submatrix of {M}, given by the ranges {r1} and {r2}. - - @arg r1: A range for the first dimension of M. - @type r1: tuple(int, int) - @arg r2: A range for the second dimension of M. - @type r2: tuple(int, int) - @arg rc: Use the reverse complement matrix. - @type rc: bool - - @returns: - @rtype: tuple(int, int, int) - """ - longest, x_longest, y_longest = 0, 0, 0 - nr1 = r1[0] - self.__lcp, r1[1] - self.__lcp - nr2 = r2[0] - self.__lcp, r2[1] - self.__lcp - - M = self.__matrix - if rc: - M = self.__matrix_rc - nr2 = self.__s2_len - nr2[1], self.__s2_len - nr2[0] - #if - - for i in range(nr1[0], nr1[1] + 1): - x_relative = i - nr1[0] - - for j in range(nr2[0], nr2[1] + 1): - y_relative = j - nr2[0] - realVal = min(M[i][j], x_relative, y_relative) - - if realVal > longest: - longest, x_longest, y_longest = (realVal, x_relative, - y_relative) - #for - #for - - return x_longest, y_longest, longest - #findMax -#LCS def makeFSTables(table_id): """ @@ -272,7 +128,39 @@ def fitFS(peptide, altPeptide, FS): return True #fitFS -class DescribeRawVar(models.RawVar): +def findFS(peptide, altPeptide, FS): + """ + Find the longest part of {altPeptide} that fits in {peptide} in a certain + frame given by {FS}. + + @arg peptide: Original peptide sequence. + @type peptide: unicode + @arg altPeptide: Observed peptide sequence. + @type altPeptide: unicode + @arg FS: Frame shift table. + @type FS: dict + + @returns: The length and the offset in {peptide} of the largest frameshift. + @rtype: tuple(int, int) + """ + pList = __makeOverlaps(peptide) + maxFS = 0 + fsStart = 0 + + for i in range(len(pList))[::-1]: + for j in range(min(i + 1, len(altPeptide))): + if not altPeptide[::-1][j] in FS[pList[i - j]]: + break + if j >= maxFS: + maxFS = j + fsStart = i - j + 2 + #if + #for + + return maxFS - 1, fsStart +#findFS + +class RawVar(models.RawVar): """ Container for a raw variant. @@ -325,8 +213,9 @@ class DescribeRawVar(models.RawVar): self.startAA = startAA self.endAA = endAA self.term = term - self.hgvs = self.description() - self.hgvsLength = self.descriptionLength() + self.update() + #self.hgvs = self.description() + #self.hgvsLength = self.descriptionLength() #__init__ def __DNADescription(self): @@ -385,7 +274,7 @@ class DescribeRawVar(models.RawVar): descr += "%i" % self.start if self.end: descr += "_%s%i" % (seq3(self.endAA), self.end) - if self.type not in ["subst", "stop", "ext", "fs"]: + if self.type not in ["subst", "stop", "ext", "fs"]: # fs is not a type descr += self.type if self.inserted: descr += "%s" % seq3(self.inserted) @@ -393,7 +282,7 @@ class DescribeRawVar(models.RawVar): if self.type == "stop": return descr + '*' if self.term: - return descr + "%s*%i" % (self.type, self.term) + return descr + "fs*%i" % self.term return descr #__proteinDescription @@ -409,12 +298,12 @@ class DescribeRawVar(models.RawVar): variant stored in this class. @rtype: int """ - if not self.start : # `=' or `?' + if not self.start: # `=' or `?' return 1 descrLen = 1 # Start position. - if self.end : # '_' and end position. + if self.end: # '_' and end position. descrLen += 2 if self.type != "subst": @@ -461,6 +350,13 @@ class DescribeRawVar(models.RawVar): return descrLen #__proteinDescriptionLength + def update(self): + """ + """ + self.hgvs = self.description() + self.hgvsLength = self.descriptionLength() + #update + def description(self): """ """ @@ -484,7 +380,7 @@ class DescribeRawVar(models.RawVar): #descriptionLength #DescribeRawVar -def alleleDescription(allele): +def allele_description(allele): """ Convert a list of raw variants to an HGVS allele description. @@ -495,9 +391,9 @@ def alleleDescription(allele): @rval: unicode """ if len(allele) > 1: - return "[%s]" % ';'.join(map(lambda x : x.hgvs, allele)) + return "[%s]" % ';'.join(map(lambda x: x.hgvs, allele)) return allele[0].hgvs -#alleleDescription +#allele_description def alleleDescriptionLength(allele): """ @@ -510,10 +406,10 @@ def alleleDescriptionLength(allele): @rval: int """ # NOTE: Do we need to count the ; and [] ? - return sum(map(lambda x : x.hgvsLength, allele)) + return sum(map(lambda x: x.hgvsLength, allele)) #alleleDescriptionLength -def printpos(s, start, end, fill = 0): +def printpos(s, start, end, fill=0): """ For debugging purposes. """ @@ -525,313 +421,142 @@ def printpos(s, start, end, fill = 0): s[end:end + fs]) #printpos -def DNA_description(M, s1, s2, s1_start, s1_end, s2_start, s2_end): +def var2RawVar(s1, s2, var, DNA=True): """ - Give an allele description of the change from {s1} to {s2} in the range - {s1_start}..{s1_end} on {s1} and {s2_start}..{s2_end} on {s2}. - - arg s1: Sequence 1. - type s1: unicode - arg s2: Sequence 2. - type s2: unicode - arg s1_start: Start of the range on {s1}. - type s1_start: int - arg s1_end: End of the range on {s1}. - type s1_end: int - arg s2_start: Start of the range on {s2}. - type s2_start: int - arg s2_end: End of the range on {s2}. - type s2_end: int - - @returns: A list of DescribeRawVar objects, representing the allele. - @rval: list(DescribeRawVar) """ - # TODO: Instead of copying this function and adjusting it to make it work - # for proteins, consider disabling parts like the inversion. - # TODO: Think about frameshift descriptions. - - # Nothing happened. - if s1 == s2: - return [DescribeRawVar()] + # Unknown. + if s1 == '?' or s2 == '?': + return [RawVar(DNA=DNA, type="unknown")] # Insertion / Duplication. - if s1_start == s1_end: - ins_length = s2_end - s2_start - shift5, shift3 = roll(s2, s2_start + 1, s2_end) + if var.reference_start == var.reference_end: + ins_length = var.sample_end - var.sample_start + shift5, shift3 = roll(s2, var.sample_start + 1, var.sample_end) shift = shift5 + shift3 - s1_start += shift3 - s1_end += shift3 - s2_start += shift3 - s2_end += shift3 + var.reference_start += shift3 + var.reference_end += shift3 + var.sample_start += shift3 + var.sample_end += shift3 - if s2_start - ins_length >= 0 and \ - s1[s1_start - ins_length:s1_start] == s2[s2_start:s2_end]: + if (var.sample_start - ins_length >= 0 and + s1[var.reference_start - ins_length:var.reference_start] == + s2[var.sample_start:var.sample_end]): if ins_length == 1: - return [DescribeRawVar(start=s1_start, type="dup", shift=shift)] - return [DescribeRawVar(start=s1_start - ins_length + 1, end=s1_end, - type="dup", shift=shift)] + return RawVar(DNA=DNA, start=var.reference_start, type="dup", + shift=shift) + return RawVar(DNA=DNA, start=var.reference_start - ins_length + 1, + end=var.reference_end, type="dup", shift=shift) #if - return [DescribeRawVar(start=s1_start, end=s1_start + 1, - inserted=s2[s2_start:s2_end], type="ins", shift=shift)] + return RawVar(DNA=DNA, start=var.reference_start, + end=var.reference_start + 1, + inserted=s2[var.sample_start:var.sample_end], type="ins", + shift=shift) #if # Deletion. - if s2_start == s2_end: - shift5, shift3 = roll(s1, s1_start + 1, s1_end) + if var.sample_start == var.sample_end: + shift5, shift3 = roll(s1, var.reference_start + 1, var.reference_end) shift = shift5 + shift3 - s1_start += shift3 + 1 - s1_end += shift3 + var.reference_start += shift3 + 1 + var.reference_end += shift3 - if s1_start == s1_end: - return [DescribeRawVar(start=s1_start, type="del", shift=shift)] - return [DescribeRawVar(start=s1_start, end=s1_end, type="del", shift=shift)] + if var.reference_start == var.reference_end: + return RawVar(DNA=DNA, start=var.reference_start, type="del", + shift=shift) + return RawVar(DNA=DNA, start=var.reference_start, + end=var.reference_end, type="del", shift=shift) #if # Substitution. - if s1_start + 1 == s1_end and s2_start + 1 == s2_end: - return [DescribeRawVar(start=s1_start + 1, deleted=s1[s1_start], - inserted=s2[s2_start], type="subst")] - - # Simple InDel. - if s1_start + 1 == s1_end: - return [DescribeRawVar(start=s1_start + 1, inserted=s2[s2_start:s2_end], - type="delins")] - - # TODO: Refactor the code after this point. - - # At this stage, we either have an inversion, an indel or a Compound - # variant. - s1_end_f, s2_end_f, lcs_f_len = M.findMax((s1_start, s1_end), - (s2_start, s2_end)) - s1_end_r, s2_end_r, lcs_r_len = M.findMax((s1_start, s1_end), - (s2_start, s2_end), rc=True) + if (var.reference_start + 1 == var.reference_end and + var.sample_start + 1 == var.sample_end): - # Palindrome snooping. - trim = palinsnoop(s1[s1_start + s1_end_r - lcs_r_len:s1_start + s1_end_r]) - if trim == -1 : # Full palindrome. - lcs_r_len = 0 # s1_end_r and s2_end_r should not be used after this. - - # Inversion or Compound variant. - default = [DescribeRawVar(start=s1_start + 1, end=s1_end, - inserted=s2[s2_start:s2_end], type="delins")] + return RawVar(DNA=DNA, start=var.reference_start + 1, + deleted=s1[var.reference_start], inserted=s2[var.sample_start], + type="subst") + #if - if not (lcs_f_len or lcs_r_len) : # Optimisation, not really needed. - return default + # Simple InDel. + if var.reference_start + 1 == var.reference_end: + return RawVar(DNA=DNA, start=var.reference_start + 1, + inserted=s2[var.sample_start:var.sample_end], type="delins") # Inversion. - if lcs_f_len <= lcs_r_len: - if trim > 0 : # Partial palindrome. - s1_end_r -= trim - s2_end_r -= trim - lcs_r_len -= 2 * trim - #if + if var.type == extractor.VARIANT_REVERSE_COMPLEMENT: + trim = palinsnoop(s1[var.reference_start:var.reference_end]) - # Simple Inversion. - if s2_end - s2_start == lcs_r_len and s1_end - s1_start == lcs_r_len: - return [DescribeRawVar(start=s1_start + 1, end=s1_end, type="inv")] - - r1_len = s1_end_r - lcs_r_len - r2_len = s1_end - s1_start - s1_end_r - m1_len = s2_end_r - lcs_r_len - m2_len = s2_end - s2_start - s2_end_r - - # The flanks of the inversion (but not both) can be empty, so we - # generate descriptions conditionally. - leftRv = [] - rightRv = [] - if r1_len or m2_len: - lcs = len(longest_common_suffix(s1[s1_start:s1_start + r1_len], - s2[s2_start:s2_start + m2_len])) - leftRv = DNA_description(M, s1, s2, - s1_start, s1_start + r1_len - lcs, - s2_start, s2_start + m2_len - lcs) - #if - if r2_len or m1_len: - lcp = len(longest_common_prefix(s1[s1_end - r2_len:s1_end], - s2[s2_end - m1_len:s2_end])) - rightRv = DNA_description(M, s1, s2, - s1_end - r2_len + lcp, s1_end, s2_end - m1_len + lcp, s2_end) + if trim > 0: # Partial palindrome. + var.reference_end -= trim + var.sample_end -= trim #if - partial = leftRv + [DescribeRawVar(start=s1_start + r1_len + 1, - end=s1_end - r2_len, type="inv")] + rightRv + return RawVar(DNA=DNA, start=var.reference_start + 1, + end=var.reference_end, type="inv") #if - # Compound variant. - else: - r1_len = s1_end_f - lcs_f_len - r2_len = s1_end - s1_start - s1_end_f - m1_len = s2_end_f - lcs_f_len - m2_len = s2_end - s2_start - s2_end_f - - partial = DNA_description(M, s1, s2, s1_start, s1_start + r1_len, - s2_start, s2_start + m1_len) + DNA_description(M, s1, s2, - s1_end - r2_len, s1_end, s2_end - m2_len, s2_end) - #else - - if alleleDescriptionLength(partial) <= alleleDescriptionLength(default): - return partial - return default -#DNA_description - -def protein_description(M, s1, s2, s1_start, s1_end, s2_start, s2_end): + # InDel. + return RawVar(DNA=DNA, start=var.reference_start + 1, + end=var.reference_end, inserted=s2[var.sample_start:var.sample_end], + type="delins") +#var2RawVar + +def describe(s1, s2, DNA=True): """ - Give an allele description of the change from {s1} to {s2} in the range - {s1_start}..{s1_end} on {s1} and {s2_start}..{s2_end} on {s2}. + Give an allele description of the change from {s1} to {s2}. arg s1: Sequence 1. type s1: unicode arg s2: Sequence 2. type s2: unicode - arg s1_start: Start of the range on {s1}. - type s1_start: int - arg s1_end: End of the range on {s1}. - type s1_end: int - arg s2_start: Start of the range on {s2}. - type s2_start: int - arg s2_end: End of the range on {s2}. - type s2_end: int @returns: A list of DescribeRawVar objects, representing the allele. @rval: list(DescribeRawVar) """ - if s1 == '?' or s2 == '?': - return [DescribeRawVar(DNA=False, type="unknown")] - - # One of the sequences is missing. - if not (s1 and s2): - return [DescribeRawVar(DNA=False)] - - # Nothing happened. - if s1 == s2: - return [DescribeRawVar(DNA=False)] - - # Substitution. - if s1_start + 1 == s1_end and s2_start + 1 == s2_end: - return [DescribeRawVar(DNA=False, start=s1_start + 1, deleted=s1[s1_start], - inserted=s2[s2_start], type="subst")] - - # Insertion / Duplication / Extention. - if s1_start == s1_end: - if len(s1) == s1_start + 1: - return [DescribeRawVar(DNA=False, start=s1_start + 1, - inserted=s2[s2_start], term=abs(len(s1) - len(s2)), - type="ext")] - ins_length = s2_end - s2_start - shift5, shift3 = roll(s2, s2_start + 1, s2_end) - shift = shift5 + shift3 - - s1_start += shift3 - s1_end += shift3 - s2_start += shift3 - s2_end += shift3 + description = [] - if s2_start - ins_length >= 0 and \ - s1[s1_start - ins_length:s1_start] == s2[s2_start:s2_end]: - - if ins_length == 1: - return [DescribeRawVar(DNA=False, start=s1_start, - startAA=s1[s1_start - 1], type="dup", shift=shift)] - return [DescribeRawVar(DNA=False, start=s1_start - ins_length + 1, - startAA=s1[s1_start - ins_length], end=s1_end, - endAA=s1[s1_end - 1], type="dup", shift=shift)] + if not DNA: + FS1, FS2 = makeFSTables(1) + longestFSf = max(findFS(s1, s2, FS1), findFS(s1, s2, FS2)) + longestFSr = max(findFS(s2, s1, FS1), findFS(s2, s1, FS2)) + + if longestFSf > longestFSr: + print s1[:longestFSf[1]], s1[longestFSf[1]:] + print s2[:len(s2) - longestFSf[0]], s2[len(s2) - longestFSf[0]:] + s1_part = s1[:longestFSf[1]] + s2_part = s2[:len(s2) - longestFSf[0]] + term = longestFSf[0] + #if + else: + print s1[:len(s1) - longestFSr[0]], s1[len(s1) - longestFSr[0]:] + print s2[:longestFSr[1]], s2[longestFSr[1]:] + s1_part = s1[:len(s1) - longestFSr[0]] + s2_part = s2[:longestFSr[1]] + term = len(s2) - longestFSr[1] + #else + + s1_part = s1 + s2_part = s2 + for variant in extractor.extract(unicode(s1_part), len(s1_part), + unicode(s2_part), len(s2_part), 1): + description.append(var2RawVar(s1, s2, variant, DNA=DNA)) + + if description: + description[-1].term = term + 2 + description[-1].update() #if - return [DescribeRawVar(DNA=False, start=s1_start, startAA=s1[s1_start - 1], - end=s1_start + 1, endAA=s1[s1_end], inserted=s2[s2_start:s2_end], - type="ins", shift=shift)] - #if - - # Deletion / Inframe stop. - if s2_start == s2_end: - if len(s2) == s2_end + 1: - return [DescribeRawVar(DNA=False, start=s1_start + 1, startAA=s1[s1_start], - type="stop")] - shift5, shift3 = roll(s1, s1_start + 1, s1_end) - shift = shift5 + shift3 - - s1_start += shift3 + 1 - s1_end += shift3 - - if s1_start == s1_end: - return [DescribeRawVar(DNA=False, start=s1_start, startAA=s1[s1_start - 1], - type="del", shift=shift)] - return [DescribeRawVar(DNA=False, start=s1_start, startAA=s1[s1_start - 1], - end=s1_end, endAA=s1[s1_end - 1], type="del", shift=shift)] - #if - - # Simple InDel. - if s1_start + 1 == s1_end: - return [DescribeRawVar(DNA=False, start=s1_start + 1, startAA=s1[s1_start], - inserted=s2[s2_start:s2_end], type="delins")] - - # Frameshift. - if len(s1) == s1_end + 1 and len(s2) == s2_end + 1: - # TODO: the FS tables should be made for all coding tables in advance. - FS1, FS2 = makeFSTables(1) # Standard coding table. - - if (fitFS(s1[s1_start + 1:], s2[s2_start + 1:], FS1) or - fitFS(s1[s1_start + 1:], s2[s2_start + 1:], FS2) or - fitFS(s2[s2_start + 1:], s1[s1_start + 2:], FS1) or - fitFS(s2[s2_start + 1:], s1[s1_start + 2:], FS2)): - return [DescribeRawVar(DNA=False, start=s1_start + 1, deleted=s1[s1_start], - inserted=s2[s2_start], term=len(s2) - s2_start, type="fs")] #if + else: + for variant in extractor.extract(unicode(s1), len(s1), unicode(s2), len(s2), + 0): + if variant.type != extractor.VARIANT_IDENTITY: + description.append(var2RawVar(s1, s2, variant, DNA=DNA)) - # At this point, we have an indel. - s1_end_f, s2_end_f, lcs_f_len = M.findMax((s1_start, s1_end), - (s2_start, s2_end)) - - # InDel. - default = [DescribeRawVar(DNA=False, start=s1_start + 1, startAA=s1[s1_start], - end=s1_end, endAA=s1[s1_end], inserted=s2[s2_start:s2_end], - type="delins")] - - if not lcs_f_len : # Optimisation, not really needed. - return default - - r1_len = s1_end_f - lcs_f_len - r2_len = s1_end - s1_start - s1_end_f - m1_len = s2_end_f - lcs_f_len - m2_len = s2_end - s2_start - s2_end_f - - partial = protein_description(M, s1, s2, s1_start, s1_start + r1_len, - s2_start, s2_start + m1_len) + protein_description(M, s1, s2, - s1_end - r2_len, s1_end, s2_end - m2_len, s2_end) - - if alleleDescriptionLength(partial) <= alleleDescriptionLength(default): - return partial - return default -#protein_description - -def describe(original, mutated, DNA=True): - """ - Convenience function for DNA_description(). - - @arg original: - @type original: unicode - @arg mutated: - @type mutated: unicode - - @returns: A list of DescribeRawVar objects, representing the allele. - @rval: list(DescribeRawVar) - """ - s1 = original - s2 = mutated - lcp = len(longest_common_prefix(s1, s2)) - lcs = len(longest_common_suffix(s1[lcp:], s2[lcp:])) - s1_end = len(s1) - lcs - s2_end = len(s2) - lcs - - if (s1_end - lcp) * (s2_end - lcp) > MAX_MATRIX_SIZE: - return + # Nothing happened. + if not description: + return [RawVar(DNA=DNA)] - if not DNA: - M = LCS(s1, s2, lcp, s1_end, s2_end) - return protein_description(M, s1, s2, lcp, s1_end, lcp, s2_end) - #if - M = LCS(s1, s2, lcp, s1_end, s2_end, DNA=True) - return DNA_description(M, s1, s2, lcp, s1_end, lcp, s2_end) + return description #describe diff --git a/mutalyzer/entrypoints/mutalyzer.py b/mutalyzer/entrypoints/mutalyzer.py index 6717161d1d4795c923f70cfa6846358ace2972c8..b9af312f6a3346c00be98cccbbb1bd1192faff38 100644 --- a/mutalyzer/entrypoints/mutalyzer.py +++ b/mutalyzer/entrypoints/mutalyzer.py @@ -93,21 +93,24 @@ def check_name(description): for i in O.getOutput("legends") : print i - allele = describe.describe(O.getIndexedOutput("original", 0), + extracted_allele = describe.describe( + O.getIndexedOutput("original", 0), O.getIndexedOutput("mutated", 0)) - prot_allele = describe.describe(O.getIndexedOutput("oldprotein", 0), - O.getIndexedOutput("newprotein", 0, default=""), DNA=False) + extracted_protein_allele = describe.describe( + O.getIndexedOutput("oldprotein", 0), + O.getIndexedOutput("newprotein", 0, default=""), + DNA=False) - extracted = extractedProt = '(skipped)' + extracted = extracted_protein = '(skipped)' - if allele: - extracted = describe.alleleDescription(allele) - if prot_allele: - extractedProt = describe.alleleDescription(prot_allele) + if extracted_allele: + extracted = describe.allele_description(extracted_allele) + if extracted_protein_allele: + extracted_protein = describe.allele_description(extracted_protein_allele) print "\nExperimental services:" print extracted - print extractedProt + print extracted_protein #print "+++ %s" % O.getOutput("myTranscriptDescription") diff --git a/mutalyzer/services/rpc.py b/mutalyzer/services/rpc.py index 7e57d76be8a7bb5310f905b4265996d07ab37d5a..c296ee6dba0303616e724fa3bafc5b51aefe256a 100644 --- a/mutalyzer/services/rpc.py +++ b/mutalyzer/services/rpc.py @@ -1241,7 +1241,7 @@ class MutalyzerService(ServiceBase): result = Allele() result.allele = describe.describe(reference, observed) - result.description = describe.alleleDescription(result.allele) + result.description = describe.allele_description(result.allele) output.addMessage(__file__, -1, 'INFO', 'Finished processing descriptionExtract') diff --git a/mutalyzer/website/templates/description-extractor.html b/mutalyzer/website/templates/description-extractor.html index 2f2cb17d774dfee13a1f228a04deb27f8eac7a01..d1d8917a4b82ff92d7145e05a2248ecbd0b6ab6b 100644 --- a/mutalyzer/website/templates/description-extractor.html +++ b/mutalyzer/website/templates/description-extractor.html @@ -94,7 +94,7 @@ Please supply a reference sequence and an observed sequence. <td>{{ raw_var.deleted }}</td> <td>{{ raw_var.inserted }}</td> <td>{{ raw_var.shift }}</td> - <td>{{ raw_var.hgvs }}</td> + <td>{{ raw_var }}</td> </tr> {% endfor %} </tbody> diff --git a/mutalyzer/website/views.py b/mutalyzer/website/views.py index 992e84a502ba6d46c310f65f0e50b9611b6ac412..c1d761c4b00d76812cf607a18b30fcca9bd27d19 100644 --- a/mutalyzer/website/views.py +++ b/mutalyzer/website/views.py @@ -274,15 +274,9 @@ def name_checker(): allele = describe.describe(output.getIndexedOutput('original', 0), output.getIndexedOutput('mutated', 0)) if allele: - extracted = describe.alleleDescription(allele) - - if output.getIndexedOutput('oldprotein', 0): - prot_allele = describe.describe( - output.getIndexedOutput('oldprotein', 0), - output.getIndexedOutput('newprotein', 0, default=''), - DNA=False) - if prot_allele: - extractedProt = describe.alleleDescription(prot_allele) + extracted = describe.allele_description(allele) + if prot_allele: + extractedProt = describe.allele_description(prot_allele) else: extracted = extractedProt = '' @@ -702,7 +696,7 @@ def description_extractor(): 'Variant sequence is not DNA.') raw_vars = describe.describe(reference_sequence, variant_sequence) - description = describe.alleleDescription(raw_vars) + description = describe.allele_description(raw_vars) errors, warnings, summary = output.Summary() messages = map(util.message_info, output.getMessages()) diff --git a/requirements.txt b/requirements.txt index 99065b96b5b3b80ecdffc6fec3d1c9ed1682dd9c..c825efb1b43e59ad7929bb97d0541d10ea1d7410 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ -e git+https://github.com/LUMC/magic-python.git#egg=Magic_file_extensions +-e git+https://git.lumc.nl/mutalyzer/extractor.git@3148fad6b081c36e9675494f4896ba60f14707a9#egg=extractor Flask==0.10.1 Jinja2==2.7.3 MySQL-python==1.2.5 diff --git a/tests/test_describe.py b/tests/test_describe.py index e81c7ce45bf6dbb5776326d75e3f7f410179db6d..76ef43b9874fce37bcb7baec04a76c5cc68d2c71 100644 --- a/tests/test_describe.py +++ b/tests/test_describe.py @@ -25,7 +25,7 @@ class TestDescribe(MutalyzerTest): result = describe.describe( 'ATGATGATCAGATACAGTGTGATACAGGTAGTTAGACAA', 'ATGATTTGATCAGATACATGTGATACCGGTAGTTAGGACAA') - description = describe.alleleDescription(result) + description = describe.allele_description(result) assert description == '[5_6insTT;17del;26A>C;35dup]' def test2(self): @@ -35,7 +35,7 @@ class TestDescribe(MutalyzerTest): result = describe.describe( 'TAAGCACCAGGAGTCCATGAAGAAGATGGCTCCTGCCATGGAATCCCCTACTCTACTGTG', 'TAAGCACCAGGAGTCCATGAAGAAGCTGGATCCTCCCATGGAATCCCCTACTCTACTGTG') - description = describe.alleleDescription(result) + description = describe.allele_description(result) assert description == '[26A>C;30C>A;35G>C]' def test3(self): @@ -45,7 +45,7 @@ class TestDescribe(MutalyzerTest): result = describe.describe( 'TAAGCACCAGGAGTCCATGAAGAAGATGGCTCCTGCCATGGAATCCCCTACTCTA', 'TAAGCACCAGGAGTCCATGAAGAAGCCATGTCCTGCCATGGAATCCCCTACTCTA') - description = describe.alleleDescription(result) + description = describe.allele_description(result) assert description == '[26_29inv;30C>G]' def test4(self): @@ -55,5 +55,5 @@ class TestDescribe(MutalyzerTest): result = describe.describe( 'TAAGCACCAGGAGTCCATGAAGAAGATGGCTCCTGCCATGGAATCCCCTACTCTA', 'TAAGCACCAGGAGTCCATGAAGAAGCCATGTCCTGCCATGAATCCCCTACTCTA') - description = describe.alleleDescription(result) + description = describe.allele_description(result) assert description == '[26_29inv;30C>G;41del]'