diff --git a/.travis.yml b/.travis.yml index 771b6f7ae2c3e61e334805c0be0c81ac73b850b8..bfd46701c8be6d8c70d9746dcbdf16e77c9fcfde 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,8 @@ language: python python: - "2.7" before_install: + - sudo apt-get update -qq + - sudo apt-get install -y swig - pip install -r requirements.txt install: - pip install . diff --git a/doc/install.rst b/doc/install.rst index ab615afb40260ca9457d79d60c29cb2a222f23a2..3db1005f8040c33a16bebb9ae0d384778f66570e 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -100,7 +100,7 @@ covered here. Assuming you created and activated a virtual environment for Mutalyzer, install all required Python packages:: - $ sudo apt-get install python-dev libmysqlclient-dev libxml2-dev libxslt-dev + $ sudo apt-get install python-dev libmysqlclient-dev libxml2-dev libxslt-dev swig $ pip install -r requirements.txt Install Mutalyzer:: diff --git a/mutalyzer/describe.py b/mutalyzer/describe.py deleted file mode 100644 index d81254c39aeed1febbb7b5545ae48b82e3dfc7cb..0000000000000000000000000000000000000000 --- a/mutalyzer/describe.py +++ /dev/null @@ -1,837 +0,0 @@ -#!/usr/bin/python - -""" -Prototype of a module that can generate a HGVS description of the variant(s) -leading from one sequence to an other. - -@requires: Bio.Seq -""" - -from __future__ import unicode_literals - -import collections -from Bio.SeqUtils import seq3 -from Bio.Data import CodonTable - -from mutalyzer.util import longest_common_prefix, longest_common_suffix -from mutalyzer.util import palinsnoop, roll, reverse_complement -from mutalyzer import models - - -# Maximum size of the LCS matrix -MAX_MATRIX_SIZE = 8000000 - - -class LCS(object): - """ - Class that calculates a Longest Common Substring matrix once and provides - a function to find the LCS of a submatrix. - """ - - __delim = ' ' - - def __init__(self, s1, s2, lcp, s1_end, s2_end, DNA=False): - """ - Initialise the class. - - @arg s1: A string. - @type s1: unicode - @arg s2: A string. - @type s2: unicode - @arg lcp: The length of the longest common prefix of {s1} and {s2}. - @type lcp: int - @arg s1_end: End of the substring in {s1}. - @type s1_end: - @arg s2_end: End of the substring in {s2}. - @type s2_end: int - @arg DNA: - @type DNA: bool - """ - self.__lcp = lcp - self.__s1 = s1[self.__lcp:s1_end] - self.__s2 = s2[self.__lcp:s2_end] - self.__s2_len = s2_end - lcp - self.__matrix = self.LCSMatrix(self.__s1, self.__s2) - - self.__s2_rc = None - self.__matrix_rc = None - if DNA: - self.__s2_rc = reverse_complement(s2[self.__lcp:s2_end]) - self.__matrix_rc = self.LCSMatrix(self.__s1, self.__s2_rc) - #if - #__init__ - - def __unicode__(self): - """ - Return a graphical representation of the LCS matrix, mainly for - debugging. - - @returns: A graphical representation of the LCS matrix. - @rtype: unicode - """ - return self.visMatrix((0, len(self.__s1)), (0, len(self.__s2))) - #__unicode__ - - def visMatrix(self, r1, r2, rc=False): - """ - Return a graphical representation of the LCS matrix, mainly for - debugging. - - @returns: A graphical representation of the LCS matrix. - @rtype: unicode - """ - nr1 = r1[0] - self.__lcp, r1[1] - self.__lcp - nr2 = r2[0] - self.__lcp, r2[1] - self.__lcp - - M = self.__matrix - s2 = self.__s2 - if rc: - M = self.__matrix_rc - s2 = self.__s2_rc - - out = self.__delim.join(self.__delim + '-' + s2[nr2[0]:nr2[1]]) + '\n' - for i in range(nr1[0], nr1[1] + 1): - out += (('-' + self.__s1)[i] + self.__delim + - self.__delim.join(map(lambda x: unicode(M[i][x]), - range(nr2[0], nr2[1] + 1))) + '\n') - - return out - #visMatrix - - def LCSMatrix(self, s1, s2): - """ - Calculate the Longest Common Substring matrix. - - @arg s1: A string. - @type s1: unicode - @arg s2: A string. - @type s2: unicode - - @returns: A matrix with the LCS of {s1}[i], {s2}[j] at position i, j. - @rval: list[list[int]] - """ - y_max = len(s1) + 1 - x_max = len(s2) + 1 - M = [[0] * x_max for i in range(y_max)] - - for x in range(1, y_max): - for y in range(1, x_max): - if s1[x - 1] == s2[y - 1]: - M[x][y] = M[x - 1][y - 1] + 1 - - return M - #LCSMatrix - - def findMax(self, r1, r2, rc=False): - """ - Find the LCS in a submatrix of {M}, given by the ranges {r1} and {r2}. - - @arg r1: A range for the first dimension of M. - @type r1: tuple(int, int) - @arg r2: A range for the second dimension of M. - @type r2: tuple(int, int) - @arg rc: Use the reverse complement matrix. - @type rc: bool - - @returns: - @rtype: tuple(int, int, int) - """ - longest, x_longest, y_longest = 0, 0, 0 - nr1 = r1[0] - self.__lcp, r1[1] - self.__lcp - nr2 = r2[0] - self.__lcp, r2[1] - self.__lcp - - M = self.__matrix - if rc: - M = self.__matrix_rc - nr2 = self.__s2_len - nr2[1], self.__s2_len - nr2[0] - #if - - for i in range(nr1[0], nr1[1] + 1): - x_relative = i - nr1[0] - - for j in range(nr2[0], nr2[1] + 1): - y_relative = j - nr2[0] - realVal = min(M[i][j], x_relative, y_relative) - - if realVal > longest: - longest, x_longest, y_longest = (realVal, x_relative, - y_relative) - #for - #for - - return x_longest, y_longest, longest - #findMax -#LCS - -def makeFSTables(table_id): - """ - For every pair of amino acids, calculate the set of possible amino acids in - a different reading frame. Do this for both alternative reading frames (+1 - and +2). - - @arg table_id: Coding table ID. - @type table_id: int - @returns: Two dictionaries for the two alternative reading frames. - @rtype: tuple(dict, dict) - """ - # Make the forward translation table. - table = dict(CodonTable.unambiguous_dna_by_id[table_id].forward_table) - for i in CodonTable.unambiguous_dna_by_id[table_id].stop_codons: - table[i] = '*' - - # Make the reverse translation table. - reverse_table = collections.defaultdict(list) - for i in table: - reverse_table[table[i]].append(i) - - # Make the frame shift tables. - FS1 = collections.defaultdict(set) - FS2 = collections.defaultdict(set) - for AA_i in reverse_table: - for AA_j in reverse_table: - for codon_i in reverse_table[AA_i]: - for codon_j in reverse_table[AA_j]: - FS1[AA_i + AA_j].add(table[(codon_i + codon_j)[1:4]]) # +1. - FS2[AA_i + AA_j].add(table[(codon_i + codon_j)[2:5]]) # +2. - #for - return FS1, FS2 -#makeFSTables - -def __makeOverlaps(peptide): - """ - Make a list of overlapping 2-mers of {peptide} in order of appearance. - - @arg peptide: A peptide sequence. - @type peptide: unicode - @returns: All 2-mers of {peptide} in order of appearance. - @rtype: list(unicode) - """ - return map(lambda x: peptide[x:x+2], range(len(peptide) - 1)) -#__makeOverlaps - -def __options(pList, peptidePrefix, FS, output): - """ - Enumerate all peptides that could result from a frame shift. - - @arg pList: List of overlapping 2-mers of a peptide. - @type pList: list(unicode) - @arg peptidePrefix: Prefix of a peptide in the alternative reading frame. - @type peptidePrefix: unicode - @arg FS: Frame shift table. - @type FS: dict - @arg output: List of peptides, should be empty initially. - @type output: list(unicode) - """ - if not pList: - output.append(peptidePrefix) - return - #if - for i in FS[pList[0]]: - __options(pList[1:], peptidePrefix + i, FS, output) -#__options - -def enumFS(peptide, FS): - """ - Enumerate all peptides that could result from a frame shift. - - @arg peptide: Original peptide sequence. - @type peptide: unicode - @arg FS: Frame shift table. - @type FS: dict - """ - output = [] - - __options(__makeOverlaps(peptide), "", FS, output) - return output -#enumFS - -def fitFS(peptide, altPeptide, FS): - """ - Check whether peptide {altPeptide} is a possible frame shift of peptide - {peptide}. - - @arg peptide: Original peptide sequence. - @type peptide: unicode - @arg altPeptide: Observed peptide sequence. - @type altPeptide: unicode - @arg FS: Frame shift table. - @type FS: dict - """ - # Todo: This is a temporary fix to prevent crashing on frameshift - # detection (I think bug #124). - return False - - if len(peptide) < len(altPeptide): - return False - - pList = __makeOverlaps(peptide) - - for i in range(len(altPeptide)): - if not altPeptide[i] in FS[pList[i]]: - return False - return True -#fitFS - -class DescribeRawVar(models.RawVar): - """ - Container for a raw variant. - - To use this class correctly, do not supply more than the minimum amount of - data. The {description()} function may not work properly if too much - information is given. - - Example: if {end} is initialised for a substitution, a range will be - retuned, resulting in a description like: 100_100A>T - - Note: As a workaround for a classname conflict in Spyne, this class is - named `DescribeRawVar` instead of just `RawVar`. We might want to reconsider - this name at some point. - """ - - def __init__(self, DNA=True, start=0, start_offset=0, end=0, end_offset=0, - type="none", deleted="", inserted="", shift=0, startAA="", endAA="", - term=0): - """ - Initialise the class with the appropriate values. - - @arg start: Start position. - @type start: int - @arg start_offset: - @type start_offset: int - @arg end: End position. - @type end: int - @arg end_offset: - @type end_offset: int - @arg type: Variant type. - @type type: unicode - @arg deleted: Deleted part of the reference sequence. - @type deleted: unicode - @arg inserted: Inserted part. - @type inserted: unicode - @arg shift: Amount of freedom. - @type shift: int - """ - # TODO: Will this container be used for all variants, or only genomic? - # start_offset and end_offset may be never used. - self.DNA = DNA - self.start = start - self.start_offset = start_offset - self.end = end - self.end_offset = end_offset - self.type = type - self.deleted = deleted - self.inserted = inserted - self.shift = shift - self.startAA = startAA - self.endAA = endAA - self.term = term - self.hgvs = self.description() - self.hgvsLength = self.descriptionLength() - #__init__ - - def __DNADescription(self): - """ - Give the HGVS description of the raw variant stored in this class. - - Note that this function relies on the absence of values to make the - correct description. Also see the comment in the class definition. - - @returns: The HGVS description of the raw variant stored in this class. - @rtype: unicode - """ - if not self.start: - return "=" - - descr = "%i" % self.start - - if self.end: - descr += "_%i" % self.end - - if self.type != "subst": - descr += "%s" % self.type - - if self.inserted: - return descr + "%s" % self.inserted - return descr - #if - - return descr + "%s>%s" % (self.deleted, self.inserted) - #__DNADescription - - def __proteinDescription(self): - """ - Give the HGVS description of the raw variant stored in this class. - - Note that this function relies on the absence of values to make the - correct description. Also see the comment in the class definition. - - @returns: The HGVS description of the raw variant stored in this class. - @rtype: unicode - """ - if self.type == "unknown": - return "?" - if not self.start: - return "=" - - descr = "" - if not self.deleted: - if self.type == "ext": - descr += '*' - else: - descr += "%s" % seq3(self.startAA) - #if - else: - descr += "%s" % seq3(self.deleted) - descr += "%i" % self.start - if self.end: - descr += "_%s%i" % (seq3(self.endAA), self.end) - if self.type not in ["subst", "stop", "ext", "fs"]: - descr += self.type - if self.inserted: - descr += "%s" % seq3(self.inserted) - - if self.type == "stop": - return descr + '*' - if self.term: - return descr + "%s*%i" % (self.type, self.term) - return descr - #__proteinDescription - - def __DNADescriptionLength(self): - """ - Give the standardised length of the HGVS description of the raw variant - stored in this class. - - Note that this function relies on the absence of values to make the - correct description. Also see the comment in the class definition. - - @returns: The standardised length of the HGVS description of the raw - variant stored in this class. - @rtype: int - """ - if not self.start : # `=' or `?' - return 1 - - descrLen = 1 # Start position. - - if self.end : # '_' and end position. - descrLen += 2 - - if self.type != "subst": - descrLen += len(self.type) - - if self.inserted: - return descrLen + len(self.inserted) - return descrLen - #if - - return 4 # Start position, '>' and end position. - #__DNAdescriptionLength - - def __proteinDescriptionLength(self): - """ - Give the standardised length of the HGVS description of the raw variant - stored in this class. - - Note that this function relies on the absence of values to make the - correct description. Also see the comment in the class definition. - - @returns: The standardised length of the HGVS description of the raw - variant stored in this class. - @rtype: int - """ - if not self.start: # = - return 1 - - descrLen = 1 # Start position. - if not self.deleted and self.type == "ext": - descrLen += 1 # * - else: - descrLen += 3 # One amino acid. - if self.end: - descrLen += 5 # `_' + one amino acid + end position. - if self.type not in ["subst", "stop", "ext", "fs"]: - descrLen += len(self.type) - if self.inserted: - descrLen += 3 * len(self.inserted) - if self.type == "stop": - return descrLen + 1 # * - if self.term: - return descrLen + len(self.type) + 2 # `*' + length until stop. - return descrLen - #__proteinDescriptionLength - - def description(self): - """ - """ - if self.DNA: - return self.__DNADescription() - return self.__proteinDescription() - #description - - def descriptionLength(self): - """ - Give the standardised length of the HGVS description of the raw variant - stored in this class. - - @returns: The standardised length of the HGVS description of the raw - variant stored in this class. - @rtype: int - """ - if self.DNA: - return self.__DNADescriptionLength() - return self.__proteinDescriptionLength() - #descriptionLength -#DescribeRawVar - -def alleleDescription(allele): - """ - Convert a list of raw variants to an HGVS allele description. - - @arg allele: A list of raw variants representing an allele description. - @type allele: list(DescribeRawVar) - - @returns: The HGVS description of {allele}. - @rval: unicode - """ - if len(allele) > 1: - return "[%s]" % ';'.join(map(lambda x : x.hgvs, allele)) - return allele[0].hgvs -#alleleDescription - -def alleleDescriptionLength(allele): - """ - Calculate the standardised length of an HGVS allele description. - - @arg allele: A list of raw variants representing an allele description. - @type allele: list(DescribeRawVar) - - @returns: The standardised length of the HGVS description of {allele}. - @rval: int - """ - # NOTE: Do we need to count the ; and [] ? - return sum(map(lambda x : x.hgvsLength, allele)) -#alleleDescriptionLength - -def printpos(s, start, end, fill = 0): - """ - For debugging purposes. - """ - # TODO: See if this can partially replace or be merged with the - # visualisation in the __mutate() function of mutator.py - fs = 10 # Flank size. - - return "%s %s%s %s" % (s[start - fs:start], s[start:end], '-' * fill, - s[end:end + fs]) -#printpos - -def DNA_description(M, s1, s2, s1_start, s1_end, s2_start, s2_end): - """ - Give an allele description of the change from {s1} to {s2} in the range - {s1_start}..{s1_end} on {s1} and {s2_start}..{s2_end} on {s2}. - - arg s1: Sequence 1. - type s1: unicode - arg s2: Sequence 2. - type s2: unicode - arg s1_start: Start of the range on {s1}. - type s1_start: int - arg s1_end: End of the range on {s1}. - type s1_end: int - arg s2_start: Start of the range on {s2}. - type s2_start: int - arg s2_end: End of the range on {s2}. - type s2_end: int - - @returns: A list of DescribeRawVar objects, representing the allele. - @rval: list(DescribeRawVar) - """ - # TODO: Instead of copying this function and adjusting it to make it work - # for proteins, consider disabling parts like the inversion. - # TODO: Think about frameshift descriptions. - - # Nothing happened. - if s1 == s2: - return [DescribeRawVar()] - - # Insertion / Duplication. - if s1_start == s1_end: - ins_length = s2_end - s2_start - shift5, shift3 = roll(s2, s2_start + 1, s2_end) - shift = shift5 + shift3 - - s1_start += shift3 - s1_end += shift3 - s2_start += shift3 - s2_end += shift3 - - if s2_start - ins_length >= 0 and \ - s1[s1_start - ins_length:s1_start] == s2[s2_start:s2_end]: - - if ins_length == 1: - return [DescribeRawVar(start=s1_start, type="dup", shift=shift)] - return [DescribeRawVar(start=s1_start - ins_length + 1, end=s1_end, - type="dup", shift=shift)] - #if - return [DescribeRawVar(start=s1_start, end=s1_start + 1, - inserted=s2[s2_start:s2_end], type="ins", shift=shift)] - #if - - # Deletion. - if s2_start == s2_end: - shift5, shift3 = roll(s1, s1_start + 1, s1_end) - shift = shift5 + shift3 - - s1_start += shift3 + 1 - s1_end += shift3 - - if s1_start == s1_end: - return [DescribeRawVar(start=s1_start, type="del", shift=shift)] - return [DescribeRawVar(start=s1_start, end=s1_end, type="del", shift=shift)] - #if - - # Substitution. - if s1_start + 1 == s1_end and s2_start + 1 == s2_end: - return [DescribeRawVar(start=s1_start + 1, deleted=s1[s1_start], - inserted=s2[s2_start], type="subst")] - - # Simple InDel. - if s1_start + 1 == s1_end: - return [DescribeRawVar(start=s1_start + 1, inserted=s2[s2_start:s2_end], - type="delins")] - - # TODO: Refactor the code after this point. - - # At this stage, we either have an inversion, an indel or a Compound - # variant. - s1_end_f, s2_end_f, lcs_f_len = M.findMax((s1_start, s1_end), - (s2_start, s2_end)) - s1_end_r, s2_end_r, lcs_r_len = M.findMax((s1_start, s1_end), - (s2_start, s2_end), rc=True) - - # Palindrome snooping. - trim = palinsnoop(s1[s1_start + s1_end_r - lcs_r_len:s1_start + s1_end_r]) - if trim == -1 : # Full palindrome. - lcs_r_len = 0 # s1_end_r and s2_end_r should not be used after this. - - # Inversion or Compound variant. - default = [DescribeRawVar(start=s1_start + 1, end=s1_end, - inserted=s2[s2_start:s2_end], type="delins")] - - if not (lcs_f_len or lcs_r_len) : # Optimisation, not really needed. - return default - - # Inversion. - if lcs_f_len <= lcs_r_len: - if trim > 0 : # Partial palindrome. - s1_end_r -= trim - s2_end_r -= trim - lcs_r_len -= 2 * trim - #if - - # Simple Inversion. - if s2_end - s2_start == lcs_r_len and s1_end - s1_start == lcs_r_len: - return [DescribeRawVar(start=s1_start + 1, end=s1_end, type="inv")] - - r1_len = s1_end_r - lcs_r_len - r2_len = s1_end - s1_start - s1_end_r - m1_len = s2_end_r - lcs_r_len - m2_len = s2_end - s2_start - s2_end_r - - # The flanks of the inversion (but not both) can be empty, so we - # generate descriptions conditionally. - leftRv = [] - rightRv = [] - if r1_len or m2_len: - lcs = len(longest_common_suffix(s1[s1_start:s1_start + r1_len], - s2[s2_start:s2_start + m2_len])) - leftRv = DNA_description(M, s1, s2, - s1_start, s1_start + r1_len - lcs, - s2_start, s2_start + m2_len - lcs) - #if - if r2_len or m1_len: - lcp = len(longest_common_prefix(s1[s1_end - r2_len:s1_end], - s2[s2_end - m1_len:s2_end])) - rightRv = DNA_description(M, s1, s2, - s1_end - r2_len + lcp, s1_end, s2_end - m1_len + lcp, s2_end) - #if - - partial = leftRv + [DescribeRawVar(start=s1_start + r1_len + 1, - end=s1_end - r2_len, type="inv")] + rightRv - #if - - # Compound variant. - else: - r1_len = s1_end_f - lcs_f_len - r2_len = s1_end - s1_start - s1_end_f - m1_len = s2_end_f - lcs_f_len - m2_len = s2_end - s2_start - s2_end_f - - partial = DNA_description(M, s1, s2, s1_start, s1_start + r1_len, - s2_start, s2_start + m1_len) + DNA_description(M, s1, s2, - s1_end - r2_len, s1_end, s2_end - m2_len, s2_end) - #else - - if alleleDescriptionLength(partial) <= alleleDescriptionLength(default): - return partial - return default -#DNA_description - -def protein_description(M, s1, s2, s1_start, s1_end, s2_start, s2_end): - """ - Give an allele description of the change from {s1} to {s2} in the range - {s1_start}..{s1_end} on {s1} and {s2_start}..{s2_end} on {s2}. - - arg s1: Sequence 1. - type s1: unicode - arg s2: Sequence 2. - type s2: unicode - arg s1_start: Start of the range on {s1}. - type s1_start: int - arg s1_end: End of the range on {s1}. - type s1_end: int - arg s2_start: Start of the range on {s2}. - type s2_start: int - arg s2_end: End of the range on {s2}. - type s2_end: int - - @returns: A list of DescribeRawVar objects, representing the allele. - @rval: list(DescribeRawVar) - """ - if s1 == '?' or s2 == '?': - return [DescribeRawVar(DNA=False, type="unknown")] - - # One of the sequences is missing. - if not (s1 and s2): - return [DescribeRawVar(DNA=False)] - - # Nothing happened. - if s1 == s2: - return [DescribeRawVar(DNA=False)] - - # Substitution. - if s1_start + 1 == s1_end and s2_start + 1 == s2_end: - return [DescribeRawVar(DNA=False, start=s1_start + 1, deleted=s1[s1_start], - inserted=s2[s2_start], type="subst")] - - # Insertion / Duplication / Extention. - if s1_start == s1_end: - if len(s1) == s1_start + 1: - return [DescribeRawVar(DNA=False, start=s1_start + 1, - inserted=s2[s2_start], term=abs(len(s1) - len(s2)), - type="ext")] - ins_length = s2_end - s2_start - shift5, shift3 = roll(s2, s2_start + 1, s2_end) - shift = shift5 + shift3 - - s1_start += shift3 - s1_end += shift3 - s2_start += shift3 - s2_end += shift3 - - if s2_start - ins_length >= 0 and \ - s1[s1_start - ins_length:s1_start] == s2[s2_start:s2_end]: - - if ins_length == 1: - return [DescribeRawVar(DNA=False, start=s1_start, - startAA=s1[s1_start - 1], type="dup", shift=shift)] - return [DescribeRawVar(DNA=False, start=s1_start - ins_length + 1, - startAA=s1[s1_start - ins_length], end=s1_end, - endAA=s1[s1_end - 1], type="dup", shift=shift)] - #if - return [DescribeRawVar(DNA=False, start=s1_start, startAA=s1[s1_start - 1], - end=s1_start + 1, endAA=s1[s1_end], inserted=s2[s2_start:s2_end], - type="ins", shift=shift)] - #if - - # Deletion / Inframe stop. - if s2_start == s2_end: - if len(s2) == s2_end + 1: - return [DescribeRawVar(DNA=False, start=s1_start + 1, startAA=s1[s1_start], - type="stop")] - shift5, shift3 = roll(s1, s1_start + 1, s1_end) - shift = shift5 + shift3 - - s1_start += shift3 + 1 - s1_end += shift3 - - if s1_start == s1_end: - return [DescribeRawVar(DNA=False, start=s1_start, startAA=s1[s1_start - 1], - type="del", shift=shift)] - return [DescribeRawVar(DNA=False, start=s1_start, startAA=s1[s1_start - 1], - end=s1_end, endAA=s1[s1_end - 1], type="del", shift=shift)] - #if - - # Simple InDel. - if s1_start + 1 == s1_end: - return [DescribeRawVar(DNA=False, start=s1_start + 1, startAA=s1[s1_start], - inserted=s2[s2_start:s2_end], type="delins")] - - # Frameshift. - if len(s1) == s1_end + 1 and len(s2) == s2_end + 1: - # TODO: the FS tables should be made for all coding tables in advance. - FS1, FS2 = makeFSTables(1) # Standard coding table. - - if (fitFS(s1[s1_start + 1:], s2[s2_start + 1:], FS1) or - fitFS(s1[s1_start + 1:], s2[s2_start + 1:], FS2) or - fitFS(s2[s2_start + 1:], s1[s1_start + 2:], FS1) or - fitFS(s2[s2_start + 1:], s1[s1_start + 2:], FS2)): - return [DescribeRawVar(DNA=False, start=s1_start + 1, deleted=s1[s1_start], - inserted=s2[s2_start], term=len(s2) - s2_start, type="fs")] - #if - - # At this point, we have an indel. - s1_end_f, s2_end_f, lcs_f_len = M.findMax((s1_start, s1_end), - (s2_start, s2_end)) - - # InDel. - default = [DescribeRawVar(DNA=False, start=s1_start + 1, startAA=s1[s1_start], - end=s1_end, endAA=s1[s1_end], inserted=s2[s2_start:s2_end], - type="delins")] - - if not lcs_f_len : # Optimisation, not really needed. - return default - - r1_len = s1_end_f - lcs_f_len - r2_len = s1_end - s1_start - s1_end_f - m1_len = s2_end_f - lcs_f_len - m2_len = s2_end - s2_start - s2_end_f - - partial = protein_description(M, s1, s2, s1_start, s1_start + r1_len, - s2_start, s2_start + m1_len) + protein_description(M, s1, s2, - s1_end - r2_len, s1_end, s2_end - m2_len, s2_end) - - if alleleDescriptionLength(partial) <= alleleDescriptionLength(default): - return partial - return default -#protein_description - -def describe(original, mutated, DNA=True): - """ - Convenience function for DNA_description(). - - @arg original: - @type original: unicode - @arg mutated: - @type mutated: unicode - - @returns: A list of DescribeRawVar objects, representing the allele. - @rval: list(DescribeRawVar) - """ - s1 = original - s2 = mutated - lcp = len(longest_common_prefix(s1, s2)) - lcs = len(longest_common_suffix(s1[lcp:], s2[lcp:])) - s1_end = len(s1) - lcs - s2_end = len(s2) - lcs - - if (s1_end - lcp) * (s2_end - lcp) > MAX_MATRIX_SIZE: - return - - if not DNA: - M = LCS(s1, s2, lcp, s1_end, s2_end) - return protein_description(M, s1, s2, lcp, s1_end, lcp, s2_end) - #if - M = LCS(s1, s2, lcp, s1_end, s2_end, DNA=True) - return DNA_description(M, s1, s2, lcp, s1_end, lcp, s2_end) -#describe diff --git a/mutalyzer/entrypoints/mutalyzer.py b/mutalyzer/entrypoints/mutalyzer.py index 6717161d1d4795c923f70cfa6846358ace2972c8..c455ae1ed8e290c8982d7883880c9950c613e0e0 100644 --- a/mutalyzer/entrypoints/mutalyzer.py +++ b/mutalyzer/entrypoints/mutalyzer.py @@ -8,14 +8,24 @@ Mutalyzer command-line name checker. from __future__ import unicode_literals import argparse -import sys +import json + +import extractor from . import _cli_string -from .. import describe from .. import output from .. import variantchecker +class MyEncoder(json.JSONEncoder): + def default(self, o): + json_object = o.__dict__ + json_object.update({"hgvs": unicode(o), "weight": o.weight()}) + + return json_object + #default +#MyEncoder + def check_name(description): """ Run the name checker. @@ -93,22 +103,32 @@ def check_name(description): for i in O.getOutput("legends") : print i - allele = describe.describe(O.getIndexedOutput("original", 0), - O.getIndexedOutput("mutated", 0)) - prot_allele = describe.describe(O.getIndexedOutput("oldprotein", 0), - O.getIndexedOutput("newprotein", 0, default=""), DNA=False) + reference_sequence = O.getIndexedOutput("original", 0) + sample_sequence = O.getIndexedOutput("mutated", 0) + + described_allele = extractor.describe_dna(reference_sequence, + sample_sequence) + #described_protein_allele = describe.describe( + # O.getIndexedOutput("oldprotein", 0), + # O.getIndexedOutput("newprotein", 0, default=""), + # DNA=False) + described_protein_allele = "" - extracted = extractedProt = '(skipped)' + described = described_protein = '(skipped)' - if allele: - extracted = describe.alleleDescription(allele) - if prot_allele: - extractedProt = describe.alleleDescription(prot_allele) + if described_allele: + described = described_allele + if described_protein_allele: + described_protein = described_protein_allele print "\nExperimental services:" - print extracted - print extractedProt + print described + print described_protein #print "+++ %s" % O.getOutput("myTranscriptDescription") + print json.dumps({ + #"reference_sequence": reference_sequence, + #"sample_sequence": sample_sequence, + "allele_description": described_allele}, cls=MyEncoder) def main(): diff --git a/mutalyzer/extractor_loader.py b/mutalyzer/extractor_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..ee8b097e50518f008daafbde2bd7357da648bf37 --- /dev/null +++ b/mutalyzer/extractor_loader.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python + +from __future__ import unicode_literals + +import sys + +import json + +import describe + +class MyEncoder(json.JSONEncoder): + def default(self, o): + return o.__dict__ + +def main(): + if len(sys.argv) < 3: + print "usage: " + sys.argv[0] + " reference sample" + exit() + #if + + f = open(sys.argv[1], "r") + ref = f.read() + f.close() + f = open(sys.argv[2], "r") + alt = f.read() + f.close() + + extracted_allele = describe.describe_dna(ref, alt) + + #print "Description Extractor Version " + describe.extractor.VERSION + print extracted_allele + #print "JSON: " + json.dumps({"reference_sequence": ref, "sample_sequence": alt, "allele_description": extracted_allele}, cls=MyEncoder) + +#main + +if __name__ == "__main__": + main() diff --git a/mutalyzer/models.py b/mutalyzer/models.py index 18d934f7dbf6f3ba39fc06eaad9bc0320a67fb03..7ebade7f8fa429b92845cac6f3f13a784474bb69 100644 --- a/mutalyzer/models.py +++ b/mutalyzer/models.py @@ -105,23 +105,73 @@ class RawVar(ComplexModel): """ __namespace__ = SOAP_NAMESPACE - DNA = Mandatory.Boolean start = Mandatory.Integer start_offset = Mandatory.Integer end = Mandatory.Integer end_offset = Mandatory.Integer + sample_start = Mandatory.Integer + sample_start_offset = Mandatory.Integer + sample_end = Mandatory.Integer + sample_end_offset = Mandatory.Integer type = Mandatory.Unicode deleted = Mandatory.Unicode inserted = Mandatory.Unicode + weight = Mandatory.Integer shift = Mandatory.Integer - startAA = Mandatory.Unicode - endAA = Mandatory.Unicode - term = Mandatory.Integer - hgvs = Mandatory.Unicode - hgvsLength = Mandatory.Integer + description = Mandatory.Unicode #RawVar +class _RawVar(ComplexModel): + """ + Used in MutalyzerOutput data type. + """ + __namespace__ = SOAP_NAMESPACE + + start = Mandatory.Integer + end = Mandatory.Integer + sample_start = Mandatory.Integer + sample_end = Mandatory.Integer + type = Mandatory.Unicode + deleted = Mandatory.Unicode + inserted = Mandatory.Unicode + shift = Mandatory.Integer + hgvs = Mandatory.Unicode + weight = Mandatory.Integer +#_RawVar + + +class DNAVar(_RawVar): + """ + Used in MutalyzerOutput data type. + """ + __namespace__ = SOAP_NAMESPACE + + start_offset = Mandatory.Integer + end_offset = Mandatory.Integer + sample_start_offset = Mandatory.Integer + sample_end_offset = Mandatory.Integer +#DNAVar + + +class RNAVar(DNAVar): + """ + Used in MutalyzerOutput data type. + """ + __namespace__ = SOAP_NAMESPACE +#RNAVar + + +class ProteinVar(_RawVar): + """ + Used in MutalyzerOutput data type. + """ + __namespace__ = SOAP_NAMESPACE + + term = Mandatory.Integer +#ProteinVar + + class Allele(ComplexModel): """ Used in MutalyzerOutput data type. diff --git a/mutalyzer/services/rpc.py b/mutalyzer/services/rpc.py index 7e57d76be8a7bb5310f905b4265996d07ab37d5a..770271128c6012bedb2139eff9cd58ce5e7a6edb 100644 --- a/mutalyzer/services/rpc.py +++ b/mutalyzer/services/rpc.py @@ -23,6 +23,8 @@ from operator import attrgetter from sqlalchemy.orm.exc import NoResultFound from sqlalchemy.sql import func +import extractor + import mutalyzer from mutalyzer.config import settings from mutalyzer.db import session @@ -40,7 +42,6 @@ from mutalyzer import Retriever from mutalyzer import GenRecord from mutalyzer import Scheduler from mutalyzer.models import * -from mutalyzer import describe def create_rpc_fault(output): @@ -1239,9 +1240,28 @@ class MutalyzerService(ServiceBase): output.addMessage(__file__, -1, 'INFO', 'Received request descriptionExtract') + allele = extractor.describe_dna(reference, observed) + result = Allele() - result.allele = describe.describe(reference, observed) - result.description = describe.alleleDescription(result.allele) + result.allele = [] + for variant in allele: + raw_var = RawVar() + raw_var.start = variant.start + raw_var.start_offset = variant.start_offset + raw_var.end = variant.end + raw_var.end_offset = variant.end_offset + raw_var.sample_start = variant.sample_start + raw_var.sample_start_offset = variant.sample_start_offset + raw_var.sample_end = variant.sample_end + raw_var.sample_end_offset = variant.sample_end_offset + raw_var.type = variant.type + raw_var.deleted = unicode(variant.deleted) + raw_var.inserted = unicode(variant.inserted) + raw_var.weight = variant.weight() + raw_var.shift = variant.shift + raw_var.description = unicode(variant) + result.allele.append(raw_var) + result.description = unicode(allele) output.addMessage(__file__, -1, 'INFO', 'Finished processing descriptionExtract') diff --git a/mutalyzer/test.py b/mutalyzer/test.py new file mode 100644 index 0000000000000000000000000000000000000000..f733aec814afeda4f4c22ef9b8cabfaecb1b2221 --- /dev/null +++ b/mutalyzer/test.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python + +from __future__ import unicode_literals + +import json + +import describe + +class MyEncoder(json.JSONEncoder): + def default(self, o): + return o.__dict__ + +def main(): + ref = "ACGTCGATTCGCTAGCTTCGGGGGATAGATAGAGATATAGAGATATTTTT" + alt = "ACGTCGGTTCGCTAGCTTCGGGGGATAGATAGATATATAGAGATATTTTT" + + extracted_allele = describe.describe_dna(ref, alt) + + print extracted_allele + print json.dumps({"reference_sequence": ref, "sample_sequence": alt, + "allele_description": extracted_allele}, cls=MyEncoder) +#main + +if __name__ == "__main__": + main() diff --git a/mutalyzer/util.py b/mutalyzer/util.py index 6b7987b31c8f9a7bed62507572f0c417589d6c4a..4bf06e3effef5fe56f507e7dd0a82f2594aab8b4 100644 --- a/mutalyzer/util.py +++ b/mutalyzer/util.py @@ -31,66 +31,8 @@ import time from Bio.SeqUtils import seq3 - -# Taken from BioPython. -AMBIGUOUS_DNA_COMPLEMENT = { - 'A': 'T', - 'C': 'G', - 'G': 'C', - 'T': 'A', - 'M': 'K', - 'R': 'Y', - 'W': 'W', - 'S': 'S', - 'Y': 'R', - 'K': 'M', - 'V': 'B', - 'H': 'D', - 'D': 'H', - 'B': 'V', - 'X': 'X', - 'N': 'N'} -AMBIGUOUS_RNA_COMPLEMENT = { - 'A': 'U', - 'C': 'G', - 'G': 'C', - 'U': 'A', - 'M': 'K', - 'R': 'Y', - 'W': 'W', - 'S': 'S', - 'Y': 'R', - 'K': 'M', - 'V': 'B', - 'H': 'D', - 'D': 'H', - 'B': 'V', - 'X': 'X', - 'N': 'N'} - - -def _make_translation_table(complement_mapping): - before = complement_mapping.keys() - before += [b.lower() for b in before] - after = complement_mapping.values() - after += [b.lower() for b in after] - return {ord(k): v for k, v in zip(before, after)} - - -_dna_complement_table = _make_translation_table(AMBIGUOUS_DNA_COMPLEMENT) -_rna_complement_table = _make_translation_table(AMBIGUOUS_RNA_COMPLEMENT) - - -def reverse_complement(sequence): - """ - Reverse complement of a sequence represented as unicode string. - """ - if 'U' in sequence or 'u' in sequence: - table = _rna_complement_table - else: - table = _dna_complement_table - - return ''.join(reversed(sequence.translate(table))) +# NOTE: This is a temporary fix. +from extractor.describe import reverse_complement, palinsnoop, roll def is_utf8_alias(encoding): @@ -309,87 +251,6 @@ def roll_(s, start, end) : #roll -def roll(s, first, last): - """ - Determine the variability of a variant by looking at cyclic - permutations. Not all cyclic permutations are tested at each time, it - is sufficient to check ``aW'' if ``Wa'' matches (with ``a'' a letter, - ``W'' a word) when rolling to the left for example. - - >>> roll('abbabbabbabb', 4, 6) - (3, 6) - >>> roll('abbabbabbabb', 5, 5) - (0, 1) - >>> roll('abcccccde', 4, 4) - (1, 3) - - @arg s: A reference sequence. - @type s: any sequence type - @arg first: First position of the pattern in the reference sequence. - @type first: int - @arg last: Last position of the pattern in the reference sequence. - @type last: int - - @return: tuple: - - left ; Amount of positions that the pattern can be shifted to - the left. - - right ; Amount of positions that the pattern can be shifted to - the right. - @rtype: tuple(int, int) - """ - pattern = s[first - 1:last] # Extract the pattern - pattern_length = len(pattern) - - # Keep rolling to the left as long as a cyclic permutation matches. - minimum = first - 2 - j = pattern_length - 1 - while minimum > -1 and s[minimum] == pattern[j % pattern_length]: - j -= 1 - minimum -= 1 - - # Keep rolling to the right as long as a cyclic permutation matches. - maximum = last - j = 0 - while maximum < len(s) and s[maximum] == pattern[j % pattern_length]: - j += 1 - maximum += 1 - - return first - minimum - 2, maximum - last -#roll - - -def palinsnoop(s): - """ - Check a sequence for a reverse-complement-palindromic prefix (and - suffix). If one is detected, return the length of this prefix. If the - string equals its reverse complement, return -1. - - >>> palinsnoop('TACGCTA') - 2 - >>> palinsnoop('TACGTA') - -1 - >>> palinsnoop('TACGCTT') - 0 - - @arg s: A nucleotide sequence. - @type s: unicode - - @return: The number of elements that are palindromic or -1 if the string - is a 'palindrome'. - @rtype: int - """ - s_revcomp = reverse_complement(s) - - for i in range(int(math.ceil(len(s) / 2.0))): - if s[i] != s_revcomp[i]: - # The first i elements are 'palindromic'. - return i - - # Perfect 'palindrome'. - return -1 -#palinsnoop - - def longest_common_prefix(s1, s2): """ Calculate the longest common prefix of two strings. @@ -434,7 +295,7 @@ def longest_common_suffix(s1, s2): @type s2: unicode @return: The longest common suffix of s1 and s2. - @rtype: string + @rtype: unicode """ return longest_common_prefix(s1[::-1], s2[::-1])[::-1] #longest_common_suffix @@ -680,7 +541,7 @@ def visualise_sequence(sequence, max_length=25, flank_size=6): @type flank_size: int @return: Either the original sequence, or an abbreviation of it. - @rtype: str + @rtype: unicode """ if len(sequence) > max_length: return '%s [%ibp] %s' % (sequence[:flank_size], diff --git a/mutalyzer/website/templates/description-extractor.html b/mutalyzer/website/templates/description-extractor.html index 2f2cb17d774dfee13a1f228a04deb27f8eac7a01..d1d8917a4b82ff92d7145e05a2248ecbd0b6ab6b 100644 --- a/mutalyzer/website/templates/description-extractor.html +++ b/mutalyzer/website/templates/description-extractor.html @@ -94,7 +94,7 @@ Please supply a reference sequence and an observed sequence. <td>{{ raw_var.deleted }}</td> <td>{{ raw_var.inserted }}</td> <td>{{ raw_var.shift }}</td> - <td>{{ raw_var.hgvs }}</td> + <td>{{ raw_var }}</td> </tr> {% endfor %} </tbody> diff --git a/mutalyzer/website/views.py b/mutalyzer/website/views.py index 992e84a502ba6d46c310f65f0e50b9611b6ac412..36327e1836c4589c610bd7d1c8bd062261fd05f4 100644 --- a/mutalyzer/website/views.py +++ b/mutalyzer/website/views.py @@ -19,8 +19,10 @@ from lxml import etree from spyne.server.http import HttpBase from sqlalchemy.orm.exc import NoResultFound +import extractor + import mutalyzer -from mutalyzer import (announce, describe, File, Retriever, Scheduler, stats, +from mutalyzer import (announce, File, Retriever, Scheduler, stats, util, variantchecker) from mutalyzer.config import settings from mutalyzer.db.models import BATCH_JOB_TYPES @@ -269,20 +271,18 @@ def name_checker(): # Experimental description extractor. if (output.getIndexedOutput('original', 0) and output.getIndexedOutput('mutated', 0)): - extracted = extractedProt = '(skipped)' + allele = extractor.describe_dna(output.getIndexedOutput('original', 0), + output.getIndexedOutput('mutated', 0)) + #prot_allele = describe.describe_protein( + # output.getIndexedOutput('oldprotein', 0), + # output.getIndexedOutput('newprotein', 0, default='')) + prot_allele = '' - allele = describe.describe(output.getIndexedOutput('original', 0), - output.getIndexedOutput('mutated', 0)) + extracted = extractedProt = '(skipped)' if allele: - extracted = describe.alleleDescription(allele) - - if output.getIndexedOutput('oldprotein', 0): - prot_allele = describe.describe( - output.getIndexedOutput('oldprotein', 0), - output.getIndexedOutput('newprotein', 0, default=''), - DNA=False) - if prot_allele: - extractedProt = describe.alleleDescription(prot_allele) + extracted = unicode(allele) #describe.allele_description(allele) + if prot_allele: + extractedProt = unicode(prot_allele) #describe.allele_description(prot_allele) else: extracted = extractedProt = '' @@ -701,8 +701,8 @@ def description_extractor(): output.addMessage(__file__, 3, 'ENODNA', 'Variant sequence is not DNA.') - raw_vars = describe.describe(reference_sequence, variant_sequence) - description = describe.alleleDescription(raw_vars) + raw_vars = extractor.describe_dna(reference_sequence, variant_sequence) + description = unicode(raw_vars) #describe.allele_description(raw_vars) errors, warnings, summary = output.Summary() messages = map(util.message_info, output.getMessages()) diff --git a/requirements.txt b/requirements.txt index 99065b96b5b3b80ecdffc6fec3d1c9ed1682dd9c..3942920811f24a6c0c015255333e09fa7fcc29ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ -e git+https://github.com/LUMC/magic-python.git#egg=Magic_file_extensions +-e git+https://github.com/LUMC/extractor.git@f270ef101e909895d58e47420935f2bbfdb28a3b#egg=extractor Flask==0.10.1 Jinja2==2.7.3 MySQL-python==1.2.5 diff --git a/tests/test_describe.py b/tests/test_describe.py deleted file mode 100644 index e81c7ce45bf6dbb5776326d75e3f7f410179db6d..0000000000000000000000000000000000000000 --- a/tests/test_describe.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -Tests for the mutalyzer.describe module. -""" - - -from __future__ import unicode_literals - -#import logging; logging.basicConfig() -import os - -import mutalyzer -from mutalyzer import describe - -from utils import MutalyzerTest - - -class TestDescribe(MutalyzerTest): - """ - Test the mytalyzer.describe module. - """ - def test1(self): - """ - Test 1. - """ - result = describe.describe( - 'ATGATGATCAGATACAGTGTGATACAGGTAGTTAGACAA', - 'ATGATTTGATCAGATACATGTGATACCGGTAGTTAGGACAA') - description = describe.alleleDescription(result) - assert description == '[5_6insTT;17del;26A>C;35dup]' - - def test2(self): - """ - Test 2. - """ - result = describe.describe( - 'TAAGCACCAGGAGTCCATGAAGAAGATGGCTCCTGCCATGGAATCCCCTACTCTACTGTG', - 'TAAGCACCAGGAGTCCATGAAGAAGCTGGATCCTCCCATGGAATCCCCTACTCTACTGTG') - description = describe.alleleDescription(result) - assert description == '[26A>C;30C>A;35G>C]' - - def test3(self): - """ - Test 3. - """ - result = describe.describe( - 'TAAGCACCAGGAGTCCATGAAGAAGATGGCTCCTGCCATGGAATCCCCTACTCTA', - 'TAAGCACCAGGAGTCCATGAAGAAGCCATGTCCTGCCATGGAATCCCCTACTCTA') - description = describe.alleleDescription(result) - assert description == '[26_29inv;30C>G]' - - def test4(self): - """ - Test 4. - """ - result = describe.describe( - 'TAAGCACCAGGAGTCCATGAAGAAGATGGCTCCTGCCATGGAATCCCCTACTCTA', - 'TAAGCACCAGGAGTCCATGAAGAAGCCATGTCCTGCCATGAATCCCCTACTCTA') - description = describe.alleleDescription(result) - assert description == '[26_29inv;30C>G;41del]' diff --git a/tests/test_services_json.py b/tests/test_services_json.py index c405422b8beafb70f5ffefc7c21a2a4380267bf8..843ca8ea5c3340dc268c440ab0d0075188e95c4d 100644 --- a/tests/test_services_json.py +++ b/tests/test_services_json.py @@ -241,60 +241,60 @@ class TestServicesJson(MutalyzerTest): r = self._call('descriptionExtract', 'ATGATGATCAGATACAGTGTGATACAGGTAGTTAGACAA', 'ATGATTTGATCAGATACATGTGATACCGGTAGTTAGGACAA') - assert r == {'allele': [{'term': 0, - 'end': 6, - 'startAA': '', + assert r == {'allele': [{'end': 6, 'deleted': '', - 'hgvsLength': 8, + 'weight': 8, 'inserted': 'TT', 'start_offset': 0, 'start': 5, - 'hgvs': '5_6insTT', + 'description': '5_6insTT', 'shift': 1, - 'endAA': '', - 'DNA': True, 'end_offset': 0, - 'type': 'ins'}, - {'term': 0, - 'end': 0, - 'startAA': '', - 'deleted': '', - 'hgvsLength': 4, + 'type': 'ins', + 'sample_start': 6, + 'sample_end': 7, + 'sample_start_offset': 0, + 'sample_end_offset': 0}, + {'end': 17, + 'deleted': 'G', + 'weight': 7, 'inserted': '', 'start_offset': 0, 'start': 17, - 'hgvs': '17del', + 'description': '17del', 'shift': 0, - 'endAA': '', - 'DNA': True, 'end_offset': 0, - 'type': 'del'}, - {'term': 0, - 'end': 0, - 'startAA': '', + 'type': 'del', + 'sample_start': 18, + 'sample_end': 19, + 'sample_start_offset': 0, + 'sample_end_offset': 0}, + {'end': 26, 'deleted': 'A', - 'hgvsLength': 4, + 'weight': 3, 'inserted': 'C', 'start_offset': 0, 'start': 26, - 'hgvs': '26A>C', + 'description': '26A>C', 'shift': 0, - 'endAA': '', - 'DNA': True, 'end_offset': 0, - 'type': 'subst'}, - {'term': 0, - 'end': 0, - 'startAA': '', + 'type': 'subst', + 'sample_start': 27, + 'sample_end': 27, + 'sample_start_offset': 0, + 'sample_end_offset': 0}, + {'end': 35, 'deleted': '', - 'hgvsLength': 4, - 'inserted': '', + 'weight': 5, + 'inserted': 'G', 'start_offset': 0, 'start': 35, - 'hgvs': '35dup', + 'description': '35dup', 'shift': 1, - 'endAA': '', - 'DNA': True, 'end_offset': 0, - 'type': 'dup'}], + 'type': 'dup', + 'sample_start': 37, + 'sample_end': 37, + 'sample_start_offset': 0, + 'sample_end_offset': 0}], 'description': '[5_6insTT;17del;26A>C;35dup]'} diff --git a/tests/test_services_soap.py b/tests/test_services_soap.py index df21a98fcfc0743524d17327f7e43c89942f8b15..e60470ad074841d4bc2522e3a9045f4e5c6f297c 100644 --- a/tests/test_services_soap.py +++ b/tests/test_services_soap.py @@ -760,59 +760,59 @@ facilisi.""" assert all(all((v_real[k] == v_expected[k]) or not(v_real[k] or v_expected[k]) for k in v_expected) for v_real, v_expected in - zip(r['allele'].RawVar, [{'term': 0, - 'end': 6, - 'startAA': '', + zip(r['allele'].RawVar, [{'end': 6, 'deleted': '', - 'hgvsLength': 8, + 'weight': 8, 'inserted': 'TT', 'start_offset': 0, 'start': 5, - 'hgvs': '5_6insTT', + 'description': '5_6insTT', 'shift': 1, - 'endAA': '', - 'DNA': True, 'end_offset': 0, - 'type': 'ins'}, - {'term': 0, - 'end': 0, - 'startAA': '', - 'deleted': '', - 'hgvsLength': 4, + 'type': 'ins', + 'sample_start': 6, + 'sample_end': 7, + 'sample_start_offset': 0, + 'sample_end_offset': 0}, + {'end': 17, + 'deleted': 'G', + 'weight': 7, 'inserted': '', 'start_offset': 0, 'start': 17, - 'hgvs': '17del', + 'description': '17del', 'shift': 0, - 'endAA': '', - 'DNA': True, 'end_offset': 0, - 'type': 'del'}, - {'term': 0, - 'end': 0, - 'startAA': '', + 'type': 'del', + 'sample_start': 18, + 'sample_end': 19, + 'sample_start_offset': 0, + 'sample_end_offset': 0}, + {'end': 26, 'deleted': 'A', - 'hgvsLength': 4, + 'weight': 3, 'inserted': 'C', 'start_offset': 0, 'start': 26, - 'hgvs': '26A>C', + 'description': '26A>C', 'shift': 0, - 'endAA': '', - 'DNA': True, 'end_offset': 0, - 'type': 'subst'}, - {'term': 0, - 'end': 0, - 'startAA': '', + 'type': 'subst', + 'sample_start': 27, + 'sample_end': 27, + 'sample_start_offset': 0, + 'sample_end_offset': 0}, + {'end': 35, 'deleted': '', - 'hgvsLength': 4, - 'inserted': '', + 'weight': 5, + 'inserted': 'G', 'start_offset': 0, 'start': 35, - 'hgvs': '35dup', + 'description': '35dup', 'shift': 1, - 'endAA': '', - 'DNA': True, 'end_offset': 0, - 'type': 'dup'}])) + 'type': 'dup', + 'sample_start': 37, + 'sample_end': 37, + 'sample_start_offset': 0, + 'sample_end_offset': 0}]))