From 70b4ff98a1313707a32ee0b1eee45aae5a33dbcf Mon Sep 17 00:00:00 2001 From: "J.F.J. Laros" <j.f.j.laros@lumc.nl> Date: Sat, 28 Jan 2012 00:05:18 +0000 Subject: [PATCH] Found a good criterium for the recursion, started adding an allele description as a list of objects. git-svn-id: https://humgenprojects.lumc.nl/svn/mutalyzer/trunk@464 eb6bd6ab-9ccd-42b9-aceb-e2899b4a52f1 --- extras/soap-tools/describe.py | 227 +++++++++++++++++++++++----------- 1 file changed, 156 insertions(+), 71 deletions(-) diff --git a/extras/soap-tools/describe.py b/extras/soap-tools/describe.py index b6f2c8bd..c6862b97 100644 --- a/extras/soap-tools/describe.py +++ b/extras/soap-tools/describe.py @@ -7,7 +7,6 @@ @requires: suds.client.Client """ - import sys import argparse import Bio.Seq @@ -19,8 +18,38 @@ from mutalyzer.util import palinsnoop, roll WSDL_LOCATION = "http://localhost/mutalyzer/services/?wsdl" +class RawVar() : + def __init__(self, start = 0, start_offset = 0, end = 0, end_offset = 0, + type = "", deleted = "", inserted = "", hgvs = "=") : + """ + """ + + self.start = start + self.start_offset = start_offset + self.end = end + self.end_offset= end_offset + self.type = type + self.deleted = deleted + self.inserted = inserted + self.hgvs = hgvs + #__init__ +#RawVar + def LongestCommonSubstring(s1, s2) : """ + Find the longest common substring between {s1} and {s2}. + + Mainly copied from: + http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/ + Longest_common_substring#Python + + @arg s1: String 1. + @type s1: str + @arg s2: String 2. + @type s2: str + + @returns: The end locations and the length of the longest common substring. + @rtype: tuple(int, int, int) """ len_s1 = len(s1) @@ -47,8 +76,26 @@ def LongestCommonSubstring(s1, s2) : return x_longest, y_longest, longest #LongestCommonSubstring -def DNA_description(s1, s2, s1_start, s1_end, s2_start, s2_end) : +def DNA_description(s1, s2, s1_start, s1_end, s2_start, s2_end, allele) : """ + Give the HGVS description of the change from {s1} to {s2} in the range + {s1_start}..{s1_end} on {s1} and {s2_start}..{s2_end} on {s2}. + + arg s1: Sequence 1. + type s1: str + arg s2: Sequence 2. + type s2: str + arg s1_start: Start of the range on {s1}. + type s1_start: int + arg s1_end: End of the range on {s1}. + type s1_end: int + arg s2_start: Start of the range on {s2}. + type s2_start: int + arg s2_end: End of the range on {s2}. + type s2_end: int + + @returns: HGVS description. + @rval: str """ # Nothing happened. @@ -68,11 +115,22 @@ def DNA_description(s1, s2, s1_start, s1_end, s2_start, s2_end) : if s2_start - ins_length >= 0 and \ s1[s1_start - ins_length:s1_start] == s2[s2_start:s2_end] : - if ins_length == 1 : - return "%idup" % (s1_start) - return "%i_%idup" % (s1_start - ins_length + 1, s1_end) + if ins_length == 1 : # FIXME we forgot the ins of length 1 + hgvs = "%idup" % (s1_start) + allele.append(RawVar(start = s1_start, + type = "dup", hgvs = hgvs)) + return hgvs + #if + + hgvs = "%i_%idup" % (s1_start - ins_length + 1, s1_end) + allele.append(RawVar(start = s1_start - ins_length + 1, + end = s1_end, type = "dup", hgvs = hgvs)) + return hgvs #if - return "%i_%iins%s" % (s1_start, s1_start + 1, s2[s2_start:s2_end]) + hgvs = "%i_%iins%s" % (s1_start, s1_start + 1, s2[s2_start:s2_end]) + allele.append(RawVar(start = s1_start, end = s1_start + 1, + inserted = s2[s2_start:s2_end], type = "ins", hgvs = hgvs)) + return hgvs #if # Deletion. @@ -80,17 +138,32 @@ def DNA_description(s1, s2, s1_start, s1_end, s2_start, s2_end) : dummy, shift = roll(s1, s1_start + 1, s1_end) if s1_start + 1 == s1_end : - return "%idel" % (s1_start + shift + 1) - return "%i_%idel" % (s1_start + shift + 1, s1_end + shift) + hgvs = "%idel" % (s1_start + shift + 1) + allele.append(RawVar(start = s1_start + shift + 1, type = "del", + hgvs = hgvs)) + return hgvs + #if + hgvs = "%i_%idel" % (s1_start + shift + 1, s1_end + shift) + allele.append(RawVar(start = s1_start + shift + 1, + s1_end = s1_end + shift, type = "del", hgvs = hgvs)) + return hgvs #if # Substitution. if s1_start + 1 == s1_end and s2_start + 1 == s2_end : - return "%i%s>%s" % (s1_start + 1, s1[s1_start], s2[s2_start]) + hgvs = "%i%s>%s" % (s1_start + 1, s1[s1_start], s2[s2_start]) + allele.append(RawVar(start = s1_start + 1, deleted = s1[s1_start], + inserted = s2[s2_start], type = "subst", hgvs = hgvs)) + return hgvs + #if # Simple InDel. if s1_start + 1 == s1_end : - return "%idelins%s" % (s1_start + 1, s2[s2_start:s2_end]) + hgvs = "%idelins%s" % (s1_start + 1, s2[s2_start:s2_end]) + allele.append(RawVar(start = s1_start + 1, + inserted = s2[s2_start:s2_end], type = "delins", hgvs = hgvs)) + return hgvs + #if # At this stage, we either have an inversion, an indel or a Compound # variant. @@ -105,69 +178,76 @@ def DNA_description(s1, s2, s1_start, s1_end, s2_start, s2_end) : lcs_r_len = 0 # s1_end_r and s2_end_r should not be used after this. # Inversion or Compound variant. - if max(lcs_f_len, lcs_r_len) > 3 : # TODO: This is not a good criterium. + default = "%i_%idelins%s" % (s1_start + 1, s1_end, s2[s2_start:s2_end]) - # Inversion. - if lcs_f_len <= lcs_r_len : + if not max(lcs_f_len, lcs_r_len) : # Optimisation, not really needed. + allele.append(RawVar(start = s1_start + 1, end = s1_end, + inserted = s2[s2_start:s2_end], type = "delins", hgvs = hgvs)) + return default - if trim > 0 : # Partial palindrome. - s1_end_r -= trim - s2_end_r -= trim - lcs_r_len -= 2 * trim - #if + # Inversion. + if lcs_f_len <= lcs_r_len : - # Simple Inversion. - if s2_end - s2_start == lcs_r_len and \ - s1_end - s1_start == lcs_r_len : - return "%i_%iinv" % (s1_start + 1, s1_end) - - r1_len = s1_end_r - lcs_r_len - r2_len = s1_end - s1_start - s1_end_r - m1_len = s2_end_r - lcs_r_len - m2_len = s2_end - s2_start - s2_end_r - - # The flanks of the inversion (but not both) can be empty, so we - # generate descriptions conditionally. - leftVariant = "" - rightVariant = "" - if r1_len or m2_len : - lcs = len(longest_common_suffix(s1[s1_start:s1_start + r1_len], - s2[s2_start:s2_start + m2_len])) - leftVariant = "%s;" % DNA_description(s1, s2, - s1_start, s1_start + r1_len - lcs, - s2_start, s2_start + m2_len - lcs) - #if - if r2_len or m1_len : - lcp = len(longest_common_prefix(s1[s1_end - r2_len:s1_end], - s2[s2_end - m1_len:s2_end])) - rightVariant = ";%s" % DNA_description(s1, s2, - s1_end - r2_len + lcp, s1_end, - s2_end - m1_len + lcp, s2_end) - #if + if trim > 0 : # Partial palindrome. + s1_end_r -= trim + s2_end_r -= trim + lcs_r_len -= 2 * trim + #if + + # Simple Inversion. + if s2_end - s2_start == lcs_r_len and s1_end - s1_start == lcs_r_len : + hgvs = "%i_%iinv" % (s1_start + 1, s1_end) + allele.append(RawVar(start = s1_start + 1, end = s1_end, + type = "inv", hgvs = hgvs)) + return hgvs + #if - return "%s%i_%iinv%s" % (leftVariant, - s1_start + r1_len + 1, s1_end - r2_len, rightVariant) + r1_len = s1_end_r - lcs_r_len + r2_len = s1_end - s1_start - s1_end_r + m1_len = s2_end_r - lcs_r_len + m2_len = s2_end - s2_start - s2_end_r + + # The flanks of the inversion (but not both) can be empty, so we + # generate descriptions conditionally. + leftVariant = "" + rightVariant = "" + if r1_len or m2_len : + lcs = len(longest_common_suffix(s1[s1_start:s1_start + r1_len], + s2[s2_start:s2_start + m2_len])) + leftVariant = "%s;" % DNA_description(s1, s2, + s1_start, s1_start + r1_len - lcs, + s2_start, s2_start + m2_len - lcs, allele) + #if + if r2_len or m1_len : + lcp = len(longest_common_prefix(s1[s1_end - r2_len:s1_end], + s2[s2_end - m1_len:s2_end])) + rightVariant = ";%s" % DNA_description(s1, s2, + s1_end - r2_len + lcp, s1_end, s2_end - m1_len + lcp, s2_end, + allele) #if - # Compound variant. - else : - # Determine the position of the longest common substring in both - # the reference and the mutated sequence. - r1_len = s1_end_f - lcs_f_len - r2_len = s1_end - s1_start - s1_end_f - m1_len = s2_end_f - lcs_f_len - m2_len = s2_end - s2_start - s2_end_f - - return "%s;%s" % ( - DNA_description(s1, s2, - s1_start, s1_start + r1_len, s2_start, s2_start + m1_len), - DNA_description(s1, s2, - s1_end - r2_len, s1_end, s2_end - m2_len, s2_end)) - #else + partial = "%s%i_%iinv%s" % (leftVariant, s1_start + r1_len + 1, + s1_end - r2_len, rightVariant) #if - # Default InDel. - return "%i_%idelins%s" % (s1_start + 1, s1_end, s2[s2_start:s2_end]) + # Compound variant. + else : + r1_len = s1_end_f - lcs_f_len + r2_len = s1_end - s1_start - s1_end_f + m1_len = s2_end_f - lcs_f_len + m2_len = s2_end - s2_start - s2_end_f + + partial = "%s;%s" % ( + DNA_description(s1, s2, + s1_start, s1_start + r1_len, s2_start, s2_start + m1_len, + allele), + DNA_description(s1, s2, + s1_end - r2_len, s1_end, s2_end - m2_len, s2_end, allele)) + #else + + if len(partial) <= len(default) : + return partial + return default #DNA_description def describe(description) : @@ -183,13 +263,18 @@ def describe(description) : s1_end = len(s1) - lcs s2_end = len(s2) - lcs - for i in result.rawVariants.RawVariant : - print i.description - print i.visualisation - print - #for + if result.rawVariants : + for i in result.rawVariants.RawVariant : + print i.description + print i.visualisation + print + #for print(result.genomicDescription) - print(DNA_description(s1, s2, lcp, s1_end, lcp, s2_end)) + + allele = [] + print(DNA_description(s1, s2, lcp, s1_end, lcp, s2_end, allele)) + for i in allele : + print i.hgvs #describe def main() : -- GitLab