From f7108427da6e470f0dd0ecc46a9ab07c698ca750 Mon Sep 17 00:00:00 2001 From: "J.F.J. Laros" <j.f.j.laros@lumc.nl> Date: Sun, 29 Jan 2012 19:02:09 +0000 Subject: [PATCH] Finished generating the allele description. git-svn-id: https://humgenprojects.lumc.nl/svn/mutalyzer/trunk@465 eb6bd6ab-9ccd-42b9-aceb-e2899b4a52f1 --- extras/soap-tools/describe.py | 91 +++++++++++++++++------------------ 1 file changed, 45 insertions(+), 46 deletions(-) diff --git a/extras/soap-tools/describe.py b/extras/soap-tools/describe.py index c6862b97..2480c49e 100644 --- a/extras/soap-tools/describe.py +++ b/extras/soap-tools/describe.py @@ -76,7 +76,7 @@ def LongestCommonSubstring(s1, s2) : return x_longest, y_longest, longest #LongestCommonSubstring -def DNA_description(s1, s2, s1_start, s1_end, s2_start, s2_end, allele) : +def DNA_description(s1, s2, s1_start, s1_end, s2_start, s2_end) : """ Give the HGVS description of the change from {s1} to {s2} in the range {s1_start}..{s1_end} on {s1} and {s2_start}..{s2_end} on {s2}. @@ -100,13 +100,16 @@ def DNA_description(s1, s2, s1_start, s1_end, s2_start, s2_end, allele) : # Nothing happened. if s1 == s2: - return "=" + return "=", None # Insertion / Duplication. if s1_start == s1_end : ins_length = s2_end - s2_start dummy, shift = roll(s2, s2_start + 1, s2_end + 1) + # FIXME This needs to be checked. + # NM_002001.2:c.2_3insAT goes wrong. + # NM_002001.2:c.3dup too. s1_start += shift + 1 s1_end += shift + 1 s2_start += shift + 1 @@ -115,22 +118,19 @@ def DNA_description(s1, s2, s1_start, s1_end, s2_start, s2_end, allele) : if s2_start - ins_length >= 0 and \ s1[s1_start - ins_length:s1_start] == s2[s2_start:s2_end] : - if ins_length == 1 : # FIXME we forgot the ins of length 1 + if ins_length == 1 : hgvs = "%idup" % (s1_start) - allele.append(RawVar(start = s1_start, - type = "dup", hgvs = hgvs)) - return hgvs + return hgvs, [RawVar(start = s1_start, type = "dup", + hgvs = hgvs)] #if hgvs = "%i_%idup" % (s1_start - ins_length + 1, s1_end) - allele.append(RawVar(start = s1_start - ins_length + 1, - end = s1_end, type = "dup", hgvs = hgvs)) - return hgvs + return hgvs, [RawVar(start = s1_start - ins_length + 1, + end = s1_end, type = "dup", hgvs = hgvs)] #if hgvs = "%i_%iins%s" % (s1_start, s1_start + 1, s2[s2_start:s2_end]) - allele.append(RawVar(start = s1_start, end = s1_start + 1, - inserted = s2[s2_start:s2_end], type = "ins", hgvs = hgvs)) - return hgvs + return hgvs, [RawVar(start = s1_start, end = s1_start + 1, + inserted = s2[s2_start:s2_end], type = "ins", hgvs = hgvs)] #if # Deletion. @@ -139,30 +139,26 @@ def DNA_description(s1, s2, s1_start, s1_end, s2_start, s2_end, allele) : if s1_start + 1 == s1_end : hgvs = "%idel" % (s1_start + shift + 1) - allele.append(RawVar(start = s1_start + shift + 1, type = "del", - hgvs = hgvs)) - return hgvs + return hgvs, [RawVar(start = s1_start + shift + 1, type = "del", + hgvs = hgvs)] #if hgvs = "%i_%idel" % (s1_start + shift + 1, s1_end + shift) - allele.append(RawVar(start = s1_start + shift + 1, - s1_end = s1_end + shift, type = "del", hgvs = hgvs)) - return hgvs + return hgvs, [RawVar(start = s1_start + shift + 1, + s1_end = s1_end + shift, type = "del", hgvs = hgvs)] #if # Substitution. if s1_start + 1 == s1_end and s2_start + 1 == s2_end : hgvs = "%i%s>%s" % (s1_start + 1, s1[s1_start], s2[s2_start]) - allele.append(RawVar(start = s1_start + 1, deleted = s1[s1_start], - inserted = s2[s2_start], type = "subst", hgvs = hgvs)) - return hgvs + return hgvs, [RawVar(start = s1_start + 1, deleted = s1[s1_start], + inserted = s2[s2_start], type = "subst", hgvs = hgvs)] #if # Simple InDel. if s1_start + 1 == s1_end : hgvs = "%idelins%s" % (s1_start + 1, s2[s2_start:s2_end]) - allele.append(RawVar(start = s1_start + 1, - inserted = s2[s2_start:s2_end], type = "delins", hgvs = hgvs)) - return hgvs + return hgvs, [RawVar(start = s1_start + 1, + inserted = s2[s2_start:s2_end], type = "delins", hgvs = hgvs)] #if # At this stage, we either have an inversion, an indel or a Compound @@ -181,9 +177,8 @@ def DNA_description(s1, s2, s1_start, s1_end, s2_start, s2_end, allele) : default = "%i_%idelins%s" % (s1_start + 1, s1_end, s2[s2_start:s2_end]) if not max(lcs_f_len, lcs_r_len) : # Optimisation, not really needed. - allele.append(RawVar(start = s1_start + 1, end = s1_end, - inserted = s2[s2_start:s2_end], type = "delins", hgvs = hgvs)) - return default + return default, [RawVar(start = s1_start + 1, end = s1_end, + inserted = s2[s2_start:s2_end], type = "delins", hgvs = default)] # Inversion. if lcs_f_len <= lcs_r_len : @@ -197,9 +192,8 @@ def DNA_description(s1, s2, s1_start, s1_end, s2_start, s2_end, allele) : # Simple Inversion. if s2_end - s2_start == lcs_r_len and s1_end - s1_start == lcs_r_len : hgvs = "%i_%iinv" % (s1_start + 1, s1_end) - allele.append(RawVar(start = s1_start + 1, end = s1_end, - type = "inv", hgvs = hgvs)) - return hgvs + return hgvs, [RawVar(start = s1_start + 1, end = s1_end, + type = "inv", hgvs = hgvs)] #if r1_len = s1_end_r - lcs_r_len @@ -214,20 +208,23 @@ def DNA_description(s1, s2, s1_start, s1_end, s2_start, s2_end, allele) : if r1_len or m2_len : lcs = len(longest_common_suffix(s1[s1_start:s1_start + r1_len], s2[s2_start:s2_start + m2_len])) - leftVariant = "%s;" % DNA_description(s1, s2, + leftHgvs, leftRv = DNA_description(s1, s2, s1_start, s1_start + r1_len - lcs, - s2_start, s2_start + m2_len - lcs, allele) + s2_start, s2_start + m2_len - lcs) + leftVariant = "%s;" % leftHgvs #if if r2_len or m1_len : lcp = len(longest_common_prefix(s1[s1_end - r2_len:s1_end], s2[s2_end - m1_len:s2_end])) - rightVariant = ";%s" % DNA_description(s1, s2, - s1_end - r2_len + lcp, s1_end, s2_end - m1_len + lcp, s2_end, - allele) + rightHgvs, rightRv = DNA_description(s1, s2, + s1_end - r2_len + lcp, s1_end, s2_end - m1_len + lcp, s2_end) + rightVariant = ";%s" % rightHgvs #if partial = "%s%i_%iinv%s" % (leftVariant, s1_start + r1_len + 1, s1_end - r2_len, rightVariant) + partialRv = leftRv + [RawVar(start = s1_start + r1_len + 1, + end = s1_end - r2_len, type = "inv", hgvs = hgvs)] + rightRv #if # Compound variant. @@ -237,17 +234,18 @@ def DNA_description(s1, s2, s1_start, s1_end, s2_start, s2_end, allele) : m1_len = s2_end_f - lcs_f_len m2_len = s2_end - s2_start - s2_end_f - partial = "%s;%s" % ( - DNA_description(s1, s2, - s1_start, s1_start + r1_len, s2_start, s2_start + m1_len, - allele), - DNA_description(s1, s2, - s1_end - r2_len, s1_end, s2_end - m2_len, s2_end, allele)) + leftHgvs, leftRv = DNA_description(s1, s2, + s1_start, s1_start + r1_len, s2_start, s2_start + m1_len) + rightHgvs, rightRv = DNA_description(s1, s2, + s1_end - r2_len, s1_end, s2_end - m2_len, s2_end) + partial = "%s;%s" % (leftHgvs, rightHgvs) + partialRv = leftRv + rightRv #else if len(partial) <= len(default) : - return partial - return default + return partial, partialRv + return default, [RawVar(start = s1_start + 1, end = s1_end, + inserted = s2[s2_start:s2_end], type = "delins", hgvs = default)] #DNA_description def describe(description) : @@ -271,10 +269,11 @@ def describe(description) : #for print(result.genomicDescription) - allele = [] - print(DNA_description(s1, s2, lcp, s1_end, lcp, s2_end, allele)) + hgvs, allele = DNA_description(s1, s2, lcp, s1_end, lcp, s2_end) + print(hgvs) for i in allele : - print i.hgvs + print(i.type) + print(';'.join(map(lambda x : x.hgvs, allele))) #describe def main() : -- GitLab