Commit 82e1ba61 authored by jkvis's avatar jkvis

Added repeat extractor

parent f8c3a266
......@@ -26,4 +26,5 @@ __homepage__ = 'https://github.com/mutalyzer/description-extractor'
describe_dna = describe.describe_dna
describe_protein = describe.describe_protein
describe_repeats = describe.describe_repeats
extract = extractor.extract
......@@ -26,7 +26,7 @@ using namespace std;
// Entry point.
int main(int argc, char* argv[])
{
/*
if (argc < 3)
{
fprintf(stderr, "usage: %s reference sample\n", argv[0]);
......@@ -64,42 +64,53 @@ int main(int argc, char* argv[])
size_t const alt_length = fread(sample, sizeof(char_t), sample_length, file);
static_cast<void>(alt_length);
fclose(file);
*/
string header[4305];
string protein[4305];
/*
int const N = 359;
string header[N];
string protein[N];
for (int i = 0; i < 4305; ++i)
fprintf(stdout, "\t");
for (int i = 0; i < N; ++i)
{
getline(cin, header[i]);
getline(cin, protein[i]);
cin >> header[i] >> protein[i];
fprintf(stdout, "%s\t", header[i].c_str());
} // for
fprintf(stdout, "\n");
for (int i = 0; i < 4305; ++i)
for (int i = 0; i < N; ++i)
{
cerr << i << endl;
double best = 1.f;
for (int j = i + 1; j < 4305; ++j)
fprintf(stdout, "%s\t", header[i].c_str());
for (int j = 0; j < N; ++j)
{
vector<Variant> variant;
extract(variant, protein[i].c_str(), protein[i].length(), protein[j].c_str(), protein[j].length(), TYPE_PROTEIN, "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF");
for (std::vector<Variant>::iterator it = variant.begin(); it != variant.end(); ++it)
if (i == j)
{
if (it->type >= FRAME_SHIFT && best > it->probability)
fprintf(stdout, "%.10e\t", 0.);
continue;
} // if
vector<Variant> variants;
extract(variants, protein[i].c_str(), protein[i].length(), protein[j].c_str(), protein[j].length(), TYPE_PROTEIN, "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF");
size_t best = 0;
for (std::vector<Variant>::iterator it = variants.begin(); it != variants.end(); ++it)
{
if (it->type >= FRAME_SHIFT && it->reference_end - it->reference_start > best)
{
best = it->probability;
fprintf(stdout, "%.9s %.9s %ld--%ld, %ld--%ld, %d, %.10e\n", header[i].c_str(), header[j].c_str(), it->reference_start, it->reference_end, it->sample_start, it->sample_end, it->type, 1.f - it->probability);
best = it->reference_end - it->reference_start;
} // if
} // for
size_t const length = min(protein[i].length(), protein[j].length());
fprintf(stdout, "%.10e\t", 1. - static_cast<double>(best) / static_cast<double>(length));
} // for
fprintf(stdout, "\n");
} // for
*/
/*
// The actual extraction.
std::vector<Variant> variant;
size_t const weight = extract(variant, reference, reference_length - 1, sample, sample_length - 1, TYPE_PROTEIN, "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF");
size_t const weight = extract(variant, reference, reference_length - 1, sample, sample_length - 1, TYPE_DNA);
// Printing the variants.
......@@ -109,6 +120,18 @@ int main(int argc, char* argv[])
if (it->type >= FRAME_SHIFT)
{
fprintf(stdout, "%ld--%ld, %ld--%ld, %d, %lf, %ld--%ld\n", it->reference_start, it->reference_end, it->sample_start, it->sample_end, it->type, 1.f - it->probability, it->transposition_start, it->transposition_end);
char_t ref_DNA[(it->reference_end - it->reference_start) * 3];
char_t alt_DNA[(it->reference_end - it->reference_start) * 3];
backtranslation(ref_DNA, alt_DNA, reference, it->reference_start, sample, it->sample_start, it->reference_end - it->reference_start, it->type);
fprintf(stdout, "ref_DNA: ");
fwrite(ref_DNA, sizeof(char_t), (it->reference_end - it->reference_start) * 3, stdout);
fprintf(stdout, "\nref_pro: ");
fwrite(reference + it->reference_start, sizeof(char_t), (it->reference_end - it->reference_start), stdout);
fprintf(stdout , "\nalt_DNA: ");
fwrite(alt_DNA, sizeof(char_t), (it->reference_end - it->reference_start) * 3, stdout);
fprintf(stdout, "\nalt_pro: ");
fwrite(sample + it->sample_start, sizeof(char_t), (it->reference_end - it->reference_start), stdout);
fprintf(stdout , "\n");
} // if
else
{
......@@ -120,7 +143,7 @@ int main(int argc, char* argv[])
// Cleaning up.
delete[] reference;
delete[] sample;
*/
return 0;
} // main
......@@ -103,7 +103,7 @@ def var_to_dna_var(s1, s2, var, seq_list=[], weight_position=1):
:arg unicode s2: Sample sequence.
:arg str var: Variant from the extractor module.
:arg str seq_list: Container for an inserted sequence.
:arg str weight_position: Weight of a position.
:arg int weight_position: Weight of a position.
"""
# Unknown.
if s1 == '?' or s2 == '?':
......@@ -120,7 +120,10 @@ def var_to_dna_var(s1, s2, var, seq_list=[], weight_position=1):
var.sample_start += shift3
var.sample_end += shift3
if (var.sample_start - ins_length >= 0 and
'$' not in s1[var.reference_start - ins_length:var.reference_start] and
'$' not in s2[var.sample_start:var.sample_end] and
s1[var.reference_start - ins_length:var.reference_start] ==
s2[var.sample_start:var.sample_end]):
# NOTE: We may want to omit the inserted / deleted sequence and
......@@ -339,6 +342,90 @@ def describe_dna(s1, s2):
return description
def mask_string(string, units):
MASK = '$'
for unit in units:
found = string.find(unit)
if found != -1:
string = string.replace(unit, MASK * len(unit))
return string
def get_repeats(string, unit):
repeats = []
found = string.find(unit, 0)
while found != -1:
last = len(repeats) - 1
if last > -1 and repeats[last]['start'] + repeats[last]['count'] * len(unit) == found:
repeats[last]['count'] += 1
else:
repeats.append({'start': found, 'count': 1, 'unit': unit})
found = string.find(unit, found + 1)
return repeats
def describe_repeats(reference, sample, units):
masked_ref = mask_string(reference, units)
masked_alt = mask_string(sample, units)
repeats = []
for unit in units:
repeats += get_repeats(sample, unit)
repeats = sorted(repeats, key=lambda repeat: repeat['start'])
ref_swig = util.swig_str(masked_ref)
alt_swig = util.swig_str(masked_alt)
extracted = extractor.extract(ref_swig[0], ref_swig[1],
alt_swig[0], alt_swig[1], extractor.TYPE_DNA)
in_transposition = 0
index = 0
repeat = 0
description = Allele()
seq_list = ISeqList()
for variant in extracted.variants:
while variant.sample_start > index:
seq_list.append(DNAVar(type='repeat',inserted=repeats[repeat]['unit'],count=repeats[repeat]['count']))
index = repeats[repeat]['start'] + repeats[repeat]['count'] * len(repeats[repeat]['unit'])
repeat += 1
if variant.type & extractor.TRANSPOSITION_OPEN:
in_transposition += 1
if in_transposition:
if variant.type & extractor.IDENTITY:
seq_list.append(ISeq(start=variant.transposition_start + 1,
end=variant.transposition_end, reverse=False,
weight_position=extracted.weight_position))
elif variant.type & extractor.REVERSE_COMPLEMENT:
seq_list.append(ISeq(start=variant.transposition_start + 1,
end=variant.transposition_end, reverse=True,
weight_position=extracted.weight_position))
else:
seq_list.append(ISeq(
sequence=sample[variant.sample_start:variant.sample_end],
weight_position=extracted.weight_position))
elif variant.type & extractor.IDENTITY:
seq_list.append(ISeq(start=variant.reference_start + 1,
end=variant.reference_end, reverse=False,weight_position=extracted.weight_position))
elif variant.type & extractor.REVERSE_COMPLEMENT:
seq_list.append(ISeq(start=variant.reference_start + 1,
end=variant.reference_end, reverse=True,weight_position=extracted.weight_position))
else:
seq_list.append(ISeq(sequence=sample[variant.sample_start:variant.sample_end],
weight_position=extracted.weight_position))
if variant.type & extractor.TRANSPOSITION_CLOSE:
in_transposition -= 1
index = variant.sample_end
description.append(DNAVar(start=1,end=len(reference),sample_start=1,sample_end=len(sample),type='delins',inserted=seq_list))
print(description)
def print_var(variant):
print('({:3}, {:3}), ({:3}, {:3}), {:08b}, {}, {}'.format(variant.reference_start,
variant.reference_end, variant.sample_start, variant.sample_end,
......
......@@ -67,6 +67,10 @@ static uint64_t acid_map[128] = {0x0ull};
static double acid_frequency[128] = {.0f};
static double frame_shift_frequency[128][128][5] = {{{.0f}}};
// This character is always ignored when LCS matching and can be used for
// repeat masking
static char_t const MASK = '$';
// Only used to interface to Python: calls the C++ extract function.
Variant_List extract(char_t const* const reference,
size_t const reference_length,
......@@ -200,12 +204,40 @@ size_t extract(std::vector<Variant> &variant,
size_t extractor(std::vector<Variant> &variant,
char_t const* const reference,
char_t const* const complement,
size_t const reference_start,
size_t const reference_end,
size_t reference_start,
size_t reference_end,
char_t const* const sample,
size_t const sample_start,
size_t const sample_end)
size_t sample_start,
size_t sample_end)
{
// First do prefix and suffix matching on the MASK character
size_t i = 0;
while (reference_start + i < reference_end && reference[reference_start + i] == MASK)
{
++i;
} // while
reference_start += i;
i = 0;
while (reference_end - i - 1 > reference_start && reference[reference_end - i - 1] == MASK)
{
++i;
} // while
reference_end -= i;
i = 0;
while (sample_start + i < sample_end && sample[sample_start + i] == MASK)
{
++i;
} // while
sample_start += i;
i = 0;
while (sample_end - i - 1 > sample_start && sample[sample_end - i - 1] == MASK)
{
++i;
} // while
sample_end -= i;
size_t const reference_length = reference_end - reference_start;
size_t const sample_length = sample_end - sample_start;
......@@ -911,11 +943,11 @@ size_t LCS(std::vector<Substring> &substring,
// The initial k.
size_t k = reference_length > sample_length ? sample_length / 4 : reference_length / 4;
size_t k = reference_length > sample_length ? sample_length / 8 : reference_length / 8;
// Reduce k until the cut-off is reached.
while (k > 4 && k > cut_off)
while (k > 8 && k > cut_off)
{
......@@ -990,7 +1022,7 @@ size_t LCS_1(std::vector<Substring> &substring,
for (size_t j = 0; j < reference_length; ++j)
{
// A match
if (reference[reference_start + j] == sample[sample_start + i])
if (reference[reference_start + j] == sample[sample_start + i] && reference[reference_start + j] != MASK)
{
if (i == 0 || j == 0)
{
......@@ -1024,7 +1056,7 @@ size_t LCS_1(std::vector<Substring> &substring,
// If applicable check for a LCS in reverse complement space.
// The same code is used as before but the complement string is
// travesed backwards (towards the start).
if (complement != 0 && complement[reference_end - j - 1] == sample[sample_start + i])
if (complement != 0 && complement[reference_end - j - 1] == sample[sample_start + i] && complement[reference_end - j - 1] != MASK)
{
if (i == 0 || j == 0)
{
......@@ -1206,7 +1238,7 @@ size_t LCS_k(std::vector<Substring> &substring,
// Extending to the right.
{
size_t i = 0;
while (i <= k && it->reference_index + it->length + i < reference_end && it->sample_index + it->length + i < sample_end && reference[it->reference_index + it->length + i] == sample[it->sample_index + it->length + i])
while (i <= k && it->reference_index + it->length + i < reference_end && it->sample_index + it->length + i < sample_end && reference[it->reference_index + it->length + i] == sample[it->sample_index + it->length + i] && reference[it->reference_index + it->length + i] != MASK)
{
++i;
} // while
......@@ -1215,7 +1247,7 @@ size_t LCS_k(std::vector<Substring> &substring,
// Extending to the left.
{
size_t i = 0;
while (i <= k && it->reference_index - i - 1 >= reference_start && it->sample_index - i - 1 >= sample_start && reference[it->reference_index - i - 1] == sample[it->sample_index - i - 1])
while (i <= k && it->reference_index - i - 1 >= reference_start && it->sample_index - i - 1 >= sample_start && reference[it->reference_index - i - 1] == sample[it->sample_index - i - 1] && reference[it->reference_index - i - 1] != MASK)
{
++i;
} // while
......@@ -1232,7 +1264,7 @@ size_t LCS_k(std::vector<Substring> &substring,
// Extending to the right (sample orientation).
{
size_t i = 0;
while (i <= k && it->reference_index - i - 1 >= reference_start && it->sample_index + it->length + i < sample_end && complement[it->reference_index - i - 1] == sample[it->sample_index + it->length + i])
while (i <= k && it->reference_index - i - 1 >= reference_start && it->sample_index + it->length + i < sample_end && complement[it->reference_index - i - 1] == sample[it->sample_index + it->length + i] && complement[it->reference_index - i - 1] != MASK)
{
++i;
} // while
......@@ -1242,7 +1274,7 @@ size_t LCS_k(std::vector<Substring> &substring,
// Extending to the left (sample orientation).
{
size_t i = 0;
while (i <= k && it->reference_index + it->length + i < reference_end && it->sample_index - i - 1 >= sample_start && complement[it->reference_index + it->length + i] == sample[it->sample_index - i - 1])
while (i <= k && it->reference_index + it->length + i < reference_end && it->sample_index - i - 1 >= sample_start && complement[it->reference_index + it->length + i] == sample[it->sample_index - i - 1] && complement[it->reference_index + it->length + i] != MASK)
{
++i;
} // while
......@@ -1407,7 +1439,7 @@ bool string_match(char_t const* const string_1,
{
for (size_t i = 0; i < length; ++i)
{
if (string_1[i] != string_2[i])
if (string_1[i] != string_2[i] || string_1[i] == MASK)
{
return false;
} // if
......@@ -1425,7 +1457,7 @@ bool string_match_reverse(char_t const* const string_1,
{
for (size_t i = 0; i < length; ++i)
{
if (string_1[-i] != string_2[i])
if (string_1[-i] != string_2[i] || string_1[-i] == MASK)
{
return false;
} // if
......@@ -1445,7 +1477,7 @@ size_t prefix_match(char_t const* const reference,
// Traverse both strings towards the end as long as their characters
// are equal. Do NOT exceed the length of the strings.
while (i < reference_length && i < sample_length && reference[i] == sample[i])
while (i < reference_length && i < sample_length && reference[i] == sample[i] && reference[i] != MASK)
{
++i;
} // while
......@@ -1466,7 +1498,7 @@ size_t suffix_match(char_t const* const reference,
// Start at the end of both strings and traverse towards the start
// as long as their characters are equal. Do not exceed the length
// of the strings.
while (i < reference_length - prefix && i < sample_length - prefix && reference[reference_length - i - 1] == sample[sample_length - i - 1])
while (i < reference_length - prefix && i < sample_length - prefix && reference[reference_length - i - 1] == sample[sample_length - i - 1] && reference[reference_length - i - 1] != MASK)
{
++i;
} // while
......@@ -1506,8 +1538,8 @@ char_t const* IUPAC_complement(char_t const* const string,
return complement;
} // IUPAC_complement
void backtranslation(size_t reference_DNA[],
size_t sample_DNA[],
void backtranslation(char_t ref_DNA[],
char_t alt_DNA[],
char_t const* const reference,
size_t const reference_start,
char_t const* const sample,
......@@ -1515,11 +1547,14 @@ void backtranslation(size_t reference_DNA[],
size_t const length,
uint8_t const type)
{
for (size_t i = 0; i < 3 * length; ++ i)
size_t reference_DNA[3 * length];
size_t sample_DNA[3 * length];
for (size_t i = 0; i < 3 * length; i++)
{
reference_DNA[i] = 0;
sample_DNA[i] = 0;
} // for
for (size_t p = 0; p < length; ++p)
{
for (size_t i = 0; i < 64; ++i)
......@@ -1603,6 +1638,11 @@ void backtranslation(size_t reference_DNA[],
} // if
} // for
} // for
for (size_t i = 0; i < 3 * length; i++)
{
ref_DNA[i] = IUPAC_ALPHA[reference_DNA[i]];
alt_DNA[i] = IUPAC_ALPHA[sample_DNA[i]];
} // for
return;
} // backtranslation
......
......@@ -28,7 +28,7 @@ namespace mutalyzer
{
// Version string for run-time identification.
static char const* const VERSION = "2.3.3";
static char const* const VERSION = "2.3.4";
// The character type used for all strings. For now it should just be
......@@ -668,6 +668,16 @@ uint8_t frame_shift(char_t const reference_1,
char_t const sample);
void backtranslation(char_t reference_DNA[],
char_t sample_DNA[],
char_t const* const reference,
size_t const reference_start,
char_t const* const sample,
size_t const sample_start,
size_t const length,
uint8_t const type);
#if defined(__debug__)
// *******************************************************************
// Dprint_truncated function
......
......@@ -121,6 +121,9 @@ class ISeq(object):
if not (self.start or self.end):
return ''
if self.start == self.end:
return '{}'.format(self.start)
inverted = 'inv' if self.reverse else ''
return '{0}_{1}{2}'.format(self.start, self.end, inverted)
......@@ -180,7 +183,9 @@ class AISeq(object):
return self.sequence
if self.type == 'trans':
return '{}_{}'.format(self.start, self.end)
if self.start != self.end:
return '{}_{}'.format(self.start, self.end)
return '{}'.format(self.start)
return '{}_{}{}|{}'.format(
self.start, self.end, self.sequence, '|'.join(self.frames))
......@@ -193,7 +198,7 @@ class DNAVar(object):
"""
def __init__(
self, start=0, start_offset=0, end=0, end_offset=0, sample_start=0,
sample_start_offset=0, sample_end=0, sample_end_offset=0,
sample_start_offset=0, sample_end=0, sample_end_offset=0, count=0,
type='none', deleted=ISeqList([ISeq()]),
inserted=ISeqList([ISeq()]), shift=0, weight_position=1):
"""
......@@ -207,6 +212,7 @@ class DNAVar(object):
:arg int sample_start_offset:
:arg int sample_end: End position.
:arg int sample_end_offset:
:arg int count
:arg unicode type: Variant type.
:arg unicode deleted: Deleted part of the reference sequence.
:arg ISeqList inserted: Inserted part.
......@@ -222,6 +228,7 @@ class DNAVar(object):
self.sample_start_offset = sample_start_offset
self.sample_end = sample_end
self.sample_end_offset = sample_end_offset
self.count = count
self.type = type
self.deleted = deleted
self.inserted = inserted
......@@ -241,6 +248,9 @@ class DNAVar(object):
if self.type == 'none':
return '='
if self.count > 0:
return '{0}({1})'.format(self.inserted, self.count)
description = str(self.start)
if self.start != self.end:
......
#!/usr/bin/env python
from __future__ import unicode_literals
from extractor import *
ref = 'AGCTGTGGGAGGGAGCCAGTGGATTTGGAAACAGAAATGGCTTGGCCTTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCCTCCTGCAATCCTTTAACTTACTGAATAACTCATGATTATGGGCCACCTGCAGGTACCATGCTAG'
alt = 'AGCTGTGGGAGGGAGCCAGTGGATTTGGAAACAGAAATGGCTTGGCCTTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCGTCCTTCCTTCCCTCCTGCAATCCTTTAACTTACTGAATAACTCATGATTATGGGCCACCTGCAGGTACCATGCTAG'
units = ['CTTC']
print 'version:', extractor.VERSION
describe_repeats(ref, alt, units)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment