Commit 28668cf5 authored by J.K. Vis's avatar J.K. Vis
Browse files

Recursive frame shift extraction

parent a5071bc7
......@@ -8,8 +8,8 @@
// FILE INFORMATION:
// File: extractor.cc (depends on extractor.h)
// Author: Jonathan K. Vis
// Revision: 2.1.11
// Date: 2015/02/16
// Revision: 2.2.0
// Date: 2015/03/10
// *******************************************************************
// DESCRIPTION:
// This library can be used to generete HGVS variant descriptions as
......
......@@ -8,8 +8,8 @@
// FILE INFORMATION:
// File: extractor.h (implemented in extractor.cc)
// Author: Jonathan K. Vis
// Revision: 2.1.9
// Date: 2015/02/16
// Revision: 2.2.0
// Date: 2015/03/10
// *******************************************************************
// DESCRIPTION:
// This library can be used to generate HGVS variant descriptions as
......@@ -34,7 +34,7 @@ namespace mutalyzer
{
// Version string for run-time identification.
static char const* const VERSION = "2.1.9";
static char const* const VERSION = "2.2.0";
// The character type used for all strings. For now it should just be
......@@ -202,7 +202,7 @@ Variant_List extract(char_t const* const reference,
size_t const reference_length,
char_t const* const sample,
size_t const sample_length,
int const type = TYPE_DNA,
int const type = TYPE_DNA,
char_t const* const codon_string = 0);
// *******************************************************************
......@@ -230,7 +230,7 @@ size_t extract(std::vector<Variant> &variant,
size_t const reference_length,
char_t const* const sample,
size_t const sample_length,
int const type = TYPE_DNA,
int const type = TYPE_DNA,
char_t const* const codon_string = 0);
// *******************************************************************
......@@ -597,14 +597,6 @@ size_t annotate_frame_shift(std::vector<Variant> &variant,
size_t const sample_start,
size_t const sample_end);
size_t annotate_frame_inversion(std::vector<Variant> &variant,
char_t const* const reference,
size_t const reference_start,
size_t const reference_end,
char_t const* const sample,
size_t const sample_start,
size_t const sample_end);
#if defined(__debug__)
// *******************************************************************
......
......@@ -8,6 +8,9 @@ typedef char char_t;
typedef unsigned char uint8_t;
typedef unsigned long long uint64_t;
static unsigned int const IDENTITY = 0x01;
static unsigned int const FRAME_SHIFT = 0x20;
static uint8_t const FRAME_SHIFT_NONE = 0x00;
static uint8_t const FRAME_SHIFT_1 = 0x01;
static uint8_t const FRAME_SHIFT_2 = 0x02;
......@@ -19,25 +22,68 @@ char_t const* const CODON_STRING = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGG
static uint8_t frame_shift_map[128][128][128] = {{{FRAME_SHIFT_NONE}}};
struct FS_Substring
struct Variant
{
size_t reference_start;
size_t reference_end;
size_t sample_start;
size_t sample_end;
unsigned int type;
size_t weight;
size_t transposition_start;
size_t transposition_end;
inline Variant(size_t const reference_start,
size_t const reference_end,
size_t const sample_start,
size_t const sample_end,
unsigned int const type = IDENTITY,
size_t const weight = 0,
size_t const transposition_start = 0,
size_t const transposition_end = 0):
reference_start(reference_start),
reference_end(reference_end),
sample_start(sample_start),
sample_end(sample_end),
type(type),
weight(weight),
transposition_start(transposition_start),
transposition_end(transposition_end) { }
inline Variant(void) { }
}; // Variant
struct Substring
{
size_t reference_index;
size_t sample_index;
size_t length;
uint8_t type;
union
{
bool reverse_complement;
uint8_t type;
}; // union
inline FS_Substring(size_t const reference_index,
size_t const sample_index,
size_t const length,
uint8_t const type):
inline Substring(size_t const reference_index,
size_t const sample_index,
size_t const length,
bool const reverse_complement = false):
reference_index(reference_index),
sample_index(sample_index),
length(length),
type(type) { }
reverse_complement(reverse_complement) { }
inline FS_Substring(void): length(0) { }
}; // FS_Substring
inline Substring(size_t const reference_index,
size_t const sample_index,
size_t const length,
uint8_t const type):
reference_index(reference_index),
sample_index(sample_index),
length(length),
type(type) { }
inline Substring(void): length(0) { }
}; // Substring
uint8_t calculate_frame_shift(uint64_t const acid_map[],
size_t const reference_1,
......@@ -127,13 +173,13 @@ uint8_t frame_shift(char_t const reference_1,
return frame_shift_map[reference_1 & 0x7f][reference_2 & 0x7f][sample & 0x7f];
} // frame_shift
size_t lcs_frame_shift(std::vector<FS_Substring> &substring,
char_t const* const reference,
size_t const reference_start,
size_t const reference_end,
char_t const* const sample,
size_t const sample_start,
size_t const sample_end)
size_t lcs_frame_shift(std::vector<Substring> &substring,
char_t const* const reference,
size_t const reference_start,
size_t const reference_end,
char_t const* const sample,
size_t const sample_start,
size_t const sample_end)
{
size_t const reference_length = reference_end - reference_start;
size_t const sample_length = sample_end - sample_start;
......@@ -153,14 +199,14 @@ size_t lcs_frame_shift(std::vector<FS_Substring> &substring,
lcs[1][i][4] = 0;
} // for
FS_Substring fs_substring[5];
Substring fs_substring[5];
for (size_t i = 0; i < sample_length; ++i)
{
uint8_t const shift_reverse = frame_shift(reference[reference_end - 1], reference[reference_end - 2], sample[sample_start + i]);
if ((shift_reverse & FRAME_SHIFT_REVERSE) == FRAME_SHIFT_REVERSE)
{
lcs[i % 2][0][2] = 1;
fs_substring[2] = FS_Substring(reference_start, sample_start + i, lcs[i % 2][0][2], FRAME_SHIFT_REVERSE);
fs_substring[2] = Substring(reference_start, sample_start + i, lcs[i % 2][0][2], FRAME_SHIFT_REVERSE);
} // if
for (size_t j = 1; j < reference_length; ++j)
{
......@@ -188,27 +234,27 @@ size_t lcs_frame_shift(std::vector<FS_Substring> &substring,
} // if
if (lcs[i % 2][j][0] > fs_substring[0].length)
{
fs_substring[0] = FS_Substring(reference_start + j - lcs[i % 2][j][0], sample_start + i - lcs[i % 2][j][0] + 1, lcs[i % 2][j][0], FRAME_SHIFT_1);
fs_substring[0] = Substring(reference_start + j - lcs[i % 2][j][0], sample_start + i - lcs[i % 2][j][0] + 1, lcs[i % 2][j][0], FRAME_SHIFT_1);
} // if
if (lcs[i % 2][j][1] > fs_substring[1].length)
{
fs_substring[1] = FS_Substring(reference_start + j - lcs[i % 2][j][1], sample_start + i - lcs[i % 2][j][1] + 1, lcs[i % 2][j][1], FRAME_SHIFT_2);
fs_substring[1] = Substring(reference_start + j - lcs[i % 2][j][1], sample_start + i - lcs[i % 2][j][1] + 1, lcs[i % 2][j][1], FRAME_SHIFT_2);
} // if
if (lcs[i % 2][j][2] > fs_substring[2].length)
{
fs_substring[2] = FS_Substring(reference_start + j - lcs[i % 2][j][2], sample_start + i - lcs[i % 2][j][2], lcs[i % 2][j][2], FRAME_SHIFT_REVERSE);
fs_substring[2] = Substring(reference_start + j - lcs[i % 2][j][2], sample_start + i - lcs[i % 2][j][2], lcs[i % 2][j][2], FRAME_SHIFT_REVERSE);
} // if
if (lcs[i % 2][j][3] > fs_substring[3].length)
{
fs_substring[3] = FS_Substring(reference_start + j - lcs[i % 2][j][3], sample_start + i - lcs[i % 2][j][3] + 1, lcs[i % 2][j][3], FRAME_SHIFT_REVERSE_1);
fs_substring[3] = Substring(reference_start + j - lcs[i % 2][j][3], sample_start + i - lcs[i % 2][j][3] + 1, lcs[i % 2][j][3], FRAME_SHIFT_REVERSE_1);
} // if
if (lcs[i % 2][j][4] > fs_substring[4].length)
{
fs_substring[4] = FS_Substring(reference_start + j - lcs[i % 2][j][4], sample_start + i - lcs[i % 2][j][4] + 1, lcs[i % 2][j][4], FRAME_SHIFT_REVERSE_2);
fs_substring[4] = Substring(reference_start + j - lcs[i % 2][j][4], sample_start + i - lcs[i % 2][j][4] + 1, lcs[i % 2][j][4], FRAME_SHIFT_REVERSE_2);
} // if
} // for
} // for
substring = std::vector<FS_Substring>(1, fs_substring[0]);
substring = std::vector<Substring>(1, fs_substring[0]);
substring.push_back(fs_substring[1]);
substring.push_back(fs_substring[2]);
substring.push_back(fs_substring[3]);
......@@ -216,69 +262,89 @@ size_t lcs_frame_shift(std::vector<FS_Substring> &substring,
return 0;
} // lcs_frame_shift
FS_Substring maximize_frame_shift(std::vector<FS_Substring> const &substring)
size_t extractor_frame_shift(std::vector<Variant> &annotation,
char_t const* const reference,
size_t const reference_start,
size_t const reference_end,
char_t const* const sample,
size_t const sample_start,
size_t const sample_end)
{
FS_Substring fs_substring;
size_t max_length = 0;
size_t const reference_length = reference_end - reference_start;
size_t const sample_length = sample_end - sample_start;
// First the base cases to end the recursion.
if (reference_length <= 0 && sample_length <= 0)
{
return 0;
} // if
// Calculate the frame shift LCS of the two strings.
std::vector<Substring> substring;
lcs_frame_shift(substring, reference, reference_start, reference_end, sample, sample_start, sample_end);
// Pick the ``best fitting'' frame shift LCS, i.e., pushed as far to
// the start of the reference string as possible. Also update in
// case of compound frame shifts.
Substring lcs(0, 0, 0, FRAME_SHIFT_NONE);
for (size_t i = 0; i < 5; ++i)
{
if (substring[i].length > max_length)
if (substring[i].length > lcs.length ||
(substring[i].length == lcs.length && substring[i].reference_index < lcs.reference_index))
{
max_length = substring[i].length;
fs_substring = substring[i];
lcs = substring[i];
} // if
else if (substring[i].length == max_length)
// Update compound frame shifts, e.g.,
// FRAME_SHIFT_1 | FRAME_SHIFT_2
else if (substring[i].length == lcs.length &&
substring[i].reference_index == lcs.reference_index &&
substring[i].sample_index == lcs.sample_index)
{
if ((fs_substring.type == FRAME_SHIFT_1 && substring[i].type == FRAME_SHIFT_2) ||
(fs_substring.type == FRAME_SHIFT_2 && substring[i].type == FRAME_SHIFT_1) ||
(fs_substring.type == FRAME_SHIFT_REVERSE_1 && substring[i].type == FRAME_SHIFT_REVERSE_2) ||
(fs_substring.type == FRAME_SHIFT_REVERSE_2 && substring[i].type == FRAME_SHIFT_REVERSE_1))
{
fs_substring.type |= substring[i].type;
} // if
lcs.type |= substring[i].type;
} // if
} // for
return fs_substring;
} // maximize_frame_shift
// No LCS found: no frame shift annotation.
if (lcs.length <= 0)
{
return 0;
} // if
// Recursively apply this function to the prefixes of the strings.
std::vector<Variant> prefix;
extractor_frame_shift(prefix, reference, reference_start, lcs.reference_index, sample, sample_start, lcs.sample_index);
// Recursively apply this function to the suffixes of the strings.
std::vector<Variant> suffix;
extractor_frame_shift(suffix, reference, lcs.reference_index + lcs.length, reference_end, sample, lcs.sample_index + lcs.length, sample_end);
// Add all variants (in order) to the annotation vector.
annotation.insert(annotation.end(), prefix.begin(), prefix.end());
annotation.push_back(Variant(lcs.reference_index, lcs.reference_index + lcs.length, lcs.sample_index, lcs.sample_index + lcs.length, FRAME_SHIFT | lcs.type));
annotation.insert(annotation.end(), suffix.begin(), suffix.end());
return 0;
} // extractor_frame_shift
int main(int, char* [])
{
initialize_frame_shift_map(CODON_STRING);
char_t const* const reference = "MDYSL";
size_t const start = 1;
size_t const end = 5;
std::vector<FS_Substring> substring;
FS_Substring fs_lcs;
lcs_frame_shift(substring, reference, start, end, "MALFP", start, end);
fs_lcs = maximize_frame_shift(substring);
printf("fs_lcs: %d @ %ld, %ld length = %ld\n", fs_lcs.type, fs_lcs.reference_index, fs_lcs.sample_index, fs_lcs.length);
lcs_frame_shift(substring, reference, start, end, "MATIP", start, end);
fs_lcs = maximize_frame_shift(substring);
printf("fs_lcs: %d @ %ld, %ld length = %ld\n", fs_lcs.type, fs_lcs.reference_index, fs_lcs.sample_index, fs_lcs.length);
lcs_frame_shift(substring, reference, start, end, "MTIPW", start, end);
fs_lcs = maximize_frame_shift(substring);
printf("fs_lcs: %d @ %ld, %ld length = %ld\n", fs_lcs.type, fs_lcs.reference_index, fs_lcs.sample_index, fs_lcs.length);
lcs_frame_shift(substring, reference, start, end, "MLFPG", start, end);
fs_lcs = maximize_frame_shift(substring);
printf("fs_lcs: %d @ %ld, %ld length = %ld\n", fs_lcs.type, fs_lcs.reference_index, fs_lcs.sample_index, fs_lcs.length);
lcs_frame_shift(substring, reference, start, end, "MQGIV", start, end);
fs_lcs = maximize_frame_shift(substring);
printf("fs_lcs: %d @ %ld, %ld length = %ld\n", fs_lcs.type, fs_lcs.reference_index, fs_lcs.sample_index, fs_lcs.length);
lcs_frame_shift(substring, reference, start, end, "MAGNS", start, end);
fs_lcs = maximize_frame_shift(substring);
printf("fs_lcs: %d @ %ld, %ld length = %ld\n", fs_lcs.type, fs_lcs.reference_index, fs_lcs.sample_index, fs_lcs.length);
lcs_frame_shift(substring, reference, start, end, "MDRE*", start, end);
fs_lcs = maximize_frame_shift(substring);
printf("fs_lcs: %d @ %ld, %ld length = %ld\n", fs_lcs.type, fs_lcs.reference_index, fs_lcs.sample_index, fs_lcs.length);
lcs_frame_shift(substring, reference, start, end, "MRE*S", start, end);
fs_lcs = maximize_frame_shift(substring);
printf("fs_lcs: %d @ %ld, %ld length = %ld\n", fs_lcs.type, fs_lcs.reference_index, fs_lcs.sample_index, fs_lcs.length);
lcs_frame_shift(substring, reference, start, end, "MGNSP", start, end);
fs_lcs = maximize_frame_shift(substring);
printf("fs_lcs: %d @ %ld, %ld length = %ld\n", fs_lcs.type, fs_lcs.reference_index, fs_lcs.sample_index, fs_lcs.length);
std::vector<Variant> annotation;
extractor_frame_shift(annotation, "MDYSLAAALTLHGH", 1, 11, "MTIPWRSPHFHGH", 1, 10);
// Printing the variants.
fprintf(stdout, "Annotation (%ld):\n", annotation.size());
for (std::vector<Variant>::iterator it = annotation.begin(); it != annotation.end(); ++it)
{
fprintf(stdout, "%ld--%ld, %ld--%ld, %d, %ld, %ld--%ld\n", it->reference_start, it->reference_end, it->sample_start, it->sample_end, it->type, it->weight, it->transposition_start, it->transposition_end);
} // for
return 0;
} // main
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment