Commit a68401ab authored by J.K. Vis's avatar J.K. Vis
Browse files

Frame shift refactoring

parent 99b520a2
......@@ -8,8 +8,8 @@
// FILE INFORMATION:
// File: extractor.cc (depends on extractor.h)
// Author: Jonathan K. Vis
// Revision: 2.1.10
// Date: 2015/01/14
// Revision: 2.1.11
// Date: 2015/02/16
// *******************************************************************
// DESCRIPTION:
// This library can be used to generete HGVS variant descriptions as
......@@ -152,7 +152,9 @@ size_t extract(std::vector<Variant> &variant,
{
if (it->type == SUBSTITUTION)
{
annotate_frame_shift(annotation, reference, it->reference_start, it->reference_end, sample, it->sample_start, it->sample_end);
annotate_frame_inversion(annotation, reference, it->reference_start, it->reference_end, sample, it->sample_start, it->sample_end);
//annotate_frame_shift(annotation, reference, it->reference_start, it->reference_end, sample, it->sample_start, it->sample_end);
} // if
} // for
......@@ -1304,7 +1306,7 @@ char_t const* IUPAC_complement(char_t const* const string,
return complement;
} // IUPAC_complement
// This function calculates the frame shift A reference amino acid is
// This function calculates the frame shift. A reference amino acid is
// checked against two possible partial overlaps between every
// combination of two sample (observed) amino acids.
unsigned int frame_shift(char_t const reference, char_t const sample_1, char_t const sample_2)
......@@ -1393,6 +1395,27 @@ size_t annotate_frame_shift(std::vector<Variant> &variant,
return variant.size();
} // annotate_frame_shift
size_t annotate_frame_inversion(std::vector<Variant> &variant,
char_t const* const reference,
size_t const reference_start,
size_t const reference_end,
char_t const* const sample,
size_t const sample_start,
size_t const sample_end)
{
size_t const reference_length = reference_end - reference_start;
size_t const sample_length = sample_end - sample_start;
size_t const length = (reference_length <= sample_length ? reference_length : sample_length) - 1;
#if defined(__debug__)
fprintf(stderr, "Frame inversion detection (%ld)\n", length);
#endif
return 0;
} // annotate_frame_inversion
#if defined(__debug__)
// Debug function for printing large strings in truncated form: a
......
......@@ -8,8 +8,8 @@
// FILE INFORMATION:
// File: extractor.h (implemented in extractor.cc)
// Author: Jonathan K. Vis
// Revision: 2.1.8
// Date: 2015/01/12
// Revision: 2.1.9
// Date: 2015/02/16
// *******************************************************************
// DESCRIPTION:
// This library can be used to generate HGVS variant descriptions as
......@@ -34,7 +34,7 @@ namespace mutalyzer
{
// Version string for run-time identification.
static char const* const VERSION = "2.1.8";
static char const* const VERSION = "2.1.9";
// The character type used for all strings. For now it should just be
......@@ -597,6 +597,14 @@ size_t annotate_frame_shift(std::vector<Variant> &variant,
size_t const sample_start,
size_t const sample_end);
size_t annotate_frame_inversion(std::vector<Variant> &variant,
char_t const* const reference,
size_t const reference_start,
size_t const reference_end,
char_t const* const sample,
size_t const sample_start,
size_t const sample_end);
#if defined(__debug__)
// *******************************************************************
......
......@@ -8,186 +8,106 @@
// FILE INFORMATION:
// File: sandbox.cc
// Author: Jonathan K. Vis
// Revision: 1.1.5
// Date: 2015/02/02
// Revision: 1.0.0
// Date: 2015/03/02
// *******************************************************************
// DESCRIPTION:
// General testing playground: automatic tandem repeat detection
// based on run length encoding with variable run lengths.
// General testing playground: protein frame shift construction code.
// *******************************************************************
#include <cstdlib>
#include <vector>
#include <cstdio>
#include <cstdlib>
typedef char char_t;
static size_t const THRESHOLD = 10000;
struct Repeat
{
inline Repeat(size_t const start,
size_t const end,
size_t const count = 0):
start(start), end(end), count(count) { }
#include <map>
inline Repeat(void) { }
static unsigned int const FRAME_SHIFT_0 = 0x00;
static unsigned int const FRAME_SHIFT_1 = 0x01;
static unsigned int const FRAME_SHIFT_2 = 0x02;
static unsigned int const FRAME_SHIFT_REVERSE_IDEM = 0x04;
static unsigned int const FRAME_SHIFT_REVERSE_1 = 0x08;
static unsigned int const FRAME_SHIFT_REVERSE_2 = 0x10;
size_t start;
size_t end;
size_t count;
}; // Repeat
static char_t const* const codon_string = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF";
bool string_match(char_t const* const string_1,
char_t const* const string_2,
size_t const length)
static char_t const IUPAC_BASE[4] =
{
for (size_t i = 0; i < length; ++i)
{
if (string_1[i] != string_2[i])
{
return false;
} // if
} // for
return true;
} // string_match
'A',
'C',
'G',
'T'
}; // IUPAC_BASE
char_t codon_table[64] = {'\0'};
std::multimap<char_t const, unsigned int const> codon_map;
void tandem_repeat_annotation(std::vector<Repeat> &repeat,
char_t const* const string,
size_t const start,
size_t const end)
unsigned int frame_shift(char_t const reference, char_t const sample_1, char_t const sample_2)
{
size_t const length = end - start;
size_t const k_max = length > THRESHOLD ? THRESHOLD / 2 : length / 2 + 1;
std::pair<std::multimap<char_t const, unsigned int const>::const_iterator, std::multimap<char_t const, unsigned int const>::const_iterator> const range_1 = codon_map.equal_range(sample_1);
std::pair<std::multimap<char_t const, unsigned int const>::const_iterator, std::multimap<char_t const, unsigned int const>::const_iterator> const range_2 = codon_map.equal_range(sample_2);
size_t i = 0;
size_t last_repeat = i;
while (i < length)
unsigned int shift = FRAME_SHIFT_0;
for (std::multimap<char_t const, unsigned int const>::const_iterator it_1 = range_1.first; it_1 != range_1.second; ++it_1)
{
size_t max_count = 0;
size_t max_k = 1;
for (size_t k = 1; k < k_max; ++k)
for (std::multimap<char_t const, unsigned int const>::const_iterator it_2 = range_2.first; it_2 != range_2.second; ++it_2)
{
size_t count = 0;
for (size_t j = i + k; j < length - k + 1; j += k)
const unsigned int codon_1 = ((it_1->second << 0x2) | (it_2->second >> 0x4)) & 0x3f;
const unsigned int codon_2 = ((it_1->second << 0x4) | (it_2->second >> 0x2)) & 0x3f;
if (reference == codon_table[codon_1])
{
if (!string_match(string + start + i, string + start + j, k))
{
break;
} // if
++count;
} // for
if (count > 0 && count >= max_count)
shift |= FRAME_SHIFT_1;
} // if
if (reference == codon_table[codon_2])
{
max_count = count;
max_k = k;
shift |= FRAME_SHIFT_2;
} // if
} // for
if (max_count > 0)
{
if (last_repeat < i)
if (shift == (FRAME_SHIFT_1 | FRAME_SHIFT_2))
{
repeat.push_back(Repeat(start + last_repeat, start + i));
return shift;
} // if
repeat.push_back(Repeat(start + i, start + i + max_k, max_count));
last_repeat = i + max_k * (max_count + 1);
} // if
i += max_k * (max_count + 1);
} // while
if (last_repeat < i)
{
repeat.push_back(Repeat(start + last_repeat, start + i));
} // if
return;
} // tandem_repeat_annotation
printf("%d %d -> %d %d\n", it_1->second, it_2->second, codon_1, codon_2);
} // for
} // for
return shift;
} // frame_shift
int main(int argc, char* argv[])
void print_codon(unsigned int const index)
{
if (argc < 2)
{
fprintf(stderr, "usage: %s string [start] [end]\n", argv[0]);
return 1;
} // if
fprintf(stderr, "Tandem Repeat Annotator\n");
printf("%c%c%c", IUPAC_BASE[index >> 0x4], IUPAC_BASE[(index >> 0x2) & 0x3], IUPAC_BASE[index & 0x3]);
} // print_codon
FILE* file = fopen(argv[1], "r");
if (file == 0)
{
fprintf(stderr, "ERROR: could not open file `%s'\n", argv[1]);
return 1;
} // if
fseek(file, 0, SEEK_END);
size_t const length = ftell(file);
rewind(file);
char_t* string = new char_t[length];
size_t const ref_length = fread(string, sizeof(char_t), length, file);
static_cast<void>(ref_length);
fclose(file);
size_t start = 0;
if (argc > 2)
{
start = atoi(argv[2]);
if (start > length)
{
start = 0;
} // if
} // if
size_t end = length;
if (argc > 3)
{
end = atoi(argv[3]);
if (end > length)
{
end = length;
} // if
} // if
unsigned int reverse_complement(unsigned int const index)
{
return (~((index >> 0x4) | (((index >> 0x2) & 0x3) << 2) | ((index & 0x3) << 0x4)) & 0x3f);
} // reverse_complement
std::vector<Repeat> repeat;
tandem_repeat_annotation(repeat, string, start, end);
unsigned int frame_shift_reverse(char_t const reference, char_t const sample_1)
{
std::pair<std::multimap<char_t const, unsigned int const>::const_iterator, std::multimap<char_t const, unsigned int const>::const_iterator> const range_1 = codon_map.equal_range(sample_1);
for (std::vector<Repeat>::const_iterator it = repeat.begin(); it != repeat.end(); ++it)
unsigned int shift = FRAME_SHIFT_0;
for (std::multimap<char_t const, unsigned int const>::const_iterator it_1 = range_1.first; it_1 != range_1.second; ++it_1)
{
for (size_t i = it->start; i < it->end; ++i)
{
printf("%c", string[i]);
} // for
if (it->count > 0)
if (reference == codon_table[reverse_complement(it_1->second)])
{
printf("%ld", it->count + 1);
shift |= FRAME_SHIFT_REVERSE_IDEM;
} // if
printf(";");
} // for
printf("\n");
return shift;
} // frame_shift_reverse
char_t* reverse = new char_t[length];
for (size_t i = 0; i < length; ++i)
{
reverse[i] = string[length - i - 1];
} // for
std::vector<Repeat> repeat_reverse;
tandem_repeat_annotation(repeat_reverse, reverse, length - end, length - start);
for (std::vector<Repeat>::const_reverse_iterator it = repeat_reverse.rbegin(); it != repeat_reverse.rend(); ++it)
int main(int, char* [])
{
for (unsigned int i = 0; i < 64; ++i)
{
size_t const repeat_length = it->end - it->start;
for (size_t i = 0; i < repeat_length; ++i)
{
printf("%c", reverse[it->end - 1 - i]);
} // for
if (it->count > 0)
{
printf("%ld", it->count + 1);
} // if
printf(";");
codon_table[i] = codon_string[i];
codon_map.insert(std::pair<char_t const, unsigned int const>(codon_table[i], i));
} // for
printf("\n");
delete[] reverse;
delete[] string;
unsigned int const shift = frame_shift_reverse('D', 'V');
printf("%d\n", shift);
return 0;
} // main
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment