Commit 0bebab7d authored by J.K. Vis's avatar J.K. Vis
Browse files

Refactored frame shift calculation

parent 95404ef5
......@@ -8,8 +8,8 @@
// FILE INFORMATION:
// File: debug.cc
// Author: Jonathan K. Vis
// Revision: 2.1.8
// Date: 2015/01/12
// Revision: 2.3.0
// Date: 2015/07/31
// *******************************************************************
// DESCRIPTION:
// This source can be used to debug the Extractor library within
......@@ -95,7 +95,7 @@ int main(int argc, char* argv[])
{
if (it->type >= FRAME_SHIFT)
{
fprintf(stdout, "%ld--%ld, %ld--%ld, %d, %ld\n", it->reference_start, it->reference_end, it->sample_start, it->sample_end, it->type, it->weight);
fprintf(stdout, "%ld--%ld, %ld--%ld, %d, %lf, %ld--%ld\n", it->reference_start, it->reference_end, it->sample_start, it->sample_end, it->type, 1.f - it->probability, it->transposition_start, it->transposition_end);
} // if
} // for
......@@ -112,10 +112,14 @@ int main(int argc, char* argv[])
fprintf(stdout, "Variants (%ld / %ld):\n", variant.size(), weight);
for (std::vector<Variant>::iterator it = variant.begin(); it != variant.end(); ++it)
{
//if (it->type != IDENTITY)
if (it->type >= FRAME_SHIFT)
{
fprintf(stdout, "%ld--%ld, %ld--%ld, %d, %ld, %ld--%ld\n", it->reference_start, it->reference_end, it->sample_start, it->sample_end, it->type, it->weight, it->transposition_start, it->transposition_end);
fprintf(stdout, "%ld--%ld, %ld--%ld, %d, %lf, %ld--%ld\n", it->reference_start, it->reference_end, it->sample_start, it->sample_end, it->type, 1.f - it->probability, it->transposition_start, it->transposition_end);
} // if
else
{
fprintf(stdout, "%ld--%ld, %ld--%ld, %d, %ld, %ld--%ld\n", it->reference_start, it->reference_end, it->sample_start, it->sample_end, it->type, it->weight, it->transposition_start, it->transposition_end);
} // else
} // for
......
......@@ -8,8 +8,8 @@
// FILE INFORMATION:
// File: extractor.cc (depends on extractor.h)
// Author: Jonathan K. Vis
// Revision: 2.2.2
// Date: 2015/07/27
// Revision: 2.3.0
// Date: 2015/07/31
// *******************************************************************
// DESCRIPTION:
// This library can be used to generete HGVS variant descriptions as
......@@ -30,6 +30,34 @@ size_t weight_position = 1;
// This seems necessary to compute transpositions.
size_t global_reference_length = 0;
static char_t const IUPAC_ALPHA[16] =
{
'x', // 0x00
'A', // 0x01
'C', // 0x02
'M', // 0x03 A | C
'G', // 0x04
'R', // 0x05 A | G
'S', // 0x06 C | G
'V', // 0x07 A | C | G
'T', // 0x08
'W', // 0x09 A | T
'Y', // 0x0a C | T
'H', // 0x0b A | C | T
'K', // 0x0c G | T
'D', // 0x0d A | G | T
'B', // 0x0e C | G | T
'N' // 0x0f A | C | G | T
}; // IUPAC_ALPHA
static char_t const IUPAC_BASE[4] =
{
'A',
'C',
'G',
'T'
}; // IUPAC_BASE
// The actual frame shift map indexed on the lower 127 ASCII
// characters. This map should be precalculated given a codon string
// by the initialize_frame_shift_map function.
......@@ -42,6 +70,9 @@ static uint8_t frame_shift_count[128][128][5] = {{{0}}};
static uint64_t acid_map[128] = {0x0ull};
static double acid_frequency[128] = {.0f};
static double frame_shift_frequency[128][128][5] = {{{.0f}}};
// Only used to interface to Python: calls the C++ extract function.
Variant_List extract(char_t const* const reference,
size_t const reference_length,
......@@ -142,19 +173,21 @@ size_t extract(std::vector<Variant> &variant,
// Frame shift annotation starts here.
if (type == TYPE_PROTEIN)
{
std::vector<Variant> annotation;
std::vector<Variant> merged;
for (std::vector<Variant>::iterator it = variant.begin(); it != variant.end(); ++it)
{
merged.push_back(*it);
if (it->type == SUBSTITUTION)
{
std::vector<Variant> annotation;
extractor_frame_shift(annotation, reference, it->reference_start, it->reference_end, sample, it->sample_start, it->sample_end);
merged.insert(merged.end(), annotation.begin(), annotation.end());
} // if
} // for
// Add the annotations to the end of the variant list.
variant.insert(variant.end(), annotation.begin(), annotation.end());
variant = merged;
} // if
// Do NOT forget to clean up the complement string.
delete[] complement;
......@@ -818,35 +851,32 @@ void extractor_frame_shift(std::vector<Variant> &annotation,
// Calculate the frame shift probability.
size_t weight = 0;
double probability = 1.f;
for (size_t i = 0; i < lcs.length; ++i)
{
size_t weight_compound = 0;
double probability_compound = .0f;
if ((lcs.type & FRAME_SHIFT_1) == FRAME_SHIFT_1)
{
weight_compound += frame_shift_count[reference[lcs.reference_index + i] & 0x7f][reference[lcs.reference_index + i + 1] & 0x7f][0];
probability_compound += frame_shift_frequency[reference[lcs.reference_index + i] & 0x7f][reference[lcs.reference_index + i + 1] & 0x7f][0];
} // if
if ((lcs.type & FRAME_SHIFT_2) == FRAME_SHIFT_2)
{
weight_compound += frame_shift_count[reference[lcs.reference_index + i] & 0x7f][reference[lcs.reference_index + i + 1] & 0x7f][1];
probability_compound += frame_shift_frequency[reference[lcs.reference_index + i] & 0x7f][reference[lcs.reference_index + i + 1] & 0x7f][1];
} // if
if ((lcs.type & FRAME_SHIFT_REVERSE) == FRAME_SHIFT_REVERSE)
{
weight_compound += frame_shift_count[reference[lcs.reference_index + i] & 0x7f][reference[lcs.reference_index + i + 1] & 0x7f][2];
probability_compound += frame_shift_frequency[reference[lcs.reference_index + i] & 0x7f][reference[lcs.reference_index + i] & 0x7f][2];
} // if
if ((lcs.type & FRAME_SHIFT_REVERSE_1) == FRAME_SHIFT_REVERSE_1)
{
weight_compound += frame_shift_count[reference[lcs.reference_index + i] & 0x7f][reference[lcs.reference_index + i + 1] & 0x7f][3];
probability_compound += frame_shift_frequency[reference[lcs.reference_index + i] & 0x7f][reference[lcs.reference_index + i + 1] & 0x7f][3];
} // if
if ((lcs.type & FRAME_SHIFT_REVERSE_2) == FRAME_SHIFT_REVERSE_2)
{
weight_compound += frame_shift_count[reference[lcs.reference_index + i] & 0x7f][reference[lcs.reference_index + i + 1] & 0x7f][4];
probability_compound += frame_shift_frequency[reference[lcs.reference_index + i] & 0x7f][reference[lcs.reference_index + i + 1] & 0x7f][4];
} // if
// This is actually only the numerator. The denominator is
// implicitly given by the number of different amino acids to the
// power of the length of the frame shift LCS.
weight *= weight_compound;
//fprintf(stderr, " probability_compound: %lf\n", probability_compound);
probability *= probability_compound;
} // for
......@@ -862,7 +892,9 @@ void extractor_frame_shift(std::vector<Variant> &annotation,
// Add all variants (in order) to the annotation vector.
annotation.insert(annotation.end(), prefix.begin(), prefix.end());
annotation.push_back(Variant(lcs.reference_index, lcs.reference_index + lcs.length, lcs.sample_index, lcs.sample_index + lcs.length, FRAME_SHIFT | lcs.type, weight));
Variant variant(lcs.reference_index, lcs.reference_index + lcs.length, lcs.sample_index, lcs.sample_index + lcs.length, FRAME_SHIFT | lcs.type);
variant.probability = probability;
annotation.push_back(variant);
annotation.insert(annotation.end(), suffix.begin(), suffix.end());
return;
......@@ -1272,7 +1304,6 @@ void LCS_frame_shift(std::vector<Substring> &substring,
size_t const reference_length = reference_end - reference_start;
size_t const sample_length = sample_end - sample_start;
// Initialize LCS array to zero.
size_t lcs[2][reference_length][5];
for (size_t i = 0; i < reference_length; ++i)
{
......@@ -1289,46 +1320,62 @@ void LCS_frame_shift(std::vector<Substring> &substring,
} // for
Substring fs_substring[5];
// Filling the LCS matrix (actually only the current and the
// previous row).
for (size_t i = 0; i < sample_length; ++i)
{
// The first ``in frame'' reverse complement frame shift.
uint8_t const shift_reverse = frame_shift(reference[reference_end - 1], reference[reference_end - 2], sample[sample_start + i]);
uint8_t const shift_reverse = frame_shift(reference[reference_end - 1], reference[reference_end - 1], sample[sample_start + i]);
if ((shift_reverse & FRAME_SHIFT_REVERSE) == FRAME_SHIFT_REVERSE)
{
lcs[i % 2][0][2] = 1;
fs_substring[2] = Substring(reference_start, sample_start + i, lcs[i % 2][0][2], FRAME_SHIFT_REVERSE);
} // if
if (lcs[i % 2][0][2] > fs_substring[2].length)
{
fs_substring[2] = Substring(reference_start - lcs[i % 2][0][2] + 1, sample_start + i - lcs[i % 2][0][2] + 1, lcs[i % 2][0][2], FRAME_SHIFT_REVERSE);
} // if
for (size_t j = 1; j < reference_length; ++j)
{
// Calculate frame shifts in both directions.
uint8_t const shift_forward = frame_shift(reference[reference_start + j - 1], reference[reference_start + j], sample[sample_start + i]);
uint8_t const shift_reverse = frame_shift(reference[reference_end - j - 1], reference[reference_end - j], sample[sample_start + i]);
if ((shift_forward & FRAME_SHIFT_1) == FRAME_SHIFT_1)
{
lcs[i % 2][j][0] = lcs[(i + 1) % 2][j - 1][0] + 1;
} // if
else
{
lcs[i % 2][j][0] = 0;
} // else
if ((shift_forward & FRAME_SHIFT_2) == FRAME_SHIFT_2)
{
lcs[i % 2][j][1] = lcs[(i + 1) % 2][j - 1][1] + 1;
} // if
else
{
lcs[i % 2][j][1] = 0;
} // else
if ((shift_reverse & FRAME_SHIFT_REVERSE) == FRAME_SHIFT_REVERSE)
{
lcs[i % 2][j][2] = lcs[(i + 1) % 2][j - 1][2] + 1;
} // if
else
{
lcs[i % 2][j][2] = 0;
} // else
if ((shift_reverse & FRAME_SHIFT_REVERSE_1) == FRAME_SHIFT_REVERSE_1)
{
lcs[i % 2][j][3] = lcs[(i + 1) % 2][j - 1][3] + 1;
} // if
else
{
lcs[i % 2][j][3] = 0;
} // else
if ((shift_reverse & FRAME_SHIFT_REVERSE_2) == FRAME_SHIFT_REVERSE_2)
{
lcs[i % 2][j][4] = lcs[(i + 1) % 2][j - 1][4] + 1;
} // if
else
{
lcs[i % 2][j][4] = 0;
} // else
// Update the best solution for each frame shift.
if (lcs[i % 2][j][0] > fs_substring[0].length)
{
fs_substring[0] = Substring(reference_start + j - lcs[i % 2][j][0], sample_start + i - lcs[i % 2][j][0] + 1, lcs[i % 2][j][0], FRAME_SHIFT_1);
......@@ -1351,7 +1398,6 @@ void LCS_frame_shift(std::vector<Substring> &substring,
} // if
} // for
} // for
substring = std::vector<Substring>(1, fs_substring[0]);
substring.push_back(fs_substring[1]);
substring.push_back(fs_substring[2]);
......@@ -1467,10 +1513,139 @@ char_t const* IUPAC_complement(char_t const* const string,
return complement;
} // IUPAC_complement
void backtranslation(size_t reference_DNA[],
size_t sample_DNA[],
char_t const* const reference,
size_t const reference_start,
char_t const* const sample,
size_t const sample_start,
size_t const length,
uint8_t const type)
{
for (size_t i = 0; i < 3 * length; ++ i)
{
reference_DNA[i] = 0;
sample_DNA[i] = 0;
} // for
for (size_t p = 0; p < length; ++p)
{
for (size_t i = 0; i < 64; ++i)
{
if (((acid_map[reference[reference_start + p] & 0x7f] >> i) & 0x1ull) == 0x1ull)
{
size_t const codon_reverse = ((i >> 0x4) | (i & 0xc) | ((i & 0x3) << 0x4)) ^ 0x3f;
for (size_t k = 0; k < 64; ++k)
{
if (((acid_map[sample[sample_start + length - p - 1] & 0x7f] >> k) & 0x1ull) == 0x1ull)
{
if ((type & FRAME_SHIFT_REVERSE) == FRAME_SHIFT_REVERSE && codon_reverse == k)
{
reference_DNA[p * 3] |= 0x1 << (i >> 4);
reference_DNA[p * 3 + 1] |= 0x1 << ((i >> 2) & 0x3);
reference_DNA[p * 3 + 2] |= 0x1 << (i & 0x3);
sample_DNA[(length - p) * 3 - 3] |= 0x1 << (codon_reverse >> 4);
sample_DNA[(length - p) * 3 - 2] |= 0x1 << ((codon_reverse >> 2) & 0x3);
sample_DNA[(length - p) * 3 - 1] |= 0x1 << (codon_reverse & 0x3);
} // if
} // if
} // for
for (size_t j = 0; j < 64; ++j)
{
if (((acid_map[reference[reference_start + p + 1] & 0x7f] >> j) & 0x1ull) == 0x1ull)
{
size_t const codon_1 = ((i & 0x3) << 0x4) | ((j & 0x3c) >> 0x2);
size_t const codon_2 = ((i & 0xf) << 0x2) | (j >> 0x4);
size_t const codon_reverse_1 = (((i & 0xc) >> 0x2) | ((i & 0x3) << 0x2) | (j & 0x30)) ^ 0x3f;
size_t const codon_reverse_2 = ((i & 0x3) | ((j & 0x30) >> 0x2) | ((j & 0xc) << 0x2)) ^ 0x3f;
for (size_t k = 0; k < 64; ++k)
{
if (((acid_map[sample[sample_start + p] & 0x7f] >> k) & 0x1ull) == 0x1ull)
{
if ((type & FRAME_SHIFT_1) == FRAME_SHIFT_1 && codon_1 == k)
{
reference_DNA[p * 3] |= 0x1 << (i >> 4);
reference_DNA[p * 3 + 1] |= 0x1 << ((i >> 2) & 0x3);
reference_DNA[p * 3 + 2] |= 0x1 << (i & 0x3);
sample_DNA[p * 3] |= 0x1 << (codon_1 >> 4);
sample_DNA[p * 3 + 1] |= 0x1 << ((codon_1 >> 2) & 0x3);
sample_DNA[p * 3 + 2] |= 0x1 << (codon_1 & 0x3);
} // if
if ((type & FRAME_SHIFT_2) == FRAME_SHIFT_2 && codon_2 == k)
{
reference_DNA[p * 3] |= 0x1 << (i >> 4);
reference_DNA[p * 3 + 1] |= 0x1 << ((i >> 2) & 0x3);
reference_DNA[p * 3 + 2] |= 0x1 << (i & 0x3);
sample_DNA[p * 3] |= 0x1 << (codon_2 >> 4);
sample_DNA[p * 3 + 1] |= 0x1 << ((codon_2 >> 2) & 0x3);
sample_DNA[p * 3 + 2] |= 0x1 << (codon_2 & 0x3);
} // if
} // if
if (((acid_map[sample[sample_start + length - p - 1] & 0x7f] >> k) & 0x1ull) == 0x1ull)
{
if ((type & FRAME_SHIFT_REVERSE_1) == FRAME_SHIFT_REVERSE_1 && codon_reverse_1 == k)
{
reference_DNA[p * 3] |= 0x1 << (i >> 4);
reference_DNA[p * 3 + 1] |= 0x1 << ((i >> 2) & 0x3);
reference_DNA[p * 3 + 2] |= 0x1 << (i & 0x3);
sample_DNA[(length - p) * 3 - 3] |= 0x1 << (codon_reverse_1 >> 4);
sample_DNA[(length - p) * 3 - 2] |= 0x1 << ((codon_reverse_1 >> 2) & 0x3);
sample_DNA[(length - p) * 3 - 1] |= 0x1 << (codon_reverse_1 & 0x3);
} // if
if ((type & FRAME_SHIFT_REVERSE_2) == FRAME_SHIFT_REVERSE_2 && codon_reverse_2 == k)
{
reference_DNA[p * 3] |= 0x1 << (i >> 4);
reference_DNA[p * 3 + 1] |= 0x1 << ((i >> 2) & 0x3);
reference_DNA[p * 3 + 2] |= 0x1 << (i & 0x3);
sample_DNA[(length - p) * 3 - 3] |= 0x1 << (codon_reverse_2 >> 4);
sample_DNA[(length - p) * 3 - 2] |= 0x1 << ((codon_reverse_2 >> 2) & 0x3);
sample_DNA[(length - p) * 3 - 1] |= 0x1 << (codon_reverse_2 & 0x3);
} // if
} // if
} // for
} // if
} // for
} // if
} // for
} // for
return;
} // backtranslation
static void initialize_acid_frequency(void)
{
acid_frequency['A'] = .09515673f;
acid_frequency['C'] = .01157279f;
acid_frequency['D'] = .05151007f;
acid_frequency['E'] = .05762795f;
acid_frequency['F'] = .03890338f;
acid_frequency['G'] = .07374416f;
acid_frequency['H'] = .02266328f;
acid_frequency['I'] = .06010209f;
acid_frequency['K'] = .04406110f;
acid_frequency['L'] = .10672657f;
acid_frequency['M'] = .02819341f;
acid_frequency['N'] = .03945573f;
acid_frequency['P'] = .04425210f;
acid_frequency['Q'] = .04439959f;
acid_frequency['R'] = .05510809f;
acid_frequency['S'] = .05802322f;
acid_frequency['T'] = .05398938f;
acid_frequency['U'] = .00000221f;
acid_frequency['V'] = .07073316f;
acid_frequency['W'] = .01531018f;
acid_frequency['X'] = .00001106f;
acid_frequency['Y'] = .02845373f;
return;
} // initialize_acid_frequency
// This function precalculates the frame_shift_map and frequency count
// based on a given codon string.
void initialize_frame_shift_map(char_t const* const codon_string)
{
// FIXME:
initialize_acid_frequency();
for (size_t i = 0; i < 64; ++i)
{
acid_map[codon_string[i] & 0x7f] |= (0x1ull << i);
......@@ -1490,27 +1665,32 @@ void initialize_frame_shift_map(char_t const* const codon_string)
uint8_t const shift = calculate_frame_shift(i, j, k);
frame_shift_map[i][j][k] = shift;
// Calculate frame shift frequencies.
if ((shift & FRAME_SHIFT_1) == FRAME_SHIFT_1)
{
++frame_shift_count[i][j][0];
frame_shift_frequency[i][j][0] += acid_frequency[k];
} // if
if ((shift & FRAME_SHIFT_2) == FRAME_SHIFT_2)
{
++frame_shift_count[i][j][1];
frame_shift_frequency[i][j][1] += acid_frequency[k];
} // if
if ((shift & FRAME_SHIFT_REVERSE) == FRAME_SHIFT_REVERSE)
{
++frame_shift_count[i][j][2];
frame_shift_frequency[i][j][2] += acid_frequency[k];
} // if
if ((shift & FRAME_SHIFT_REVERSE_1) == FRAME_SHIFT_REVERSE_1)
{
++frame_shift_count[i][j][3];
frame_shift_frequency[i][j][3] += acid_frequency[k];
} // if
if ((shift & FRAME_SHIFT_REVERSE_2) == FRAME_SHIFT_REVERSE_2)
{
++frame_shift_count[i][j][4];
frame_shift_frequency[i][j][4] += acid_frequency[k];
} // if
} // if
} // for
} // if
......@@ -1546,7 +1726,6 @@ uint8_t calculate_frame_shift(size_t const reference_1,
{
if (((acid_map[sample] >> k) & 0x1ull) == 0x1ull)
{
shift = FRAME_SHIFT_NONE;
if (codon_1 == k)
{
shift |= FRAME_SHIFT_1;
......@@ -1604,6 +1783,14 @@ size_t Dprint_truncated(char_t const* const string,
fputs("...", stream) +
fwrite(string + end - (length / 2) + 1, sizeof(char_t), length / 2 - 1, stream);
} // Dprint_truncated
size_t Dprint_codon(size_t const index,
FILE* stream)
{
return fprintf(stream, "%c%c%c", IUPAC_BASE[index >> 0x4],
IUPAC_BASE[(index >> 0x2) & 0x3],
IUPAC_BASE[index & 0x3]);
} // Dprint_codon
#endif
......
......@@ -8,8 +8,8 @@
// FILE INFORMATION:
// File: extractor.h (implemented in extractor.cc)
// Author: Jonathan K. Vis
// Revision: 2.2.1
// Date: 2015/03/11
// Revision: 2.3.0
// Date: 2015/07/31
// *******************************************************************
// DESCRIPTION:
// This library can be used to generate HGVS variant descriptions as
......@@ -34,7 +34,7 @@ namespace mutalyzer
{
// Version string for run-time identification.
static char const* const VERSION = "2.2.1";
static char const* const VERSION = "2.3.0";
// The character type used for all strings. For now it should just be
......@@ -154,7 +154,11 @@ struct Variant
size_t sample_start;
size_t sample_end;
unsigned int type;
size_t weight;
union
{
size_t weight;
double probability;
}; // union
size_t transposition_start;
size_t transposition_end;
......@@ -688,6 +692,9 @@ size_t Dprint_truncated(char_t const* const string,
size_t const end,
size_t const length = 40,
FILE* stream = stderr);
size_t Dprint_codon(size_t const index,
FILE* stream = stderr);
#endif
......
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment