Commit 04a27b81 authored by J.K. Vis's avatar J.K. Vis
Browse files

Debugging in progress on frameshift extraction

parent 1da85b13
......@@ -20,7 +20,7 @@ TARGET=_extractor.so
DEBUG=debug.cc
CXX=g++
CFLAGS=-c -fpic -Wall -Wextra -O3 #-D__debug__
CFLAGS=-c -fpic -Wall -Wextra -O3 -D__debug__
LDFLAGS=-Wall -O3 -shared
SWIG=swig
......
......@@ -23,6 +23,12 @@ using namespace mutalyzer;
#include <cstdio>
/*
#include <iostream>
#include <string>
using namespace std;
*/
// Entry point.
int main(int argc, char* argv[])
{
......@@ -66,13 +72,44 @@ int main(int argc, char* argv[])
fclose(file);
/*
string header[4305];
string protein[4305];
for (int i = 0; i < 4305; ++i)
{
getline(cin, header[i]);
getline(cin, protein[i]);
} // for
for (int i = 0; i < 10; ++i)
{
cerr << i << endl;
for (int j = i + 1; j < 10; ++j)
{
vector<Variant> variant;
extract(variant, protein[i].c_str(), protein[i].length(), protein[j].c_str(), protein[j].length(), TYPE_PROTEIN, "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF");
fprintf(stdout, "%s %s\n", header[i].c_str(), header[j].c_str());
for (std::vector<Variant>::iterator it = variant.begin(); it != variant.end(); ++it)
{
if (it->type >= FRAME_SHIFT)
{
fprintf(stdout, "%ld--%ld, %ld--%ld, %d, %ld\n", it->reference_start, it->reference_end, it->sample_start, it->sample_end, it->type, it->weight);
} // if
} // for
} // for
} // for
*/
// The actual extraction.
std::vector<Variant> variant;
size_t const weight = extract(variant, reference, reference_length, sample, sample_length, TYPE_PROTEIN, "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF");
size_t const weight = extract(variant, reference, reference_length - 1, sample, sample_length - 1, TYPE_PROTEIN, "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF");
// Printing the variants.
fprintf(stdout, "\nVariants (%ld / %ld):\n", variant.size(), weight);
fprintf(stdout, "Variants (%ld / %ld):\n", variant.size(), weight);
for (std::vector<Variant>::iterator it = variant.begin(); it != variant.end(); ++it)
{
//if (it->type != IDENTITY)
......
......@@ -8,8 +8,8 @@
// FILE INFORMATION:
// File: extractor.cc (depends on extractor.h)
// Author: Jonathan K. Vis
// Revision: 2.2.1
// Date: 2015/06/30
// Revision: 2.2.2
// Date: 2015/07/27
// *******************************************************************
// DESCRIPTION:
// This library can be used to generete HGVS variant descriptions as
......@@ -33,12 +33,14 @@ size_t global_reference_length = 0;
// The actual frame shift map indexed on the lower 127 ASCII
// characters. This map should be precalculated given a codon string
// by the initialize_frame_shift_map function.
uint8_t frame_shift_map[128][128][128] = {{{FRAME_SHIFT_NONE}}};
static uint8_t frame_shift_map[128][128][128] = {{{FRAME_SHIFT_NONE}}};
// A frequency count of all possible frame shifts (5) for all
// combinations of two amino acids (indexed by the lower 127 ASCII
// characters). Used to calculate the frame shift probability.
uint8_t frame_shift_count[128][128][5] = {{{0}}};
static uint8_t frame_shift_count[128][128][5] = {{{0}}};
static uint64_t acid_map[128] = {0x0ull};
// Only used to interface to Python: calls the C++ extract function.
Variant_List extract(char_t const* const reference,
......@@ -816,7 +818,7 @@ void extractor_frame_shift(std::vector<Variant> &annotation,
// Calculate the frame shift probability.
size_t weight = 1;
size_t weight = 0;
for (size_t i = 0; i < lcs.length; ++i)
{
size_t weight_compound = 0;
......@@ -1298,8 +1300,8 @@ void LCS_frame_shift(std::vector<Substring> &substring,
{
lcs[i % 2][0][2] = 1;
fs_substring[2] = Substring(reference_start, sample_start + i, lcs[i % 2][0][2], FRAME_SHIFT_REVERSE);
} // if
for (size_t j = 1; j < reference_length; ++j)
{
// Calculate frame shifts in both directions.
......@@ -1469,24 +1471,23 @@ char_t const* IUPAC_complement(char_t const* const string,
// based on a given codon string.
void initialize_frame_shift_map(char_t const* const codon_string)
{
uint64_t acid_map[128] = {0x0};
for (size_t i = 0; i < 64; ++i)
{
acid_map[codon_string[i] & 0x7f] |= (0x1ll << i);
acid_map[codon_string[i] & 0x7f] |= (0x1ull << i);
} // for
for (size_t i = 0; i < 128; ++i)
{
if (acid_map[i] != 0x0)
if (acid_map[i] != 0x0ull)
{
for (size_t j = 0; j < 128; ++j)
{
if (acid_map[j] != 0x0)
if (acid_map[j] != 0x0ull)
{
for (size_t k = 0; k < 128; ++k)
{
if (acid_map[k] != 0x0)
if (acid_map[k] != 0x0ull)
{
uint8_t const shift = calculate_frame_shift(acid_map, i, j, k);
uint8_t const shift = calculate_frame_shift(i, j, k);
frame_shift_map[i][j][k] = shift;
// Calculate frame shift frequencies.
......@@ -1523,20 +1524,19 @@ void initialize_frame_shift_map(char_t const* const codon_string)
// combinations of two reference amino acids the corresponding DNA
// sequence and the (partial) overlap between all possible DNA
// sequences of the sample amico acid.
uint8_t calculate_frame_shift(uint64_t const acid_map[],
size_t const reference_1,
size_t const reference_2,
size_t const sample)
uint8_t calculate_frame_shift(size_t const reference_1,
size_t const reference_2,
size_t const sample)
{
uint8_t shift = FRAME_SHIFT_NONE;
for (size_t i = 0; i < 64; ++i)
{
if (((acid_map[reference_1] >> i) & 0x1) == 0x1)
if (((acid_map[reference_1] >> i) & 0x1ull) == 0x1ull)
{
size_t const codon_reverse = ((i >> 0x4) | (i & 0xc) | ((i & 0x3) << 0x4)) ^ 0x3f;
for (size_t j = 0; j < 64; ++j)
{
if (((acid_map[reference_2] >> j) & 0x1) == 0x1)
if (((acid_map[reference_2] >> j) & 0x1ull) == 0x1ull)
{
size_t const codon_1 = ((i & 0x3) << 0x4) | ((j & 0x3c) >> 0x2);
size_t const codon_2 = ((i & 0xf) << 0x2) | (j >> 0x4);
......@@ -1544,8 +1544,9 @@ uint8_t calculate_frame_shift(uint64_t const acid_map[],
size_t const codon_reverse_2 = ((i & 0x3) | ((j & 0x30) >> 0x2) | ((j & 0xc) << 0x2)) ^ 0x3f;
for (size_t k = 0; k < 64; ++k)
{
if (((acid_map[sample] >> k) & 0x1) == 0x1)
if (((acid_map[sample] >> k) & 0x1ull) == 0x1ull)
{
shift = FRAME_SHIFT_NONE;
if (codon_1 == k)
{
shift |= FRAME_SHIFT_1;
......
......@@ -126,16 +126,6 @@ static double const TRANSPOSITION_CUT_OFF = 0.1;
extern size_t global_reference_length;
// The actual frame shift map indexed by the lower 127 ASCII
// characters. This map should be precalculated given a codon string
// by the initialize_frame_shift_map function.
extern uint8_t frame_shift_map[128][128][128];
// A frequency count of all possible frame shifts (5) for all
// combinations of two amino acids (indexed by the lower 127 ASCII
// characters). Used to calculate the frame shift probability.
extern uint8_t frame_shift_count[128][128][5];
// *******************************************************************
// Variant structure
// This structure describes a variant (region of change).
......@@ -654,17 +644,14 @@ void initialize_frame_shift_map(char_t const* const codon_string);
// sequence and the (partial) overlap between all possible DNA
// sequences of the sample amico acid.
//
// @arg acid_map: maps amino acids (coded as the lower 127 ASCII
// characters) to DNA codons
// @arg reference_1: first reference amino acid
// @arg reference_2: second reference amino acid
// @arg sample: sample amino acid
// @return: frame shift
// *******************************************************************
uint8_t calculate_frame_shift(uint64_t const acid_map[],
size_t const reference_1,
size_t const reference_2,
size_t const sample);
uint8_t calculate_frame_shift(size_t const reference_1,
size_t const reference_2,
size_t const sample);
// *******************************************************************
// frame_shift function
......
......@@ -51,7 +51,7 @@ char_t const* const CODON_STRING = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGG
static uint8_t frame_shift_map[128][128][128] = {{{FRAME_SHIFT_NONE}}};
static uint8_t frame_shift_count[128][128][5] = {{{0}}};
static uint64_t acid_map[128] = {0x0};
static uint64_t acid_map[128] = {0x0ull};
struct Variant
{
......@@ -116,19 +116,27 @@ struct Substring
inline Substring(void): length(0) { }
}; // Substring
uint8_t calculate_frame_shift(size_t const reference_1,
size_t const reference_2,
size_t const sample)
void print_codon(FILE* stream, size_t const index)
{
fprintf(stream, "%c%c%c", IUPAC_BASE[index >> 0x4],
IUPAC_BASE[(index >> 0x2) & 0x3],
IUPAC_BASE[index & 0x3]);
return;
} // print_codon
uint8_t calculate_frame_shift(size_t const reference_1,
size_t const reference_2,
size_t const sample)
{
uint8_t shift = FRAME_SHIFT_NONE;
for (size_t i = 0; i < 64; ++i)
{
if (((acid_map[reference_1] >> i) & 0x1) == 0x1)
if (((acid_map[reference_1] >> i) & 0x1ull) == 0x1ull)
{
size_t const codon_reverse = ((i >> 0x4) | (i & 0xc) | ((i & 0x3) << 0x4)) ^ 0x3f;
for (size_t j = 0; j < 64; ++j)
{
if (((acid_map[reference_2] >> j) & 0x1) == 0x1)
if (((acid_map[reference_2] >> j) & 0x1ull) == 0x1ull)
{
size_t const codon_1 = ((i & 0x3) << 0x4) | ((j & 0x3c) >> 0x2);
size_t const codon_2 = ((i & 0xf) << 0x2) | (j >> 0x4);
......@@ -136,8 +144,9 @@ uint8_t calculate_frame_shift(size_t const reference_1,
size_t const codon_reverse_2 = ((i & 0x3) | ((j & 0x30) >> 0x2) | ((j & 0xc) << 0x2)) ^ 0x3f;
for (size_t k = 0; k < 64; ++k)
{
if (((acid_map[sample] >> k) & 0x1) == 0x1)
if (((acid_map[sample] >> k) & 0x1ull) == 0x1ull)
{
shift = FRAME_SHIFT_NONE;
if (codon_1 == k)
{
shift |= FRAME_SHIFT_1;
......@@ -158,6 +167,7 @@ uint8_t calculate_frame_shift(size_t const reference_1,
{
shift |= FRAME_SHIFT_REVERSE_2;
} // if
//printf("0x%x\n", shift);
} // if
} // for
} // if
......@@ -171,19 +181,20 @@ void initialize_frame_shift_map(char_t const* const codon_string)
{
for (size_t i = 0; i < 64; ++i)
{
acid_map[codon_string[i] & 0x7f] |= (0x1ll << i);
acid_map[codon_string[i] & 0x7f] |= (0x1ull << i);
} // for
for (size_t i = 0; i < 128; ++i)
{
if (acid_map[i] != 0x0)
if (acid_map[i] != 0x0ull)
{
for (size_t j = 0; j < 128; ++j)
{
if (acid_map[j] != 0x0)
if (acid_map[j] != 0x0ull)
{
for (size_t k = 0; k < 128; ++k)
{
if (acid_map[k] != 0x0)
if (acid_map[k] != 0x0ull)
{
uint8_t const shift = calculate_frame_shift(i, j, k);
frame_shift_map[i][j][k] = shift;
......@@ -292,7 +303,7 @@ void LCS_frame_shift(std::vector<Substring> &substring,
} // if
if (lcs[i % 2][j][2] > fs_substring[2].length)
{
fs_substring[2] = Substring(reference_start + j - lcs[i % 2][j][2], sample_start + i - lcs[i % 2][j][2], lcs[i % 2][j][2], FRAME_SHIFT_REVERSE);
fs_substring[2] = Substring(reference_start + j - lcs[i % 2][j][2] + 1, sample_start + i - lcs[i % 2][j][2], lcs[i % 2][j][2], FRAME_SHIFT_REVERSE);
} // if
if (lcs[i % 2][j][3] > fs_substring[3].length)
{
......@@ -381,7 +392,7 @@ void extractor_frame_shift(std::vector<Variant> &annotation,
size_t const sample_length = sample_end - sample_start;
// First the base cases to end the recursion.
if (reference_length <= 0 && sample_length <= 0)
if (reference_length <= 0 || sample_length <= 0)
{
return;
} // if
......@@ -494,7 +505,8 @@ int main(int, char* [])
initialize_frame_shift_map(CODON_STRING);
std::vector<Variant> annotation;
extractor_frame_shift(annotation, "MDYSLAAALTLHGH", 1, 11, "MTIPWRSPHFHGH", 1, 10);
extractor_frame_shift(annotation, "MLGNMNVFMAVLGIILFSGFLAAYFSHKWDD", 1, 32,
"MVGRYRFEFILIILILCALITARFYLS", 1, 28);
// Printing the variants.
fprintf(stdout, "Annotation (%ld):\n", annotation.size());
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment