Commit 774a59db authored by J.K. Vis's avatar J.K. Vis
Browse files

Updated cut-off scheme

parent 1172e248
......@@ -8,8 +8,8 @@
// FILE INFORMATION:
// File: debug.cc
// Author: Jonathan K. Vis
// Revision: 2.1.0
// Date: 2014/08/06
// Revision: 2.1.1
// Date: 2014/08/21
// *******************************************************************
// DESCRIPTION:
// This source can be used to debug the Extractor library within
......@@ -46,7 +46,8 @@ int main(int argc, char* argv[])
size_t const reference_length = ftell(file);
rewind(file);
char_t* reference = new char_t[reference_length];
fread(reference, sizeof(char_t), reference_length, file);
size_t const ref_length = fread(reference, sizeof(char_t), reference_length, file);
static_cast<void>(ref_length);
fclose(file);
file = fopen(argv[2], "r");
......@@ -60,7 +61,8 @@ int main(int argc, char* argv[])
size_t const sample_length = ftell(file);
rewind(file);
char_t* sample = new char_t[sample_length];
fread(sample, sizeof(char_t), sample_length, file);
size_t const alt_length = fread(sample, sizeof(char_t), sample_length, file);
static_cast<void>(alt_length);
fclose(file);
......
......@@ -8,8 +8,8 @@
// FILE INFORMATION:
// File: extractor.cc (depends on extractor.h)
// Author: Jonathan K. Vis
// Revision: 2.1.2
// Date: 2014/08/13
// Revision: 2.1.4
// Date: 2014/08/21
// *******************************************************************
// DESCRIPTION:
// This library can be used to generete HGVS variant descriptions as
......@@ -217,8 +217,9 @@ size_t extractor(std::vector<Variant> &variant,
// Calculate the LCS (possibly in reverse complement) of the two
// strings.
size_t const cut_off = reference_length < THRESHOLD_CUT_OFF ? 1 : EXTRACTION_CUT_OFF;
std::vector<Substring> substring;
size_t const length = LCS(substring, reference, complement, reference_start, reference_end, sample, sample_start, sample_end);
size_t const length = LCS(substring, reference, complement, reference_start, reference_end, sample, sample_start, sample_end, cut_off);
// No LCS found: this is a transposition or a deletion/insertion.
......@@ -456,8 +457,9 @@ size_t extractor_transposition(std::vector<Variant> &variant,
// Extract the LCS (from the whole reference string).
size_t const cut_off = static_cast<size_t>(TRANSPOSITION_CUT_OFF * static_cast<double>(sample_length));
std::vector<Substring> substring;
size_t const length = LCS(substring, reference, complement, 0, global_reference_length, sample, sample_start, sample_end);
size_t const length = LCS(substring, reference, complement, 0, global_reference_length, sample, sample_start, sample_end, cut_off);
// No LCS found: this is a deletion/insertion.
......@@ -557,18 +559,12 @@ size_t LCS(std::vector<Substring> &substring,
size_t const reference_end,
char_t const* const sample,
size_t const sample_start,
size_t const sample_end)
size_t const sample_end,
size_t const cut_off)
{
size_t const reference_length = reference_end - reference_start;
size_t const sample_length = sample_end - sample_start;
// Always fully explore strings smaller than this threshold.
static size_t const THRESHOLD = 16000;
// A dynamic cut-off for unevenly matched string lengths.
double const a = reference_length >= sample_length ? reference_length : sample_length;
double const b = reference_length >= sample_length ? sample_length : reference_length;
size_t const cut_off = (reference_length > THRESHOLD ? ceil((1.0 - b / (a + 0.1 * b)) * b) / 8 : 0) + 1;
// The initial k.
size_t k = reference_length > sample_length ? sample_length / 4 : reference_length / 4;
......
......@@ -8,8 +8,8 @@
// FILE INFORMATION:
// File: extractor.h (implemented in extractor.cc)
// Author: Jonathan K. Vis
// Revision: 2.1.2
// Date: 2014/08/13
// Revision: 2.1.4
// Date: 2014/08/21
// *******************************************************************
// DESCRIPTION:
// This library can be used to generate HGVS variant descriptions as
......@@ -34,7 +34,7 @@ namespace mutalyzer
{
// Version string for run-time identification.
static char const* const VERSION = "2.1.2";
static char const* const VERSION = "2.1.4";
// The character type used for all strings. For now it should just be
......@@ -90,6 +90,15 @@ static size_t const WEIGHT_SEPARATOR = 1; // i.e., _, [, ], ;
static size_t const WEIGHT_SUBSTITUTION = 1; // i.e., >
// Cut-off constants. For normal extraction use the threshold to
// specify the maximum reference length without any cut-off. Otherwise
// the extraction cut-off is used.
// The for transpositions is set at a fraction of the sample length.
static size_t const THRESHOLD_CUT_OFF = 16000;
static size_t const EXTRACTION_CUT_OFF = 500;
static double const TRANSPOSITION_CUT_OFF = 0.3;
// This global variable is used to have access to the whole reference
// string at any point in the extraction process. Commonly used in
// transposition extraction.
......@@ -317,6 +326,7 @@ struct Substring
// @arg sample: sample string
// @arg sample_start: starting position in the sample string
// @arg sample_end: ending position in the sample string
// @arg cut_off: optional cut-off value for the k in LCS_k
// @return: length of the LCS
// *******************************************************************
size_t LCS(std::vector<Substring> &substring,
......@@ -326,7 +336,8 @@ size_t LCS(std::vector<Substring> &substring,
size_t const reference_end,
char_t const* const sample,
size_t const sample_start,
size_t const sample_end);
size_t const sample_end,
size_t const cut_off = 1);
// *******************************************************************
// LCS_1 function
......
......@@ -8,8 +8,8 @@
// FILE INFORMATION:
// File: extractor.i (SWIG interface file)
// Author: Jonathan K. Vis
// Revision: 2.1.2
// Date: 2014/08/13
// Revision: 2.1.4
// Date: 2014/08/21
// *******************************************************************
// DESCRIPTION:
// Defines the SWIG interface for the Extractor library for use in
......@@ -32,7 +32,7 @@ namespace mutalyzer
{
// Version string for run-time identification.
static char const* const VERSION = "2.1.2";
static char const* const VERSION = "2.1.4";
// The character type used for all strings. For now it should just be
// a char.
......@@ -85,8 +85,6 @@ static size_t const WEIGHT_SUBSTITUTION = 1; // i.e., >
// sample string
// @member type: type of the variant described using the
// constants above
// @member weight: weight of the variant according to the weight
// constants above (used internally)
// @member transposition_start: starting position of a transposition
// withing the reference string
// @member transposition_end: ending position of a transposition
......@@ -99,7 +97,6 @@ struct Variant
size_t sample_start;
size_t sample_end;
unsigned int type;
size_t weight;
size_t transposition_start;
size_t transposition_end;
}; // Variant
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment