extractor.cc 67.1 KB
Newer Older
jkvis's avatar
jkvis committed
1
2
3
4
5
6
7
8
9
10
11
12
// *******************************************************************
// Extractor (library)
// *******************************************************************
// FILE INFORMATION:
//   File:     extractor.cc (depends on extractor.h)
//   Author:   Jonathan K. Vis
// *******************************************************************
// DESCRIPTION:
//   This library can be used to generete HGVS variant descriptions as
//   accepted by the Mutalyzer Name Checker.
// *******************************************************************

jkvis's avatar
jkvis committed
13
14
#include "extractor.h"

J.K. Vis's avatar
J.K. Vis committed
15
16
17
namespace mutalyzer
{

J.K. Vis's avatar
J.K. Vis committed
18
19
20
21
// The (average) description length of a position. Depends on the
// reference string length: ceil(log10(|reference| / 4)).
size_t weight_position = 1;

22
23
24
// This global variable is a dirty trick used to always have access to
// the complete reference strings even when deep into the recursion.
// This seems necessary to compute transpositions.
J.K. Vis's avatar
J.K. Vis committed
25
26
size_t global_reference_length = 0;

J.K. Vis's avatar
J.K. Vis committed
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
static char_t const IUPAC_ALPHA[16] =
{
  'x',  // 0x00
  'A',  // 0x01
  'C',  // 0x02
  'M',  // 0x03  A | C
  'G',  // 0x04
  'R',  // 0x05  A | G
  'S',  // 0x06  C | G
  'V',  // 0x07  A | C | G
  'T',  // 0x08
  'W',  // 0x09  A | T
  'Y',  // 0x0a  C | T
  'H',  // 0x0b  A | C | T
  'K',  // 0x0c  G | T
  'D',  // 0x0d  A | G | T
  'B',  // 0x0e  C | G | T
  'N'   // 0x0f  A | C | G | T
}; // IUPAC_ALPHA

static char_t const IUPAC_BASE[4] =
{
  'A',
  'C',
  'G',
  'T'
}; // IUPAC_BASE

55
56
57
// The actual frame shift map indexed on the lower 127 ASCII
// characters. This map should be precalculated given a codon string
// by the initialize_frame_shift_map function.
58
static uint8_t frame_shift_map[128][128][128] = {{{FRAME_SHIFT_NONE}}};
J.K. Vis's avatar
J.K. Vis committed
59

60
61
62
// A frequency count of all possible frame shifts (5) for all
// combinations of two amino acids (indexed by the lower 127 ASCII
// characters). Used to calculate the frame shift probability.
63
64
65
static uint8_t frame_shift_count[128][128][5] = {{{0}}};

static uint64_t acid_map[128] = {0x0ull};
J.K. Vis's avatar
J.K. Vis committed
66

J.K. Vis's avatar
J.K. Vis committed
67
68
69
static double acid_frequency[128] = {.0f};
static double frame_shift_frequency[128][128][5] = {{{.0f}}};

70
// Only used to interface to Python: calls the C++ extract function.
71
72
73
74
Variant_List extract(char_t const* const reference,
                     size_t const        reference_length,
                     char_t const* const sample,
                     size_t const        sample_length,
J.K. Vis's avatar
J.K. Vis committed
75
76
                     int const           type,
                     char_t const* const codon_string)
J.K. Vis's avatar
J.K. Vis committed
77
{
78
  Variant_List variant_list;
J.K. Vis's avatar
J.K. Vis committed
79
  extract(variant_list.variants, reference, reference_length, sample, sample_length, type, codon_string);
80
81
  variant_list.weight_position = weight_position;
  return variant_list;
J.K. Vis's avatar
J.K. Vis committed
82
83
} // extract

84
85
// The main library function. Extract all variants (regions of change)
// from the given strings.
J.K. Vis's avatar
J.K. Vis committed
86
87
88
89
90
size_t extract(std::vector<Variant> &variant,
               char_t const* const   reference,
               size_t const          reference_length,
               char_t const* const   sample,
               size_t const          sample_length,
J.K. Vis's avatar
J.K. Vis committed
91
92
               int const             type,
               char_t const* const   codon_string)
J.K. Vis's avatar
J.K. Vis committed
93
{
94
  // The global variables are set for this extraction run.
J.K. Vis's avatar
J.K. Vis committed
95
96
  global_reference_length = reference_length;
  weight_position = ceil(log10(reference_length / 4));
jkvis's avatar
jkvis committed
97
98
99
100
  if (weight_position <= 0)
  {
    weight_position = 1;
  } // if
J.K. Vis's avatar
J.K. Vis committed
101

102
  // Common prefix and suffix snooping.
J.K. Vis's avatar
J.K. Vis committed
103
104
105
  size_t const prefix = prefix_match(reference, reference_length, sample, sample_length);
  size_t const suffix = suffix_match(reference, reference_length, sample, sample_length, prefix);

jkvis's avatar
jkvis committed
106

jkvis's avatar
jkvis committed
107
#if defined(__debug__)
J.K. Vis's avatar
J.K. Vis committed
108
109
110
111
112
113
114
115
116
117
118
119
  fputs("reference: ", stderr);
  Dprint_truncated(reference, 0, reference_length);
  fprintf(stderr, " (%ld)\n", reference_length);
  fputs("sample:    ", stderr);
  Dprint_truncated(sample, 0, sample_length);
  fprintf(stderr, " (%ld)\n", sample_length);
  fputs("prefix: ", stderr);
  Dprint_truncated(reference, 0, prefix);
  fprintf(stderr, " (%ld)\n", prefix);
  fputs("suffix: ", stderr);
  Dprint_truncated(reference, reference_length - suffix, reference_length);
  fprintf(stderr, " (%ld)\n", suffix);
J.K. Vis's avatar
J.K. Vis committed
120
121
122
123
  if (type == TYPE_DNA)
  {
    fputs("Construction IUPAC complement\n", stderr);
  } // if
jkvis's avatar
jkvis committed
124
#endif
jkvis's avatar
jkvis committed
125
126


127
128
  // Do NOT construct a complement string for protein strings. All
  // other string types default to protein strings.
J.K. Vis's avatar
J.K. Vis committed
129
  char_t const* complement = type == TYPE_DNA ? IUPAC_complement(reference, reference_length) : 0;
130

jkvis's avatar
jkvis committed
131

J.K. Vis's avatar
J.K. Vis committed
132
#if defined(__debug__)
J.K. Vis's avatar
J.K. Vis committed
133
134
135
136
137
138
  if (type == TYPE_DNA)
  {
    fputs("complement: ", stderr);
    Dprint_truncated(complement, 0, reference_length);
    fprintf(stderr, " (%ld)\n", reference_length);
  } // if
J.K. Vis's avatar
J.K. Vis committed
139
140
  fprintf(stderr, "position weight: %ld\n", weight_position);
#endif
jkvis's avatar
jkvis committed
141
142


J.K. Vis's avatar
J.K. Vis committed
143
  if (prefix > 0)
jkvis's avatar
jkvis committed
144
  {
J.K. Vis's avatar
J.K. Vis committed
145
146
    variant.push_back(Variant(0, prefix, 0, prefix));
  } // if
jkvis's avatar
jkvis committed
147

148
  // The actual extraction process starts here.
J.K. Vis's avatar
J.K. Vis committed
149
150
151
  size_t weight;
  if (type == TYPE_PROTEIN)
  {
152
    initialize_frame_shift_map(codon_string);
J.K. Vis's avatar
J.K. Vis committed
153

J.K. Vis's avatar
J.K. Vis committed
154
155
156
157
158
159
    weight = extractor_protein(variant, reference, prefix, reference_length - suffix, sample, prefix, sample_length - suffix);
  } // if
  else
  {
    weight = extractor(variant, reference, complement, prefix, reference_length - suffix, sample, prefix, sample_length - suffix);
  } // else
J.K. Vis's avatar
J.K. Vis committed
160
161

  if (suffix > 0)
jkvis's avatar
jkvis committed
162
  {
J.K. Vis's avatar
J.K. Vis committed
163
164
    variant.push_back(Variant(reference_length - suffix, reference_length, sample_length - suffix, sample_length));
  } // if
jkvis's avatar
jkvis committed
165

166

J.K. Vis's avatar
J.K. Vis committed
167
168
169
  // Frame shift annotation starts here.
  if (type == TYPE_PROTEIN)
  {
J.K. Vis's avatar
J.K. Vis committed
170
    std::vector<Variant> merged;
J.K. Vis's avatar
J.K. Vis committed
171
172
    for (std::vector<Variant>::iterator it = variant.begin(); it != variant.end(); ++it)
    {
J.K. Vis's avatar
J.K. Vis committed
173
      merged.push_back(*it);
J.K. Vis's avatar
J.K. Vis committed
174
175
      if (it->type == SUBSTITUTION)
      {
J.K. Vis's avatar
J.K. Vis committed
176
        std::vector<Variant> annotation;
177
        extractor_frame_shift(annotation, reference, it->reference_start, it->reference_end, sample, it->sample_start, it->sample_end);
J.K. Vis's avatar
J.K. Vis committed
178
        merged.insert(merged.end(), annotation.begin(), annotation.end());
J.K. Vis's avatar
J.K. Vis committed
179
180
      } // if
    } // for
J.K. Vis's avatar
J.K. Vis committed
181
    variant = merged;
J.K. Vis's avatar
J.K. Vis committed
182
183
  } // if

J.K. Vis's avatar
J.K. Vis committed
184

185
  // Do NOT forget to clean up the complement string.
J.K. Vis's avatar
J.K. Vis committed
186
  delete[] complement;
jkvis's avatar
jkvis committed
187

J.K. Vis's avatar
J.K. Vis committed
188
189
  return weight;
} // extract
jkvis's avatar
jkvis committed
190

191
192
193
194
195
196
197
198
199
// This is the recursive extractor function. It works as follows:
// First, determine the ``best fitting'' longest common substring
// (LCS) (possibly as a reverse complement) and discard it from the
// solution. Then apply the same function on the remaining prefixes
// and suffixes. If there is no LCS it is a variant (region of
// change), i.e., a deletion/insertion.
// With regard to the reverse complement: the complement string is, as
// its name suggests, just the complement (DNA/RNA) of the reference
// string but it is NOT reversed.
J.K. Vis's avatar
J.K. Vis committed
200
201
202
203
204
205
206
207
size_t extractor(std::vector<Variant> &variant,
                 char_t const* const   reference,
                 char_t const* const   complement,
                 size_t const          reference_start,
                 size_t const          reference_end,
                 char_t const* const   sample,
                 size_t const          sample_start,
                 size_t const          sample_end)
jkvis's avatar
jkvis committed
208
{
J.K. Vis's avatar
J.K. Vis committed
209
210
  size_t const reference_length = reference_end - reference_start;
  size_t const sample_length = sample_end - sample_start;
211
212

  // Assume this is a deletion/insertion.
J.K. Vis's avatar
J.K. Vis committed
213
214
  size_t const weight_trivial = weight_position + WEIGHT_DELETION_INSERTION + WEIGHT_BASE * sample_length + (reference_length != 1 ? weight_position + WEIGHT_SEPARATOR : 0);
  size_t weight = 0;
jkvis's avatar
jkvis committed
215

jkvis's avatar
jkvis committed
216

jkvis's avatar
jkvis committed
217
#if defined(__debug__)
J.K. Vis's avatar
J.K. Vis committed
218
219
220
221
222
223
224
225
226
227
228
  fputs("Extraction\n", stderr);
  fprintf(stderr, "  reference %ld--%ld:  ", reference_start, reference_end);
  Dprint_truncated(reference, reference_start, reference_end);
  fprintf(stderr, " (%ld)\n", reference_length);
  fprintf(stderr, "  complement %ld--%ld: ", reference_start, reference_end);
  Dprint_truncated(complement, reference_start, reference_end);
  fprintf(stderr, " (%ld)\n", reference_length);
  fprintf(stderr, "  sample %ld--%ld:     ", sample_start, sample_end);
  Dprint_truncated(sample, sample_start, sample_end);
  fprintf(stderr, " (%ld)\n", sample_length);
  fprintf(stderr, "  trivial weight: %ld\n", weight_trivial);
jkvis's avatar
jkvis committed
229
230
#endif

jkvis's avatar
jkvis committed
231

232
233
  // First some base cases to end the recursion.
  // No more reference string.
J.K. Vis's avatar
J.K. Vis committed
234
235
  if (reference_length <= 0)
  {
236
237
    // But some of the sample string is remaining: this is an
    // insertion or transposition.
J.K. Vis's avatar
J.K. Vis committed
238
239
240
241
    if (sample_length > 0)
    {
      weight = 2 * weight_position + WEIGHT_SEPARATOR + WEIGHT_INSERTION + WEIGHT_BASE * sample_length;

242
243
244
245
      // First, we check if we can match the inserted substring
      // somewhere in the complete reference string. This will
      // indicate a possible transposition. Otherwise it is a regular
      // insertion.
J.K. Vis's avatar
J.K. Vis committed
246
247
248
      std::vector<Variant> transposition;
      size_t const weight_transposition = extractor_transposition(transposition, reference, complement, reference_start, reference_end, sample, sample_start, sample_end, weight) + 2 * weight_position + 3 * WEIGHT_SEPARATOR + WEIGHT_INSERTION;

249

jkvis's avatar
jkvis committed
250
#if defined(__debug__)
J.K. Vis's avatar
J.K. Vis committed
251
  fprintf(stderr, "Transpositions: %ld (trivial: %ld)\n", weight_transposition, weight);
252
  for (std::vector<Variant>::const_iterator it = transposition.begin(); it != transposition.end(); ++it)
J.K. Vis's avatar
J.K. Vis committed
253
254
255
  {
    fprintf(stderr, "  %ld--%ld, %ld--%ld, %d, %ld, %ld--%ld\n", it->reference_start, it->reference_end, it->sample_start, it->sample_end, it->type, it->weight, it->transposition_start, it->transposition_end);
  } // for
jkvis's avatar
jkvis committed
256
#endif
jkvis's avatar
jkvis committed
257

J.K. Vis's avatar
J.K. Vis committed
258

259
      // Add transpositions if any.
J.K. Vis's avatar
J.K. Vis committed
260
261
262
263
264
265
266
267
      if (weight > weight_transposition && transposition.size() > 0 && !(transposition.size() == 1 && transposition.front().type == SUBSTITUTION))
      {
        transposition.front().type |= TRANSPOSITION_OPEN;
        transposition.back().type |= TRANSPOSITION_CLOSE;
        variant.insert(variant.end(), transposition.begin(), transposition.end());
        return weight_transposition;
      } // if

268
      // This is an actual insertion.
J.K. Vis's avatar
J.K. Vis committed
269
270
271
      variant.push_back(Variant(reference_start, reference_end, sample_start, sample_end, SUBSTITUTION, weight));
    } // if
    return weight;
272
  } // if
jkvis's avatar
jkvis committed
273

274
275
  // Obviously there is a piece of reference string left, but no more
  // sample string: this is a deletion.
J.K. Vis's avatar
J.K. Vis committed
276
  if (sample_length <= 0)
277
  {
J.K. Vis's avatar
J.K. Vis committed
278
279
280
    weight = weight_position + WEIGHT_DELETION + (reference_length > 1 ? weight_position + WEIGHT_SEPARATOR : 0);
    variant.push_back(Variant(reference_start, reference_end, sample_start, sample_end, SUBSTITUTION, weight));
    return weight;
281
  } // if
jkvis's avatar
jkvis committed
282

283
  // Single-nucleotide polymorphism (SNP): a special case for HGVS.
J.K. Vis's avatar
J.K. Vis committed
284
  if (reference_length == 1 && sample_length == 1)
285
  {
J.K. Vis's avatar
J.K. Vis committed
286
287
288
    weight = weight_position + 2 * WEIGHT_BASE + WEIGHT_SUBSTITUTION;
    variant.push_back(Variant(reference_start, reference_end, sample_start, sample_end, SUBSTITUTION, weight));
    return weight;
289
  } // if
jkvis's avatar
jkvis committed
290

J.K. Vis's avatar
J.K. Vis committed
291

292
293
  // Calculate the LCS (possibly in reverse complement) of the two
  // strings.
J.K. Vis's avatar
J.K. Vis committed
294
  size_t const cut_off = reference_length < THRESHOLD_CUT_OFF ? 1 : weight_position;
J.K. Vis's avatar
J.K. Vis committed
295
  std::vector<Substring> substring;
J.K. Vis's avatar
J.K. Vis committed
296
  size_t const length = LCS(substring, reference, complement, reference_start, reference_end, sample, sample_start, sample_end, cut_off);
J.K. Vis's avatar
J.K. Vis committed
297

298
299

  // No LCS found: this is a transposition or a deletion/insertion.
J.K. Vis's avatar
J.K. Vis committed
300
  if (length <= 0 || substring.size() <= 0)
jkvis's avatar
jkvis committed
301
  {
J.K. Vis's avatar
J.K. Vis committed
302
    weight = weight_trivial;
jkvis's avatar
jkvis committed
303

304
305
306
    // First, we check if we can match the inserted substring
    // somewhere in the complete reference string. This will
    // indicate a possible transposition.
jkvis's avatar
jkvis committed
307
    std::vector<Variant> transposition;
308
    size_t const weight_transposition = extractor_transposition(transposition, reference, complement, reference_start, reference_end, sample, sample_start, sample_end, weight)  + 2 * weight_position + 3 * WEIGHT_SEPARATOR + WEIGHT_DELETION_INSERTION;
jkvis's avatar
jkvis committed
309
310
311


#if defined(__debug__)
J.K. Vis's avatar
J.K. Vis committed
312
  fprintf(stderr, "Transpositions: %ld (trivial: %ld)\n", weight_transposition, weight);
313
  for (std::vector<Variant>::const_iterator it = transposition.begin(); it != transposition.end(); ++it)
J.K. Vis's avatar
J.K. Vis committed
314
315
316
  {
    fprintf(stderr, "  %ld--%ld, %ld--%ld, %d, %ld, %ld--%ld\n", it->reference_start, it->reference_end, it->sample_start, it->sample_end, it->type, it->weight, it->transposition_start, it->transposition_end);
  } // for
jkvis's avatar
jkvis committed
317
318
#endif

J.K. Vis's avatar
J.K. Vis committed
319

320
    // Add transpositions if any.
J.K. Vis's avatar
J.K. Vis committed
321
    if (weight > weight_transposition && transposition.size() > 0 && !(transposition.size() == 1 && transposition.front().type == SUBSTITUTION))
jkvis's avatar
jkvis committed
322
    {
J.K. Vis's avatar
J.K. Vis committed
323
324
325
326
      transposition.front().type |= TRANSPOSITION_OPEN;
      transposition.back().type |= TRANSPOSITION_CLOSE;
      variant.insert(variant.end(), transposition.begin(), transposition.end());
      return weight_transposition;
jkvis's avatar
jkvis committed
327
    } // if
J.K. Vis's avatar
J.K. Vis committed
328

329
    // This is an actual deletion/insertion.
J.K. Vis's avatar
J.K. Vis committed
330
331
    variant.push_back(Variant(reference_start, reference_end, sample_start, sample_end, SUBSTITUTION, weight_trivial));
    return weight_trivial;
jkvis's avatar
jkvis committed
332
333
  } // if

jkvis's avatar
jkvis committed
334

335
336
  // Pick the ``best fitting'' LCS, i.e., the remaining prefixes and
  // suffixes are of the same length.
337
  size_t diff = (reference_end - reference_start) + (sample_end - sample_start);
338
339
  std::vector<Substring>::const_iterator lcs = substring.begin();
  for (std::vector<Substring>::const_iterator it = substring.begin(); it != substring.end(); ++it)
340
341
342
343
344
  {
    size_t const prefix_diff = abs((it->reference_index - reference_start) - (it->sample_index - sample_start));
    size_t const suffix_diff = abs((reference_end - (it->reference_index + it->length)) - (sample_end - (it->sample_index + it->length)));
    if (prefix_diff + suffix_diff < diff)
    {
345
      // A better fitting LCS.
346
347
348
349
350
      diff = prefix_diff + suffix_diff;
      lcs = it;
    } // if
  } // for

351
352

  // Add some weight for reverse complement LCS.
J.K. Vis's avatar
J.K. Vis committed
353
  if (lcs->reverse_complement)
jkvis's avatar
jkvis committed
354
  {
J.K. Vis's avatar
J.K. Vis committed
355
    weight = 2 * weight_position + WEIGHT_SEPARATOR + WEIGHT_INVERSION;
jkvis's avatar
jkvis committed
356
357
  } // if

J.K. Vis's avatar
J.K. Vis committed
358
359
360

#if defined(__debug__)
  fprintf(stderr, "  LCS (x%ld)\n", substring.size());
361
  for (std::vector<Substring>::const_iterator it = substring.begin() ; it != substring.end(); ++it)
jkvis's avatar
jkvis committed
362
  {
J.K. Vis's avatar
J.K. Vis committed
363
    if (!it->reverse_complement)
jkvis's avatar
jkvis committed
364
    {
J.K. Vis's avatar
J.K. Vis committed
365
366
367
368
369
      fprintf(stderr, "    %ld--%ld: ", it->reference_index, it->reference_index + length);
      Dprint_truncated(reference, it->reference_index, it->reference_index + length);
      fprintf(stderr, " (%ld)\n    %ld--%ld: ", length, it->sample_index, it->sample_index + length);
      Dprint_truncated(sample, it->sample_index, it->sample_index + length);
      fprintf(stderr, " (%ld)", length);
jkvis's avatar
jkvis committed
370
    } // if
J.K. Vis's avatar
J.K. Vis committed
371
372
373
374
375
376
377
378
379
380
381
382
383
    else
    {
      fprintf(stderr, "    %ld--%ld: ", it->reference_index, it->reference_index + length);
      Dprint_truncated(complement, it->reference_index, it->reference_index + length);
      fprintf(stderr, " (%ld), (reverse complement)\n    %ld--%ld: ", length, it->sample_index, it->sample_index + length);
      Dprint_truncated(sample, it->sample_index, it->sample_index + length);
      fprintf(stderr, " (%ld)", length);
    } // else
    fputs("\n", stderr);
  } // for
#endif


384
  // Recursively apply this function to the prefixes of the strings.
J.K. Vis's avatar
J.K. Vis committed
385
386
387
  std::vector<Variant> prefix;
  weight += extractor(prefix, reference, complement, reference_start, lcs->reference_index, sample, sample_start, lcs->sample_index);

388
  // Stop if the weight of the variant exeeds the trivial weight.
J.K. Vis's avatar
J.K. Vis committed
389
390
  if (weight > weight_trivial)
  {
391
392
393
394
395
396
397
398
399
400
401
    weight = weight_trivial;

    // First, we check if we can match the inserted substring
    // somewhere in the complete reference string. This will
    // indicate a possible transposition.
    std::vector<Variant> transposition;
    size_t const weight_transposition = extractor_transposition(transposition, reference, complement, reference_start, reference_end, sample, sample_start, sample_end, weight)  + 2 * weight_position + 3 * WEIGHT_SEPARATOR + WEIGHT_DELETION_INSERTION;


#if defined(__debug__)
  fprintf(stderr, "Transpositions: %ld (trivial: %ld)\n", weight_transposition, weight);
402
  for (std::vector<Variant>::const_iterator it = transposition.begin(); it != transposition.end(); ++it)
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
  {
    fprintf(stderr, "  %ld--%ld, %ld--%ld, %d, %ld, %ld--%ld\n", it->reference_start, it->reference_end, it->sample_start, it->sample_end, it->type, it->weight, it->transposition_start, it->transposition_end);
  } // for
#endif


    // Add transpositions if any.
    if (weight > weight_transposition && transposition.size() > 0 && !(transposition.size() == 1 && transposition.front().type == SUBSTITUTION))
    {
      transposition.front().type |= TRANSPOSITION_OPEN;
      transposition.back().type |= TRANSPOSITION_CLOSE;
      variant.insert(variant.end(), transposition.begin(), transposition.end());
      return weight_transposition;
    } // if

    // This is an actual deletion/insertion.
J.K. Vis's avatar
J.K. Vis committed
419
420
    variant.push_back(Variant(reference_start, reference_end, sample_start, sample_end, SUBSTITUTION, weight_trivial));
    return weight_trivial;
jkvis's avatar
jkvis committed
421
422
  } // if

423
  // Recursively apply this function to the suffixes of the strings.
J.K. Vis's avatar
J.K. Vis committed
424
425
426
  std::vector<Variant> suffix;
  weight += extractor(suffix, reference, complement, lcs->reference_index + length, reference_end, sample, lcs->sample_index + length, sample_end);

427
  // Stop if the weight of the variant exeeds the trivial weight.
J.K. Vis's avatar
J.K. Vis committed
428
  if (weight > weight_trivial)
jkvis's avatar
jkvis committed
429
  {
430
431
432
433
434
435
436
437
438
439
440
    weight = weight_trivial;

    // First, we check if we can match the inserted substring
    // somewhere in the complete reference string. This will
    // indicate a possible transposition.
    std::vector<Variant> transposition;
    size_t const weight_transposition = extractor_transposition(transposition, reference, complement, reference_start, reference_end, sample, sample_start, sample_end, weight)  + 2 * weight_position + 3 * WEIGHT_SEPARATOR + WEIGHT_DELETION_INSERTION;


#if defined(__debug__)
  fprintf(stderr, "Transpositions: %ld (trivial: %ld)\n", weight_transposition, weight);
441
  for (std::vector<Variant>::const_iterator it = transposition.begin(); it != transposition.end(); ++it)
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
  {
    fprintf(stderr, "  %ld--%ld, %ld--%ld, %d, %ld, %ld--%ld\n", it->reference_start, it->reference_end, it->sample_start, it->sample_end, it->type, it->weight, it->transposition_start, it->transposition_end);
  } // for
#endif


    // Add transpositions if any.
    if (weight > weight_transposition && transposition.size() > 0 && !(transposition.size() == 1 && transposition.front().type == SUBSTITUTION))
    {
      transposition.front().type |= TRANSPOSITION_OPEN;
      transposition.back().type |= TRANSPOSITION_CLOSE;
      variant.insert(variant.end(), transposition.begin(), transposition.end());
      return weight_transposition;
    } // if

    // This is an actual deletion/insertion.
J.K. Vis's avatar
J.K. Vis committed
458
459
    variant.push_back(Variant(reference_start, reference_end, sample_start, sample_end, SUBSTITUTION, weight_trivial));
    return weight_trivial;
jkvis's avatar
jkvis committed
460
461
462
  } // if


463
  // Add all variants (in order) to the variant vector.
J.K. Vis's avatar
J.K. Vis committed
464
465
466
  variant.insert(variant.end(), prefix.begin(), prefix.end());

  if (!lcs->reverse_complement)
jkvis's avatar
jkvis committed
467
  {
J.K. Vis's avatar
J.K. Vis committed
468
    variant.push_back(Variant(lcs->reference_index, lcs->reference_index + length, lcs->sample_index, lcs->sample_index + length));
jkvis's avatar
jkvis committed
469
  } // if
J.K. Vis's avatar
J.K. Vis committed
470
471
472
473
474
475
476
477
478
479
  else
  {
    variant.push_back(Variant(lcs->reference_index, lcs->reference_index + length, lcs->sample_index, lcs->sample_index + length, REVERSE_COMPLEMENT, 2 * weight_position + WEIGHT_SEPARATOR + WEIGHT_INVERSION));
  } // else

  variant.insert(variant.end(), suffix.begin(), suffix.end());

  return weight;
} // extractor

480
481
482
483
// This function tries to extract transpositions from inserted
// sequences (insertions or deletion/insertions). Again we use a
// recursive method: extract the LCS and apply to the remaining prefix
// and suffix.
J.K. Vis's avatar
J.K. Vis committed
484
485
486
487
488
489
490
491
492
493
494
495
496
497
size_t extractor_transposition(std::vector<Variant> &variant,
                               char_t const* const   reference,
                               char_t const* const   complement,
                               size_t const          reference_start,
                               size_t const          reference_end,
                               char_t const* const   sample,
                               size_t const          sample_start,
                               size_t const          sample_end,
                               size_t const          weight_trivial)
{
  size_t const sample_length = sample_end - sample_start;

  size_t weight = 0;

jkvis's avatar
jkvis committed
498

jkvis's avatar
jkvis committed
499
#if defined(__debug__)
J.K. Vis's avatar
J.K. Vis committed
500
501
502
503
504
505
506
507
508
509
510
  fputs("Transposition extraction\n", stderr);
  fprintf(stderr, "  reference %ld--%ld:  ", 0ul, global_reference_length);
  Dprint_truncated(reference, 0, global_reference_length);
  fprintf(stderr, " (%ld)\n", global_reference_length);
  fprintf(stderr, "  complement %ld--%ld: ", 0ul, global_reference_length);
  Dprint_truncated(complement, 0, global_reference_length);
  fprintf(stderr, " (%ld)\n", global_reference_length);
  fprintf(stderr, "  sample %ld--%ld:     ", sample_start, sample_end);
  Dprint_truncated(sample, sample_start, sample_end);
  fprintf(stderr, " (%ld)\n", sample_length);
  fprintf(stderr, "  trivial weight: %ld\n", weight_trivial);
jkvis's avatar
jkvis committed
511
512
#endif

J.K. Vis's avatar
J.K. Vis committed
513

514
515
516
517
518
519
520
521
  // End transposition extraction if no more of the sample string
  // remains.
  if (sample_length <= 0)
  {
    return weight;
  } // if


522
  // Only consider large enough inserted regions (>> 1), based on
523
524
  // (average) description length of a position, otherwise it is just
  // a deletion/insertion.
J.K. Vis's avatar
J.K. Vis committed
525
  if (sample_length <= 2 * weight_position)
jkvis's avatar
jkvis committed
526
  {
527
528
529
    weight = sample_length * WEIGHT_BASE;
    variant.push_back(Variant(reference_start, reference_end, sample_start, sample_end, SUBSTITUTION, weight));
    return weight;
J.K. Vis's avatar
J.K. Vis committed
530
531
  } // if

532
533

  // Extract the LCS (from the whole reference string).
534
  size_t const cut_off = global_reference_length < THRESHOLD_CUT_OFF ? 1 : TRANSPOSITION_CUT_OFF * sample_length;
J.K. Vis's avatar
J.K. Vis committed
535
  std::vector<Substring> substring;
J.K. Vis's avatar
J.K. Vis committed
536
  size_t const length = LCS(substring, reference, complement, 0, global_reference_length, sample, sample_start, sample_end, cut_off);
J.K. Vis's avatar
J.K. Vis committed
537

538

539
  // No LCS found: this is a deletion/insertion.
J.K. Vis's avatar
J.K. Vis committed
540
541
542
543
544
545
546
  if (length <= 0 || substring.size() <= 0)
  {
    weight = sample_length * WEIGHT_BASE;
    variant.push_back(Variant(reference_start, reference_end, sample_start, sample_end, SUBSTITUTION, weight));
    return weight;
  } // if

547
  // We do NOT have to bother about a ``best fitting'' LCS here.
548
  std::vector<Substring>::const_iterator const lcs = substring.begin();
J.K. Vis's avatar
J.K. Vis committed
549

550
  // Update the weight of the transposition.
J.K. Vis's avatar
J.K. Vis committed
551
552
553
554
555
556
557
558
559
  weight += 2 * weight_position + WEIGHT_SEPARATOR;
  if (lcs->reverse_complement)
  {
    weight += WEIGHT_INVERSION;
  } // if


#if defined(__debug__)
  fprintf(stderr, "  LCS (x%ld)\n", substring.size());
560
  for (std::vector<Substring>::const_iterator it = substring.begin() ; it != substring.end(); ++it)
J.K. Vis's avatar
J.K. Vis committed
561
562
  {
    if (!it->reverse_complement)
jkvis's avatar
jkvis committed
563
    {
J.K. Vis's avatar
J.K. Vis committed
564
565
566
567
568
      fprintf(stderr, "    %ld--%ld: ", it->reference_index, it->reference_index + length);
      Dprint_truncated(reference, it->reference_index, it->reference_index + length);
      fprintf(stderr, " (%ld)\n    %ld--%ld: ", length, it->sample_index, it->sample_index + length);
      Dprint_truncated(sample, it->sample_index, it->sample_index + length);
      fprintf(stderr, " (%ld)", length);
jkvis's avatar
jkvis committed
569
    } // if
J.K. Vis's avatar
J.K. Vis committed
570
571
572
573
574
575
576
577
578
    else
    {
      fprintf(stderr, "    %ld--%ld: ", it->reference_index, it->reference_index + length);
      Dprint_truncated(complement, it->reference_index, it->reference_index + length);
      fprintf(stderr, " (%ld), (reverse complement)\n    %ld--%ld: ", length, it->sample_index, it->sample_index + length);
      Dprint_truncated(sample, it->sample_index, it->sample_index + length);
      fprintf(stderr, " (%ld)", length);
    } // else
    fputs("\n", stderr);
jkvis's avatar
jkvis committed
579
  } // for
J.K. Vis's avatar
J.K. Vis committed
580
#endif
jkvis's avatar
jkvis committed
581

jkvis's avatar
jkvis committed
582

583
  // Recursively apply this function to the prefixes of the strings
J.K. Vis's avatar
J.K. Vis committed
584
  std::vector<Variant> prefix;
585
  weight += extractor_transposition(prefix, reference, complement, reference_start, reference_end, sample, sample_start, lcs->sample_index, lcs->sample_index - sample_start) + WEIGHT_SEPARATOR;
J.K. Vis's avatar
J.K. Vis committed
586

587
  // Stop if the weight of the variant exeeds the trivial weight.
J.K. Vis's avatar
J.K. Vis committed
588
  if (weight > weight_trivial)
jkvis's avatar
jkvis committed
589
  {
J.K. Vis's avatar
J.K. Vis committed
590
591
592
593
594
    weight = sample_length * WEIGHT_BASE;
    variant.push_back(Variant(reference_start, reference_end, sample_start, sample_end, SUBSTITUTION, weight));
    return weight;
  } // if

595
  // Recursively apply this function to the suffixes of the strings.
J.K. Vis's avatar
J.K. Vis committed
596
  std::vector<Variant> suffix;
597
  weight += extractor_transposition(suffix, reference, complement, reference_start, reference_end, sample, lcs->sample_index + length, sample_end, sample_end - (lcs->sample_index + length)) + WEIGHT_SEPARATOR;
J.K. Vis's avatar
J.K. Vis committed
598

599
  // Stop if the weight of the variant exeeds the trivial weight.
J.K. Vis's avatar
J.K. Vis committed
600
601
602
603
604
605
606
  if (weight > weight_trivial)
  {
    weight = sample_length * WEIGHT_BASE;
    variant.push_back(Variant(reference_start, reference_end, sample_start, sample_end, SUBSTITUTION, weight));
    return weight;
  } // if

607
608

  // Add all variants (in order) to the variant vector.
J.K. Vis's avatar
J.K. Vis committed
609
610
611
612
613
  variant.insert(variant.end(), prefix.begin(), prefix.end());

  if (!lcs->reverse_complement)
  {
    variant.push_back(Variant(reference_start, reference_end, lcs->sample_index, lcs->sample_index + length, IDENTITY, 2 * weight_position + WEIGHT_SEPARATOR, lcs->reference_index, lcs->reference_index + length));
jkvis's avatar
jkvis committed
614
  } // if
615
616
  else
  {
J.K. Vis's avatar
J.K. Vis committed
617
    variant.push_back(Variant(reference_start, reference_end, lcs->sample_index, lcs->sample_index + length, REVERSE_COMPLEMENT, 2 * weight_position + WEIGHT_SEPARATOR + WEIGHT_INVERSION, lcs->reference_index, lcs->reference_index + length));
618
  } // else
jkvis's avatar
jkvis committed
619

J.K. Vis's avatar
J.K. Vis committed
620
621
  variant.insert(variant.end(), suffix.begin(), suffix.end());

622
  return weight;
J.K. Vis's avatar
J.K. Vis committed
623
624
} // extractor_transposition

J.K. Vis's avatar
J.K. Vis committed
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
// This is the recursive protein extractor function. It works as the
// regular extractor function, but no reverse complements nor
// transposion matching is used.
size_t extractor_protein(std::vector<Variant> &variant,
                         char_t const* const   reference,
                         size_t const          reference_start,
                         size_t const          reference_end,
                         char_t const* const   sample,
                         size_t const          sample_start,
                         size_t const          sample_end)
{
  size_t const reference_length = reference_end - reference_start;
  size_t const sample_length = sample_end - sample_start;

  // Assume this is a deletion/insertion.
  size_t const weight_trivial = weight_position + WEIGHT_DELETION_INSERTION + WEIGHT_BASE * sample_length + (reference_length != 1 ? weight_position + WEIGHT_SEPARATOR : 0);
  size_t weight = 0;


#if defined(__debug__)
  fputs("Extraction (protein)\n", stderr);
  fprintf(stderr, "  reference %ld--%ld:  ", reference_start, reference_end);
  Dprint_truncated(reference, reference_start, reference_end);
  fprintf(stderr, " (%ld)\n", reference_length);
  fprintf(stderr, "  sample %ld--%ld:     ", sample_start, sample_end);
  Dprint_truncated(sample, sample_start, sample_end);
  fprintf(stderr, " (%ld)\n", sample_length);
  fprintf(stderr, "  trivial weight: %ld\n", weight_trivial);
#endif


  // First some base cases to end the recursion.
  // No more reference string.
  if (reference_length <= 0)
  {
    // But some of the sample string is remaining: this is an
    // insertion.
    if (sample_length > 0)
    {
      weight = 2 * weight_position + WEIGHT_SEPARATOR + WEIGHT_INSERTION + WEIGHT_BASE * sample_length;
      variant.push_back(Variant(reference_start, reference_end, sample_start, sample_end, SUBSTITUTION, weight));
    } // if
    return weight;
  } // if

  // Obviously there is a piece of reference string left, but no more
  // sample string: this is a deletion.
  if (sample_length <= 0)
  {
    weight = weight_position + WEIGHT_DELETION + (reference_length > 1 ? weight_position + WEIGHT_SEPARATOR : 0);
    variant.push_back(Variant(reference_start, reference_end, sample_start, sample_end, SUBSTITUTION, weight));
    return weight;
  } // if

  // Single substitution: a special case for HGVS.
  if (reference_length == 1 && sample_length == 1)
  {
    weight = weight_position + 2 * WEIGHT_BASE + WEIGHT_SUBSTITUTION;
    variant.push_back(Variant(reference_start, reference_end, sample_start, sample_end, SUBSTITUTION, weight));
    return weight;
  } // if


  // Calculate the LCS of the two strings.
  std::vector<Substring> substring;
  size_t const length = LCS_1(substring, reference, 0, reference_start, reference_end, sample, sample_start, sample_end);


  // No LCS found: this is a deletion/insertion.
  if (length <= 0 || substring.size() <= 0)
  {
    weight = weight_trivial;

    // This is an actual deletion/insertion.
    variant.push_back(Variant(reference_start, reference_end, sample_start, sample_end, SUBSTITUTION, weight_trivial));
    return weight_trivial;
  } // if


  // Pick the ``best fitting'' LCS, i.e., the remaining prefixes and
  // suffixes are of the same length.
  size_t diff = (reference_end - reference_start) + (sample_end - sample_start);
707
708
  std::vector<Substring>::const_iterator lcs = substring.begin();
  for (std::vector<Substring>::const_iterator it = substring.begin(); it != substring.end(); ++it)
J.K. Vis's avatar
J.K. Vis committed
709
710
711
712
713
714
715
716
717
718
719
720
721
722
  {
    size_t const prefix_diff = abs((it->reference_index - reference_start) - (it->sample_index - sample_start));
    size_t const suffix_diff = abs((reference_end - (it->reference_index + it->length)) - (sample_end - (it->sample_index + it->length)));
    if (prefix_diff + suffix_diff < diff)
    {
      // A better fitting LCS.
      diff = prefix_diff + suffix_diff;
      lcs = it;
    } // if
  } // for


#if defined(__debug__)
  fprintf(stderr, "  LCS (x%ld)\n", substring.size());
723
  for (std::vector<Substring>::const_iterator it = substring.begin() ; it != substring.end(); ++it)
J.K. Vis's avatar
J.K. Vis committed
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
  {
    fprintf(stderr, "    %ld--%ld: ", it->reference_index, it->reference_index + length);
    Dprint_truncated(reference, it->reference_index, it->reference_index + length);
    fprintf(stderr, " (%ld)\n    %ld--%ld: ", length, it->sample_index, it->sample_index + length);
    Dprint_truncated(sample, it->sample_index, it->sample_index + length);
    fprintf(stderr, " (%ld)", length);
    fputs("\n", stderr);
  } // for
#endif


  // Recursively apply this function to the prefixes of the strings.
  std::vector<Variant> prefix;
  weight += extractor_protein(prefix, reference, reference_start, lcs->reference_index, sample, sample_start, lcs->sample_index);

  // Stop if the weight of the variant exeeds the trivial weight.
  if (weight > weight_trivial)
  {
    weight = weight_trivial;

    // This is an actual deletion/insertion.
    variant.push_back(Variant(reference_start, reference_end, sample_start, sample_end, SUBSTITUTION, weight_trivial));
    return weight_trivial;
  } // if

  // Recursively apply this function to the suffixes of the strings.
  std::vector<Variant> suffix;
  weight += extractor_protein(suffix, reference, lcs->reference_index + length, reference_end, sample, lcs->sample_index + length, sample_end);

  // Stop if the weight of the variant exeeds the trivial weight.
  if (weight > weight_trivial)
  {
    weight = weight_trivial;

    // This is an actual deletion/insertion.
    variant.push_back(Variant(reference_start, reference_end, sample_start, sample_end, SUBSTITUTION, weight_trivial));
    return weight_trivial;
  } // if


  // Add all variants (in order) to the variant vector.
  variant.insert(variant.end(), prefix.begin(), prefix.end());
  variant.push_back(Variant(lcs->reference_index, lcs->reference_index + length, lcs->sample_index, lcs->sample_index + length));
  variant.insert(variant.end(), suffix.begin(), suffix.end());

  return weight;
} // extractor_protein

772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
void extractor_frame_shift(std::vector<Variant> &annotation,
                           char_t const* const   reference,
                           size_t const          reference_start,
                           size_t const          reference_end,
                           char_t const* const   sample,
                           size_t const          sample_start,
                           size_t const          sample_end)
{
  size_t const reference_length = reference_end - reference_start;
  size_t const sample_length = sample_end - sample_start;


#if defined(__debug__)
  fputs("Extraction (frame shift)\n", stderr);
  fprintf(stderr, "  reference %ld--%ld:  ", reference_start, reference_end);
  Dprint_truncated(reference, reference_start, reference_end);
  fprintf(stderr, " (%ld)\n", reference_length);
  fprintf(stderr, "  sample %ld--%ld:     ", sample_start, sample_end);
  Dprint_truncated(sample, sample_start, sample_end);
  fprintf(stderr, " (%ld)\n", sample_length);
#endif


  // First the base cases to end the recursion.
796
  if (reference_length <= 0 || sample_length <= 0)
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
  {
    return;
  } // if


  // Calculate the frame shift LCS of the two strings.
  std::vector<Substring> substring;
  LCS_frame_shift(substring, reference, reference_start, reference_end, sample, sample_start, sample_end);


  // Pick the ``best fitting'' frame shift LCS, i.e., pushed as far to
  // the start of the reference string as possible. Also update in
  // case of compound frame shifts.
  Substring lcs(0, 0, 0, FRAME_SHIFT_NONE);
  for (size_t i = 0; i < 5; ++i)
  {
    if (substring[i].length > lcs.length ||
        (substring[i].length == lcs.length && substring[i].reference_index < lcs.reference_index))
    {
      lcs = substring[i];
    } // if
    // Update compound frame shifts, e.g.,
    // FRAME_SHIFT_1 | FRAME_SHIFT_2
    else if (substring[i].length == lcs.length &&
             substring[i].reference_index == lcs.reference_index &&
             substring[i].sample_index == lcs.sample_index)
    {
      lcs.type |= substring[i].type;
    } // if
  } // for


  // No LCS found: no frame shift annotation.
  if (lcs.length <= 0)
  {
    return;
  } // if


836
837
838
839
840
841
842
843
844
845
846
#if defined(__debug__)
  fprintf(stderr, "  LCS type = %d\n", lcs.type);
  fprintf(stderr, "    %ld--%ld: ", lcs.reference_index, lcs.reference_index + lcs.length);
  Dprint_truncated(reference, lcs.reference_index, lcs.reference_index + lcs.length);
  fprintf(stderr, " (%ld)\n    %ld--%ld: ", lcs.length, lcs.sample_index, lcs.sample_index + lcs.length);
  Dprint_truncated(sample, lcs.sample_index, lcs.sample_index + lcs.length);
  fprintf(stderr, " (%ld)", lcs.length);
  fputs("\n", stderr);
#endif


847
  // Calculate the frame shift probability.
J.K. Vis's avatar
J.K. Vis committed
848
  double probability = 1.f;
849
850
  for (size_t i = 0; i < lcs.length; ++i)
  {
J.K. Vis's avatar
J.K. Vis committed
851
    double probability_compound = .0f;
852
853
    if ((lcs.type & FRAME_SHIFT_1) == FRAME_SHIFT_1)
    {
J.K. Vis's avatar
J.K. Vis committed
854
      probability_compound += frame_shift_frequency[reference[lcs.reference_index + i] & 0x7f][reference[lcs.reference_index + i + 1] & 0x7f][0];
855
856
857
    } // if
    if ((lcs.type & FRAME_SHIFT_2) == FRAME_SHIFT_2)
    {
J.K. Vis's avatar
J.K. Vis committed
858
      probability_compound += frame_shift_frequency[reference[lcs.reference_index + i] & 0x7f][reference[lcs.reference_index + i + 1] & 0x7f][1];
859
860
861
    } // if
    if ((lcs.type & FRAME_SHIFT_REVERSE) == FRAME_SHIFT_REVERSE)
    {
J.K. Vis's avatar
J.K. Vis committed
862
      probability_compound += frame_shift_frequency[reference[lcs.reference_index + i] & 0x7f][reference[lcs.reference_index + i] & 0x7f][2];
863
864
865
    } // if
    if ((lcs.type & FRAME_SHIFT_REVERSE_1) == FRAME_SHIFT_REVERSE_1)
    {
J.K. Vis's avatar
J.K. Vis committed
866
      probability_compound += frame_shift_frequency[reference[lcs.reference_index + i] & 0x7f][reference[lcs.reference_index + i + 1] & 0x7f][3];
867
868
869
    } // if
    if ((lcs.type & FRAME_SHIFT_REVERSE_2) == FRAME_SHIFT_REVERSE_2)
    {
J.K. Vis's avatar
J.K. Vis committed
870
      probability_compound += frame_shift_frequency[reference[lcs.reference_index + i] & 0x7f][reference[lcs.reference_index + i + 1] & 0x7f][4];
871
    } // if
J.K. Vis's avatar
J.K. Vis committed
872
    probability *= probability_compound;
873
874
875
  } // for


876
877
878
879
880
881
882
883
884
885
886
887
  // Recursively apply this function to the prefixes of the strings.
  std::vector<Variant> prefix;
  extractor_frame_shift(prefix, reference, reference_start, lcs.reference_index, sample, sample_start, lcs.sample_index);


  // Recursively apply this function to the suffixes of the strings.
  std::vector<Variant> suffix;
  extractor_frame_shift(suffix, reference, lcs.reference_index + lcs.length, reference_end, sample, lcs.sample_index + lcs.length, sample_end);


  // Add all variants (in order) to the annotation vector.
  annotation.insert(annotation.end(), prefix.begin(), prefix.end());
J.K. Vis's avatar
J.K. Vis committed
888
889
890
  Variant variant(lcs.reference_index, lcs.reference_index + lcs.length, lcs.sample_index, lcs.sample_index + lcs.length, FRAME_SHIFT | lcs.type);
  variant.probability = probability;
  annotation.push_back(variant);
891
892
893
894
  annotation.insert(annotation.end(), suffix.begin(), suffix.end());

  return;
} // extractor_frame_shift
J.K. Vis's avatar
J.K. Vis committed
895

896
897
898
// This function calculates the LCS using the LCS_k function by
// choosing an initial k and reducing it if necessary until the
// strings represent random strings modeled by a threshold value.
J.K. Vis's avatar
J.K. Vis committed
899
900
901
902
903
904
905
size_t LCS(std::vector<Substring> &substring,
           char_t const* const     reference,
           char_t const* const     complement,
           size_t const            reference_start,
           size_t const            reference_end,
           char_t const* const     sample,
           size_t const            sample_start,
J.K. Vis's avatar
J.K. Vis committed
906
907
           size_t const            sample_end,
           size_t const            cut_off)
J.K. Vis's avatar
J.K. Vis committed
908
909
910
911
912
{
  size_t const reference_length = reference_end - reference_start;
  size_t const sample_length = sample_end - sample_start;


913
  // The initial k.
J.K. Vis's avatar
J.K. Vis committed
914
915
  size_t k = reference_length > sample_length ? sample_length / 4 : reference_length / 4;

916
917

  // Reduce k until the cut-off is reached.
J.K. Vis's avatar
J.K. Vis committed
918
919
920
921
922
923
924
925
926
  while (k > 4 && k > cut_off)
  {


#if defined(__debug__)
  fprintf(stderr, "  k = %ld (cut-off: %ld)\n", k, cut_off);
#endif


927
    // Try to find a LCS with k.
J.K. Vis's avatar
J.K. Vis committed
928
929
930
931
932
933
934
935
936
937
938
    substring.clear();
    size_t const length = LCS_k(substring, reference, complement, reference_start, reference_end, sample, sample_start, sample_end, k);

    // A LCS of sufficient length has been found.
    if (length >= 2 * k && substring.size() > 0)
    {
      return length;
    } // if
    k /= 3;
  } // while

939
  // Cut-off: no LCS found.
J.K. Vis's avatar
J.K. Vis committed
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
  if (cut_off > 1)
  {


#if defined(__debug__)
  fprintf(stderr, "  cut-off\n");
#endif


    substring.clear();
    return 0;
  } // if


#if defined(__debug__)
  fprintf(stderr, "  k = 1\n");
#endif


959
  // As a last resort try running the classical algorithm.
J.K. Vis's avatar
J.K. Vis committed
960
961
962
  return LCS_1(substring, reference, complement, reference_start, reference_end, sample, sample_start, sample_end);
} // LCS

963
964
// Calculate the LCS in the well-known way using dynamic programming.
// NOT suitable for large strings.
J.K. Vis's avatar
J.K. Vis committed
965
966
967
968
969
970
971
972
size_t LCS_1(std::vector<Substring> &substring,
             char_t const* const     reference,
             char_t const* const     complement,
             size_t const            reference_start,
             size_t const            reference_end,
             char_t const* const     sample,
             size_t const            sample_start,
             size_t const            sample_end)
jkvis's avatar
jkvis committed
973
974
975
{
  size_t const reference_length = reference_end - reference_start;
  size_t const sample_length = sample_end - sample_start;
J.K. Vis's avatar
J.K. Vis committed
976
  bool reverse_complement = false;
jkvis's avatar
jkvis committed
977

jkvis's avatar
jkvis committed
978
979
  // Just a fancy way of allocation a continuous 2D array in heap
  // space.
jkvis's avatar
jkvis committed
980
981
982
  typedef size_t array[2][reference_length];
  array &LCS_line = *(reinterpret_cast<array*>(new size_t[2 * reference_length]));
  array &LCS_line_rc = *(reinterpret_cast<array*>(new size_t[2 * reference_length]));
jkvis's avatar
jkvis committed
983
984

  size_t length = 0;
jkvis's avatar
jkvis committed
985
986
987

  // Filling the LCS matrix (actually only the current and the
  // previous row).
jkvis's avatar
jkvis committed
988
989
990
991
  for (size_t i = 0; i < sample_length; ++i)
  {
    for (size_t j = 0; j < reference_length; ++j)
    {
jkvis's avatar
jkvis committed
992
      // A match
jkvis's avatar
jkvis committed
993
      if (reference[reference_start + j] == sample[sample_start + i])
jkvis's avatar
jkvis committed
994
      {
jkvis's avatar
jkvis committed
995
        if (i == 0 || j == 0)
jkvis's avatar
jkvis committed
996
        {
jkvis's avatar
jkvis committed
997
          LCS_line[i % 2][j] = 1;
jkvis's avatar
jkvis committed
998
999
1000
        } // if
        else
        {