extractor.h 30.2 KB
Newer Older
jkvis's avatar
jkvis committed
1
2
3
4
5
6
7
8
// *******************************************************************
// Extractor (library)
// *******************************************************************
// FILE INFORMATION:
//   File:     extractor.h (implemented in extractor.cc)
//   Author:   Jonathan K. Vis
// *******************************************************************
// DESCRIPTION:
jkvis's avatar
jkvis committed
9
//   This library can be used to generate HGVS variant descriptions as
jkvis's avatar
jkvis committed
10
11
12
13
14
15
//   accepted by the Mutalyzer Name Checker.
// *******************************************************************

#if !defined(__extractor_h__)
#define __extractor_h__

J.K. Vis's avatar
J.K. Vis committed
16
#include <cmath>
jkvis's avatar
jkvis committed
17
#include <cstddef>
J.K. Vis's avatar
J.K. Vis committed
18
#include <cstdlib>
jkvis's avatar
jkvis committed
19
20
#include <vector>

J.K. Vis's avatar
J.K. Vis committed
21
22
23
24
25
26

#if defined(__debug__)
#include <cstdio>
#endif


jkvis's avatar
jkvis committed
27
28
29
namespace mutalyzer
{

30
// Version string for run-time identification.
31
static char const* const VERSION = "2.3.1";
J.K. Vis's avatar
J.K. Vis committed
32
33


jkvis's avatar
jkvis committed
34
// The character type used for all strings. For now it should just be
35
// a char.
J.K. Vis's avatar
J.K. Vis committed
36
37
38
typedef char char_t;


39
40
41
42
43
// Integer types of fixed bit width used for frame shift calculation.
typedef unsigned char       uint8_t;
typedef unsigned long long uint64_t;


44
45
46
47
48
49
50
51
52
53
// *******************************************************************
// Variant Extraction
//   These functions are used to extract variants (regions of change)
//   between two strings.
// *******************************************************************


// These constants can be used to specify the type of string to be
// extracted. The extractor is primarily focussed on DNA/RNA. When
// TYPE_PROTEIN (or another value) is used no complement string is
J.K. Vis's avatar
J.K. Vis committed
54
55
56
// constructed and no reverse complement is calculated. For
// TYPE_PROTEIN frame shift detection is applied on
// deletions/insertions.
57
static int const TYPE_DNA     = 0; // DNA/RNA (default)
J.K. Vis's avatar
J.K. Vis committed
58
59
static int const TYPE_PROTEIN = 1; // Protein
static int const TYPE_OTHER   = 2; // Other strings
60

J.K. Vis's avatar
J.K. Vis committed
61

62
63
64
// These constants can be used to deterimine the type of variant.
// Substitution covers most: deletions, insertions, substitutions, and
// insertion/deletions. Indentity is used to describe the unchanged
65
66
67
68
// (matched) regions. The constants are coded as bitfields and should
// be appropriately combined, e.g., IDENTITY | TRANSPOSITION_OPEN for
// describing a real transposition. Note that some combinations do NOT
// make sense, e.g., SUBSTITUION | REVERSE_COMPLEMENT.
69
70
71
72
73
static unsigned int const IDENTITY            = 0x01;
static unsigned int const REVERSE_COMPLEMENT  = 0x02;
static unsigned int const SUBSTITUTION        = 0x04;
static unsigned int const TRANSPOSITION_OPEN  = 0x08;
static unsigned int const TRANSPOSITION_CLOSE = 0x10;
74
static unsigned int const FRAME_SHIFT         = 0x20;
75
76
77
78
79
80
81
82
83
84
85
86
87


// These constants describe the actual frame shift type. The constants
// are coded as bitfields and can be appopriately combined in case of
// a compound frame shift, e.g., FRAME_SHIFT_1 | FRAME_SHIFT_2.
// When used within a Variant structure these constants should be
// combined with the FRAME_SHIFT constant.
static uint8_t const FRAME_SHIFT_NONE      = 0x00;
static uint8_t const FRAME_SHIFT_1         = 0x01;
static uint8_t const FRAME_SHIFT_2         = 0x02;
static uint8_t const FRAME_SHIFT_REVERSE   = 0x04;
static uint8_t const FRAME_SHIFT_REVERSE_1 = 0x08;
static uint8_t const FRAME_SHIFT_REVERSE_2 = 0x10;
jkvis's avatar
jkvis committed
88
89


90
91
92
93
94
95
96
97
// These constants are used in calculating the weight of the generated
// description and consequently used to end the description process
// when a certain ``trivial'' weight is exeeded. The weight constants
// are based on their HGVS description lengths, i.e., the amount of
// characters used. The weight_position variable is used to have a
// constant weight for a position description regardless the actual
// position. It is usually set to ceil(log10(|reference| / 4)), and
// its intention is to be constant during an extraction run.
J.K. Vis's avatar
J.K. Vis committed
98
extern size_t       weight_position;
jkvis's avatar
jkvis committed
99

J.K. Vis's avatar
J.K. Vis committed
100
static size_t const WEIGHT_BASE               = 1; // i.e., A, C, G, T
101
102
103
104
105
106
static size_t const WEIGHT_DELETION           = 3; // i.e., del
static size_t const WEIGHT_DELETION_INSERTION = 6; // i.e., delins
static size_t const WEIGHT_INSERTION          = 3; // i.e., ins
static size_t const WEIGHT_INVERSION          = 3; // i.e., inv
static size_t const WEIGHT_SEPARATOR          = 1; // i.e., _, [, ], ;
static size_t const WEIGHT_SUBSTITUTION       = 1; // i.e., >
jkvis's avatar
jkvis committed
107

108

109
// Cut-off constants. The threshold is used to specify the maximum
J.K. Vis's avatar
J.K. Vis committed
110
111
112
// reference length without any cut-off. Otherwise the weight_position
// is used as a cut-off. For transpositions the cut-off is specified
// as a fraction of the sample_length.
J.K. Vis's avatar
J.K. Vis committed
113
static size_t const THRESHOLD_CUT_OFF     = 16000;
114
static double const TRANSPOSITION_CUT_OFF =   0.1;
J.K. Vis's avatar
J.K. Vis committed
115
116


117
118
119
// This global variable is used to have access to the whole reference
// string at any point in the extraction process. Commonly used in
// transposition extraction.
J.K. Vis's avatar
J.K. Vis committed
120
extern size_t global_reference_length;
jkvis's avatar
jkvis committed
121

122

123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
// *******************************************************************
// Variant structure
//   This structure describes a variant (region of change).
//
//   @member reference_start: starting position of the variant within
//                            the reference string
//   @member reference_end: ending position of the variant within the
//                          reference string
//   @member sample_start: starting position of the variant within the
//                         sample string
//   @member sample_end: ending position of the variant within the
//                       sample string
//   @member type: type of the variant described using the
//                 constants above
//   @member weight: weight of the variant according to the weight
//                   constants above (used internally)
//   @member transposition_start: starting position of a transposition
J.K. Vis's avatar
J.K. Vis committed
140
//                                within the reference string
141
//   @member transposition_end: ending position of a transposition
J.K. Vis's avatar
J.K. Vis committed
142
//                              within the reference string
143
// *******************************************************************
jkvis's avatar
jkvis committed
144
145
struct Variant
{
146
147
148
149
150
  size_t       reference_start;
  size_t       reference_end;
  size_t       sample_start;
  size_t       sample_end;
  unsigned int type;
J.K. Vis's avatar
J.K. Vis committed
151
152
153
154
155
  union
  {
    size_t     weight;
    double     probability;
  }; // union
156
157
158
159
160
161
162
163
164
165
166
  size_t       transposition_start;
  size_t       transposition_end;

  inline Variant(size_t const       reference_start,
                 size_t const       reference_end,
                 size_t const       sample_start,
                 size_t const       sample_end,
                 unsigned int const type                = IDENTITY,
                 size_t const       weight              = 0,
                 size_t const       transposition_start = 0,
                 size_t const       transposition_end   = 0):
167
168
169
170
         reference_start(reference_start),
         reference_end(reference_end),
         sample_start(sample_start),
         sample_end(sample_end),
J.K. Vis's avatar
J.K. Vis committed
171
172
         type(type),
         weight(weight),
173
         transposition_start(transposition_start),
J.K. Vis's avatar
J.K. Vis committed
174
         transposition_end(transposition_end) { }
jkvis's avatar
jkvis committed
175
176
177
178

  inline Variant(void) { }
}; // Variant

179
180
181
182
183
184
185
186
187
188
189
190
191
192
// *******************************************************************
// Variant_List structure
//   This structure describes a list of variants with associated
//   metadata.
//
//   @member weight_position: weight used for position descriptors
//   @member variants: vector of variants
// *******************************************************************
struct Variant_List
{
  size_t               weight_position;
  std::vector<Variant> variants;
}; // Variant_List

193
194
195
196
197
198
199
200
201
202
// *******************************************************************
// extract function
//   This function is the interface function for Python. It is just a
//   wrapper for the C++ extract function below.
//
//   @arg reference: reference string
//   @arg reference_length: length of the reference string
//   @arg sample: sample string
//   @arg sample_length: length of the sample string
//   @arg type: type of strings  0 --- DNA/RNA (default)
J.K. Vis's avatar
J.K. Vis committed
203
204
//                               1 --- Protein
//                               2 --- Other
J.K. Vis's avatar
J.K. Vis committed
205
206
207
//   @arg codon_string: serialized codon table: 64 characters
//                      corresponding to the codons AAA, ..., TTT.
//                      Only for protein extraction (frame shifts).
208
//   @return: variant list with metadata
209
// *******************************************************************
210
211
212
213
Variant_List extract(char_t const* const reference,
                     size_t const        reference_length,
                     char_t const* const sample,
                     size_t const        sample_length,
J.K. Vis's avatar
J.K. Vis committed
214
                     int const           type         = TYPE_DNA,
J.K. Vis's avatar
J.K. Vis committed
215
                     char_t const* const codon_string = 0);
J.K. Vis's avatar
J.K. Vis committed
216

217
218
219
220
221
222
223
224
225
226
227
228
229
// *******************************************************************
// extract function
//   This function extracts the variants (regions of change) between
//   the reference and the sample string. It automatically constructs
//   the reverse complement string for the reference string if the
//   string type is DNA/RNA.
//
//   @arg variant: vector of variants
//   @arg reference: reference string
//   @arg reference_length: length of the reference string
//   @arg sample: sample string
//   @arg sample_length: length of the sample string
//   @arg type: type of strings  0 --- DNA/RNA (default)
J.K. Vis's avatar
J.K. Vis committed
230
231
//                               1 --- Protein
//                               2 --- Other
J.K. Vis's avatar
J.K. Vis committed
232
233
234
//   @arg codon_string: serialized codon table: 64 characters
//                      corresponding to the codons AAA, ..., TTT.
//                      Only for protein extraction (frame shifts).
235
236
//   @return: weight of the extracted variants
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
237
238
239
240
241
size_t extract(std::vector<Variant> &variant,
               char_t const* const   reference,
               size_t const          reference_length,
               char_t const* const   sample,
               size_t const          sample_length,
J.K. Vis's avatar
J.K. Vis committed
242
               int const             type         = TYPE_DNA,
J.K. Vis's avatar
J.K. Vis committed
243
               char_t const* const   codon_string = 0);
J.K. Vis's avatar
J.K. Vis committed
244

245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
// *******************************************************************
// extractor function
//   This function extracts the variants (regions of change) between
//   the reference and the sample string by recursively calling itself
//   on prefixes and suffixes of a longest common substring.
//
//   @arg variant: vector of variants
//   @arg reference: reference string
//   @arg complement: complement string (can be null for strings other
//                    than DNA/RNA)
//   @arg reference_start: starting position in the reference string
//   @arg reference_end: ending position in the reference string
//   @arg sample: sample string
//   @arg sample_start: starting position in the sample string
//   @arg sample_end: ending position in the sample string
//   @return: weight of the extracted variants
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
262
263
264
265
266
267
268
269
270
size_t extractor(std::vector<Variant> &variant,
                 char_t const* const   reference,
                 char_t const* const   complement,
                 size_t const          reference_start,
                 size_t const          reference_end,
                 char_t const* const   sample,
                 size_t const          sample_start,
                 size_t const          sample_end);

271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
// *******************************************************************
// extractor_transposition function
//   This function extracts the variants (regions of change) between
//   a part of the sample string classified as an insertion and the
//   whole reference string.
//
//   @arg variant: vector of variants
//   @arg reference: reference string
//   @arg complement: complement string (can be null for strings other
//                    than DNA/RNA)
//   @arg reference_start: starting position in the reference string
//                         used for the deletion part
//   @arg reference_end: ending position in the reference string used
//                       for the deletion part
//   @arg sample: sample string
//   @arg sample_start: starting position in the sample string
//   @arg sample_end: ending position in the sample string
//   @arg weight_trivial: trivial weight to describe the transposition
//                        as a normal insertion (used for ending the
//                        extraction process)
//   @return: weight of the extracted variants
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
293
294
295
296
297
298
299
300
301
size_t extractor_transposition(std::vector<Variant> &variant,
                               char_t const* const   reference,
                               char_t const* const   complement,
                               size_t const          reference_start,
                               size_t const          reference_end,
                               char_t const* const   sample,
                               size_t const          sample_start,
                               size_t const          sample_end,
                               size_t const          weight_trivial = 0);
jkvis's avatar
jkvis committed
302

J.K. Vis's avatar
J.K. Vis committed
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
// *******************************************************************
// extractor_protein function
//   This function extracts the variants (regions of change) between
//   the reference and the sample protein string by recursively
//   calling itself on prefixes and suffixes of a longest common
//   substring, calculated by the LCS_1 algorithm (these strings are
//   very short).
//
//   @arg variant: vector of variants
//   @arg reference: reference string
//   @arg reference_start: starting position in the reference string
//   @arg reference_end: ending position in the reference string used
//   @arg sample: sample string
//   @arg sample_start: starting position in the sample string
//   @arg sample_end: ending position in the sample string
//   @return: weight of the extracted variants
// *******************************************************************
size_t extractor_protein(std::vector<Variant> &variant,
                         char_t const* const   reference,
                         size_t const          reference_start,
                         size_t const          reference_end,
                         char_t const* const   sample,
                         size_t const          sample_start,
                         size_t const          sample_end);

328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
// *******************************************************************
// extractor_frame_shift function
//   This function extracts the frame shift annotation between the
//   reference and the sample protein string by recursively calling
//   itself on prefixes and suffixes of a longest common substring,
//   calculated by the LCS_frame_shift algorithm (these strings are
//   very short).
//
//   @arg annotation: vector of variants (contains annotation)
//   @arg reference: reference string
//   @arg reference_start: starting position in the reference string
//   @arg reference_end: ending position in the reference string used
//   @arg sample: sample string
//   @arg sample_start: starting position in the sample string
//   @arg sample_end: ending position in the sample string
// *******************************************************************
void extractor_frame_shift(std::vector<Variant> &annotation,
                           char_t const* const   reference,
                           size_t const          reference_start,
                           size_t const          reference_end,
                           char_t const* const   sample,
                           size_t const          sample_start,
                           size_t const          sample_end);

jkvis's avatar
jkvis committed
352

353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
// *******************************************************************
// Longest Common Substring (LCS) calculation
//   These functions are useful for LCS calculation of (large similar)
//   strings.
// *******************************************************************


// *******************************************************************
// Substring structure
//   This structure describes a common substring between two strings.
//
//   @member reference_index: starting position of the substring
//                            within the reference sequence
//   @member sample_index: ending position of the substring within the
//                         sample sequence
//   @member length: length of the substring
//   @member reverse_complement: indicates a reverse complement
//                               substring (only for DNA/RNA)
371
372
//   @member type: (in union with @member reverse_complement)
//                 indicates the type of frame shift/
373
// *******************************************************************
jkvis's avatar
jkvis committed
374
375
struct Substring
{
376
377
378
379
380
381
382
383
384
385
386
387
388
  size_t  reference_index;
  size_t  sample_index;
  size_t  length;
  union
  {
    bool    reverse_complement;
    uint8_t type;
  }; // union

  inline Substring(size_t const  reference_index,
                   size_t const  sample_index,
                   size_t const  length,
                   bool const    reverse_complement = false):
jkvis's avatar
jkvis committed
389
390
391
392
393
         reference_index(reference_index),
         sample_index(sample_index),
         length(length),
         reverse_complement(reverse_complement) { }

394
395
396
397
398
399
400
401
402
403
  inline Substring(size_t const  reference_index,
                   size_t const  sample_index,
                   size_t const  length,
                   uint8_t const type):
         reference_index(reference_index),
         sample_index(sample_index),
         length(length),
         type(type) { }

  inline Substring(void): length(0) { }
jkvis's avatar
jkvis committed
404
405
}; // Substring

406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
// *******************************************************************
// LCS function
//   This function calculates the longest common substrings between
//   two (three?) strings by choosing an initial k and calling the
//   lcs_k function. The k is automatically reduced if necessary until
//   the LCS of the two strings approaches some cutoff threshold.
//
//   @arg substring: vector of substrings
//   @arg reference: reference string
//   @arg complement: complement string (can be null for strings other
//                    than DNA/RNA)
//   @arg reference_start: starting position in the reference string
//   @arg reference_end: ending position in the reference string
//   @arg sample: sample string
//   @arg sample_start: starting position in the sample string
//   @arg sample_end: ending position in the sample string
J.K. Vis's avatar
J.K. Vis committed
422
//   @arg cut_off: optional cut-off value for the k in LCS_k
423
424
//   @return: length of the LCS
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
425
426
427
428
429
430
431
size_t LCS(std::vector<Substring> &substring,
           char_t const* const     reference,
           char_t const* const     complement,
           size_t const            reference_start,
           size_t const            reference_end,
           char_t const* const     sample,
           size_t const            sample_start,
J.K. Vis's avatar
J.K. Vis committed
432
433
           size_t const            sample_end,
           size_t const            cut_off = 1);
J.K. Vis's avatar
J.K. Vis committed
434

435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
// *******************************************************************
// LCS_1 function
//   This function calculates the longest common substrings between
//   two (three?) strings. It asumes no similarity between both
//   strings. Not for use for large strings. This is the classical
//   dynamic programming algorithm.
//
//   @arg substring: vector of substrings
//   @arg reference: reference string
//   @arg complement: complement string (can be null for strings other
//                    than DNA/RNA)
//   @arg reference_start: starting position in the reference string
//   @arg reference_end: ending position in the reference string
//   @arg sample: sample string
//   @arg sample_start: starting position in the sample string
//   @arg sample_end: ending position in the sample string
//   @return: length of the LCS
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
453
454
455
456
457
458
459
460
461
size_t LCS_1(std::vector<Substring> &substring,
             char_t const* const     reference,
             char_t const* const     complement,
             size_t const            reference_start,
             size_t const            reference_end,
             char_t const* const     sample,
             size_t const            sample_start,
             size_t const            sample_end);

462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
// *******************************************************************
// LCS_k function
//   This function calculates the longest common substrings between
//   two (three?) strings by encoding the reference and complement
//   strings into non-overlapping k-mers and the sample string into
//   overlapping k-mers. This function can be used for large similar
//   strings. If the returned vector is empty or the length of the
//   substrings is less or equal 2k, try again with a smaller k.
//
//   @arg substring: vector of substrings
//   @arg reference: reference string
//   @arg complement: complement string (can be null for strings other
//                    than DNA/RNA)
//   @arg reference_start: starting position in the reference string
//   @arg reference_end: ending position in the reference string
//   @arg sample: sample string
//   @arg sample_start: starting position in the sample string
//   @arg sample_end: ending position in the sample string
//   @arg k: size of the k-mers, must be greater than 1
//   @return: length of the LCS
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
483
484
485
486
487
488
489
490
491
492
size_t LCS_k(std::vector<Substring> &substring,
             char_t const* const     reference,
             char_t const* const     complement,
             size_t const            reference_start,
             size_t const            reference_end,
             char_t const* const     sample,
             size_t const            sample_start,
             size_t const            sample_end,
             size_t const            k);

493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
// *******************************************************************
// LCS_frame_shift function
//   This function calculates the frame shift LCS.
//
//   @arg substring: vector of substrings
//   @arg reference: reference string
//   @arg reference_start: starting position in the reference string
//   @arg reference_end: ending position in the reference string
//   @arg sample: sample string
//   @arg sample_start: starting position in the sample string
//   @arg sample_end: ending position in the sample string
// *******************************************************************
void LCS_frame_shift(std::vector<Substring> &substring,
                     char_t const* const     reference,
                     size_t const            reference_start,
                     size_t const            reference_end,
                     char_t const* const     sample,
                     size_t const            sample_start,
                     size_t const            sample_end);

J.K. Vis's avatar
J.K. Vis committed
513

514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
// *******************************************************************
// General string matching functions
//   These functions are useful for string matching.
// *******************************************************************

// *******************************************************************
// string_match function
//   This function is more or less equivalent to C's strncmp.
//
//   @arg string_1: first string to be compared
//   @arg string_2: second string to be compared
//   @arg length: maximum length to be compared

//   @return: true iff string_1 and string_2 match for the given
//            length
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
530
531
532
533
bool string_match(char_t const* const string_1,
                  char_t const* const string_2,
                  size_t const        length);

534
535
536
537
538
539
540
541
542
543
544
545
546
// *******************************************************************
// string_match_reverse function
//   This function is very similar to C's strncmp, but it traverses
//   string_1 from end to start while traversing string_2 from start
//   to end (useful for the reverse complement in DNA/RNA).
//
//   @arg string_1: first string to be compared
//   @arg string_2: second string to be compared
//   @arg length: maximum length to be compared

//   @return: true iff string_1 and string_2 match in their respective
//            directions for the given length
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
547
548
549
550
bool string_match_reverse(char_t const* const string_1,
                          char_t const* const string_2,
                          size_t const        length);

551
552
553
554
555
556
557
558
559
560
561
562
563
// *******************************************************************
// prefix_match function
//   This function calculates the length (in characters) of the common
//   prefix between two strings. The result of this function is also
//   used in the suffix_match function.
//
//   @arg reference: reference string
//   @arg reference_length: reference length
//   @arg sample: sample string
//   @arg sample_length: sample length

//   @return: the length of the common prefix
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
564
565
566
567
size_t prefix_match(char_t const* const reference,
                    size_t const        reference_length,
                    char_t const* const sample,
                    size_t const        sample_length);
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582

// *******************************************************************
// suffix_match function
//   This function calculates the length (in characters) of the common
//   suffix between two strings. It needs the calculated common
//   prefix.
//
//   @arg reference: reference string
//   @arg reference_length: reference length
//   @arg sample: sample string
//   @arg sample_length: sample length
//   @arg prefix: length of the common prefix

//   @return: the length of the common suffix
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
583
584
585
586
587
588
size_t suffix_match(char_t const* const reference,
                    size_t const        reference_length,
                    char_t const* const sample,
                    size_t const        sample_length,
                    size_t const        prefix = 0);

589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605

// *******************************************************************
// IUPAC Nucleotide Acid Notation functions
//   These functions are useful for calculating the complement of DNA/
//   RNA strings.
// *******************************************************************


// *******************************************************************
// IUPAC_complement function
//   This function converts a IUPAC Nucleotide Acid Notation into its
//   complement.
//
//   @arg base: character from the IUPAC Nucleotide Acid Notation
//              alphabet
//   @return: its corresponding IUPAC complement for single bases only
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
606
char_t IUPAC_complement(char_t const base);
607
608
609
610
611
612
613
614
615
616
617
618
619

// *******************************************************************
// IUPAC_complement function
//   This function converts a string in IUPAC Nucleotide Acid Notation
//   into its complement. A new string is allocated, so deletion is
//   the responsibility of the caller.
//
//   @arg string: string in the IUPAC Nucleotide Acid Notation
//                alphabet
//   @arg length: number of characters in the string to convert (might
//                be less than the actual string length)
//   @return: string containing the complement of the input string
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
620
621
622
623
char_t const* IUPAC_complement(char_t const* const string,
                               size_t const        length);


J.K. Vis's avatar
J.K. Vis committed
624
625
626
627
628
// *******************************************************************
// Amino Acid functions
//   These functions are useful for calculating frame shifts.
// *******************************************************************

629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
// *******************************************************************
// initialize_frame_shift_map function
//   Precalculates the frame_shift_map based on a given codon string.
//
//   @arg codon_string: gives the amino acid symbols in codon order:
//                      0 AAA, ... 63 TTT.
// *******************************************************************
void initialize_frame_shift_map(char_t const* const codon_string);

// *******************************************************************
// calculate_frame_shift function
//   Used to precalculate the frame_shift_map. It computes for all
//   combinations of two reference amino acids the corresponding DNA
//   sequence and the (partial) overlap between all possible DNA
//   sequences of the sample amico acid.
//
//   @arg reference_1: first reference amino acid
//   @arg reference_2: second reference amino acid
//   @arg sample: sample amino acid
//   @return: frame shift
// *******************************************************************
650
651
652
uint8_t calculate_frame_shift(size_t const reference_1,
                              size_t const reference_2,
                              size_t const sample);
653

J.K. Vis's avatar
J.K. Vis committed
654
655
656
657
658
// *******************************************************************
// frame_shift function
//   This function calculates the frame shift A reference amino acid
//   is checked against two possible partial overlaps between every
//   combination of two sample (observed) amino acids. Possible
659
//   results are defines as FRAME_SHIFT constants.
J.K. Vis's avatar
J.K. Vis committed
660
//
661
662
663
//   @arg reference_1: first reference amino acid
//   @arg reference_2: second reference amino acid
//   @arg sample: sample amino acid
J.K. Vis's avatar
J.K. Vis committed
664
665
//   @return: frame shift
// *******************************************************************
666
667
668
uint8_t frame_shift(char_t const reference_1,
                    char_t const reference_2,
                    char_t const sample);
J.K. Vis's avatar
J.K. Vis committed
669
670


J.K. Vis's avatar
J.K. Vis committed
671
#if defined(__debug__)
672
673
674
675
676
677
678
679
680
681
682
683
// *******************************************************************
// Dprint_truncated function
//   Debug function for printing large strings in truncated form: a
//   prefix of a certain length ... a suffix of the same length.
//
//   @arg string: string to be printed
//   @arg start: starting position in the string
//   @arg end: ending position in the string
//   @arg length: length of the prefix and suffix
//   @arg stream: file stream to print to
//   @return: the length of the printed string
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
684
685
686
687
688
size_t Dprint_truncated(char_t const* const string,
                        size_t const        start,
                        size_t const        end,
                        size_t const        length = 40,
                        FILE*               stream = stderr);
J.K. Vis's avatar
J.K. Vis committed
689
690
691

size_t Dprint_codon(size_t const index,
                    FILE*        stream = stderr);
J.K. Vis's avatar
J.K. Vis committed
692
#endif
jkvis's avatar
jkvis committed
693

jkvis's avatar
jkvis committed
694

J.K. Vis's avatar
J.K. Vis committed
695
} // mutalyzer
jkvis's avatar
jkvis committed
696
697
698

#endif