extractor.h 22.8 KB
Newer Older
jkvis's avatar
jkvis committed
1
// *******************************************************************
jkvis's avatar
jkvis committed
2
//   (C) Copyright 2014 Leiden Institute of Advanced Computer Science
jkvis's avatar
jkvis committed
3
4
5
6
7
8
9
10
//   Universiteit Leiden
//   All Rights Reserved
// *******************************************************************
// Extractor (library)
// *******************************************************************
// FILE INFORMATION:
//   File:     extractor.h (implemented in extractor.cc)
//   Author:   Jonathan K. Vis
J.K. Vis's avatar
J.K. Vis committed
11
12
//   Revision: 2.1.4
//   Date:     2014/08/21
jkvis's avatar
jkvis committed
13
14
// *******************************************************************
// DESCRIPTION:
jkvis's avatar
jkvis committed
15
//   This library can be used to generate HGVS variant descriptions as
jkvis's avatar
jkvis committed
16
17
18
19
20
21
//   accepted by the Mutalyzer Name Checker.
// *******************************************************************

#if !defined(__extractor_h__)
#define __extractor_h__

J.K. Vis's avatar
J.K. Vis committed
22
#include <cmath>
jkvis's avatar
jkvis committed
23
#include <cstddef>
J.K. Vis's avatar
J.K. Vis committed
24
#include <cstdlib>
jkvis's avatar
jkvis committed
25
26
#include <vector>

J.K. Vis's avatar
J.K. Vis committed
27
28
29
30
31
32

#if defined(__debug__)
#include <cstdio>
#endif


jkvis's avatar
jkvis committed
33
34
35
namespace mutalyzer
{

36
// Version string for run-time identification.
J.K. Vis's avatar
J.K. Vis committed
37
static char const* const VERSION = "2.1.4";
J.K. Vis's avatar
J.K. Vis committed
38
39


jkvis's avatar
jkvis committed
40
// The character type used for all strings. For now it should just be
41
// a char.
J.K. Vis's avatar
J.K. Vis committed
42
43
44
typedef char char_t;


45
46
47
48
49
50
51
52
53
54
55
56
57
// *******************************************************************
// Variant Extraction
//   These functions are used to extract variants (regions of change)
//   between two strings.
// *******************************************************************


// These constants can be used to specify the type of string to be
// extracted. The extractor is primarily focussed on DNA/RNA. When
// TYPE_PROTEIN (or another value) is used no complement string is
// constructed and no reverse complement is calculated.
static int const TYPE_DNA     = 0; // DNA/RNA (default)
static int const TYPE_PROTEIN = 1; // Protein or other strings
58

J.K. Vis's avatar
J.K. Vis committed
59

60
61
62
// These constants can be used to deterimine the type of variant.
// Substitution covers most: deletions, insertions, substitutions, and
// insertion/deletions. Indentity is used to describe the unchanged
63
64
65
66
// (matched) regions. The constants are coded as bitfields and should
// be appropriately combined, e.g., IDENTITY | TRANSPOSITION_OPEN for
// describing a real transposition. Note that some combinations do NOT
// make sense, e.g., SUBSTITUION | REVERSE_COMPLEMENT.
67
68
69
70
71
static unsigned int const IDENTITY            = 0x01;
static unsigned int const REVERSE_COMPLEMENT  = 0x02;
static unsigned int const SUBSTITUTION        = 0x04;
static unsigned int const TRANSPOSITION_OPEN  = 0x08;
static unsigned int const TRANSPOSITION_CLOSE = 0x10;
jkvis's avatar
jkvis committed
72
73


74
75
76
77
78
79
80
81
// These constants are used in calculating the weight of the generated
// description and consequently used to end the description process
// when a certain ``trivial'' weight is exeeded. The weight constants
// are based on their HGVS description lengths, i.e., the amount of
// characters used. The weight_position variable is used to have a
// constant weight for a position description regardless the actual
// position. It is usually set to ceil(log10(|reference| / 4)), and
// its intention is to be constant during an extraction run.
J.K. Vis's avatar
J.K. Vis committed
82
extern size_t       weight_position;
jkvis's avatar
jkvis committed
83

84
85
86
87
88
89
90
static size_t const WEIGHT_BASE               = 1; // i.e., A, G, T, C
static size_t const WEIGHT_DELETION           = 3; // i.e., del
static size_t const WEIGHT_DELETION_INSERTION = 6; // i.e., delins
static size_t const WEIGHT_INSERTION          = 3; // i.e., ins
static size_t const WEIGHT_INVERSION          = 3; // i.e., inv
static size_t const WEIGHT_SEPARATOR          = 1; // i.e., _, [, ], ;
static size_t const WEIGHT_SUBSTITUTION       = 1; // i.e., >
jkvis's avatar
jkvis committed
91

92

J.K. Vis's avatar
J.K. Vis committed
93
94
95
96
97
98
99
100
101
// Cut-off constants. For normal extraction use the threshold to
// specify the maximum reference length without any cut-off. Otherwise
// the extraction cut-off is used.
// The for transpositions is set at a fraction of the sample length.
static size_t const THRESHOLD_CUT_OFF     = 16000;
static size_t const EXTRACTION_CUT_OFF    =   500;
static double const TRANSPOSITION_CUT_OFF =   0.3;


102
103
104
// This global variable is used to have access to the whole reference
// string at any point in the extraction process. Commonly used in
// transposition extraction.
J.K. Vis's avatar
J.K. Vis committed
105
extern size_t global_reference_length;
jkvis's avatar
jkvis committed
106

107

108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
// *******************************************************************
// Variant structure
//   This structure describes a variant (region of change).
//
//   @member reference_start: starting position of the variant within
//                            the reference string
//   @member reference_end: ending position of the variant within the
//                          reference string
//   @member sample_start: starting position of the variant within the
//                         sample string
//   @member sample_end: ending position of the variant within the
//                       sample string
//   @member type: type of the variant described using the
//                 constants above
//   @member weight: weight of the variant according to the weight
//                   constants above (used internally)
//   @member transposition_start: starting position of a transposition
//                                withing the reference string
//   @member transposition_end: ending position of a transposition
//                              withing the reference string
// *******************************************************************
jkvis's avatar
jkvis committed
129
130
struct Variant
{
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
  size_t       reference_start;
  size_t       reference_end;
  size_t       sample_start;
  size_t       sample_end;
  unsigned int type;
  size_t       weight;
  size_t       transposition_start;
  size_t       transposition_end;

  inline Variant(size_t const       reference_start,
                 size_t const       reference_end,
                 size_t const       sample_start,
                 size_t const       sample_end,
                 unsigned int const type                = IDENTITY,
                 size_t const       weight              = 0,
                 size_t const       transposition_start = 0,
                 size_t const       transposition_end   = 0):
148
149
150
151
         reference_start(reference_start),
         reference_end(reference_end),
         sample_start(sample_start),
         sample_end(sample_end),
J.K. Vis's avatar
J.K. Vis committed
152
153
         type(type),
         weight(weight),
154
         transposition_start(transposition_start),
J.K. Vis's avatar
J.K. Vis committed
155
         transposition_end(transposition_end) { }
jkvis's avatar
jkvis committed
156
157
158
159

  inline Variant(void) { }
}; // Variant

160
161
162
163
164
165
166
167
168
169
170
171
172
173
// *******************************************************************
// Variant_List structure
//   This structure describes a list of variants with associated
//   metadata.
//
//   @member weight_position: weight used for position descriptors
//   @member variants: vector of variants
// *******************************************************************
struct Variant_List
{
  size_t               weight_position;
  std::vector<Variant> variants;
}; // Variant_List

174
175
176
177
178
179
180
181
182
183
184
// *******************************************************************
// extract function
//   This function is the interface function for Python. It is just a
//   wrapper for the C++ extract function below.
//
//   @arg reference: reference string
//   @arg reference_length: length of the reference string
//   @arg sample: sample string
//   @arg sample_length: length of the sample string
//   @arg type: type of strings  0 --- DNA/RNA (default)
//                               1 --- Protein/other
185
//   @return: variant list with metadata
186
// *******************************************************************
187
188
189
190
191
Variant_List extract(char_t const* const reference,
                     size_t const        reference_length,
                     char_t const* const sample,
                     size_t const        sample_length,
                     int const           type = TYPE_DNA);
J.K. Vis's avatar
J.K. Vis committed
192

193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
// *******************************************************************
// extract function
//   This function extracts the variants (regions of change) between
//   the reference and the sample string. It automatically constructs
//   the reverse complement string for the reference string if the
//   string type is DNA/RNA.
//
//   @arg variant: vector of variants
//   @arg reference: reference string
//   @arg reference_length: length of the reference string
//   @arg sample: sample string
//   @arg sample_length: length of the sample string
//   @arg type: type of strings  0 --- DNA/RNA (default)
//                               1 --- Protein/other
//   @return: weight of the extracted variants
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
209
210
211
212
213
214
215
size_t extract(std::vector<Variant> &variant,
               char_t const* const   reference,
               size_t const          reference_length,
               char_t const* const   sample,
               size_t const          sample_length,
               int const             type = TYPE_DNA);

216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
// *******************************************************************
// extractor function
//   This function extracts the variants (regions of change) between
//   the reference and the sample string by recursively calling itself
//   on prefixes and suffixes of a longest common substring.
//
//   @arg variant: vector of variants
//   @arg reference: reference string
//   @arg complement: complement string (can be null for strings other
//                    than DNA/RNA)
//   @arg reference_start: starting position in the reference string
//   @arg reference_end: ending position in the reference string
//   @arg sample: sample string
//   @arg sample_start: starting position in the sample string
//   @arg sample_end: ending position in the sample string
//   @return: weight of the extracted variants
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
233
234
235
236
237
238
239
240
241
size_t extractor(std::vector<Variant> &variant,
                 char_t const* const   reference,
                 char_t const* const   complement,
                 size_t const          reference_start,
                 size_t const          reference_end,
                 char_t const* const   sample,
                 size_t const          sample_start,
                 size_t const          sample_end);

242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
// *******************************************************************
// extractor_transposition function
//   This function extracts the variants (regions of change) between
//   a part of the sample string classified as an insertion and the
//   whole reference string.
//
//   @arg variant: vector of variants
//   @arg reference: reference string
//   @arg complement: complement string (can be null for strings other
//                    than DNA/RNA)
//   @arg reference_start: starting position in the reference string
//                         used for the deletion part
//   @arg reference_end: ending position in the reference string used
//                       for the deletion part
//   @arg sample: sample string
//   @arg sample_start: starting position in the sample string
//   @arg sample_end: ending position in the sample string
//   @arg weight_trivial: trivial weight to describe the transposition
//                        as a normal insertion (used for ending the
//                        extraction process)
//   @return: weight of the extracted variants
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
264
265
266
267
268
269
270
271
272
size_t extractor_transposition(std::vector<Variant> &variant,
                               char_t const* const   reference,
                               char_t const* const   complement,
                               size_t const          reference_start,
                               size_t const          reference_end,
                               char_t const* const   sample,
                               size_t const          sample_start,
                               size_t const          sample_end,
                               size_t const          weight_trivial = 0);
jkvis's avatar
jkvis committed
273
274


275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
// *******************************************************************
// Longest Common Substring (LCS) calculation
//   These functions are useful for LCS calculation of (large similar)
//   strings.
// *******************************************************************


// *******************************************************************
// Substring structure
//   This structure describes a common substring between two strings.
//
//   @member reference_index: starting position of the substring
//                            within the reference sequence
//   @member sample_index: ending position of the substring within the
//                         sample sequence
//   @member length: length of the substring
//   @member reverse_complement: indicates a reverse complement
//                               substring (only for DNA/RNA)
// *******************************************************************
jkvis's avatar
jkvis committed
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
struct Substring
{
  size_t reference_index;
  size_t sample_index;
  size_t length;
  bool   reverse_complement;

  inline Substring(size_t const reference_index,
                   size_t const sample_index,
                   size_t const length,
                   bool const   reverse_complement = false):
         reference_index(reference_index),
         sample_index(sample_index),
         length(length),
         reverse_complement(reverse_complement) { }

  inline Substring(void) { }
}; // Substring

313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
// *******************************************************************
// LCS function
//   This function calculates the longest common substrings between
//   two (three?) strings by choosing an initial k and calling the
//   lcs_k function. The k is automatically reduced if necessary until
//   the LCS of the two strings approaches some cutoff threshold.
//
//   @arg substring: vector of substrings
//   @arg reference: reference string
//   @arg complement: complement string (can be null for strings other
//                    than DNA/RNA)
//   @arg reference_start: starting position in the reference string
//   @arg reference_end: ending position in the reference string
//   @arg sample: sample string
//   @arg sample_start: starting position in the sample string
//   @arg sample_end: ending position in the sample string
J.K. Vis's avatar
J.K. Vis committed
329
//   @arg cut_off: optional cut-off value for the k in LCS_k
330
331
//   @return: length of the LCS
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
332
333
334
335
336
337
338
size_t LCS(std::vector<Substring> &substring,
           char_t const* const     reference,
           char_t const* const     complement,
           size_t const            reference_start,
           size_t const            reference_end,
           char_t const* const     sample,
           size_t const            sample_start,
J.K. Vis's avatar
J.K. Vis committed
339
340
           size_t const            sample_end,
           size_t const            cut_off = 1);
J.K. Vis's avatar
J.K. Vis committed
341

342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
// *******************************************************************
// LCS_1 function
//   This function calculates the longest common substrings between
//   two (three?) strings. It asumes no similarity between both
//   strings. Not for use for large strings. This is the classical
//   dynamic programming algorithm.
//
//   @arg substring: vector of substrings
//   @arg reference: reference string
//   @arg complement: complement string (can be null for strings other
//                    than DNA/RNA)
//   @arg reference_start: starting position in the reference string
//   @arg reference_end: ending position in the reference string
//   @arg sample: sample string
//   @arg sample_start: starting position in the sample string
//   @arg sample_end: ending position in the sample string
//   @return: length of the LCS
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
360
361
362
363
364
365
366
367
368
size_t LCS_1(std::vector<Substring> &substring,
             char_t const* const     reference,
             char_t const* const     complement,
             size_t const            reference_start,
             size_t const            reference_end,
             char_t const* const     sample,
             size_t const            sample_start,
             size_t const            sample_end);

369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
// *******************************************************************
// LCS_k function
//   This function calculates the longest common substrings between
//   two (three?) strings by encoding the reference and complement
//   strings into non-overlapping k-mers and the sample string into
//   overlapping k-mers. This function can be used for large similar
//   strings. If the returned vector is empty or the length of the
//   substrings is less or equal 2k, try again with a smaller k.
//
//   @arg substring: vector of substrings
//   @arg reference: reference string
//   @arg complement: complement string (can be null for strings other
//                    than DNA/RNA)
//   @arg reference_start: starting position in the reference string
//   @arg reference_end: ending position in the reference string
//   @arg sample: sample string
//   @arg sample_start: starting position in the sample string
//   @arg sample_end: ending position in the sample string
//   @arg k: size of the k-mers, must be greater than 1
//   @return: length of the LCS
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
390
391
392
393
394
395
396
397
398
399
400
size_t LCS_k(std::vector<Substring> &substring,
             char_t const* const     reference,
             char_t const* const     complement,
             size_t const            reference_start,
             size_t const            reference_end,
             char_t const* const     sample,
             size_t const            sample_start,
             size_t const            sample_end,
             size_t const            k);


401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
// *******************************************************************
// General string matching functions
//   These functions are useful for string matching.
// *******************************************************************

// *******************************************************************
// string_match function
//   This function is more or less equivalent to C's strncmp.
//
//   @arg string_1: first string to be compared
//   @arg string_2: second string to be compared
//   @arg length: maximum length to be compared

//   @return: true iff string_1 and string_2 match for the given
//            length
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
417
418
419
420
bool string_match(char_t const* const string_1,
                  char_t const* const string_2,
                  size_t const        length);

421
422
423
424
425
426
427
428
429
430
431
432
433
// *******************************************************************
// string_match_reverse function
//   This function is very similar to C's strncmp, but it traverses
//   string_1 from end to start while traversing string_2 from start
//   to end (useful for the reverse complement in DNA/RNA).
//
//   @arg string_1: first string to be compared
//   @arg string_2: second string to be compared
//   @arg length: maximum length to be compared

//   @return: true iff string_1 and string_2 match in their respective
//            directions for the given length
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
434
435
436
437
bool string_match_reverse(char_t const* const string_1,
                          char_t const* const string_2,
                          size_t const        length);

438
439
440
441
442
443
444
445
446
447
448
449
450
// *******************************************************************
// prefix_match function
//   This function calculates the length (in characters) of the common
//   prefix between two strings. The result of this function is also
//   used in the suffix_match function.
//
//   @arg reference: reference string
//   @arg reference_length: reference length
//   @arg sample: sample string
//   @arg sample_length: sample length

//   @return: the length of the common prefix
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
451
452
453
454
size_t prefix_match(char_t const* const reference,
                    size_t const        reference_length,
                    char_t const* const sample,
                    size_t const        sample_length);
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469

// *******************************************************************
// suffix_match function
//   This function calculates the length (in characters) of the common
//   suffix between two strings. It needs the calculated common
//   prefix.
//
//   @arg reference: reference string
//   @arg reference_length: reference length
//   @arg sample: sample string
//   @arg sample_length: sample length
//   @arg prefix: length of the common prefix

//   @return: the length of the common suffix
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
470
471
472
473
474
475
size_t suffix_match(char_t const* const reference,
                    size_t const        reference_length,
                    char_t const* const sample,
                    size_t const        sample_length,
                    size_t const        prefix = 0);

476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492

// *******************************************************************
// IUPAC Nucleotide Acid Notation functions
//   These functions are useful for calculating the complement of DNA/
//   RNA strings.
// *******************************************************************


// *******************************************************************
// IUPAC_complement function
//   This function converts a IUPAC Nucleotide Acid Notation into its
//   complement.
//
//   @arg base: character from the IUPAC Nucleotide Acid Notation
//              alphabet
//   @return: its corresponding IUPAC complement for single bases only
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
493
char_t IUPAC_complement(char_t const base);
494
495
496
497
498
499
500
501
502
503
504
505
506

// *******************************************************************
// IUPAC_complement function
//   This function converts a string in IUPAC Nucleotide Acid Notation
//   into its complement. A new string is allocated, so deletion is
//   the responsibility of the caller.
//
//   @arg string: string in the IUPAC Nucleotide Acid Notation
//                alphabet
//   @arg length: number of characters in the string to convert (might
//                be less than the actual string length)
//   @return: string containing the complement of the input string
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
507
508
509
510
511
char_t const* IUPAC_complement(char_t const* const string,
                               size_t const        length);


#if defined(__debug__)
512
513
514
515
516
517
518
519
520
521
522
523
// *******************************************************************
// Dprint_truncated function
//   Debug function for printing large strings in truncated form: a
//   prefix of a certain length ... a suffix of the same length.
//
//   @arg string: string to be printed
//   @arg start: starting position in the string
//   @arg end: ending position in the string
//   @arg length: length of the prefix and suffix
//   @arg stream: file stream to print to
//   @return: the length of the printed string
// *******************************************************************
J.K. Vis's avatar
J.K. Vis committed
524
525
526
527
528
529
size_t Dprint_truncated(char_t const* const string,
                        size_t const        start,
                        size_t const        end,
                        size_t const        length = 40,
                        FILE*               stream = stderr);
#endif
jkvis's avatar
jkvis committed
530

jkvis's avatar
jkvis committed
531

J.K. Vis's avatar
J.K. Vis committed
532
} // mutalyzer
jkvis's avatar
jkvis committed
533
534
535

#endif