Commit f5fd3eb3 authored by J.K. Vis's avatar J.K. Vis
Browse files

Refactoring done

parent 932b3029
......@@ -8,8 +8,8 @@
## FILE INFORMATION:
## File: Makefile
## Author: Jonathan K. Vis
## Revision: 1.05a
## Date: 2014/02/05
## Revision: 2.01a
## Date: 2014/07/22
## *******************************************************************
## DESCRIPTION:
## Build the Extractor library for python.
......@@ -54,7 +54,7 @@ debug: $(SOURCES:.cc=.o) $(DEBUG)
$(CXX) $(CFLAGS:-Wextra=) $(INCLUDES) -o $@ $<
clean:
rm -f $(filter-out $(SOURCES:.cc=.py),$(OBJECTS)) $(WRAPPER) $(TARGET) debug
rm -f $(DEBUG:.cc=.o) $(filter-out $(SOURCES:.cc=.py),$(OBJECTS)) $(WRAPPER) $(TARGET) debug
rebuild: clean all
......
// *******************************************************************
// (C) Copyright 2014 Leiden Institute of Advanced Computer Science
// Universiteit Leiden
// All Rights Reserved
// *******************************************************************
// Extractor (library)
// *******************************************************************
// FILE INFORMATION:
// File: debug.cc (depends on extractor.h)
// Author: Jonathan K. Vis
// Revision: 1.05a
// Date: 2014/02/05
// *******************************************************************
// DESCRIPTION:
// This source can be used to debug the Extractor library within
// C/C++.
// *******************************************************************
#include "extractor.h"
using namespace mutalyzer;
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <vector>
using namespace std;
int main(int argc, char* argv[])
{
static_cast<void>(argc);
char* reference;
char* sample;
ifstream file(argv[1]);
if (argc < 3)
{
fprintf(stderr, "usage: %s reference sample\n", argv[0]);
return 1;
} // if
fprintf(stderr, "debug.cc --- loading files\n reference: %s\n sample: %s\n", argv[1], argv[2]);
fprintf(stderr, "HGVS description extractor\n");
FILE* file = fopen(argv[1], "r");
if (file == 0)
{
fprintf(stderr, "ERROR: could not open file `%s'\n", argv[1]);
return 1;
} // if
fseek(file, 0, SEEK_END);
size_t const reference_length = ftell(file) - 1;
rewind(file);
char_t* reference = new char_t[reference_length];
fread(reference, sizeof(char_t), reference_length, file);
fclose(file);
file = fopen(argv[2], "r");
if (file == 0)
{
fprintf(stderr, "ERROR: could not open file `%s'\n", argv[2]);
delete[] reference;
return 1;
} // if
fseek(file, 0, SEEK_END);
size_t const sample_length = ftell(file) - 1;
rewind(file);
char_t* sample = new char_t[sample_length];
fread(sample, sizeof(char_t), sample_length, file);
fclose(file);
file.seekg(0, file.end);
size_t const reference_length = static_cast<size_t>(file.tellg()) - 1;
file.seekg(0, file.beg);
reference = new char[reference_length];
file.read(reference, reference_length);
file.close();
file.open(argv[2]);
file.seekg(0, file.end);
size_t const sample_length = static_cast<size_t>(file.tellg()) - 1;
file.seekg(0, file.beg);
sample = new char[sample_length];
file.read(sample, sample_length);
file.close();
fprintf(stderr, "debug.cc --- files loaded\n reference: %ld\n sample: %ld\n", reference_length, sample_length);
std::vector<Variant> variant;
size_t const weight = extract(variant, reference, reference_length, sample, sample_length);
vector<Variant> result = extract(reference, reference_length, sample, sample_length);
for (size_t i = 0; i < result.size(); ++i)
fprintf(stderr, "\nVariants (%ld / %ld):\n", variant.size(), weight);
for (std::vector<Variant>::iterator it = variant.begin() ; it != variant.end(); ++it)
{
cout << result[i].reference_start << "--" << result[i].reference_end << ", "
<< result[i].sample_start << "--" << result[i].sample_end << ", " << result[i].type << endl;
//if (it->type != IDENTITY)
{
fprintf(stderr, "%ld--%ld, %ld--%ld, %d, %ld, %ld--%ld\n", it->reference_start, it->reference_end, it->sample_start, it->sample_end, it->type, it->weight, it->transposition_start, it->transposition_end);
} // if
} // for
delete[] reference;
delete[] sample;
......
This diff is collapsed.
......@@ -8,8 +8,8 @@
// FILE INFORMATION:
// File: extractor.h (implemented in extractor.cc)
// Author: Jonathan K. Vis
// Revision: 1.05b
// Date: 2014/07/07
// Revision: 2.01a
// Date: 2014/07/22
// *******************************************************************
// DESCRIPTION:
// This library can be used to generate HGVS variant descriptions as
......@@ -19,180 +19,112 @@
#if !defined(__extractor_h__)
#define __extractor_h__
#include <cmath>
#include <cstddef>
#include <cstdlib>
#include <vector>
#if defined(__debug__)
#include <cstdio>
#endif
namespace mutalyzer
{
typedef char char_t;
static int const TYPE_DNA = 0;
static int const TYPE_PROTEIN = 1;
// Can be used as cutoff for k-mer reduction. The expected length of
// a longest common substring between to strings of length n is
// log_z(n), were z is the size of the alphabet.
static int const ALPHABET_SIZE[2] =
{
4, // DNA/RNA
20 // Protein/other
}; // ALPHABET
static int const IDENTITY = 0;
static int const REVERSE_COMPLEMENT = 1;
static int const SUBSTITUTION = 2;
static int const TRANSPOSITION_OPEN = 4;
static int const TRANSPOSITION_CLOSE = 8;
// *******************************************************************
// Variant Extraction
// These functions are used to extract variants (regions of change)
// between two strings.
// *******************************************************************
extern size_t weight_position;
static size_t const WEIGHT_BASE = 1; // A
static size_t const WEIGHT_DELETION = 3; // del
static size_t const WEIGHT_DELETION_INSERTION = 6; // delins
static size_t const WEIGHT_INSERTION = 3; // ins
static size_t const WEIGHT_INVERSION = 3; // inv
static size_t const WEIGHT_SEPARATOR = 1; // _[]
static size_t const WEIGHT_SUBSTITUTION = 1; // >
// These constants can be used to deterimine the type of variant.
// Substitution covers most: deletions, insertions, substitutions, and
// insertion/deletions. Indentity is used to describe the unchanged
// (matched) regions. These constants are coded as bitfields and
// should be appropriately combined, e.g.,
// IDENTITY | TRANSPOSITION_OPEN for describing a real transposition.
// Note that some combinations do NOT make sense, e.g.,
// IDENTITY | REVERSE_COMPLEMENT.
static int const SUBSTITUTION = 0;
static int const IDENTITY = 1;
static int const REVERSE_COMPLEMENT = 2;
static int const TRANSPOSITION_OPEN = 4;
static int const TRANSPOSITION_CLOSE = 8;
extern size_t global_reference_length;
// *******************************************************************
// Variant structure
// This structure describes a variant (region of change).
//
// @member reference_start: starting position of the variant within
// the reference string
// @member reference_end: ending position of the variant within the
// reference string
// @member sample_start: starting position of the variant within the
// sample string
// @member sample_end: ending position of the variant within the
// sample string
// @member transposition_start: starting position of a transposition
// withing the reference string
// @member transposition_ned: ending position of a transposition
// withing the reference string
// @member reverse_complement: indicates a reverse complement
// variant
// @member transposition: indicates an inserted substring from the
// reference string
// *******************************************************************
struct Variant
{
size_t reference_start;
size_t reference_end;
size_t sample_start;
size_t sample_end;
int type;
size_t weight;
size_t transposition_start;
size_t transposition_end;
int type;
inline Variant(size_t const reference_start,
size_t const reference_end,
size_t const sample_start,
size_t const sample_end,
int const type = SUBSTITUTION):
reference_start(reference_start),
reference_end(reference_end),
sample_start(sample_start),
sample_end(sample_end),
transposition_start(0),
transposition_end(0),
type(type) { }
inline Variant(size_t const reference_start,
size_t const reference_end,
size_t const sample_start,
size_t const sample_end,
size_t const transposition_start,
size_t const transposition_end,
int const type = SUBSTITUTION):
int const type = IDENTITY,
size_t const weight = 0,
size_t const transposition_start = 0,
size_t const transposition_end = 0):
reference_start(reference_start),
reference_end(reference_end),
sample_start(sample_start),
sample_end(sample_end),
type(type),
weight(weight),
transposition_start(transposition_start),
transposition_end(transposition_end),
type(type) { }
transposition_end(transposition_end) { }
inline Variant(void) { }
}; // Variant
// *******************************************************************
// extract function
// This function extracts the variants (regions of change) between
// the reference and the sample string. It automatically constructs
// the reverse complement string for the reference string if the
// string type is DNA/RNA.
//
// @arg reference: reference string
// @arg reference_length: length of the reference string
// @arg sample: sample string
// @arg sample_length: length of the sample string
// @arg type: type of strings 0 --- DNA/RNA (default)
// 1 --- Protein/other
// @return: list of variants
// *******************************************************************
std::vector<Variant> extract(char const* const reference,
size_t const reference_length,
char const* const sample,
size_t const sample_length,
int const type = TYPE_DNA);
// *******************************************************************
// extractor function
// This function extracts the variants (regions of change) between
// the reference and the sample string by recursively calling itself
// on prefixes and suffixes of a longest common substring. It is
// strongly suggested NOT to use this function directly when dealing
// with transpositions, but use the extract function instead.
//
// @arg reference: reference string
// @arg complement: complement string (can be null for
// strings other than DNA/RNA)
// @arg reference_start: starting position in the reference string
// @arg reference_end: ending position in the reference string
// @arg sample: sample string
// @arg sample_start: starting position in the sample string
// @arg sample_end: ending position in the sample string
// @arg result: list of variants
// @arg transposition: flag indicating transposition extraction
// *******************************************************************
void extractor(char const* const reference,
char const* const complement,
size_t const reference_start,
size_t const reference_end,
char const* const sample,
size_t const sample_start,
size_t const sample_end,
std::vector<Variant> &result,
bool const transposition = false);
// *******************************************************************
// Longest Common Substring (LCS) calculation
// These functions are useful for LCS calculation of (large similar)
// strings.
// *******************************************************************
std::vector<Variant> extract(char_t const* const reference,
size_t const reference_length,
char_t const* const sample,
size_t const sample_length,
int const type = TYPE_DNA);
size_t extract(std::vector<Variant> &variant,
char_t const* const reference,
size_t const reference_length,
char_t const* const sample,
size_t const sample_length,
int const type = TYPE_DNA);
size_t extractor(std::vector<Variant> &variant,
char_t const* const reference,
char_t const* const complement,
size_t const reference_start,
size_t const reference_end,
char_t const* const sample,
size_t const sample_start,
size_t const sample_end);
size_t extractor_transposition(std::vector<Variant> &variant,
char_t const* const reference,
char_t const* const complement,
size_t const reference_start,
size_t const reference_end,
char_t const* const sample,
size_t const sample_start,
size_t const sample_end,
size_t const weight_trivial = 0);
// *******************************************************************
// Substring structure
// This structure describes a common substring between two strings.
//
// @member reference_index: starting position of the substring
// within the reference sequence
// @member sample_index: ending position of the substring within the
// sample sequence
// @member reverse_complement: indicates a reverse complement
// substring (only for DNA/RNA)
// *******************************************************************
struct Substring
{
size_t reference_index;
......@@ -213,162 +145,69 @@ struct Substring
}; // Substring
// *******************************************************************
// LCS_1 function
// This function calculates the longest common substrings between
// two (three?) strings. It asumes no similarity between both
// strings. Not for use for large strings.
//
// @arg reference: reference string
// @arg complement: complement string (can be null for strings other
// than DNA/RNA)
// @arg reference_start: starting position in the reference string
// @arg reference_end: ending position in the reference string
// @arg sample: sample string
// @arg sample_start: starting position in the sample string
// @arg sample_end: ending position in the sample string
// @return: list of longest common substrings
// *******************************************************************
std::vector<Substring> LCS_1(char const* const reference,
char const* const complement,
size_t const reference_start,
size_t const reference_end,
char const* const sample,
size_t const sample_start,
size_t const sample_end);
// *******************************************************************
// LCS_k function
// This function calculates the longest common substrings between
// two (three?) strings by encoding the reference and complement
// strings into non-overlapping k-mers and the sample string into
// overlapping k-mers. This function can be used for large similar
// strings. If the returned vector is empty or the length of the
// substrings is less or equal 2k, try again with a smaller k.
//
// @arg reference: reference string
// @arg complement: complement string (can be null for strings other
// than DNA/RNA)
// @arg reference_start: starting position in the reference string
// @arg reference_end: ending position in the reference string
// @arg sample: sample string
// @arg sample_start: starting position in the sample string
// @arg sample_end: ending position in the sample string
// @arg k: size of the k-mers, must be greater than 1
// @return list of longest common substrings
// *******************************************************************
std::vector<Substring> LCS_k(char const* const reference,
char const* const complement,
size_t const reference_start,
size_t const reference_end,
char const* const sample,
size_t const sample_start,
size_t const sample_end,
size_t const k);
// *******************************************************************
// LCS function
// This function calculates the longest common substrings between
// two (three?) strings by choosing an initial k and calling the
// lcs_k function. The k is automatically reduced if necessary until
// the LCS of the two strings approaches some cutoff threshold.
//
// @arg reference: reference string
// @arg complement: complement string (can be null for strings other
// than DNA/RNA)
// @arg reference_start: starting position in the reference string
// @arg reference_end: ending position in the reference string
// @arg sample: sample string
// @arg sample_start: starting position in the sample string
// @arg sample_end: ending position in the sample string
// @return: list of longest common substrings
// *******************************************************************
std::vector<Substring> LCS(char const* const reference,
char const* const complement,
size_t const reference_start,
size_t const reference_end,
char const* const sample,
size_t const sample_start,
size_t const sample_end);
// *******************************************************************
// IUPAC Nucleotide Acid Notation functions
// These functions are useful for calculating the complement of DNA/
// RNA strings.
// *******************************************************************
// *******************************************************************
// IUPAC_complement function
// This function converts a IUPAC Nucleotide Acid Notation into its
// complement.
//
// @arg base: character from the IUPAC Nucleotide Acid Notation
// alphabet
// @return: bit array containing all the possible bases
// *******************************************************************
static inline char IUPAC_complement(char const base)
{
switch (base)
{
case 'A':
return 'T';
case 'B':
return 'V';
case 'C':
return 'G';
case 'D':
return 'H';
case 'G':
return 'C';
case 'H':
return 'D';
case 'K':
return 'M';
case 'M':
return 'K';
case 'R':
return 'Y';
case 'T':
case 'U':
return 'A';
case 'V':
return 'B';
case 'Y':
return 'R';
} // switch
return base;
} // IUPAC_complement
size_t LCS(std::vector<Substring> &substring,
char_t const* const reference,
char_t const* const complement,
size_t const reference_start,
size_t const reference_end,
char_t const* const sample,
size_t const sample_start,
size_t const sample_end);
size_t LCS_1(std::vector<Substring> &substring,
char_t const* const reference,
char_t const* const complement,
size_t const reference_start,
size_t const reference_end,
char_t const* const sample,
size_t const sample_start,
size_t const sample_end);
size_t LCS_k(std::vector<Substring> &substring,
char_t const* const reference,
char_t const* const complement,
size_t const reference_start,
size_t const reference_end,
char_t const* const sample,
size_t const sample_start,
size_t const sample_end,
size_t const k);
bool string_match(char_t const* const string_1,
char_t const* const string_2,
size_t const length);
bool string_match_reverse(char_t const* const string_1,
char_t const* const string_2,
size_t const length);
size_t prefix_match(char_t const* const reference,
size_t const reference_length,
char_t const* const sample,
size_t const sample_length);
size_t suffix_match(char_t const* const reference,
size_t const reference_length,
char_t const* const sample,
size_t const sample_length,
size_t const prefix = 0);
char_t IUPAC_complement(char_t const base);
char_t const* IUPAC_complement(char_t const* const string,
size_t const length);
#if defined(__debug__)
size_t Dprint_truncated(char_t const* const string,
size_t const start,
size_t const end,
size_t const length = 40,
FILE* stream = stderr);
#endif
// *******************************************************************
// IUPAC_complement function
// This function converts a string in IUPAC Nucleotide Acid Notation
// into its complement. A new string is allocated, so deletion is
// the responsibility of the caller.
//
// @arg string: string in the IUPAC Nucleotide Acid Notation
// alphabet
// @arg n: number of characters in the string to convert (might be
// less than the actual string length)
// @return: string containing the complement of the input string
// *******************************************************************
static inline char const* IUPAC_complement(char const* const string,
size_t const n)
{
// The caller is responsible for deallocation.
char* result = new char[n];
for (size_t i = 0; i < n; ++i)
{
result[i] = IUPAC_complement(string[i]);
} // for
return result;
} // IUPAC_complement
} // namespace
} // mutalyzer
#endif
......@@ -8,8 +8,8 @@
// FILE INFORMATION:
// File: extractor.i (SWIG interface file)
// Author: Jonathan K. Vis
// Revision: 1.05b
// Date: 2014/07/07
// Revision: 2.01a
// Date: 2014/07/22
// *******************************************************************
// DESCRIPTION:
// Defines the SWIG interface for the Extractor library for use in
......@@ -26,18 +26,19 @@
namespace std
{
%template(VariantVector) vector<mutalyzer::Variant>;
%template(SubstringVector) vector<mutalyzer::Substring>;
} // namespace