Commit 14a34376 authored by jkvis's avatar jkvis
Browse files

Initial commit for mutalyzer3

parent 9bea5f16
The MIT License (MIT)
Copyright (c) 2015 Leiden University Medical Center
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
include extractor/extractor.h
# HGVS variant description extractor
Unambiguous sequence variant descriptions are important in reporting
the outcome of clinical diagnostic DNA tests. The standard
nomenclature of the Human Genome Variation Society (HGVS) describes
the observed variant sequence relative to a given reference sequence.
We propose an efficient algorithm for the extraction of
HGVS descriptions from two sequences with three main requirements in
mind: minimizing the length of the resulting descriptions, minimizing
the computation time, and keeping the unambiguous descriptions
biologically meaningful.
This algorithm is able to compute the HGVS descriptions of complete
chromosomes or other large DNA strings in a reasonable amount of
computation time and its resulting descriptions are relatively small.
Additional applications include updating of gene variant database
contents and reference sequence liftovers.
>>> from extractor import describe_dna
>>> print describe_dna('TAACAATGGAAC', 'TAAACAATTGAA')
[3dup;8G>T;12del]
## Implementation
The core algorithm is implemented in C++ with a Python wrapper providing a
developer friendly interface.
## Installation
### Python package
You need [SWIG](http://www.swig.org/) installed. Then:
pip install description-extractor
### C++ library only
Run `make`.
Optionally set the `__debug__` flag to trace the algorithm.
For direct use within a C/C++ environment just
`#include "extractor.h"` and add `extractor.cc` to your project's
source files.
## Testing
There are some unit tests for the Python interface. After installing the
Python package, run them using [pytest](http://pytest.org/):
pip install pytest
python setup.py develop
py.test
Alternatively, use [tox](https://tox.readthedocs.org/) to automatically run
the tests on all supported versions of Python:
pip install tox
tox
## *******************************************************************
## Extractor (library)
## *******************************************************************
## FILE INFORMATION:
## File: Makefile
## Author: Jonathan K. Vis
## *******************************************************************
## DESCRIPTION:
## Build the Extractor library for python.
## *******************************************************************
SOURCES=extractor.cc
TARGET=_extractor.so
DEBUG=debug.cc
CXX=g++
CFLAGS=-c -fpic -Wall -Wextra -O3 #-D__debug__
LDFLAGS=-Wall -O3 -shared
SWIG=swig
SWIGFLAGS=-c++ -python
INCLUDES=-I/usr/include/python2.7
WRAPPER=$(SOURCES:.cc=.py) $(SOURCES:.cc=)_wrap.cxx
OBJECTS=$(SOURCES:.cc=.o) $(WRAPPER:.cxx=.o)
.PHONY: all clean debug rebuild
all: $(TARGET)
$(TARGET): $(OBJECTS) $(WRAPPER)
$(CXX) $(LDFLAGS) $(filter-out $(SOURCES:.cc=.py),$(OBJECTS)) -o $@
chmod -x $(TARGET)
debug: $(SOURCES:.cc=.o) $(DEBUG)
$(CXX) $(LDFLAGS:-shared=) $(SOURCES:.cc=.o) $(DEBUG) -o $@
%_wrap.cxx: %.i
$(SWIG) $(SWIGFLAGS) $(SOURCES:.cc=.i)
%.py: %_wrap.cxx
@
%.o: %.cc
$(CXX) $(CFLAGS) -o $@ $<
%.o: %.cxx
$(CXX) $(CFLAGS:-Wextra=) $(INCLUDES) -o $@ $<
clean:
rm -f $(DEBUG:.cc=.o) $(filter-out $(SOURCES:.cc=.py),$(OBJECTS)) $(WRAPPER) $(TARGET) debug
rebuild: clean all
"""
extractor: Extract a list of differences between two sequences.
Copyright (c) 2013 Leiden University Medical Center <humgen@lumc.nl>
Copyright (c) 2016 Jonathan K. Vis <j.k.vis@lumc.nl>
Licensed under the MIT license, see the LICENSE file.
"""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from . import describe, extractor
__version_info__ = tuple(extractor.VERSION.split('.'))
__version__ = extractor.VERSION
__author__ = 'LUMC, Jonathan K. Vis'
__contact__ = 'j.k.vis@lumc.nl'
__homepage__ = 'https://github.com/mutalyzer/description-extractor'
describe_dna = describe.describe_dna
describe_protein = describe.describe_protein
describe_repeats = describe.describe_repeats
extract = extractor.extract
// *******************************************************************
// Extractor (library)
// *******************************************************************
// FILE INFORMATION:
// File: debug.cc
// Author: Jonathan K. Vis
// *******************************************************************
// DESCRIPTION:
// This source can be used to debug the Extractor library within
// C/C++. It opens two files given as arguments and perform the
// description extraction. Supply the -D__debug__ flag in the
// Makefile for tracing.
// *******************************************************************
#include "extractor.h"
using namespace mutalyzer;
#include <cstdio>
#include <iostream>
#include <string>
using namespace std;
// Entry point.
int main(int argc, char* argv[])
{
if (argc < 3)
{
fprintf(stderr, "usage: %s reference sample\n", argv[0]);
return 1;
} // if
fprintf(stderr, "HGVS description extractor\n");
// Opening files.
FILE* file = fopen(argv[1], "r");
if (file == 0)
{
fprintf(stderr, "ERROR: could not open file `%s'\n", argv[1]);
return 1;
} // if
fseek(file, 0, SEEK_END);
size_t const reference_length = ftell(file);
rewind(file);
char_t* reference = new char_t[reference_length];
size_t const ref_length = fread(reference, sizeof(char_t), reference_length, file);
static_cast<void>(ref_length);
fclose(file);
file = fopen(argv[2], "r");
if (file == 0)
{
fprintf(stderr, "ERROR: could not open file `%s'\n", argv[2]);
delete[] reference;
return 1;
} // if
fseek(file, 0, SEEK_END);
size_t const sample_length = ftell(file);
rewind(file);
char_t* sample = new char_t[sample_length];
size_t const alt_length = fread(sample, sizeof(char_t), sample_length, file);
static_cast<void>(alt_length);
fclose(file);
/*
int const N = 359;
string header[N];
string protein[N];
fprintf(stdout, "\t");
for (int i = 0; i < N; ++i)
{
cin >> header[i] >> protein[i];
fprintf(stdout, "%s\t", header[i].c_str());
} // for
fprintf(stdout, "\n");
for (int i = 0; i < N; ++i)
{
cerr << i << endl;
fprintf(stdout, "%s\t", header[i].c_str());
for (int j = 0; j < N; ++j)
{
if (i == j)
{
fprintf(stdout, "%.10e\t", 0.);
continue;
} // if
vector<Variant> variants;
extract(variants, protein[i].c_str(), protein[i].length(), protein[j].c_str(), protein[j].length(), TYPE_PROTEIN, "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF");
size_t best = 0;
for (std::vector<Variant>::iterator it = variants.begin(); it != variants.end(); ++it)
{
if (it->type >= FRAME_SHIFT && it->reference_end - it->reference_start > best)
{
best = it->reference_end - it->reference_start;
} // if
} // for
size_t const length = min(protein[i].length(), protein[j].length());
fprintf(stdout, "%.10e\t", 1. - static_cast<double>(best) / static_cast<double>(length));
} // for
fprintf(stdout, "\n");
} // for
*/
// The actual extraction.
std::vector<Variant> variant;
size_t const weight = extract(variant, reference, reference_length - 1, sample, sample_length - 1, TYPE_DNA);
// Printing the variants.
fprintf(stdout, "Variants (%ld / %ld):\n", variant.size(), weight);
for (std::vector<Variant>::iterator it = variant.begin(); it != variant.end(); ++it)
{
if (it->type >= FRAME_SHIFT)
{
fprintf(stdout, "%ld--%ld, %ld--%ld, %d, %lf, %ld--%ld\n", it->reference_start, it->reference_end, it->sample_start, it->sample_end, it->type, 1.f - it->probability, it->transposition_start, it->transposition_end);
char_t ref_DNA[(it->reference_end - it->reference_start) * 3];
char_t alt_DNA[(it->reference_end - it->reference_start) * 3];
backtranslation(ref_DNA, alt_DNA, reference, it->reference_start, sample, it->sample_start, it->reference_end - it->reference_start, it->type);
fprintf(stdout, "ref_DNA: ");
fwrite(ref_DNA, sizeof(char_t), (it->reference_end - it->reference_start) * 3, stdout);
fprintf(stdout, "\nref_pro: ");
fwrite(reference + it->reference_start, sizeof(char_t), (it->reference_end - it->reference_start), stdout);
fprintf(stdout , "\nalt_DNA: ");
fwrite(alt_DNA, sizeof(char_t), (it->reference_end - it->reference_start) * 3, stdout);
fprintf(stdout, "\nalt_pro: ");
fwrite(sample + it->sample_start, sizeof(char_t), (it->reference_end - it->reference_start), stdout);
fprintf(stdout , "\n");
} // if
else
{
fprintf(stdout, "%ld--%ld, %ld--%ld, %d, %ld, %ld--%ld\n", it->reference_start, it->reference_end, it->sample_start, it->sample_end, it->type, it->weight, it->transposition_start, it->transposition_end);
} // else
} // for
// Cleaning up.
delete[] reference;
delete[] sample;
return 0;
} // main
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
// *******************************************************************
// Extractor (library)
// *******************************************************************
// FILE INFORMATION:
// File: extractor.i (SWIG interface file)
// Author: Jonathan K. Vis
// *******************************************************************
// DESCRIPTION:
// Defines the SWIG interface for the Extractor library for use in
// other languages than C/C++.
// *******************************************************************
%include "std_vector.i"
%module extractor
%{
#include "extractor.h"
%}
namespace std
{
%template(VariantVector) vector<mutalyzer::Variant>;
}
namespace mutalyzer
{
static char const* const VERSION;
typedef char char_t;
static int const TYPE_DNA;
static int const TYPE_PROTEIN;
static int const TYPE_OTHER;
static unsigned int const IDENTITY;
static unsigned int const REVERSE_COMPLEMENT;
static unsigned int const SUBSTITUTION;
static unsigned int const TRANSPOSITION_OPEN;
static unsigned int const TRANSPOSITION_CLOSE;
static unsigned int const FRAME_SHIFT;
static unsigned int const FRAME_SHIFT_NONE;
static unsigned int const FRAME_SHIFT_1;
static unsigned int const FRAME_SHIFT_2;
static unsigned int const FRAME_SHIFT_REVERSE;
static unsigned int const FRAME_SHIFT_REVERSE_1;
static unsigned int const FRAME_SHIFT_REVERSE_2;
static size_t const WEIGHT_BASE;
static size_t const WEIGHT_DELETION;
static size_t const WEIGHT_DELETION_INSERTION;
static size_t const WEIGHT_INSERTION;
static size_t const WEIGHT_INVERSION;
static size_t const WEIGHT_SEPARATOR;
static size_t const WEIGHT_SUBSTITUTION;
struct Variant
{
size_t reference_start;
size_t reference_end;
size_t sample_start;
size_t sample_end;
unsigned int type;
size_t transposition_start;
size_t transposition_end;
};
struct Variant_List
{
size_t weight_position;
std::vector<Variant> variants;
};
Variant_List extract(char_t const* const reference,
size_t const reference_length,
char_t const* const sample,
size_t const sample_length,
int const type = TYPE_DNA,
char_t const* const codon_string = 0);
}
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment