Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mirrors
extractor
Commits
a68401ab
Commit
a68401ab
authored
Mar 02, 2015
by
J.K. Vis
Browse files
Frame shift refactoring
parent
99b520a2
Changes
3
Hide whitespace changes
Inline
Side-by-side
extractor/extractor.cc
View file @
a68401ab
...
...
@@ -8,8 +8,8 @@
// FILE INFORMATION:
// File: extractor.cc (depends on extractor.h)
// Author: Jonathan K. Vis
// Revision: 2.1.1
0
// Date: 2015/0
1
/1
4
// Revision: 2.1.1
1
// Date: 2015/0
2
/1
6
// *******************************************************************
// DESCRIPTION:
// This library can be used to generete HGVS variant descriptions as
...
...
@@ -152,7 +152,9 @@ size_t extract(std::vector<Variant> &variant,
{
if
(
it
->
type
==
SUBSTITUTION
)
{
annotate_frame_shift
(
annotation
,
reference
,
it
->
reference_start
,
it
->
reference_end
,
sample
,
it
->
sample_start
,
it
->
sample_end
);
annotate_frame_inversion
(
annotation
,
reference
,
it
->
reference_start
,
it
->
reference_end
,
sample
,
it
->
sample_start
,
it
->
sample_end
);
//annotate_frame_shift(annotation, reference, it->reference_start, it->reference_end, sample, it->sample_start, it->sample_end);
}
// if
}
// for
...
...
@@ -1304,7 +1306,7 @@ char_t const* IUPAC_complement(char_t const* const string,
return
complement
;
}
// IUPAC_complement
// This function calculates the frame shift A reference amino acid is
// This function calculates the frame shift
.
A reference amino acid is
// checked against two possible partial overlaps between every
// combination of two sample (observed) amino acids.
unsigned
int
frame_shift
(
char_t
const
reference
,
char_t
const
sample_1
,
char_t
const
sample_2
)
...
...
@@ -1393,6 +1395,27 @@ size_t annotate_frame_shift(std::vector<Variant> &variant,
return
variant
.
size
();
}
// annotate_frame_shift
size_t
annotate_frame_inversion
(
std
::
vector
<
Variant
>
&
variant
,
char_t
const
*
const
reference
,
size_t
const
reference_start
,
size_t
const
reference_end
,
char_t
const
*
const
sample
,
size_t
const
sample_start
,
size_t
const
sample_end
)
{
size_t
const
reference_length
=
reference_end
-
reference_start
;
size_t
const
sample_length
=
sample_end
-
sample_start
;
size_t
const
length
=
(
reference_length
<=
sample_length
?
reference_length
:
sample_length
)
-
1
;
#if defined(__debug__)
fprintf
(
stderr
,
"Frame inversion detection (%ld)
\n
"
,
length
);
#endif
return
0
;
}
// annotate_frame_inversion
#if defined(__debug__)
// Debug function for printing large strings in truncated form: a
...
...
extractor/extractor.h
View file @
a68401ab
...
...
@@ -8,8 +8,8 @@
// FILE INFORMATION:
// File: extractor.h (implemented in extractor.cc)
// Author: Jonathan K. Vis
// Revision: 2.1.
8
// Date: 2015/0
1
/1
2
// Revision: 2.1.
9
// Date: 2015/0
2
/1
6
// *******************************************************************
// DESCRIPTION:
// This library can be used to generate HGVS variant descriptions as
...
...
@@ -34,7 +34,7 @@ namespace mutalyzer
{
// Version string for run-time identification.
static
char
const
*
const
VERSION
=
"2.1.
8
"
;
static
char
const
*
const
VERSION
=
"2.1.
9
"
;
// The character type used for all strings. For now it should just be
...
...
@@ -597,6 +597,14 @@ size_t annotate_frame_shift(std::vector<Variant> &variant,
size_t
const
sample_start
,
size_t
const
sample_end
);
size_t
annotate_frame_inversion
(
std
::
vector
<
Variant
>
&
variant
,
char_t
const
*
const
reference
,
size_t
const
reference_start
,
size_t
const
reference_end
,
char_t
const
*
const
sample
,
size_t
const
sample_start
,
size_t
const
sample_end
);
#if defined(__debug__)
// *******************************************************************
...
...
extractor/sandbox.cc
View file @
a68401ab
...
...
@@ -8,186 +8,106 @@
// FILE INFORMATION:
// File: sandbox.cc
// Author: Jonathan K. Vis
// Revision: 1.
1.5
// Date: 2015/0
2
/02
// Revision: 1.
0.0
// Date: 2015/0
3
/02
// *******************************************************************
// DESCRIPTION:
// General testing playground: automatic tandem repeat detection
// based on run length encoding with variable run lengths.
// General testing playground: protein frame shift construction code.
// *******************************************************************
#include
<cstdlib>
#include
<vector>
#include
<cstdio>
#include
<cstdlib>
typedef
char
char_t
;
static
size_t
const
THRESHOLD
=
10000
;
struct
Repeat
{
inline
Repeat
(
size_t
const
start
,
size_t
const
end
,
size_t
const
count
=
0
)
:
start
(
start
),
end
(
end
),
count
(
count
)
{
}
#include
<map>
inline
Repeat
(
void
)
{
}
static
unsigned
int
const
FRAME_SHIFT_0
=
0x00
;
static
unsigned
int
const
FRAME_SHIFT_1
=
0x01
;
static
unsigned
int
const
FRAME_SHIFT_2
=
0x02
;
static
unsigned
int
const
FRAME_SHIFT_REVERSE_IDEM
=
0x04
;
static
unsigned
int
const
FRAME_SHIFT_REVERSE_1
=
0x08
;
static
unsigned
int
const
FRAME_SHIFT_REVERSE_2
=
0x10
;
size_t
start
;
size_t
end
;
size_t
count
;
};
// Repeat
static
char_t
const
*
const
codon_string
=
"KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF"
;
bool
string_match
(
char_t
const
*
const
string_1
,
char_t
const
*
const
string_2
,
size_t
const
length
)
static
char_t
const
IUPAC_BASE
[
4
]
=
{
for
(
size_t
i
=
0
;
i
<
length
;
++
i
)
{
if
(
string_1
[
i
]
!=
string_2
[
i
])
{
return
false
;
}
// if
}
// for
return
true
;
}
// string_match
'A'
,
'C'
,
'G'
,
'T'
};
// IUPAC_BASE
char_t
codon_table
[
64
]
=
{
'\0'
};
std
::
multimap
<
char_t
const
,
unsigned
int
const
>
codon_map
;
void
tandem_repeat_annotation
(
std
::
vector
<
Repeat
>
&
repeat
,
char_t
const
*
const
string
,
size_t
const
start
,
size_t
const
end
)
unsigned
int
frame_shift
(
char_t
const
reference
,
char_t
const
sample_1
,
char_t
const
sample_2
)
{
s
ize_t
const
length
=
end
-
start
;
s
ize_t
const
k_max
=
length
>
THRESHOLD
?
THRESHOLD
/
2
:
length
/
2
+
1
;
s
td
::
pair
<
std
::
multimap
<
char_t
const
,
unsigned
int
const
>::
const_iterator
,
std
::
multimap
<
char_t
const
,
unsigned
int
const
>::
const_iterator
>
const
range_1
=
codon_map
.
equal_range
(
sample_1
)
;
s
td
::
pair
<
std
::
multimap
<
char_t
const
,
unsigned
int
const
>::
const_iterator
,
std
::
multimap
<
char_t
const
,
unsigned
int
const
>::
const_iterator
>
const
range_2
=
codon_map
.
equal_range
(
sample_2
)
;
size_t
i
=
0
;
size_t
last_repeat
=
i
;
while
(
i
<
length
)
unsigned
int
shift
=
FRAME_SHIFT_0
;
for
(
std
::
multimap
<
char_t
const
,
unsigned
int
const
>::
const_iterator
it_1
=
range_1
.
first
;
it_1
!=
range_1
.
second
;
++
it_1
)
{
size_t
max_count
=
0
;
size_t
max_k
=
1
;
for
(
size_t
k
=
1
;
k
<
k_max
;
++
k
)
for
(
std
::
multimap
<
char_t
const
,
unsigned
int
const
>::
const_iterator
it_2
=
range_2
.
first
;
it_2
!=
range_2
.
second
;
++
it_2
)
{
size_t
count
=
0
;
for
(
size_t
j
=
i
+
k
;
j
<
length
-
k
+
1
;
j
+=
k
)
const
unsigned
int
codon_1
=
((
it_1
->
second
<<
0x2
)
|
(
it_2
->
second
>>
0x4
))
&
0x3f
;
const
unsigned
int
codon_2
=
((
it_1
->
second
<<
0x4
)
|
(
it_2
->
second
>>
0x2
))
&
0x3f
;
if
(
reference
==
codon_table
[
codon_1
])
{
if
(
!
string_match
(
string
+
start
+
i
,
string
+
start
+
j
,
k
))
{
break
;
}
// if
++
count
;
}
// for
if
(
count
>
0
&&
count
>=
max_count
)
shift
|=
FRAME_SHIFT_1
;
}
// if
if
(
reference
==
codon_table
[
codon_2
])
{
max_count
=
count
;
max_k
=
k
;
shift
|=
FRAME_SHIFT_2
;
}
// if
}
// for
if
(
max_count
>
0
)
{
if
(
last_repeat
<
i
)
if
(
shift
==
(
FRAME_SHIFT_1
|
FRAME_SHIFT_2
))
{
re
peat
.
push_back
(
Repeat
(
start
+
last_repeat
,
start
+
i
))
;
re
turn
shift
;
}
// if
repeat
.
push_back
(
Repeat
(
start
+
i
,
start
+
i
+
max_k
,
max_count
));
last_repeat
=
i
+
max_k
*
(
max_count
+
1
);
}
// if
i
+=
max_k
*
(
max_count
+
1
);
}
// while
if
(
last_repeat
<
i
)
{
repeat
.
push_back
(
Repeat
(
start
+
last_repeat
,
start
+
i
));
}
// if
return
;
}
// tandem_repeat_annotation
printf
(
"%d %d -> %d %d
\n
"
,
it_1
->
second
,
it_2
->
second
,
codon_1
,
codon_2
);
}
// for
}
// for
return
shift
;
}
// frame_shift
int
main
(
int
argc
,
char
*
argv
[]
)
void
print_codon
(
unsigned
int
const
index
)
{
if
(
argc
<
2
)
{
fprintf
(
stderr
,
"usage: %s string [start] [end]
\n
"
,
argv
[
0
]);
return
1
;
}
// if
fprintf
(
stderr
,
"Tandem Repeat Annotator
\n
"
);
printf
(
"%c%c%c"
,
IUPAC_BASE
[
index
>>
0x4
],
IUPAC_BASE
[(
index
>>
0x2
)
&
0x3
],
IUPAC_BASE
[
index
&
0x3
]);
}
// print_codon
FILE
*
file
=
fopen
(
argv
[
1
],
"r"
);
if
(
file
==
0
)
{
fprintf
(
stderr
,
"ERROR: could not open file `%s'
\n
"
,
argv
[
1
]);
return
1
;
}
// if
fseek
(
file
,
0
,
SEEK_END
);
size_t
const
length
=
ftell
(
file
);
rewind
(
file
);
char_t
*
string
=
new
char_t
[
length
];
size_t
const
ref_length
=
fread
(
string
,
sizeof
(
char_t
),
length
,
file
);
static_cast
<
void
>
(
ref_length
);
fclose
(
file
);
size_t
start
=
0
;
if
(
argc
>
2
)
{
start
=
atoi
(
argv
[
2
]);
if
(
start
>
length
)
{
start
=
0
;
}
// if
}
// if
size_t
end
=
length
;
if
(
argc
>
3
)
{
end
=
atoi
(
argv
[
3
]);
if
(
end
>
length
)
{
end
=
length
;
}
// if
}
// if
unsigned
int
reverse_complement
(
unsigned
int
const
index
)
{
return
(
~
((
index
>>
0x4
)
|
(((
index
>>
0x2
)
&
0x3
)
<<
2
)
|
((
index
&
0x3
)
<<
0x4
))
&
0x3f
);
}
// reverse_complement
std
::
vector
<
Repeat
>
repeat
;
tandem_repeat_annotation
(
repeat
,
string
,
start
,
end
);
unsigned
int
frame_shift_reverse
(
char_t
const
reference
,
char_t
const
sample_1
)
{
std
::
pair
<
std
::
multimap
<
char_t
const
,
unsigned
int
const
>::
const_iterator
,
std
::
multimap
<
char_t
const
,
unsigned
int
const
>::
const_iterator
>
const
range_1
=
codon_map
.
equal_range
(
sample_1
);
for
(
std
::
vector
<
Repeat
>::
const_iterator
it
=
repeat
.
begin
();
it
!=
repeat
.
end
();
++
it
)
unsigned
int
shift
=
FRAME_SHIFT_0
;
for
(
std
::
multimap
<
char_t
const
,
unsigned
int
const
>::
const_iterator
it_1
=
range_1
.
first
;
it_1
!=
range_1
.
second
;
++
it_1
)
{
for
(
size_t
i
=
it
->
start
;
i
<
it
->
end
;
++
i
)
{
printf
(
"%c"
,
string
[
i
]);
}
// for
if
(
it
->
count
>
0
)
if
(
reference
==
codon_table
[
reverse_complement
(
it_1
->
second
)])
{
printf
(
"%ld"
,
it
->
count
+
1
)
;
shift
|=
FRAME_SHIFT_REVERSE_IDEM
;
}
// if
printf
(
";"
);
}
// for
printf
(
"
\n
"
);
return
shift
;
}
// frame_shift_reverse
char_t
*
reverse
=
new
char_t
[
length
];
for
(
size_t
i
=
0
;
i
<
length
;
++
i
)
{
reverse
[
i
]
=
string
[
length
-
i
-
1
];
}
// for
std
::
vector
<
Repeat
>
repeat_reverse
;
tandem_repeat_annotation
(
repeat_reverse
,
reverse
,
length
-
end
,
length
-
start
);
for
(
std
::
vector
<
Repeat
>::
const_reverse_iterator
it
=
repeat_reverse
.
rbegin
();
it
!=
repeat_reverse
.
rend
();
++
it
)
int
main
(
int
,
char
*
[])
{
for
(
unsigned
int
i
=
0
;
i
<
64
;
++
i
)
{
size_t
const
repeat_length
=
it
->
end
-
it
->
start
;
for
(
size_t
i
=
0
;
i
<
repeat_length
;
++
i
)
{
printf
(
"%c"
,
reverse
[
it
->
end
-
1
-
i
]);
}
// for
if
(
it
->
count
>
0
)
{
printf
(
"%ld"
,
it
->
count
+
1
);
}
// if
printf
(
";"
);
codon_table
[
i
]
=
codon_string
[
i
];
codon_map
.
insert
(
std
::
pair
<
char_t
const
,
unsigned
int
const
>
(
codon_table
[
i
],
i
));
}
// for
printf
(
"
\n
"
);
delete
[]
reverse
;
delete
[]
string
;
unsigned
int
const
shift
=
frame_shift_reverse
(
'D'
,
'V'
);
printf
(
"%d
\n
"
,
shift
);
return
0
;
}
// main
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment