Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mirrors
extractor
Commits
28668cf5
Commit
28668cf5
authored
Mar 10, 2015
by
J.K. Vis
Browse files
Recursive frame shift extraction
parent
a5071bc7
Changes
3
Hide whitespace changes
Inline
Side-by-side
extractor/extractor.cc
View file @
28668cf5
...
...
@@ -8,8 +8,8 @@
// FILE INFORMATION:
// File: extractor.cc (depends on extractor.h)
// Author: Jonathan K. Vis
// Revision: 2.
1.11
// Date: 2015/0
2
/1
6
// Revision: 2.
2.0
// Date: 2015/0
3
/1
0
// *******************************************************************
// DESCRIPTION:
// This library can be used to generete HGVS variant descriptions as
...
...
extractor/extractor.h
View file @
28668cf5
...
...
@@ -8,8 +8,8 @@
// FILE INFORMATION:
// File: extractor.h (implemented in extractor.cc)
// Author: Jonathan K. Vis
// Revision: 2.
1.9
// Date: 2015/0
2
/1
6
// Revision: 2.
2.0
// Date: 2015/0
3
/1
0
// *******************************************************************
// DESCRIPTION:
// This library can be used to generate HGVS variant descriptions as
...
...
@@ -34,7 +34,7 @@ namespace mutalyzer
{
// Version string for run-time identification.
static
char
const
*
const
VERSION
=
"2.
1.9
"
;
static
char
const
*
const
VERSION
=
"2.
2.0
"
;
// The character type used for all strings. For now it should just be
...
...
@@ -202,7 +202,7 @@ Variant_List extract(char_t const* const reference,
size_t
const
reference_length
,
char_t
const
*
const
sample
,
size_t
const
sample_length
,
int
const
type
=
TYPE_DNA
,
int
const
type
=
TYPE_DNA
,
char_t
const
*
const
codon_string
=
0
);
// *******************************************************************
...
...
@@ -230,7 +230,7 @@ size_t extract(std::vector<Variant> &variant,
size_t
const
reference_length
,
char_t
const
*
const
sample
,
size_t
const
sample_length
,
int
const
type
=
TYPE_DNA
,
int
const
type
=
TYPE_DNA
,
char_t
const
*
const
codon_string
=
0
);
// *******************************************************************
...
...
@@ -597,14 +597,6 @@ size_t annotate_frame_shift(std::vector<Variant> &variant,
size_t
const
sample_start
,
size_t
const
sample_end
);
size_t
annotate_frame_inversion
(
std
::
vector
<
Variant
>
&
variant
,
char_t
const
*
const
reference
,
size_t
const
reference_start
,
size_t
const
reference_end
,
char_t
const
*
const
sample
,
size_t
const
sample_start
,
size_t
const
sample_end
);
#if defined(__debug__)
// *******************************************************************
...
...
extractor/sandbox.cc
View file @
28668cf5
...
...
@@ -8,6 +8,9 @@ typedef char char_t;
typedef
unsigned
char
uint8_t
;
typedef
unsigned
long
long
uint64_t
;
static
unsigned
int
const
IDENTITY
=
0x01
;
static
unsigned
int
const
FRAME_SHIFT
=
0x20
;
static
uint8_t
const
FRAME_SHIFT_NONE
=
0x00
;
static
uint8_t
const
FRAME_SHIFT_1
=
0x01
;
static
uint8_t
const
FRAME_SHIFT_2
=
0x02
;
...
...
@@ -19,25 +22,68 @@ char_t const* const CODON_STRING = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGG
static
uint8_t
frame_shift_map
[
128
][
128
][
128
]
=
{{{
FRAME_SHIFT_NONE
}}};
struct
FS_Substring
struct
Variant
{
size_t
reference_start
;
size_t
reference_end
;
size_t
sample_start
;
size_t
sample_end
;
unsigned
int
type
;
size_t
weight
;
size_t
transposition_start
;
size_t
transposition_end
;
inline
Variant
(
size_t
const
reference_start
,
size_t
const
reference_end
,
size_t
const
sample_start
,
size_t
const
sample_end
,
unsigned
int
const
type
=
IDENTITY
,
size_t
const
weight
=
0
,
size_t
const
transposition_start
=
0
,
size_t
const
transposition_end
=
0
)
:
reference_start
(
reference_start
),
reference_end
(
reference_end
),
sample_start
(
sample_start
),
sample_end
(
sample_end
),
type
(
type
),
weight
(
weight
),
transposition_start
(
transposition_start
),
transposition_end
(
transposition_end
)
{
}
inline
Variant
(
void
)
{
}
};
// Variant
struct
Substring
{
size_t
reference_index
;
size_t
sample_index
;
size_t
length
;
uint8_t
type
;
union
{
bool
reverse_complement
;
uint8_t
type
;
};
// union
inline
FS_
Substring
(
size_t
const
reference_index
,
size_t
const
sample_index
,
size_t
const
length
,
uint8_t
const
typ
e
)
:
inline
Substring
(
size_t
const
reference_index
,
size_t
const
sample_index
,
size_t
const
length
,
bool
const
reverse_complement
=
fals
e
)
:
reference_index
(
reference_index
),
sample_index
(
sample_index
),
length
(
length
),
type
(
type
)
{
}
reverse_complement
(
reverse_complement
)
{
}
inline
FS_Substring
(
void
)
:
length
(
0
)
{
}
};
// FS_Substring
inline
Substring
(
size_t
const
reference_index
,
size_t
const
sample_index
,
size_t
const
length
,
uint8_t
const
type
)
:
reference_index
(
reference_index
),
sample_index
(
sample_index
),
length
(
length
),
type
(
type
)
{
}
inline
Substring
(
void
)
:
length
(
0
)
{
}
};
// Substring
uint8_t
calculate_frame_shift
(
uint64_t
const
acid_map
[],
size_t
const
reference_1
,
...
...
@@ -127,13 +173,13 @@ uint8_t frame_shift(char_t const reference_1,
return
frame_shift_map
[
reference_1
&
0x7f
][
reference_2
&
0x7f
][
sample
&
0x7f
];
}
// frame_shift
size_t
lcs_frame_shift
(
std
::
vector
<
FS_
Substring
>
&
substring
,
char_t
const
*
const
reference
,
size_t
const
reference_start
,
size_t
const
reference_end
,
char_t
const
*
const
sample
,
size_t
const
sample_start
,
size_t
const
sample_end
)
size_t
lcs_frame_shift
(
std
::
vector
<
Substring
>
&
substring
,
char_t
const
*
const
reference
,
size_t
const
reference_start
,
size_t
const
reference_end
,
char_t
const
*
const
sample
,
size_t
const
sample_start
,
size_t
const
sample_end
)
{
size_t
const
reference_length
=
reference_end
-
reference_start
;
size_t
const
sample_length
=
sample_end
-
sample_start
;
...
...
@@ -153,14 +199,14 @@ size_t lcs_frame_shift(std::vector<FS_Substring> &substring,
lcs
[
1
][
i
][
4
]
=
0
;
}
// for
FS_
Substring
fs_substring
[
5
];
Substring
fs_substring
[
5
];
for
(
size_t
i
=
0
;
i
<
sample_length
;
++
i
)
{
uint8_t
const
shift_reverse
=
frame_shift
(
reference
[
reference_end
-
1
],
reference
[
reference_end
-
2
],
sample
[
sample_start
+
i
]);
if
((
shift_reverse
&
FRAME_SHIFT_REVERSE
)
==
FRAME_SHIFT_REVERSE
)
{
lcs
[
i
%
2
][
0
][
2
]
=
1
;
fs_substring
[
2
]
=
FS_
Substring
(
reference_start
,
sample_start
+
i
,
lcs
[
i
%
2
][
0
][
2
],
FRAME_SHIFT_REVERSE
);
fs_substring
[
2
]
=
Substring
(
reference_start
,
sample_start
+
i
,
lcs
[
i
%
2
][
0
][
2
],
FRAME_SHIFT_REVERSE
);
}
// if
for
(
size_t
j
=
1
;
j
<
reference_length
;
++
j
)
{
...
...
@@ -188,27 +234,27 @@ size_t lcs_frame_shift(std::vector<FS_Substring> &substring,
}
// if
if
(
lcs
[
i
%
2
][
j
][
0
]
>
fs_substring
[
0
].
length
)
{
fs_substring
[
0
]
=
FS_
Substring
(
reference_start
+
j
-
lcs
[
i
%
2
][
j
][
0
],
sample_start
+
i
-
lcs
[
i
%
2
][
j
][
0
]
+
1
,
lcs
[
i
%
2
][
j
][
0
],
FRAME_SHIFT_1
);
fs_substring
[
0
]
=
Substring
(
reference_start
+
j
-
lcs
[
i
%
2
][
j
][
0
],
sample_start
+
i
-
lcs
[
i
%
2
][
j
][
0
]
+
1
,
lcs
[
i
%
2
][
j
][
0
],
FRAME_SHIFT_1
);
}
// if
if
(
lcs
[
i
%
2
][
j
][
1
]
>
fs_substring
[
1
].
length
)
{
fs_substring
[
1
]
=
FS_
Substring
(
reference_start
+
j
-
lcs
[
i
%
2
][
j
][
1
],
sample_start
+
i
-
lcs
[
i
%
2
][
j
][
1
]
+
1
,
lcs
[
i
%
2
][
j
][
1
],
FRAME_SHIFT_2
);
fs_substring
[
1
]
=
Substring
(
reference_start
+
j
-
lcs
[
i
%
2
][
j
][
1
],
sample_start
+
i
-
lcs
[
i
%
2
][
j
][
1
]
+
1
,
lcs
[
i
%
2
][
j
][
1
],
FRAME_SHIFT_2
);
}
// if
if
(
lcs
[
i
%
2
][
j
][
2
]
>
fs_substring
[
2
].
length
)
{
fs_substring
[
2
]
=
FS_
Substring
(
reference_start
+
j
-
lcs
[
i
%
2
][
j
][
2
],
sample_start
+
i
-
lcs
[
i
%
2
][
j
][
2
],
lcs
[
i
%
2
][
j
][
2
],
FRAME_SHIFT_REVERSE
);
fs_substring
[
2
]
=
Substring
(
reference_start
+
j
-
lcs
[
i
%
2
][
j
][
2
],
sample_start
+
i
-
lcs
[
i
%
2
][
j
][
2
],
lcs
[
i
%
2
][
j
][
2
],
FRAME_SHIFT_REVERSE
);
}
// if
if
(
lcs
[
i
%
2
][
j
][
3
]
>
fs_substring
[
3
].
length
)
{
fs_substring
[
3
]
=
FS_
Substring
(
reference_start
+
j
-
lcs
[
i
%
2
][
j
][
3
],
sample_start
+
i
-
lcs
[
i
%
2
][
j
][
3
]
+
1
,
lcs
[
i
%
2
][
j
][
3
],
FRAME_SHIFT_REVERSE_1
);
fs_substring
[
3
]
=
Substring
(
reference_start
+
j
-
lcs
[
i
%
2
][
j
][
3
],
sample_start
+
i
-
lcs
[
i
%
2
][
j
][
3
]
+
1
,
lcs
[
i
%
2
][
j
][
3
],
FRAME_SHIFT_REVERSE_1
);
}
// if
if
(
lcs
[
i
%
2
][
j
][
4
]
>
fs_substring
[
4
].
length
)
{
fs_substring
[
4
]
=
FS_
Substring
(
reference_start
+
j
-
lcs
[
i
%
2
][
j
][
4
],
sample_start
+
i
-
lcs
[
i
%
2
][
j
][
4
]
+
1
,
lcs
[
i
%
2
][
j
][
4
],
FRAME_SHIFT_REVERSE_2
);
fs_substring
[
4
]
=
Substring
(
reference_start
+
j
-
lcs
[
i
%
2
][
j
][
4
],
sample_start
+
i
-
lcs
[
i
%
2
][
j
][
4
]
+
1
,
lcs
[
i
%
2
][
j
][
4
],
FRAME_SHIFT_REVERSE_2
);
}
// if
}
// for
}
// for
substring
=
std
::
vector
<
FS_
Substring
>
(
1
,
fs_substring
[
0
]);
substring
=
std
::
vector
<
Substring
>
(
1
,
fs_substring
[
0
]);
substring
.
push_back
(
fs_substring
[
1
]);
substring
.
push_back
(
fs_substring
[
2
]);
substring
.
push_back
(
fs_substring
[
3
]);
...
...
@@ -216,69 +262,89 @@ size_t lcs_frame_shift(std::vector<FS_Substring> &substring,
return
0
;
}
// lcs_frame_shift
FS_Substring
maximize_frame_shift
(
std
::
vector
<
FS_Substring
>
const
&
substring
)
size_t
extractor_frame_shift
(
std
::
vector
<
Variant
>
&
annotation
,
char_t
const
*
const
reference
,
size_t
const
reference_start
,
size_t
const
reference_end
,
char_t
const
*
const
sample
,
size_t
const
sample_start
,
size_t
const
sample_end
)
{
FS_Substring
fs_substring
;
size_t
max_length
=
0
;
size_t
const
reference_length
=
reference_end
-
reference_start
;
size_t
const
sample_length
=
sample_end
-
sample_start
;
// First the base cases to end the recursion.
if
(
reference_length
<=
0
&&
sample_length
<=
0
)
{
return
0
;
}
// if
// Calculate the frame shift LCS of the two strings.
std
::
vector
<
Substring
>
substring
;
lcs_frame_shift
(
substring
,
reference
,
reference_start
,
reference_end
,
sample
,
sample_start
,
sample_end
);
// Pick the ``best fitting'' frame shift LCS, i.e., pushed as far to
// the start of the reference string as possible. Also update in
// case of compound frame shifts.
Substring
lcs
(
0
,
0
,
0
,
FRAME_SHIFT_NONE
);
for
(
size_t
i
=
0
;
i
<
5
;
++
i
)
{
if
(
substring
[
i
].
length
>
max_length
)
if
(
substring
[
i
].
length
>
lcs
.
length
||
(
substring
[
i
].
length
==
lcs
.
length
&&
substring
[
i
].
reference_index
<
lcs
.
reference_index
))
{
max_length
=
substring
[
i
].
length
;
fs_substring
=
substring
[
i
];
lcs
=
substring
[
i
];
}
// if
else
if
(
substring
[
i
].
length
==
max_length
)
// Update compound frame shifts, e.g.,
// FRAME_SHIFT_1 | FRAME_SHIFT_2
else
if
(
substring
[
i
].
length
==
lcs
.
length
&&
substring
[
i
].
reference_index
==
lcs
.
reference_index
&&
substring
[
i
].
sample_index
==
lcs
.
sample_index
)
{
if
((
fs_substring
.
type
==
FRAME_SHIFT_1
&&
substring
[
i
].
type
==
FRAME_SHIFT_2
)
||
(
fs_substring
.
type
==
FRAME_SHIFT_2
&&
substring
[
i
].
type
==
FRAME_SHIFT_1
)
||
(
fs_substring
.
type
==
FRAME_SHIFT_REVERSE_1
&&
substring
[
i
].
type
==
FRAME_SHIFT_REVERSE_2
)
||
(
fs_substring
.
type
==
FRAME_SHIFT_REVERSE_2
&&
substring
[
i
].
type
==
FRAME_SHIFT_REVERSE_1
))
{
fs_substring
.
type
|=
substring
[
i
].
type
;
}
// if
lcs
.
type
|=
substring
[
i
].
type
;
}
// if
}
// for
return
fs_substring
;
}
// maximize_frame_shift
// No LCS found: no frame shift annotation.
if
(
lcs
.
length
<=
0
)
{
return
0
;
}
// if
// Recursively apply this function to the prefixes of the strings.
std
::
vector
<
Variant
>
prefix
;
extractor_frame_shift
(
prefix
,
reference
,
reference_start
,
lcs
.
reference_index
,
sample
,
sample_start
,
lcs
.
sample_index
);
// Recursively apply this function to the suffixes of the strings.
std
::
vector
<
Variant
>
suffix
;
extractor_frame_shift
(
suffix
,
reference
,
lcs
.
reference_index
+
lcs
.
length
,
reference_end
,
sample
,
lcs
.
sample_index
+
lcs
.
length
,
sample_end
);
// Add all variants (in order) to the annotation vector.
annotation
.
insert
(
annotation
.
end
(),
prefix
.
begin
(),
prefix
.
end
());
annotation
.
push_back
(
Variant
(
lcs
.
reference_index
,
lcs
.
reference_index
+
lcs
.
length
,
lcs
.
sample_index
,
lcs
.
sample_index
+
lcs
.
length
,
FRAME_SHIFT
|
lcs
.
type
));
annotation
.
insert
(
annotation
.
end
(),
suffix
.
begin
(),
suffix
.
end
());
return
0
;
}
// extractor_frame_shift
int
main
(
int
,
char
*
[])
{
initialize_frame_shift_map
(
CODON_STRING
);
char_t
const
*
const
reference
=
"MDYSL"
;
size_t
const
start
=
1
;
size_t
const
end
=
5
;
std
::
vector
<
FS_Substring
>
substring
;
FS_Substring
fs_lcs
;
lcs_frame_shift
(
substring
,
reference
,
start
,
end
,
"MALFP"
,
start
,
end
);
fs_lcs
=
maximize_frame_shift
(
substring
);
printf
(
"fs_lcs: %d @ %ld, %ld length = %ld
\n
"
,
fs_lcs
.
type
,
fs_lcs
.
reference_index
,
fs_lcs
.
sample_index
,
fs_lcs
.
length
);
lcs_frame_shift
(
substring
,
reference
,
start
,
end
,
"MATIP"
,
start
,
end
);
fs_lcs
=
maximize_frame_shift
(
substring
);
printf
(
"fs_lcs: %d @ %ld, %ld length = %ld
\n
"
,
fs_lcs
.
type
,
fs_lcs
.
reference_index
,
fs_lcs
.
sample_index
,
fs_lcs
.
length
);
lcs_frame_shift
(
substring
,
reference
,
start
,
end
,
"MTIPW"
,
start
,
end
);
fs_lcs
=
maximize_frame_shift
(
substring
);
printf
(
"fs_lcs: %d @ %ld, %ld length = %ld
\n
"
,
fs_lcs
.
type
,
fs_lcs
.
reference_index
,
fs_lcs
.
sample_index
,
fs_lcs
.
length
);
lcs_frame_shift
(
substring
,
reference
,
start
,
end
,
"MLFPG"
,
start
,
end
);
fs_lcs
=
maximize_frame_shift
(
substring
);
printf
(
"fs_lcs: %d @ %ld, %ld length = %ld
\n
"
,
fs_lcs
.
type
,
fs_lcs
.
reference_index
,
fs_lcs
.
sample_index
,
fs_lcs
.
length
);
lcs_frame_shift
(
substring
,
reference
,
start
,
end
,
"MQGIV"
,
start
,
end
);
fs_lcs
=
maximize_frame_shift
(
substring
);
printf
(
"fs_lcs: %d @ %ld, %ld length = %ld
\n
"
,
fs_lcs
.
type
,
fs_lcs
.
reference_index
,
fs_lcs
.
sample_index
,
fs_lcs
.
length
);
lcs_frame_shift
(
substring
,
reference
,
start
,
end
,
"MAGNS"
,
start
,
end
);
fs_lcs
=
maximize_frame_shift
(
substring
);
printf
(
"fs_lcs: %d @ %ld, %ld length = %ld
\n
"
,
fs_lcs
.
type
,
fs_lcs
.
reference_index
,
fs_lcs
.
sample_index
,
fs_lcs
.
length
);
lcs_frame_shift
(
substring
,
reference
,
start
,
end
,
"MDRE*"
,
start
,
end
);
fs_lcs
=
maximize_frame_shift
(
substring
);
printf
(
"fs_lcs: %d @ %ld, %ld length = %ld
\n
"
,
fs_lcs
.
type
,
fs_lcs
.
reference_index
,
fs_lcs
.
sample_index
,
fs_lcs
.
length
);
lcs_frame_shift
(
substring
,
reference
,
start
,
end
,
"MRE*S"
,
start
,
end
);
fs_lcs
=
maximize_frame_shift
(
substring
);
printf
(
"fs_lcs: %d @ %ld, %ld length = %ld
\n
"
,
fs_lcs
.
type
,
fs_lcs
.
reference_index
,
fs_lcs
.
sample_index
,
fs_lcs
.
length
);
lcs_frame_shift
(
substring
,
reference
,
start
,
end
,
"MGNSP"
,
start
,
end
);
fs_lcs
=
maximize_frame_shift
(
substring
);
printf
(
"fs_lcs: %d @ %ld, %ld length = %ld
\n
"
,
fs_lcs
.
type
,
fs_lcs
.
reference_index
,
fs_lcs
.
sample_index
,
fs_lcs
.
length
);
std
::
vector
<
Variant
>
annotation
;
extractor_frame_shift
(
annotation
,
"MDYSLAAALTLHGH"
,
1
,
11
,
"MTIPWRSPHFHGH"
,
1
,
10
);
// Printing the variants.
fprintf
(
stdout
,
"Annotation (%ld):
\n
"
,
annotation
.
size
());
for
(
std
::
vector
<
Variant
>::
iterator
it
=
annotation
.
begin
();
it
!=
annotation
.
end
();
++
it
)
{
fprintf
(
stdout
,
"%ld--%ld, %ld--%ld, %d, %ld, %ld--%ld
\n
"
,
it
->
reference_start
,
it
->
reference_end
,
it
->
sample_start
,
it
->
sample_end
,
it
->
type
,
it
->
weight
,
it
->
transposition_start
,
it
->
transposition_end
);
}
// for
return
0
;
}
// main
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment