Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mirrors
extractor
Commits
82e1ba61
Commit
82e1ba61
authored
Sep 12, 2016
by
jkvis
Browse files
Added repeat extractor
parent
f8c3a266
Changes
7
Hide whitespace changes
Inline
Side-by-side
extractor/__init__.py
View file @
82e1ba61
...
...
@@ -26,4 +26,5 @@ __homepage__ = 'https://github.com/mutalyzer/description-extractor'
describe_dna
=
describe
.
describe_dna
describe_protein
=
describe
.
describe_protein
describe_repeats
=
describe
.
describe_repeats
extract
=
extractor
.
extract
extractor/debug.cc
View file @
82e1ba61
...
...
@@ -26,7 +26,7 @@ using namespace std;
// Entry point.
int
main
(
int
argc
,
char
*
argv
[])
{
/*
if
(
argc
<
3
)
{
fprintf
(
stderr
,
"usage: %s reference sample
\n
"
,
argv
[
0
]);
...
...
@@ -64,42 +64,53 @@ int main(int argc, char* argv[])
size_t
const
alt_length
=
fread
(
sample
,
sizeof
(
char_t
),
sample_length
,
file
);
static_cast
<
void
>
(
alt_length
);
fclose
(
file
);
*/
string
header
[
4305
];
string
protein
[
4305
];
/*
int const N = 359;
string header[N];
string protein[N];
for
(
int
i
=
0
;
i
<
4305
;
++
i
)
fprintf(stdout, "\t");
for (int i = 0; i < N; ++i)
{
getline
(
cin
,
header
[
i
])
;
getline
(
cin
,
protein
[
i
]
);
cin >> header[i] >> protein[i]
;
fprintf(stdout, "%s\t", header[i].c_str()
);
} // for
fprintf(stdout, "\n");
for
(
int
i
=
0
;
i
<
4305
;
++
i
)
for (int i = 0; i <
N
; ++i)
{
cerr << i << endl;
double
best
=
1.
f
;
for
(
int
j
=
i
+
1
;
j
<
4305
;
++
j
)
fprintf(stdout, "%s\t", header[i].c_str())
;
for (int j =
0
; j <
N
; ++j)
{
vector
<
Variant
>
variant
;
extract
(
variant
,
protein
[
i
].
c_str
(),
protein
[
i
].
length
(),
protein
[
j
].
c_str
(),
protein
[
j
].
length
(),
TYPE_PROTEIN
,
"KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF"
);
for
(
std
::
vector
<
Variant
>::
iterator
it
=
variant
.
begin
();
it
!=
variant
.
end
();
++
it
)
if (i == j)
{
if
(
it
->
type
>=
FRAME_SHIFT
&&
best
>
it
->
probability
)
fprintf(stdout, "%.10e\t", 0.);
continue;
} // if
vector<Variant> variants;
extract(variants, protein[i].c_str(), protein[i].length(), protein[j].c_str(), protein[j].length(), TYPE_PROTEIN, "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF");
size_t best = 0;
for (std::vector<Variant>::iterator it = variants.begin(); it != variants.end(); ++it)
{
if (it->type >= FRAME_SHIFT && it->reference_end - it->reference_start > best)
{
best
=
it
->
probability
;
fprintf
(
stdout
,
"%.9s %.9s %ld--%ld, %ld--%ld, %d, %.10e
\n
"
,
header
[
i
].
c_str
(),
header
[
j
].
c_str
(),
it
->
reference_start
,
it
->
reference_end
,
it
->
sample_start
,
it
->
sample_end
,
it
->
type
,
1.
f
-
it
->
probability
);
best = it->reference_end - it->reference_start;
} // if
} // for
size_t const length = min(protein[i].length(), protein[j].length());
fprintf(stdout, "%.10e\t", 1. - static_cast<double>(best) / static_cast<double>(length));
} // for
fprintf(stdout, "\n");
} // for
*/
/*
// The actual extraction.
std
::
vector
<
Variant
>
variant
;
size_t const weight = extract(variant, reference, reference_length - 1, sample, sample_length - 1, TYPE_
PROTEIN, "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF"
);
size_t
const
weight
=
extract
(
variant
,
reference
,
reference_length
-
1
,
sample
,
sample_length
-
1
,
TYPE_
DNA
);
// Printing the variants.
...
...
@@ -109,6 +120,18 @@ int main(int argc, char* argv[])
if
(
it
->
type
>=
FRAME_SHIFT
)
{
fprintf
(
stdout
,
"%ld--%ld, %ld--%ld, %d, %lf, %ld--%ld
\n
"
,
it
->
reference_start
,
it
->
reference_end
,
it
->
sample_start
,
it
->
sample_end
,
it
->
type
,
1.
f
-
it
->
probability
,
it
->
transposition_start
,
it
->
transposition_end
);
char_t
ref_DNA
[(
it
->
reference_end
-
it
->
reference_start
)
*
3
];
char_t
alt_DNA
[(
it
->
reference_end
-
it
->
reference_start
)
*
3
];
backtranslation
(
ref_DNA
,
alt_DNA
,
reference
,
it
->
reference_start
,
sample
,
it
->
sample_start
,
it
->
reference_end
-
it
->
reference_start
,
it
->
type
);
fprintf
(
stdout
,
"ref_DNA: "
);
fwrite
(
ref_DNA
,
sizeof
(
char_t
),
(
it
->
reference_end
-
it
->
reference_start
)
*
3
,
stdout
);
fprintf
(
stdout
,
"
\n
ref_pro: "
);
fwrite
(
reference
+
it
->
reference_start
,
sizeof
(
char_t
),
(
it
->
reference_end
-
it
->
reference_start
),
stdout
);
fprintf
(
stdout
,
"
\n
alt_DNA: "
);
fwrite
(
alt_DNA
,
sizeof
(
char_t
),
(
it
->
reference_end
-
it
->
reference_start
)
*
3
,
stdout
);
fprintf
(
stdout
,
"
\n
alt_pro: "
);
fwrite
(
sample
+
it
->
sample_start
,
sizeof
(
char_t
),
(
it
->
reference_end
-
it
->
reference_start
),
stdout
);
fprintf
(
stdout
,
"
\n
"
);
}
// if
else
{
...
...
@@ -120,7 +143,7 @@ int main(int argc, char* argv[])
// Cleaning up.
delete
[]
reference
;
delete
[]
sample
;
*/
return
0
;
}
// main
extractor/describe.py
View file @
82e1ba61
...
...
@@ -103,7 +103,7 @@ def var_to_dna_var(s1, s2, var, seq_list=[], weight_position=1):
:arg unicode s2: Sample sequence.
:arg str var: Variant from the extractor module.
:arg str seq_list: Container for an inserted sequence.
:arg
str
weight_position: Weight of a position.
:arg
int
weight_position: Weight of a position.
"""
# Unknown.
if
s1
==
'?'
or
s2
==
'?'
:
...
...
@@ -120,7 +120,10 @@ def var_to_dna_var(s1, s2, var, seq_list=[], weight_position=1):
var
.
sample_start
+=
shift3
var
.
sample_end
+=
shift3
if
(
var
.
sample_start
-
ins_length
>=
0
and
'$'
not
in
s1
[
var
.
reference_start
-
ins_length
:
var
.
reference_start
]
and
'$'
not
in
s2
[
var
.
sample_start
:
var
.
sample_end
]
and
s1
[
var
.
reference_start
-
ins_length
:
var
.
reference_start
]
==
s2
[
var
.
sample_start
:
var
.
sample_end
]):
# NOTE: We may want to omit the inserted / deleted sequence and
...
...
@@ -339,6 +342,90 @@ def describe_dna(s1, s2):
return
description
def
mask_string
(
string
,
units
):
MASK
=
'$'
for
unit
in
units
:
found
=
string
.
find
(
unit
)
if
found
!=
-
1
:
string
=
string
.
replace
(
unit
,
MASK
*
len
(
unit
))
return
string
def
get_repeats
(
string
,
unit
):
repeats
=
[]
found
=
string
.
find
(
unit
,
0
)
while
found
!=
-
1
:
last
=
len
(
repeats
)
-
1
if
last
>
-
1
and
repeats
[
last
][
'start'
]
+
repeats
[
last
][
'count'
]
*
len
(
unit
)
==
found
:
repeats
[
last
][
'count'
]
+=
1
else
:
repeats
.
append
({
'start'
:
found
,
'count'
:
1
,
'unit'
:
unit
})
found
=
string
.
find
(
unit
,
found
+
1
)
return
repeats
def
describe_repeats
(
reference
,
sample
,
units
):
masked_ref
=
mask_string
(
reference
,
units
)
masked_alt
=
mask_string
(
sample
,
units
)
repeats
=
[]
for
unit
in
units
:
repeats
+=
get_repeats
(
sample
,
unit
)
repeats
=
sorted
(
repeats
,
key
=
lambda
repeat
:
repeat
[
'start'
])
ref_swig
=
util
.
swig_str
(
masked_ref
)
alt_swig
=
util
.
swig_str
(
masked_alt
)
extracted
=
extractor
.
extract
(
ref_swig
[
0
],
ref_swig
[
1
],
alt_swig
[
0
],
alt_swig
[
1
],
extractor
.
TYPE_DNA
)
in_transposition
=
0
index
=
0
repeat
=
0
description
=
Allele
()
seq_list
=
ISeqList
()
for
variant
in
extracted
.
variants
:
while
variant
.
sample_start
>
index
:
seq_list
.
append
(
DNAVar
(
type
=
'repeat'
,
inserted
=
repeats
[
repeat
][
'unit'
],
count
=
repeats
[
repeat
][
'count'
]))
index
=
repeats
[
repeat
][
'start'
]
+
repeats
[
repeat
][
'count'
]
*
len
(
repeats
[
repeat
][
'unit'
])
repeat
+=
1
if
variant
.
type
&
extractor
.
TRANSPOSITION_OPEN
:
in_transposition
+=
1
if
in_transposition
:
if
variant
.
type
&
extractor
.
IDENTITY
:
seq_list
.
append
(
ISeq
(
start
=
variant
.
transposition_start
+
1
,
end
=
variant
.
transposition_end
,
reverse
=
False
,
weight_position
=
extracted
.
weight_position
))
elif
variant
.
type
&
extractor
.
REVERSE_COMPLEMENT
:
seq_list
.
append
(
ISeq
(
start
=
variant
.
transposition_start
+
1
,
end
=
variant
.
transposition_end
,
reverse
=
True
,
weight_position
=
extracted
.
weight_position
))
else
:
seq_list
.
append
(
ISeq
(
sequence
=
sample
[
variant
.
sample_start
:
variant
.
sample_end
],
weight_position
=
extracted
.
weight_position
))
elif
variant
.
type
&
extractor
.
IDENTITY
:
seq_list
.
append
(
ISeq
(
start
=
variant
.
reference_start
+
1
,
end
=
variant
.
reference_end
,
reverse
=
False
,
weight_position
=
extracted
.
weight_position
))
elif
variant
.
type
&
extractor
.
REVERSE_COMPLEMENT
:
seq_list
.
append
(
ISeq
(
start
=
variant
.
reference_start
+
1
,
end
=
variant
.
reference_end
,
reverse
=
True
,
weight_position
=
extracted
.
weight_position
))
else
:
seq_list
.
append
(
ISeq
(
sequence
=
sample
[
variant
.
sample_start
:
variant
.
sample_end
],
weight_position
=
extracted
.
weight_position
))
if
variant
.
type
&
extractor
.
TRANSPOSITION_CLOSE
:
in_transposition
-=
1
index
=
variant
.
sample_end
description
.
append
(
DNAVar
(
start
=
1
,
end
=
len
(
reference
),
sample_start
=
1
,
sample_end
=
len
(
sample
),
type
=
'delins'
,
inserted
=
seq_list
))
print
(
description
)
def
print_var
(
variant
):
print
(
'({:3}, {:3}), ({:3}, {:3}), {:08b}, {}, {}'
.
format
(
variant
.
reference_start
,
variant
.
reference_end
,
variant
.
sample_start
,
variant
.
sample_end
,
...
...
extractor/extractor.cc
View file @
82e1ba61
...
...
@@ -67,6 +67,10 @@ static uint64_t acid_map[128] = {0x0ull};
static
double
acid_frequency
[
128
]
=
{
.0
f
};
static
double
frame_shift_frequency
[
128
][
128
][
5
]
=
{{{
.0
f
}}};
// This character is always ignored when LCS matching and can be used for
// repeat masking
static
char_t
const
MASK
=
'$'
;
// Only used to interface to Python: calls the C++ extract function.
Variant_List
extract
(
char_t
const
*
const
reference
,
size_t
const
reference_length
,
...
...
@@ -200,12 +204,40 @@ size_t extract(std::vector<Variant> &variant,
size_t
extractor
(
std
::
vector
<
Variant
>
&
variant
,
char_t
const
*
const
reference
,
char_t
const
*
const
complement
,
size_t
const
reference_start
,
size_t
const
reference_end
,
size_t
reference_start
,
size_t
reference_end
,
char_t
const
*
const
sample
,
size_t
const
sample_start
,
size_t
const
sample_end
)
size_t
sample_start
,
size_t
sample_end
)
{
// First do prefix and suffix matching on the MASK character
size_t
i
=
0
;
while
(
reference_start
+
i
<
reference_end
&&
reference
[
reference_start
+
i
]
==
MASK
)
{
++
i
;
}
// while
reference_start
+=
i
;
i
=
0
;
while
(
reference_end
-
i
-
1
>
reference_start
&&
reference
[
reference_end
-
i
-
1
]
==
MASK
)
{
++
i
;
}
// while
reference_end
-=
i
;
i
=
0
;
while
(
sample_start
+
i
<
sample_end
&&
sample
[
sample_start
+
i
]
==
MASK
)
{
++
i
;
}
// while
sample_start
+=
i
;
i
=
0
;
while
(
sample_end
-
i
-
1
>
sample_start
&&
sample
[
sample_end
-
i
-
1
]
==
MASK
)
{
++
i
;
}
// while
sample_end
-=
i
;
size_t
const
reference_length
=
reference_end
-
reference_start
;
size_t
const
sample_length
=
sample_end
-
sample_start
;
...
...
@@ -911,11 +943,11 @@ size_t LCS(std::vector<Substring> &substring,
// The initial k.
size_t
k
=
reference_length
>
sample_length
?
sample_length
/
4
:
reference_length
/
4
;
size_t
k
=
reference_length
>
sample_length
?
sample_length
/
8
:
reference_length
/
8
;
// Reduce k until the cut-off is reached.
while
(
k
>
4
&&
k
>
cut_off
)
while
(
k
>
8
&&
k
>
cut_off
)
{
...
...
@@ -990,7 +1022,7 @@ size_t LCS_1(std::vector<Substring> &substring,
for
(
size_t
j
=
0
;
j
<
reference_length
;
++
j
)
{
// A match
if
(
reference
[
reference_start
+
j
]
==
sample
[
sample_start
+
i
])
if
(
reference
[
reference_start
+
j
]
==
sample
[
sample_start
+
i
]
&&
reference
[
reference_start
+
j
]
!=
MASK
)
{
if
(
i
==
0
||
j
==
0
)
{
...
...
@@ -1024,7 +1056,7 @@ size_t LCS_1(std::vector<Substring> &substring,
// If applicable check for a LCS in reverse complement space.
// The same code is used as before but the complement string is
// travesed backwards (towards the start).
if
(
complement
!=
0
&&
complement
[
reference_end
-
j
-
1
]
==
sample
[
sample_start
+
i
])
if
(
complement
!=
0
&&
complement
[
reference_end
-
j
-
1
]
==
sample
[
sample_start
+
i
]
&&
complement
[
reference_end
-
j
-
1
]
!=
MASK
)
{
if
(
i
==
0
||
j
==
0
)
{
...
...
@@ -1206,7 +1238,7 @@ size_t LCS_k(std::vector<Substring> &substring,
// Extending to the right.
{
size_t
i
=
0
;
while
(
i
<=
k
&&
it
->
reference_index
+
it
->
length
+
i
<
reference_end
&&
it
->
sample_index
+
it
->
length
+
i
<
sample_end
&&
reference
[
it
->
reference_index
+
it
->
length
+
i
]
==
sample
[
it
->
sample_index
+
it
->
length
+
i
])
while
(
i
<=
k
&&
it
->
reference_index
+
it
->
length
+
i
<
reference_end
&&
it
->
sample_index
+
it
->
length
+
i
<
sample_end
&&
reference
[
it
->
reference_index
+
it
->
length
+
i
]
==
sample
[
it
->
sample_index
+
it
->
length
+
i
]
&&
reference
[
it
->
reference_index
+
it
->
length
+
i
]
!=
MASK
)
{
++
i
;
}
// while
...
...
@@ -1215,7 +1247,7 @@ size_t LCS_k(std::vector<Substring> &substring,
// Extending to the left.
{
size_t
i
=
0
;
while
(
i
<=
k
&&
it
->
reference_index
-
i
-
1
>=
reference_start
&&
it
->
sample_index
-
i
-
1
>=
sample_start
&&
reference
[
it
->
reference_index
-
i
-
1
]
==
sample
[
it
->
sample_index
-
i
-
1
])
while
(
i
<=
k
&&
it
->
reference_index
-
i
-
1
>=
reference_start
&&
it
->
sample_index
-
i
-
1
>=
sample_start
&&
reference
[
it
->
reference_index
-
i
-
1
]
==
sample
[
it
->
sample_index
-
i
-
1
]
&&
reference
[
it
->
reference_index
-
i
-
1
]
!=
MASK
)
{
++
i
;
}
// while
...
...
@@ -1232,7 +1264,7 @@ size_t LCS_k(std::vector<Substring> &substring,
// Extending to the right (sample orientation).
{
size_t
i
=
0
;
while
(
i
<=
k
&&
it
->
reference_index
-
i
-
1
>=
reference_start
&&
it
->
sample_index
+
it
->
length
+
i
<
sample_end
&&
complement
[
it
->
reference_index
-
i
-
1
]
==
sample
[
it
->
sample_index
+
it
->
length
+
i
])
while
(
i
<=
k
&&
it
->
reference_index
-
i
-
1
>=
reference_start
&&
it
->
sample_index
+
it
->
length
+
i
<
sample_end
&&
complement
[
it
->
reference_index
-
i
-
1
]
==
sample
[
it
->
sample_index
+
it
->
length
+
i
]
&&
complement
[
it
->
reference_index
-
i
-
1
]
!=
MASK
)
{
++
i
;
}
// while
...
...
@@ -1242,7 +1274,7 @@ size_t LCS_k(std::vector<Substring> &substring,
// Extending to the left (sample orientation).
{
size_t
i
=
0
;
while
(
i
<=
k
&&
it
->
reference_index
+
it
->
length
+
i
<
reference_end
&&
it
->
sample_index
-
i
-
1
>=
sample_start
&&
complement
[
it
->
reference_index
+
it
->
length
+
i
]
==
sample
[
it
->
sample_index
-
i
-
1
])
while
(
i
<=
k
&&
it
->
reference_index
+
it
->
length
+
i
<
reference_end
&&
it
->
sample_index
-
i
-
1
>=
sample_start
&&
complement
[
it
->
reference_index
+
it
->
length
+
i
]
==
sample
[
it
->
sample_index
-
i
-
1
]
&&
complement
[
it
->
reference_index
+
it
->
length
+
i
]
!=
MASK
)
{
++
i
;
}
// while
...
...
@@ -1407,7 +1439,7 @@ bool string_match(char_t const* const string_1,
{
for
(
size_t
i
=
0
;
i
<
length
;
++
i
)
{
if
(
string_1
[
i
]
!=
string_2
[
i
])
if
(
string_1
[
i
]
!=
string_2
[
i
]
||
string_1
[
i
]
==
MASK
)
{
return
false
;
}
// if
...
...
@@ -1425,7 +1457,7 @@ bool string_match_reverse(char_t const* const string_1,
{
for
(
size_t
i
=
0
;
i
<
length
;
++
i
)
{
if
(
string_1
[
-
i
]
!=
string_2
[
i
])
if
(
string_1
[
-
i
]
!=
string_2
[
i
]
||
string_1
[
-
i
]
==
MASK
)
{
return
false
;
}
// if
...
...
@@ -1445,7 +1477,7 @@ size_t prefix_match(char_t const* const reference,
// Traverse both strings towards the end as long as their characters
// are equal. Do NOT exceed the length of the strings.
while
(
i
<
reference_length
&&
i
<
sample_length
&&
reference
[
i
]
==
sample
[
i
])
while
(
i
<
reference_length
&&
i
<
sample_length
&&
reference
[
i
]
==
sample
[
i
]
&&
reference
[
i
]
!=
MASK
)
{
++
i
;
}
// while
...
...
@@ -1466,7 +1498,7 @@ size_t suffix_match(char_t const* const reference,
// Start at the end of both strings and traverse towards the start
// as long as their characters are equal. Do not exceed the length
// of the strings.
while
(
i
<
reference_length
-
prefix
&&
i
<
sample_length
-
prefix
&&
reference
[
reference_length
-
i
-
1
]
==
sample
[
sample_length
-
i
-
1
])
while
(
i
<
reference_length
-
prefix
&&
i
<
sample_length
-
prefix
&&
reference
[
reference_length
-
i
-
1
]
==
sample
[
sample_length
-
i
-
1
]
&&
reference
[
reference_length
-
i
-
1
]
!=
MASK
)
{
++
i
;
}
// while
...
...
@@ -1506,8 +1538,8 @@ char_t const* IUPAC_complement(char_t const* const string,
return
complement
;
}
// IUPAC_complement
void
backtranslation
(
size
_t
ref
erence
_DNA
[],
size
_t
sample
_DNA
[],
void
backtranslation
(
char
_t
ref_DNA
[],
char
_t
alt
_DNA
[],
char_t
const
*
const
reference
,
size_t
const
reference_start
,
char_t
const
*
const
sample
,
...
...
@@ -1515,11 +1547,14 @@ void backtranslation(size_t reference_DNA[],
size_t
const
length
,
uint8_t
const
type
)
{
for
(
size_t
i
=
0
;
i
<
3
*
length
;
++
i
)
size_t
reference_DNA
[
3
*
length
];
size_t
sample_DNA
[
3
*
length
];
for
(
size_t
i
=
0
;
i
<
3
*
length
;
i
++
)
{
reference_DNA
[
i
]
=
0
;
sample_DNA
[
i
]
=
0
;
}
// for
for
(
size_t
p
=
0
;
p
<
length
;
++
p
)
{
for
(
size_t
i
=
0
;
i
<
64
;
++
i
)
...
...
@@ -1603,6 +1638,11 @@ void backtranslation(size_t reference_DNA[],
}
// if
}
// for
}
// for
for
(
size_t
i
=
0
;
i
<
3
*
length
;
i
++
)
{
ref_DNA
[
i
]
=
IUPAC_ALPHA
[
reference_DNA
[
i
]];
alt_DNA
[
i
]
=
IUPAC_ALPHA
[
sample_DNA
[
i
]];
}
// for
return
;
}
// backtranslation
...
...
extractor/extractor.h
View file @
82e1ba61
...
...
@@ -28,7 +28,7 @@ namespace mutalyzer
{
// Version string for run-time identification.
static
char
const
*
const
VERSION
=
"2.3.
3
"
;
static
char
const
*
const
VERSION
=
"2.3.
4
"
;
// The character type used for all strings. For now it should just be
...
...
@@ -668,6 +668,16 @@ uint8_t frame_shift(char_t const reference_1,
char_t
const
sample
);
void
backtranslation
(
char_t
reference_DNA
[],
char_t
sample_DNA
[],
char_t
const
*
const
reference
,
size_t
const
reference_start
,
char_t
const
*
const
sample
,
size_t
const
sample_start
,
size_t
const
length
,
uint8_t
const
type
);
#if defined(__debug__)
// *******************************************************************
// Dprint_truncated function
...
...
extractor/variant.py
View file @
82e1ba61
...
...
@@ -121,6 +121,9 @@ class ISeq(object):
if
not
(
self
.
start
or
self
.
end
):
return
''
if
self
.
start
==
self
.
end
:
return
'{}'
.
format
(
self
.
start
)
inverted
=
'inv'
if
self
.
reverse
else
''
return
'{0}_{1}{2}'
.
format
(
self
.
start
,
self
.
end
,
inverted
)
...
...
@@ -180,7 +183,9 @@ class AISeq(object):
return
self
.
sequence
if
self
.
type
==
'trans'
:
return
'{}_{}'
.
format
(
self
.
start
,
self
.
end
)
if
self
.
start
!=
self
.
end
:
return
'{}_{}'
.
format
(
self
.
start
,
self
.
end
)
return
'{}'
.
format
(
self
.
start
)
return
'{}_{}{}|{}'
.
format
(
self
.
start
,
self
.
end
,
self
.
sequence
,
'|'
.
join
(
self
.
frames
))
...
...
@@ -193,7 +198,7 @@ class DNAVar(object):
"""
def
__init__
(
self
,
start
=
0
,
start_offset
=
0
,
end
=
0
,
end_offset
=
0
,
sample_start
=
0
,
sample_start_offset
=
0
,
sample_end
=
0
,
sample_end_offset
=
0
,
sample_start_offset
=
0
,
sample_end
=
0
,
sample_end_offset
=
0
,
count
=
0
,
type
=
'none'
,
deleted
=
ISeqList
([
ISeq
()]),
inserted
=
ISeqList
([
ISeq
()]),
shift
=
0
,
weight_position
=
1
):
"""
...
...
@@ -207,6 +212,7 @@ class DNAVar(object):
:arg int sample_start_offset:
:arg int sample_end: End position.
:arg int sample_end_offset:
:arg int count
:arg unicode type: Variant type.
:arg unicode deleted: Deleted part of the reference sequence.
:arg ISeqList inserted: Inserted part.
...
...
@@ -222,6 +228,7 @@ class DNAVar(object):
self
.
sample_start_offset
=
sample_start_offset
self
.
sample_end
=
sample_end
self
.
sample_end_offset
=
sample_end_offset
self
.
count
=
count
self
.
type
=
type
self
.
deleted
=
deleted
self
.
inserted
=
inserted
...
...
@@ -241,6 +248,9 @@ class DNAVar(object):
if
self
.
type
==
'none'
:
return
'='
if
self
.
count
>
0
:
return
'{0}({1})'
.
format
(
self
.
inserted
,
self
.
count
)
description
=
str
(
self
.
start
)
if
self
.
start
!=
self
.
end
:
...
...
repeat-extractor.py
0 → 100644
View file @
82e1ba61
#!/usr/bin/env python
from
__future__
import
unicode_literals
from
extractor
import
*
ref
=
'AGCTGTGGGAGGGAGCCAGTGGATTTGGAAACAGAAATGGCTTGGCCTTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCCTCCTGCAATCCTTTAACTTACTGAATAACTCATGATTATGGGCCACCTGCAGGTACCATGCTAG'
alt
=
'AGCTGTGGGAGGGAGCCAGTGGATTTGGAAACAGAAATGGCTTGGCCTTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCGTCCTTCCTTCCCTCCTGCAATCCTTTAACTTACTGAATAACTCATGATTATGGGCCACCTGCAGGTACCATGCTAG'
units
=
[
'CTTC'
]
print
'version:'
,
extractor
.
VERSION
describe_repeats
(
ref
,
alt
,
units
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment