Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mirrors
extractor
Commits
3929f40f
Commit
3929f40f
authored
Sep 14, 2016
by
jkvis
Browse files
Added the Crossmapper module
parent
0853cced
Changes
2
Hide whitespace changes
Inline
Side-by-side
extractor/describe.py
View file @
3929f40f
...
@@ -15,6 +15,8 @@ from .variant import (ISeq, AISeq, ISeqList, AISeqList, DNAVar, ProteinVar,
...
@@ -15,6 +15,8 @@ from .variant import (ISeq, AISeq, ISeqList, AISeqList, DNAVar, ProteinVar,
Allele
,
ProteinAllele
,
FS
)
Allele
,
ProteinAllele
,
FS
)
from
.
import
extractor
,
util
from
.
import
extractor
,
util
from
crossmapper
import
Crossmap
def
roll
(
s
,
first
,
last
):
def
roll
(
s
,
first
,
last
):
"""
"""
...
@@ -472,7 +474,15 @@ def describe_repeats(reference, sample, units):
...
@@ -472,7 +474,15 @@ def describe_repeats(reference, sample, units):
else
:
else
:
description
=
prefix
description
=
prefix
return
description
cm
=
Crossmap
([
reference_start
+
1
,
reference_end
],
[],
1
)
for
variant
in
description
:
for
inserted
in
variant
.
inserted
:
inserted
.
start
=
cm
.
tuple2string
(
cm
.
g2x
(
inserted
.
start
))
inserted
.
end
=
cm
.
tuple2string
(
cm
.
g2x
(
inserted
.
end
))
variant
.
start
=
cm
.
tuple2string
(
cm
.
g2x
(
variant
.
start
))
variant
.
end
=
cm
.
tuple2string
(
cm
.
g2x
(
variant
.
end
))
return
description
,
reference_start
,
reference_end
def
print_var
(
variant
):
def
print_var
(
variant
):
...
...
repeat-extractor.py
View file @
3929f40f
...
@@ -4,7 +4,6 @@ from __future__ import unicode_literals
...
@@ -4,7 +4,6 @@ from __future__ import unicode_literals
from
extractor
import
*
from
extractor
import
*
#ref = 'AGCTGTGGGAGGGAGCCAGTGGATTTGGAAACAGAAATGGCTTGGCCTTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCCTCCTGCAATCCTTTAACTTACTGAATAACTCATGATTATGGGCCACCTGCAGGTACCATGCTAG'
#ref = 'AGCTGTGGGAGGGAGCCAGTGGATTTGGAAACAGAAATGGCTTGGCCTTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCCTCCTGCAATCCTTTAACTTACTGAATAACTCATGATTATGGGCCACCTGCAGGTACCATGCTAG'
#alt = 'AGCTGTGGGAGGGAGCCAGTGGATTTGGAAACAGAAATGGCTTCGCCTTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCGTCCTTCCTTCCCTCCTGCAATCCTATAACTTACTGAATAACTCATGATTATGGGCCACCTGCAGGTACCATGCTAG'
#alt = 'AGCTGTGGGAGGGAGCCAGTGGATTTGGAAACAGAAATGGCTTCGCCTTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCGTCCTTCCTTCCCTCCTGCAATCCTATAACTTACTGAATAACTCATGATTATGGGCCACCTGCAGGTACCATGCTAG'
#units = ['TCCT', 'GCCT']
#units = ['TCCT', 'GCCT']
...
@@ -95,39 +94,47 @@ for line in lines:
...
@@ -95,39 +94,47 @@ for line in lines:
else
:
else
:
sequences
[
label
]
=
[
string
.
strip
()]
sequences
[
label
]
=
[
string
.
strip
()]
#select = 'Amel'
select
=
'D13S317'
#unit_list = ['TATC']
unit_list
=
[
'TATC'
]
#reference = sequences[select][0]
reference
=
sequences
[
select
][
0
]
#sample = sequences[select][0]
sample
=
sequences
[
select
][
0
]
#description = describe_repeats(reference, sample, unit_list)
description
,
_
,
_
=
describe_repeats
(
reference
,
sample
,
unit_list
)
#print 'l.{}'.format(description)
print
'l.{}'
.
format
(
description
)
for
sequence
in
sequences
:
#for sequence in sequences:
best
=
0
# best = 0
for
string
in
sequences
[
sequence
]:
# for string in sequences[sequence]:
repeats
=
short_sequence_repeat_extractor
(
string
,
min_length
)
# repeats = short_sequence_repeat_extractor(string, min_length)
score
=
0
# score = 0
for
repeat
in
repeats
:
# for repeat in repeats:
if
repeat
.
count
+
1
>=
min_count
:
# if repeat.count + 1 >= min_count:
score
+=
(
repeat
.
end
-
repeat
.
start
)
*
(
repeat
.
end
-
repeat
.
start
)
*
(
repeat
.
count
+
1
)
# score += (repeat.end - repeat.start) * (repeat.end - repeat.start) * (repeat.count + 1)
if
score
>
best
:
# if score > best:
reference
=
string
# reference = string
best
=
score
# best = score
repeats
=
short_sequence_repeat_extractor
(
reference
,
min_length
)
# repeats = short_sequence_repeat_extractor(reference, min_length)
units
=
{}
# units = {}
for
repeat
in
repeats
:
# for repeat in repeats:
if
repeat
.
count
+
1
>=
min_count
:
# if repeat.count + 1 >= min_count:
units
[
reference
[
repeat
.
start
:
repeat
.
end
]]
=
repeat
.
count
+
1
# units[reference[repeat.start:repeat.end]] = repeat.count + 1
unit_list
=
[]
# unit_list = []
for
unit
in
units
:
# for unit in units:
unit_list
.
append
(
unit
)
# unit_list.append(unit)
print
sequence
,
unit_list
# print sequence,
reference
=
sequences
[
sequence
][
0
]
# if best > 0:
for
string
in
sequences
[
sequence
]:
# print unit_list
description
=
describe_repeats
(
reference
,
string
,
unit_list
)
# else:
print
'l.{}'
.
format
(
description
)
# print 'no repeat unit identified'
print
# reference = sequences[sequence][0]
# for string in sequences[sequence]:
# if best > 0:
# description, _, _ = describe_repeats(reference, string, unit_list)
# else:
# description = describe_dna(reference, string)
# print 'l.{}'.format(description)
# print
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment