Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mirrors
extractor
Commits
0ad3392d
Commit
0ad3392d
authored
Mar 07, 2016
by
jkvis
Browse files
Merge pull request #11 from mutalyzer/describe_protein
Describe protein
parents
674bf74a
b2de445d
Changes
9
Hide whitespace changes
Inline
Side-by-side
.travis.yml
View file @
0ad3392d
...
...
@@ -6,6 +6,7 @@ python:
-
"
2.7"
-
"
3.3"
-
"
3.4"
-
"
3.5"
addons
:
apt_packages
:
swig
before_install
:
pip install pytest
...
...
extractor/describe.py
View file @
0ad3392d
...
...
@@ -5,74 +5,15 @@ other.
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
unicode_literals
)
import
math
from
.variant
import
(
ISeq
,
ISeqList
,
DNAVar
,
ProteinVar
,
Allele
,
ProteinAllele
,
FrameShiftAnnotationList
,
FrameShiftAnnotation
)
from
.
import
extractor
,
util
# Taken from BioPython.
AMBIGUOUS_DNA_COMPLEMENT
=
{
'A'
:
'T'
,
'C'
:
'G'
,
'G'
:
'C'
,
'T'
:
'A'
,
'M'
:
'K'
,
'R'
:
'Y'
,
'W'
:
'W'
,
'S'
:
'S'
,
'Y'
:
'R'
,
'K'
:
'M'
,
'V'
:
'B'
,
'H'
:
'D'
,
'D'
:
'H'
,
'B'
:
'V'
,
'X'
:
'X'
,
'N'
:
'N'
}
AMBIGUOUS_RNA_COMPLEMENT
=
{
'A'
:
'U'
,
'C'
:
'G'
,
'G'
:
'C'
,
'U'
:
'A'
,
'M'
:
'K'
,
'R'
:
'Y'
,
'W'
:
'W'
,
'S'
:
'S'
,
'Y'
:
'R'
,
'K'
:
'M'
,
'V'
:
'B'
,
'H'
:
'D'
,
'D'
:
'H'
,
'B'
:
'V'
,
'X'
:
'X'
,
'N'
:
'N'
}
def
_make_translation_table
(
complement_mapping
):
before
=
list
(
complement_mapping
.
keys
())
before
+=
[
b
.
lower
()
for
b
in
before
]
after
=
list
(
complement_mapping
.
values
())
after
+=
[
b
.
lower
()
for
b
in
after
]
return
dict
((
ord
(
k
),
v
)
for
k
,
v
in
zip
(
before
,
after
))
_dna_complement_table
=
_make_translation_table
(
AMBIGUOUS_DNA_COMPLEMENT
)
_rna_complement_table
=
_make_translation_table
(
AMBIGUOUS_RNA_COMPLEMENT
)
def
reverse_complement
(
sequence
):
"""
Reverse complement of a sequence represented as unicode string.
"""
if
'U'
in
sequence
or
'u'
in
sequence
:
table
=
_rna_complement_table
else
:
table
=
_dna_complement_table
from
Bio.Seq
import
reverse_complement
return
''
.
join
(
reversed
(
sequence
.
translate
(
table
)))
from
.variant
import
(
ISeq
,
AISeq
,
ISeqList
,
AISeqList
,
DNAVar
,
ProteinVar
,
Allele
,
ProteinAllele
,
FS
)
from
.
import
extractor
,
util
def
roll
(
s
,
first
,
last
):
...
...
@@ -143,7 +84,7 @@ def palinsnoop(s):
is a 'palindrome'.
@rtype: int
"""
s_revcomp
=
reverse_complement
(
s
)
s_revcomp
=
reverse_complement
(
s
tr
(
s
))
# FIXME str inserted.
for
i
in
range
(
int
(
math
.
ceil
(
len
(
s
)
/
2.0
))):
if
s
[
i
]
!=
s_revcomp
[
i
]:
...
...
@@ -293,7 +234,7 @@ def var_to_protein_var(s1, s2, var, seq_list=[], weight_position=1):
start
=
var
.
reference_start
-
ins_length
+
1
,
end
=
var
.
reference_end
,
type
=
'dup'
,
shift
=
shift
,
sample_start
=
var
.
sample_start
+
1
,
sample_end
=
var
.
sample_end
,
inserted
=
ISeqList
([
ISeq
(
sequence
=
s2
[
inserted
=
A
ISeqList
([
A
ISeq
(
sequence
=
s2
[
var
.
sample_start
:
var
.
sample_end
],
weight_position
=
weight_position
)]),
weight_position
=
weight_position
)
...
...
@@ -301,7 +242,7 @@ def var_to_protein_var(s1, s2, var, seq_list=[], weight_position=1):
return
ProteinVar
(
s1
=
s1
,
s2
=
s2
,
start
=
var
.
reference_start
,
end
=
var
.
reference_start
+
1
,
inserted
=
seq_list
or
ISeqList
([
ISeq
(
sequence
=
s2
[
var
.
sample_start
:
var
.
sample_end
],
A
ISeqList
([
A
ISeq
(
sequence
=
s2
[
var
.
sample_start
:
var
.
sample_end
],
weight_position
=
weight_position
)]),
type
=
'ins'
,
shift
=
shift
,
sample_start
=
var
.
sample_start
+
1
,
sample_end
=
var
.
sample_end
,
weight_position
=
weight_position
)
...
...
@@ -317,7 +258,7 @@ def var_to_protein_var(s1, s2, var, seq_list=[], weight_position=1):
return
ProteinVar
(
s1
=
s1
,
s2
=
s2
,
start
=
var
.
reference_start
+
1
,
end
=
var
.
reference_end
,
type
=
'del'
,
shift
=
shift
,
sample_start
=
var
.
sample_start
,
sample_end
=
var
.
sample_end
+
1
,
deleted
=
ISeqList
([
ISeq
(
sequence
=
s1
[
deleted
=
A
ISeqList
([
A
ISeq
(
sequence
=
s1
[
var
.
reference_start
:
var
.
reference_end
],
weight_position
=
weight_position
)]),
weight_position
=
weight_position
)
...
...
@@ -328,19 +269,19 @@ def var_to_protein_var(s1, s2, var, seq_list=[], weight_position=1):
return
ProteinVar
(
s1
=
s1
,
s2
=
s2
,
start
=
var
.
reference_start
+
1
,
end
=
var
.
reference_end
,
sample_start
=
var
.
sample_start
+
1
,
sample_end
=
var
.
sample_end
,
type
=
'subst'
,
deleted
=
ISeqList
([
ISeq
(
sequence
=
s1
[
var
.
reference_start
],
deleted
=
A
ISeqList
([
A
ISeq
(
sequence
=
s1
[
var
.
reference_start
],
weight_position
=
weight_position
)]),
inserted
=
ISeqList
([
ISeq
(
sequence
=
s2
[
var
.
sample_start
],
inserted
=
A
ISeqList
([
A
ISeq
(
sequence
=
s2
[
var
.
sample_start
],
weight_position
=
weight_position
)]),
weight_position
=
weight_position
)
# InDel.
return
ProteinVar
(
s1
=
s1
,
s2
=
s2
,
start
=
var
.
reference_start
+
1
,
end
=
var
.
reference_end
,
deleted
=
ISeqList
([
ISeq
(
sequence
=
s1
[
end
=
var
.
reference_end
,
deleted
=
A
ISeqList
([
A
ISeq
(
sequence
=
s1
[
var
.
reference_start
:
var
.
reference_end
],
weight_position
=
weight_position
)]),
inserted
=
seq_list
or
ISeqList
([
ISeq
(
sequence
=
s2
[
var
.
sample_start
:
var
.
sample_end
],
A
ISeqList
([
A
ISeq
(
sequence
=
s2
[
var
.
sample_start
:
var
.
sample_end
],
weight_position
=
weight_position
)]),
type
=
'delins'
,
sample_start
=
var
.
sample_start
+
1
,
sample_end
=
var
.
sample_end
,
weight_position
=
weight_position
)
...
...
@@ -364,13 +305,6 @@ def describe_dna(s1, s2):
s2_swig
[
0
],
s2_swig
[
1
],
extractor
.
TYPE_DNA
)
for
variant
in
extracted
.
variants
:
#print(variant.type, variant.reference_start,
# variant.reference_end, variant.sample_start,
# variant.sample_end, variant.transposition_start,
# variant.transposition_end)
#print(variant.type & extractor.TRANSPOSITION_OPEN, variant.type &
# extractor.TRANSPOSITION_CLOSE)
if
variant
.
type
&
extractor
.
TRANSPOSITION_OPEN
:
if
not
in_transposition
:
seq_list
=
ISeqList
()
...
...
@@ -405,43 +339,80 @@ def describe_dna(s1, s2):
return
description
def
describe_protein
(
s1
,
s2
):
def
print_var
(
variant
):
print
(
'({:3}, {:3}), ({:3}, {:3}), {:08b}, {}, {}'
.
format
(
variant
.
reference_start
,
variant
.
reference_end
,
variant
.
sample_start
,
variant
.
sample_end
,
variant
.
type
,
variant
.
type
,
variant
.
sample_end
-
variant
.
sample_start
))
def
get_frames
(
flags
):
result
=
[]
for
fs
in
FS
:
if
flags
&
FS
[
fs
]:
result
.
append
(
fs
)
return
result
def
describe_protein
(
s1
,
s2
,
codon_table
=
1
):
"""
"""
codons
=
'KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF'
codons
=
util
.
codon_table_string
(
codon_table
)
description
=
ProteinAllele
()
annotation
=
FrameShiftAnnotationList
()
s1_swig
=
util
.
swig_str
(
s1
)
s2_swig
=
util
.
swig_str
(
s2
)
codons_swig
=
util
.
swig_str
(
codons
)
extracted
=
extractor
.
extract
(
s1_swig
[
0
],
s1_swig
[
1
],
s2_swig
[
0
],
s2_swig
[
1
],
extractor
.
TYPE_PROTEIN
,
codons_swig
[
0
])
variants
=
extracted
.
variants
for
variant
in
extracted
.
variants
:
if
(
variant
.
type
&
extractor
.
FRAME_SHIFT
and
(
variant
.
type
&
extractor
.
FRAME_SHIFT_1
or
variant
.
type
&
extractor
.
FRAME_SHIFT_2
)):
annotation
.
append
(
FrameShiftAnnotation
(
start
=
variant
.
reference_start
+
1
,
end
=
variant
.
reference_end
+
1
,
sample_start
=
variant
.
sample_start
+
1
,
sample_end
=
variant
.
sample_end
+
1
,
type
=
variant
.
type
))
#for variant in variants:
# print_var(variant)
#print()
for
variant
in
extracted
.
variants
:
if
(
not
variant
.
type
&
extractor
.
FRAME_SHIFT
and
not
variant
.
type
&
extractor
.
IDENTITY
):
var
=
var_to_protein_var
(
s1
,
s2
,
variant
,
index
=
0
while
index
<
len
(
variants
):
if
variants
[
index
].
type
!=
extractor
.
IDENTITY
:
variant
=
variants
[
index
]
index
+=
1
seq_list
=
AISeqList
()
# NOTE: This is for filling.
last_end
=
variants
[
index
].
reference_start
while
(
index
<
len
(
variants
)
and
variants
[
index
].
type
&
extractor
.
FRAME_SHIFT
):
if
last_end
!=
variants
[
index
].
sample_start
:
seq_list
.
append
(
AISeq
(
s2
[
last_end
:
variants
[
index
].
sample_start
]))
last_end
=
variants
[
index
].
sample_end
seq_list
.
append
(
AISeq
(
s2
[
variants
[
index
].
sample_start
:
variants
[
index
].
sample_end
],
start
=
variants
[
index
].
reference_start
+
1
,
end
=
variants
[
index
].
reference_end
,
sample_start
=
variants
[
index
].
sample_start
+
1
,
sample_end
=
variants
[
index
].
sample_end
,
frames
=
get_frames
(
variants
[
index
].
type
)))
# NOTE: Perhaps use trans_open, trans_close to ...
index
+=
1
if
last_end
!=
variant
.
sample_end
:
seq_list
.
append
(
AISeq
(
s2
[
last_end
:
variant
.
sample_end
]))
var
=
var_to_protein_var
(
s1
,
s2
,
variant
,
seq_list
,
weight_position
=
extracted
.
weight_position
)
description
.
append
(
var
)
if
description
[
-
1
].
type
==
'delins'
:
for
frame_shift
in
annotation
:
if
frame_shift
.
start
>=
description
[
-
1
].
start
:
description
[
-
1
].
is_frame_shift
=
True
else
:
index
+=
1
if
not
description
:
return
(
ProteinAllele
([
ProteinVar
()]),
FrameShiftAnnotationList
([
FrameShiftAnnotation
]))
return
description
,
annotation
return
ProteinAllele
([
ProteinVar
()])
return
description
extractor/util.py
View file @
0ad3392d
...
...
@@ -4,75 +4,38 @@ General utility definitions.
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
unicode_literals
)
import
sys
from
Bio.Data
import
CodonTable
from
Bio.Data.IUPACData
import
(
protein_letters_1to3
,
protein_letters_1to3_extended
)
from
Bio.SeqUtils
import
seq3
PY2
=
sys
.
version_info
[
0
]
==
2
# From BioPython.
protein_letters_1to3
=
{
'A'
:
'Ala'
,
'C'
:
'Cys'
,
'D'
:
'Asp'
,
'E'
:
'Glu'
,
'F'
:
'Phe'
,
'G'
:
'Gly'
,
'H'
:
'His'
,
'I'
:
'Ile'
,
'K'
:
'Lys'
,
'L'
:
'Leu'
,
'M'
:
'Met'
,
'N'
:
'Asn'
,
'P'
:
'Pro'
,
'Q'
:
'Gln'
,
'R'
:
'Arg'
,
'S'
:
'Ser'
,
'T'
:
'Thr'
,
'V'
:
'Val'
,
'W'
:
'Trp'
,
'Y'
:
'Tyr'
,
}
protein_letters_1to3_extended
=
dict
(
list
(
protein_letters_1to3
.
items
())
+
list
({
'B'
:
'Asx'
,
'X'
:
'Xaa'
,
'Z'
:
'Glx'
,
'J'
:
'Xle'
,
'U'
:
'Sel'
,
'O'
:
'Pyl'
,
}.
items
()))
# From BioPython.
def
seq3
(
seq
,
custom_map
=
{
'*'
:
'Ter'
},
undef_code
=
'Xaa'
):
"""Turn a one letter code protein sequence into one with three letter codes.
The single input argument 'seq' should be a protein sequence using single
letter codes, either as a python string or as a Seq or MutableSeq object.
This function returns the amino acid sequence as a string using the three
letter amino acid codes. Output follows the IUPAC standard (including
ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk.
Any unknown character (including possible gap characters), is changed into
'Xaa'.
e.g.
>>> from Bio.SeqUtils import seq3
>>> seq3("MAIVMGRWKGAR*")
'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
You can set a custom translation of the codon termination code using the
"custom_map" argument, e.g.
>>> seq3("MAIVMGRWKGAR*", custom_map={"*": "***"})
'MetAlaIleValMetGlyArgTrpLysGlyAlaArg***'
You can also set a custom translation for non-amino acid characters, such
as '-', using the "undef_code" argument, e.g.
def
codon_table_string
(
table_id
):
"""
Return the codon table referenced by {table_id} in compresed from. The
result consists of a string of amino acids sorted by the codon that
translates to them. For example, the codon 'AAG' has position 3 in the
sorted list of codons, so its translation 'K' occurs in the third position
of the output.
>>> seq3("MAIVMGRWKGA--R*", undef_code='---')
'MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer'
:arg table_id: ID of a codon table.
:type table_id: int
If not given, "undef_code" defaults to "Xaa", e.g.
:returns: String representation of code table referenced by {table_id}.
:rtype: str
"""
codons
=
CodonTable
.
unambiguous_dna_by_id
[
table_id
].
forward_table
.
items
()
>>> seq3("MAIVMGRWKGA--R*")
'MetAlaIleValMetGlyArgTrpLysGlyAlaXaaXaaArgTer'
codons
+=
map
(
lambda
x
:
(
x
,
'*'
),
CodonTable
.
unambiguous_dna_by_id
[
table_id
].
stop_codons
)
This function was inspired by BioPerl's seq3.
"""
# not doing .update() on IUPACData dict with custom_map dict
# to preserve its initial state (may be imported in other modules)
threecode
=
dict
(
list
(
protein_letters_1to3_extended
.
items
())
+
list
(
custom_map
.
items
()))
#We use a default of 'Xaa' for undefined letters
#Note this will map '-' to 'Xaa' which may be undesirable!
return
''
.
join
(
threecode
.
get
(
aa
,
undef_code
)
for
aa
in
str
(
seq
))
return
''
.
join
(
map
(
lambda
x
:
x
[
1
],
sorted
(
codons
)))
def
swig_str
(
s
,
ascii_only
=
True
):
...
...
extractor/variant.py
View file @
0ad3392d
...
...
@@ -3,8 +3,8 @@ Models for the description extractor.
"""
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
from
.
import
extractor
from
extractor.util
import
python_2_unicode_compatible
,
seq3
,
str
...
...
@@ -19,11 +19,11 @@ WEIGHTS = {
'delins'
:
extractor
.
WEIGHT_DELETION_INSERTION
}
FS
=
{
'
+
1'
:
extractor
.
FRAME_SHIFT_1
,
'
+
2'
:
extractor
.
FRAME_SHIFT_2
,
'1'
:
extractor
.
FRAME_SHIFT_1
,
'2'
:
extractor
.
FRAME_SHIFT_2
,
'inv'
:
extractor
.
FRAME_SHIFT_REVERSE
,
'inv
+1
'
:
extractor
.
FRAME_SHIFT_REVERSE_1
,
'inv
+2
'
:
extractor
.
FRAME_SHIFT_REVERSE_1
,
'
1
inv'
:
extractor
.
FRAME_SHIFT_REVERSE_1
,
'
2
inv'
:
extractor
.
FRAME_SHIFT_REVERSE_1
}
...
...
@@ -82,15 +82,18 @@ class ISeqList(HGVSList):
pass
class
FrameShiftAnnotationList
(
HGVSList
):
pass
class
AISeqList
(
ISeqList
):
def
get_sequence
(
self
):
return
''
.
join
(
map
(
lambda
x
:
x
.
sequence
,
self
.
items
))
@
python_2_unicode_compatible
class
ISeq
(
object
):
"""
Container for an inserted sequence.
"""
def
__init__
(
self
,
sequence
=
''
,
start
=
0
,
end
=
0
,
reverse
=
False
,
def
__init__
(
self
,
sequence
=
''
,
start
=
0
,
end
=
0
,
reverse
=
False
,
weight_position
=
1
):
"""
Initialise the class with the appropriate values.
...
...
@@ -122,10 +125,12 @@ class ISeq(object):
return
'{0}_{1}{2}'
.
format
(
self
.
start
,
self
.
end
,
inverted
)
# TODO: Is this still used?
def
__bool__
(
self
):
return
bool
(
self
.
sequence
)
# TODO: Is this still used?
def
__nonzero__
(
self
):
# Python 2.x compatibility.
return
self
.
__bool__
()
...
...
@@ -135,18 +140,61 @@ class ISeq(object):
return
len
(
self
.
sequence
)
*
extractor
.
WEIGHT_BASE
inverse_weight
=
WEIGHTS
[
'inv'
]
if
self
.
reverse
else
0
return
(
self
.
weight_position
*
2
+
extractor
.
WEIGHT_SEPARATOR
+
return
(
self
.
weight_position
*
2
+
extractor
.
WEIGHT_SEPARATOR
+
inverse_weight
)
@
python_2_unicode_compatible
class
AISeq
(
object
):
"""
Container for an annotated inserted sequence.
"""
def
__init__
(
self
,
sequence
=
''
,
start
=
0
,
end
=
0
,
sample_start
=
0
,
sample_end
=
0
,
frames
=
[],
weight_position
=
1
):
"""
Initialise the class with the appropriate values.
:arg unicode sequence: Literal inserted sequence.
:arg int start: Start position for a transposed sequence.
:arg int end: End position for a transposed sequence.
"""
self
.
sequence
=
sequence
self
.
start
=
start
self
.
end
=
end
self
.
sample_start
=
sample_start
self
.
sample_end
=
sample_end
self
.
weight_position
=
weight_position
self
.
frames
=
frames
self
.
type
=
'trans'
if
self
.
sequence
:
self
.
type
=
'ins'
if
self
.
frames
:
self
.
type
=
'fs'
def
__str__
(
self
):
if
self
.
type
==
'ins'
:
return
self
.
sequence
if
self
.
type
==
'trans'
:
return
'{}_{}'
.
format
(
self
.
start
,
self
.
end
)
return
'{}_{}{}|{}'
.
format
(
self
.
start
,
self
.
end
,
self
.
sequence
,
'|'
.
join
(
self
.
frames
))
@
python_2_unicode_compatible
class
DNAVar
(
object
):
"""
Container for a DNA variant.
"""
def
__init__
(
self
,
start
=
0
,
start_offset
=
0
,
end
=
0
,
end_offset
=
0
,
sample_start
=
0
,
sample_start_offset
=
0
,
sample_end
=
0
,
sample_end_offset
=
0
,
type
=
'none'
,
deleted
=
ISeqList
([
ISeq
()]),
def
__init__
(
self
,
start
=
0
,
start_offset
=
0
,
end
=
0
,
end_offset
=
0
,
sample_start
=
0
,
sample_start_offset
=
0
,
sample_end
=
0
,
sample_end_offset
=
0
,
type
=
'none'
,
deleted
=
ISeqList
([
ISeq
()]),
inserted
=
ISeqList
([
ISeq
()]),
shift
=
0
,
weight_position
=
1
):
"""
Initialise the class with the appropriate values.
...
...
@@ -227,9 +275,10 @@ class ProteinVar(object):
Container for a protein variant.
"""
def
__init__
(
self
,
s1
=
''
,
s2
=
''
,
start
=
0
,
end
=
0
,
sample_start
=
0
,
sample_end
=
0
,
type
=
'none'
,
deleted
=
ISeqList
([
ISeq
()]),
inserted
=
ISeqList
([
ISeq
()]),
shift
=
0
,
term
=
0
,
weight_position
=
1
):
def
__init__
(
self
,
s1
=
''
,
s2
=
''
,
start
=
0
,
end
=
0
,
sample_start
=
0
,
sample_end
=
0
,
type
=
'none'
,
deleted
=
ISeqList
([
ISeq
()]),
inserted
=
AISeqList
([
AISeq
()]),
shift
=
0
,
term
=
0
,
weight_position
=
1
):
"""
Initialise the class with the appropriate values.
...
...
@@ -255,10 +304,10 @@ class ProteinVar(object):
self
.
sample_end_aa
=
s2
[
sample_end
-
1
]
self
.
type
=
type
self
.
deleted
=
deleted
self
.
inserted
=
inserted
self
.
inserted
=
ISeqList
([
ISeq
(
inserted
.
get_sequence
())])
self
.
annotated_inserted
=
inserted
self
.
shift
=
shift
self
.
term
=
term
self
.
is_frame_shift
=
False
def
__str__
(
self
):
...
...
@@ -268,15 +317,16 @@ class ProteinVar(object):
:returns unicode: The HGVS description of the raw variant stored in
this class.
"""
# TODO: ext*
if
self
.
type
==
'unknown'
:
return
'?'
if
self
.
type
==
'none'
:
return
'='
description
=
'{}{}'
.
format
(
seq3
(
self
.
start_aa
),
self
.
start
)
if
self
.
is_frame_shift
:
if
self
.
term
:
return
description
+
'{}fs*{}'
.
format
(
seq3
(
self
.
inserted
[
0
].
sequence
[
0
]),
self
.
end
-
self
.
start
+
2
)
seq3
(
self
.
inserted
[
0
].
sequence
[
0
]),
self
.
term
)
if
self
.
start
!=
self
.
end
:
description
+=
'_{}{}'
.
format
(
seq3
(
self
.
end_aa
),
self
.
end
)
...
...
@@ -284,7 +334,7 @@ class ProteinVar(object):
description
+=
self
.
type
if
self
.
type
in
(
'ins'
,
'delins'
):
return
description
+
seq3
(
self
.
inserted
)
return
description
+
seq3
(
str
(
self
.
inserted
)
)
# FIXME: str
return
description
return
description
+
seq3
(
self
.
inserted
)
...
...
@@ -305,9 +355,10 @@ class ProteinVar(object):
description
+=
self
.
type
if
self
.
type
in
(
'ins'
,
'delins'
):
return
description
+
str
(
self
.
inserted
)
return
description
+
str
(
self
.
annotated_
inserted
)
return
description
return
description
+
'{}>{}'
.
format
(
self
.
deleted
,
self
.
inserted
)
return
description
+
'{}>{}'
.
format
(
self
.
deleted
,
self
.
annotated_inserted
)
@
python_2_unicode_compatible
...
...
@@ -315,11 +366,12 @@ class FrameShiftAnnotation(object):
"""
Container for frame shift annotation.
"""
def
__init__
(
self
,
start
=
0
,
end
=
0
,
sample_start
=
0
,
sample_end
=
0
,
def
__init__
(
self
,
s2
=
''
,
start
=
0
,
end
=
0
,
sample_start
=
0
,
sample_end
=
0
,
type
=
'none'
):
"""
Initialise the class with the appropriate values.
:arg unicode s2: Sample sequence.
:arg int start: Start position.
:arg int end: End position.
:arg int sample_start: Start position.
...
...
@@ -330,6 +382,7 @@ class FrameShiftAnnotation(object):
self
.
end
=
end
self
.
sample_start
=
sample_start
self
.
sample_end
=
sample_end
self
.
seq
=
s2
[
sample_start
-
1
:
sample_end
]
for
fs_type
in
FS
:
if
FS
[
fs_type
]
&
type
:
self
.
type
=
fs_type
...
...
@@ -338,4 +391,4 @@ class FrameShiftAnnotation(object):
def
__str__
(
self
):
"""
"""
return
'{}_{}
fs
{}'
.
format
(
self
.
start
,
self
.
end
,
self
.
type
)
return
'{}_{}
{}|
{}'
.
format
(
self
.
start
,
self
.
end
,
self
.
seq
,
self
.
type
)
requirements.txt
View file @
0ad3392d
pytest
==2.7.0
biopython
==1.65
setup.py
View file @
0ad3392d
...
...
@@ -74,8 +74,10 @@ setup(
'Programming Language :: Python :: 3'
,
'Programming Language :: Python :: 3.3'
,
'Programming Language :: Python :: 3.4'
,
'Programming Language :: Python :: 3.5'
,
'Programming Language :: C++'
,
'Topic :: Scientific/Engineering'
,
],
keywords
=
'bioinformatics'
keywords
=
'bioinformatics'
,
install_requires
=
[
'biopython==1.65'
]
)
test.py
0 → 100644
View file @
0ad3392d
#!/usr/bin/env python
from
__future__
import
unicode_literals
import
monoseq
from
extractor
import
describe
#ref = 'MAVLWRLSAVCGALGGRALLLRTPVVRPAHISAFLQDRPIPEWCGVQHIHLSPSHHSGSKAASLHWTSERVVSVLLLGLLPAAYLNPCSAMDYSLAAALTLHGHWGLGQVVTDYVHGDALQKAAKAGLLALSALTFAGLCYFNYHDVGICKAVAMLWKL*'
#alt = 'MAVLWRLSAVCGAPTARDRRPSSVASNSSGQTCSYLSISSGPTYPRMVWSAAHTLVTEPPFWLQGCISPLD*'
#ref = 'MDYSLAAALTLHGH'
#alt = 'MTIPWRSPHFHGH'
alt
=
'TCCTGGCATCAGTTACTGTGTTGACTCACTCAGTGTTGGGATCACTCACTTTCCCCCTACAGGACTCAGATCTGGGAGGCAATTACCTTCGGAGAAAAACGAATAGGAAAAACTGAAGTGTTACTTTTTTTAAAGCTGCTGAAGTTTGTTGGTTTCTCATTGTTTTTAAGCCTACTGGAGCAATAAAGTTTGAAGAACTTTTACCAGGTTTTTTTTATCGCTGCCTTGATATACACTTTTCAAAATGCTTTGGTGGGAAGAAGTAGAGGACTGTTATGAAAGAGAAGATGTTCAAAAGAAAACATTCACAAAATGGGTAAATGCACAATTTTCTAAGTTTGGGAAGCAGCATATTGAGAACCTCTTCAGTGACCTACAGGATGGGAGGCGCCTCCTAGACCTCCTCGAAGGCCTGACAGGGCAAAAACTGCCAAAAGAAAAAGGATCCACAAGAGTTCATGCCCTGAACAATGTCAACAAGGCACTGCGGGTTTTGCAGAACAATAATGTTGATTTAGTGAATATTGGAAGTACTGACATCGTAGATGGAAATCATAAACTGACTCTTGGTTTGATTTGGAATATAATCCTCCACTGGCAGGTCAAAAATGTAATGAAAAATATCATGGCTGGATTGCAACAAACCAACAGTGAAAAGATTCTCCTGAGCTGGGTCCGACAATCAACTCGTAATTATCCACAGGTTAATGTAATCAACTTCACCACCAGCTGGTCTGATGGCCTGGCTTTGAATGCTCTCATCCATAGTCATAGGCCAGACCTATTTGACTGGAATAGTGTGGTTTGCCAGCAGTCAGCCACACAACGACTGGAACATGCATTCAACATCGCCAGATATCAATTAGGCATAGAGAAACTACTCGATCCTGAAGATGTTGATACCACCTATCCAGATAAGAAGTCCATCTTAATGTACATCACATCACTCTTCCAAGTTTTGCCTCAACAAGTGAGCATTGAAGCCATCCAGGAAGTGGAAATGTTGCCAAGGCCACCTAAAGTGACTAAAGAAGAACATTTTCAGTTACATCATCAAATGCACTATTCTCAACAGATCACGGTCAGTCTAGCACAGGGATATGAGAGAACTTCTTCCCCTAAGCCTCGATTCAAGAGCTATGCCTACACACAGGCTGCTTATGTCACCACCTCTGACCCTACACGGAGCCCATTTCCTTCACAGCATTTGGAAGCTCCTGAAGACAAGTCATTTGGCAGTTCATTGATGGAGAGTGAAGTAAACCTGGACCGTTATCAAACAGCTTTAGAAGAAGTATTATCGTGGCTTCTTTCTGCTGAGGACACATTGCAAGCACAAGGAGAGATTTCTAATGATGTGGAAGTGGTGAAAGACCAGTTTCATACTCATGAGGGGTACATGATGGATTTGACAGCCCATCAGGGCCGGGTTGGTAATATTCTACAATTGGGAAGTAAGCTGATTGGAACAGGAAAATTATCAGAAGATGAAGAAACTGAAGTACAAGAGCAGATGAATCTCCTAAATTCAAGATGGGAATGCCTCAGGGTAGCTAGCATGGAAAAACAAAGCAATTTACATAGAGTTTTAATGGATCTCCAGAATCAGAAACTGAAAGAGTTGAATGACTGGCTAACAAAAACAGAAGAAAGAACAAGGAAAATGGAGGAAGAGCCTCTTGGACCTGATCTTGAAGACCTAAAACGCCAAGTACAACAACATAAGGTGCTTCAAGAAGATCTAGAACAAGAACAAGTCAGGGTCAATTCTCTCACTCACATGGTGGTGGTAGTTGATGAATCTAGTGGAGATCACGCAACTGCTGCTTTGGAAGAACAACTTAAGGTATTGGGAGATCGATGGGCAAACATCTGTAGATGGACAGAAGACCGCTGGGTTCTTTTACAAGACATCCTTCTCAAATGGCAACGTCTTACTGAAGAACAGTGCCTTTTTAGTGCATGGCTTTCAGAAAAAGAAGATGCAGTGAACAAGATTCACACAACTGGCTTTAAAGATCAAAATGAAATGTTATCAAGTCTTCAAAAACTGGCCGTTTTAAAAGCGGATCTAGAAAAGAAAAAGCAATCCATGGGCAAACTGTATTCACTCAAACAAGATCTTCTTTCAACACTGAAGAATAAGTCAGTGACCCAGAAGACGGAAGCATGGCTGGATAACTTTGCCCGGTGTTGGGATAATTTAGTCCAAAAACTTGAAAAGAGTACAGCACAGATTTCACAGGCTGTCACCACCACTCAGCCATCACTAACACAGACAACTGTAATGGAAACAGTAACTACGGTGACCACAAGGGAACAGATCCTGGTAAAGCATGCTCAAGAGGAACTTCCACCACCACCTCCCCAAAAGAAGAGGCAGATTACTGTGGATTCTGAAATTAGGAAAAGGTTGGATGTTGATATAACTGAACTTCACAGCTGGATTACTCGCTCAGAAGCTGTGTTGCAGAGTCCTGAATTTGCAATCTTTCGGAAGGAAGGCAACTTCTCAGACTTAAAAGAAAAAGTCAATGCCATAGAGCGAGAAAAAGCTGAGAAGTTCAGAAAACTGCAAGATGCCAGCAGATCAGCTCAGGCCCTGGTGGAACAGATGGTGAATGAGGGTGTTAATGCAGATAGCATCAAACAAGCCTCAGAACAACTGAACAGCCGGTGGATCGAATTCTGCCAGTTGCTAAGTGAGAGACTTAACTGGCTGGAGTATCAGAACAACATCATCGCTTTCTATAATCAGCTACAACAATTGGAGCAGATGACAACTACTGCTGAAAACTGGTTGAAAATCCAACCCACCACCCCATCAGAGCCAACAGCAATTAAAAGTCAGTTAAAAATTTGTAAGGATGAAGTCAACCGGCTATCAGGTCTTCAACCTCAAATTGAACGATTAAAAATTCAAAGCATAGCCCTGAAAGAGAAAGGACAAGGACCCATGTTCCTGGATGCAGACTTTGTGGCCTTTACAAATCATTTTAAGCAAGTCTTTTCTGATGTGCAGGCCAGAGAGAAAGAGCTACAGACAATTTTTGACACTTTGCCACCAATGCGCTATCAGGAGACCATGAGTGCCATCAGGACATGGGTCCAGCAGTCAGAAACCAAACTCTCCATACCTCAACTTAGTGTCACCGACTATGAAATCATGGAGCAGAGACTCGGGGAATTGCAGGCTTTACAAAGTTCTCTGCAAGAGCAACAAAGTGGCCTATACTATCTCAGCACCACTGTGAAAGAGATGTCGAAGAAAGCGCCCTCTGAAATTAGCCGGAAATATCAATCAGAATTTGAAGAAATTGAGGGACGCTGGAAGAAGCTCTCCTCCCAGCTGGTTGAGCATTGTCAAAAGCTAGAGGAGCAAATGAATAAACTCCGAAAAATTCAGAATCACATACAAACCCTGAAGAAATGGATGGCTGAAGTTGATGTTTTTCTGAAGGAGGAATGGCCTGCCCTTGGGGATTCAGAAATTCTAAAAAAGCAGCTGAAACAGTGCAGACTTTTAGTCAGTGATATTCAGACAATTCAGCCCAGTCTAAACAGTGTCAATGAAGGTGGGCAGAAGATAAAGAATGAAGCAGAGCCAGAGTTTGCTTCGAGACTTGAGACAGAACTCAAAGAACTTAACACTCAGTGGGATCACATGTGCCAACAGGTCTATGCCAGAAAGGAGGCCTTGAAGGGAGGTTTGGAGAAAACTGTAAGCCTCCAGAAAGATCTATCAGAGATGCACGAATGGATGACACAAGCTGAAGAAGAGTATCTTGAGAGAGATTTTGAATATAAAACTCCAGATGAATTACAGAAAGCAGTTGAAGAGATGAAGAGAGCTAAAGAAGAGGCCCAACAAAAAGAAGCGAAAGTGAAACTCCTTACTGAGTCTGTAAATAGTGTCATAGCTCAAGCTCCACCTGTAGCACAAGAGGCCTTAAAAAAGGAACTTGAAACTCTAACCACCAACTACCAGTGGCTCTGCACTAGGCTGAATGGGAAATGCAAGACTTTGGAAGAAGTTTGGGCATGTTGGCATGAGTTATTGTCATACTTGGAGAAAGCAAACAAGTGGCTAAATGAAGTAGAATTTAAACTTAAAACCACTGAAAACATTCCTGGCGGAGCTGAGGAAATCTCTGAGGTGCTAGATTCACTTGAAAATTTGATGCGACATTCAGAGGATAACCCAAATCAGATTCGCATATTGGCACAGACCCTAACAGATGGCGGAGTCATGGATGAGCTAATCAATGAGGAACTTGAGACATTTAATTCTCGTTGGAGGGAACTACATGAAGAGGCTGTAAGGAGGCAAAAGTTGCTTGAACAGAGCATCCAGTCTGCCCAGGAGACTGAAAAATCCTTACACTTAATCCAGGAGTCCCTCACATTCATTGACAAGCAGTTGGCAGCTTATATTGCAGACAAGGTGGACGCAGCTCAAATGCCTCAGGAAGCCCAGAAAATCCAATCTGATTTGACAAGTCATGAGATCAGTTTAGAAGAAATGAAGAAACATAATCAGGGGAAGGAGGCTGCCCAAAGAGTCCTGTCTCAGATTGATGTTGCACAGAAAAAATTACAAGATGTCTCCATGAAGTTTCGATTATTCCAGAAACCAGCCAATTTTGAGCAGCGTCTACAAGAAAGTAAGATGATTTTAGATGAAGTGAAGATGCACTTGCCTGCATTGGAAACAAAGAGTGTGGAACAGGAAGTAGTACAGTCACAGCTAAATCATTGTGTGAACTTGTATAAAAGTCTGAGTGAAGTGAAGTCTGAAGTGGAAATGGTGATAAAGACTGGACGTCAGATTGTACAGAAAAAGCAGACGGAAAATCCCAAAGAACTTGATGAAAGAGTAACAGCTTTGAAATTGCATTATAATGAGCTGGGAGCAAAGGTAACAGAAAGAAAGCAACAGTTGGAGAAATGCTTGAAATTGTCCCGTAAGATGCGAAAGGAAATGAATGTCTTGACAGAATGGCTGGCAGCTACAGATATGGAATTGACAAAGAGATCAGCAGTTGAAGGAATGCCTAGTAATTTGGATTCTGAAGTTGCCTGGGGAAAGGCTACTCAAAAAGAGATTGAGAAACAGAAGGTGCACCTGAAGAGTATCACAGAGGTAGGAGAGGCCTTGAAAACAGTTTTGGGCAAGAAGGAGACGTTGGTGGAAGATAAACTCAGTCTTCTGAATAGTAACTGGATAGCTGTCACCTCCCGAGCAGAAGAGTGGTTAAATCTTTTGTTGGAATACCAGAAACACATGGAAACTTTTGACCAGAATGTGGACCACATCACAAAGTGGATCATTCAGGCTGACACACTTTTGGATGAATCAGAGAAAAAGAAACCCCAGCAAAAAGAAGACGTGCTTAAGCGTTTAAAGGCAGAACTGAATGACATACGCCCAAAGGTGGACTCTACACGTGACCAAGCAGCAAACTTGATGGCAAACCGCGGTGACCACTGCAGGAAATTAGTAGAGCCCCAAATCTCAGAGCTCAACCATCGATTTGCAGCCATTTCACACAGAATTAAGACTGGAAAGGCCTCCATTCCTTTGAAGGAATTGGAGCAGTTTAACTCAGATATACAAAAATTGCTTGAACCACTGGAGGCTGAAATTCAGCAGGGGGTGAATCTGAAAGAGGAAGACTTCAATAAAGATATGAATGAAGACAATGAGGGTACTGTAAAAGAATTGTTGCAAAGAGGAGACAACTTACAACAAAGAATCACAGATGAGAGAAAGCGAGAGGAAATAAAGATAAAACAGCAGCTGTTACAGACAAAACATAATGCTCTCAAGGATTTGAGGTCTCAAAGAAGAAAAAAGGCTCTAGAAATTTCTCATCAGTGGTATCAGTACAAGAGGCAGGCTGATGATCTCCTGAAATGCTTGGATGACATTGAAAAAAAATTAGCCAGCCTACCTGAGCCCAGAGATGAAAGGAAAATAAAGGAAATTGATCGGGAATTGCAGAAGAAGAAAGAGGAGCTGAATGCAGTGCGTAGGCAAGCTGAGGGCTTGTCTGAGGATGGGGCCGCAATGGCAGTGGAGCCAACTCAGATCCAGCTCAGCAAGCGCTGGCGGGAAATTGAGAGCAAATTTGCTCAGTTTCGAAGACTCAACTTTGCACAAATTCACACTGTCCGTGAAGAAACGATGATGGTGATGACTGAAGACATGCCTTTGGAAATTTCTTATGTGCCTTCTACTTATTTGACTGAAATCACTCATGTCTCACAAGCCCTATTAGAAGTGGAACAACTTCTCAATGCTCCTGACCTCTGTGCTAAGGACTTTGAAGATCTCTTTAAGCAAGAGGAGTCTCTGAAGAATATAAAAGATAGTCTACAACAAAGCTCAGGTCGGATTGACATTATTCATAGCAAGAAGACAGCAGCATTGCAAAGTGCAACGCCTGTGGAAAGGGTGAAGCTACAGGAAGCTCTCTCCCAGCTTGATTTCCAATGGGAAAAAGTTAACAAAATGTACAAGGACCGACAAGGGCGATTTGACAGATCTGTTGAGAAATGGCGGCGTTTTCATTATGATATAAAGATATTTAATCAGTGGCTAACAGAAGCTGAACAGTTTCTCAGAAAGACACAAATTCCTGAGAATTGGGAACATGCTAAATACAAATGGTATCTTAAGGAACTCCAGGATGGCATTGGGCAGCGGCAAACTGTTGTCAGAACATTGAATGCAACTGGGGAAGAAATAATTCAGCAATCCTCAAAAACAGATGCCAGTATTCTACAGGAAAAATTGGGAAGCCTGAATCTGCGGTGGCAGGAGGTCTGCAAACAGCTGTCAGACAGAAAAAAGAGGCTAGAAGAACAAAAGAATATCTTGTCAGAATTTCAAAGAGATTTAAATGAATTTGTTTTATGGTTGGAGGAAGCAGATAACATTGCTAGTATCCCACTTGAACCTGGAAAAGAGCAGCAACTAAAAGAAAAGCTTGAGCAAGTCAAGTTACTGGTGGAAGAGTTGCCCCTGCGCCAGGGAATTCTCAAACAATTAAATGAAACTGGAGGACCCGTGCTTGTAAGTGCTCCCATAAGCCCAGAAGAGCAAGATAAACTTGAAAATAAGCTCAAGCAGACAAATCTCCAGTGGATAAAGGTTTCCAGAGCTTTACCTGAGAAACAAGGAGAAATTGAAGCTCAAATAAAAGACCTTGGGCAGCTTGAAAAAAAGCTTGAAGACCTTGAAGAGCAGTTAAATCATCTGCTGCTGTGGTTATCTCCTATTAGGAATCAGTTGGAAATTTATAACCAACCAAACCAAGAAGGACCATTTGACGTTCAGGAAACTGAAATAGCAGTTCAAGCTAAACAACCGGATGTGGAAGAGATTTTGTCTAAAGGGCAGCATTTGTACAAGGAAAAACCAGCCACTCAGCCAGTGAAGAGGAAGTTAGAAGATCTGAGCTCTGAGTGGAAGGCGGTAAACCGTTTACTTCAAGAGCTGAGGGCAAAGCAGCCTGACCTAGCTCCTGGACTGACCACTATTGGAGCCTCTCCTACTCAGACTGTTACTCTGGTGACACAACCTGTGGTTACTAAGGAAACTGCCATCTCCAAACTAGAAATGCCATCTTCCTTGATGTTGGAGGTACCTGCTCTGGCAGATTTCAACCGGGCTTGGACAGAACTTACCGACTGGCTTTCTCTGCTTGATCAAGTTATAAAATCACAGAGGGTGATGGTGGGTGACCTTGAGGATATCAACGAGATGATCATCAAGCAGAAGGCAACAATGCAGGATTTGGAACAGAGGCGTCCCCAGTTGGAAGAACTCATTACCGCTGCCCAAAATTTGAAAAACAAGACCAGCAATCAAGAGGCTAGAACAATCATTACGGATCGAATTGAAAGAATTCAGAATCAGTGGGATGAAGTACAAGAACACCTTCAGAACCGGAGGCAACAGTTGAATGAAATGTTAAAGGATTCAACACAATGGCTGGAAGCTAAGGAAGAAGCTGAGCAGGTCTTAGGACAGGCCAGAGCCAAGCTTGAGTCATGGAAGGAGGGTCCCTATACAGTAGATGCAATCCAAAAGAAAATCACAGAAACCAAGCAGTTGGCCAAAGACCTCCGCCAGTGGCAGACAAATGTAGATGTGGCAAATGACTTGGCCCTGAAACTTCTCCGGGATTATTCTGCAGATGATACCAGAAAAGTCCACATGATAACAGAGAATATCAATGCCTCTTGGAGAAGCATTCATAA