Commit 0ad3392d authored by jkvis's avatar jkvis
Browse files

Merge pull request #11 from mutalyzer/describe_protein

Describe protein
parents 674bf74a b2de445d
......@@ -6,6 +6,7 @@ python:
- "2.7"
- "3.3"
- "3.4"
- "3.5"
addons:
apt_packages: swig
before_install: pip install pytest
......
......@@ -5,74 +5,15 @@ other.
from __future__ import (absolute_import, division, print_function,
unicode_literals)
unicode_literals)
import math
from .variant import (ISeq, ISeqList, DNAVar, ProteinVar, Allele,
ProteinAllele, FrameShiftAnnotationList, FrameShiftAnnotation)
from . import extractor, util
# Taken from BioPython.
AMBIGUOUS_DNA_COMPLEMENT = {
'A': 'T',
'C': 'G',
'G': 'C',
'T': 'A',
'M': 'K',
'R': 'Y',
'W': 'W',
'S': 'S',
'Y': 'R',
'K': 'M',
'V': 'B',
'H': 'D',
'D': 'H',
'B': 'V',
'X': 'X',
'N': 'N'}
AMBIGUOUS_RNA_COMPLEMENT = {
'A': 'U',
'C': 'G',
'G': 'C',
'U': 'A',
'M': 'K',
'R': 'Y',
'W': 'W',
'S': 'S',
'Y': 'R',
'K': 'M',
'V': 'B',
'H': 'D',
'D': 'H',
'B': 'V',
'X': 'X',
'N': 'N'}
def _make_translation_table(complement_mapping):
before = list(complement_mapping.keys())
before += [b.lower() for b in before]
after = list(complement_mapping.values())
after += [b.lower() for b in after]
return dict((ord(k), v) for k, v in zip(before, after))
_dna_complement_table = _make_translation_table(AMBIGUOUS_DNA_COMPLEMENT)
_rna_complement_table = _make_translation_table(AMBIGUOUS_RNA_COMPLEMENT)
def reverse_complement(sequence):
"""
Reverse complement of a sequence represented as unicode string.
"""
if 'U' in sequence or 'u' in sequence:
table = _rna_complement_table
else:
table = _dna_complement_table
from Bio.Seq import reverse_complement
return ''.join(reversed(sequence.translate(table)))
from .variant import (ISeq, AISeq, ISeqList, AISeqList, DNAVar, ProteinVar,
Allele, ProteinAllele, FS)
from . import extractor, util
def roll(s, first, last):
......@@ -143,7 +84,7 @@ def palinsnoop(s):
is a 'palindrome'.
@rtype: int
"""
s_revcomp = reverse_complement(s)
s_revcomp = reverse_complement(str(s)) # FIXME str inserted.
for i in range(int(math.ceil(len(s) / 2.0))):
if s[i] != s_revcomp[i]:
......@@ -293,7 +234,7 @@ def var_to_protein_var(s1, s2, var, seq_list=[], weight_position=1):
start=var.reference_start - ins_length + 1,
end=var.reference_end, type='dup', shift=shift,
sample_start=var.sample_start + 1, sample_end=var.sample_end,
inserted=ISeqList([ISeq(sequence=s2[
inserted=AISeqList([AISeq(sequence=s2[
var.sample_start:var.sample_end],
weight_position=weight_position)]),
weight_position=weight_position)
......@@ -301,7 +242,7 @@ def var_to_protein_var(s1, s2, var, seq_list=[], weight_position=1):
return ProteinVar(s1=s1, s2=s2, start=var.reference_start,
end=var.reference_start + 1,
inserted=seq_list or
ISeqList([ISeq(sequence=s2[var.sample_start:var.sample_end],
AISeqList([AISeq(sequence=s2[var.sample_start:var.sample_end],
weight_position=weight_position)]),
type='ins', shift=shift, sample_start=var.sample_start + 1,
sample_end=var.sample_end, weight_position=weight_position)
......@@ -317,7 +258,7 @@ def var_to_protein_var(s1, s2, var, seq_list=[], weight_position=1):
return ProteinVar(s1=s1, s2=s2, start=var.reference_start + 1,
end=var.reference_end, type='del', shift=shift,
sample_start=var.sample_start, sample_end=var.sample_end + 1,
deleted=ISeqList([ISeq(sequence=s1[
deleted=AISeqList([AISeq(sequence=s1[
var.reference_start:var.reference_end],
weight_position=weight_position)]),
weight_position=weight_position)
......@@ -328,19 +269,19 @@ def var_to_protein_var(s1, s2, var, seq_list=[], weight_position=1):
return ProteinVar(s1=s1, s2=s2, start=var.reference_start + 1,
end=var.reference_end, sample_start=var.sample_start + 1,
sample_end=var.sample_end, type='subst',
deleted=ISeqList([ISeq(sequence=s1[var.reference_start],
deleted=AISeqList([AISeq(sequence=s1[var.reference_start],
weight_position=weight_position)]),
inserted=ISeqList([ISeq(sequence=s2[var.sample_start],
inserted=AISeqList([AISeq(sequence=s2[var.sample_start],
weight_position=weight_position)]),
weight_position=weight_position)
# InDel.
return ProteinVar(s1=s1, s2=s2, start=var.reference_start + 1,
end=var.reference_end, deleted=ISeqList([ISeq(sequence=s1[
end=var.reference_end, deleted=AISeqList([AISeq(sequence=s1[
var.reference_start:var.reference_end],
weight_position=weight_position)]),
inserted=seq_list or
ISeqList([ISeq(sequence=s2[var.sample_start:var.sample_end],
AISeqList([AISeq(sequence=s2[var.sample_start:var.sample_end],
weight_position=weight_position)]),
type='delins', sample_start=var.sample_start + 1,
sample_end=var.sample_end, weight_position=weight_position)
......@@ -364,13 +305,6 @@ def describe_dna(s1, s2):
s2_swig[0], s2_swig[1], extractor.TYPE_DNA)
for variant in extracted.variants:
#print(variant.type, variant.reference_start,
# variant.reference_end, variant.sample_start,
# variant.sample_end, variant.transposition_start,
# variant.transposition_end)
#print(variant.type & extractor.TRANSPOSITION_OPEN, variant.type &
# extractor.TRANSPOSITION_CLOSE)
if variant.type & extractor.TRANSPOSITION_OPEN:
if not in_transposition:
seq_list = ISeqList()
......@@ -405,43 +339,80 @@ def describe_dna(s1, s2):
return description
def describe_protein(s1, s2):
def print_var(variant):
print('({:3}, {:3}), ({:3}, {:3}), {:08b}, {}, {}'.format(variant.reference_start,
variant.reference_end, variant.sample_start, variant.sample_end,
variant.type, variant.type, variant.sample_end - variant.sample_start))
def get_frames(flags):
result = []
for fs in FS:
if flags & FS[fs]:
result.append(fs)
return result
def describe_protein(s1, s2, codon_table=1):
"""
"""
codons = 'KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF'
codons = util.codon_table_string(codon_table)
description = ProteinAllele()
annotation = FrameShiftAnnotationList()
s1_swig = util.swig_str(s1)
s2_swig = util.swig_str(s2)
codons_swig = util.swig_str(codons)
extracted = extractor.extract(s1_swig[0], s1_swig[1],
s2_swig[0], s2_swig[1], extractor.TYPE_PROTEIN, codons_swig[0])
variants = extracted.variants
for variant in extracted.variants:
if (variant.type & extractor.FRAME_SHIFT and
(variant.type & extractor.FRAME_SHIFT_1 or variant.type &
extractor.FRAME_SHIFT_2)):
annotation.append(FrameShiftAnnotation(
start=variant.reference_start + 1,
end=variant.reference_end + 1,
sample_start=variant.sample_start + 1,
sample_end=variant.sample_end + 1, type=variant.type))
#for variant in variants:
# print_var(variant)
#print()
for variant in extracted.variants:
if (not variant.type & extractor.FRAME_SHIFT and not
variant.type & extractor.IDENTITY):
var = var_to_protein_var(s1, s2, variant,
index = 0
while index < len(variants):
if variants[index].type != extractor.IDENTITY:
variant = variants[index]
index += 1
seq_list = AISeqList()
# NOTE: This is for filling.
last_end = variants[index].reference_start
while (index < len(variants) and
variants[index].type & extractor.FRAME_SHIFT):
if last_end != variants[index].sample_start:
seq_list.append(AISeq(
s2[last_end:variants[index].sample_start]))
last_end = variants[index].sample_end
seq_list.append(AISeq(
s2[variants[index].sample_start:
variants[index].sample_end],
start=variants[index].reference_start + 1,
end=variants[index].reference_end,
sample_start=variants[index].sample_start + 1,
sample_end=variants[index].sample_end,
frames=get_frames(variants[index].type)))
# NOTE: Perhaps use trans_open, trans_close to ...
index += 1
if last_end != variant.sample_end:
seq_list.append(AISeq(s2[last_end:variant.sample_end]))
var = var_to_protein_var(s1, s2, variant, seq_list,
weight_position=extracted.weight_position)
description.append(var)
if description[-1].type == 'delins':
for frame_shift in annotation:
if frame_shift.start >= description[-1].start:
description[-1].is_frame_shift = True
else:
index += 1
if not description:
return (ProteinAllele([ProteinVar()]),
FrameShiftAnnotationList([FrameShiftAnnotation]))
return description, annotation
return ProteinAllele([ProteinVar()])
return description
......@@ -4,75 +4,38 @@ General utility definitions.
from __future__ import (absolute_import, division, print_function,
unicode_literals)
unicode_literals)
import sys
from Bio.Data import CodonTable
from Bio.Data.IUPACData import (protein_letters_1to3,
protein_letters_1to3_extended)
from Bio.SeqUtils import seq3
PY2 = sys.version_info[0] == 2
# From BioPython.
protein_letters_1to3 = {
'A': 'Ala', 'C': 'Cys', 'D': 'Asp',
'E': 'Glu', 'F': 'Phe', 'G': 'Gly', 'H': 'His',
'I': 'Ile', 'K': 'Lys', 'L': 'Leu', 'M': 'Met',
'N': 'Asn', 'P': 'Pro', 'Q': 'Gln', 'R': 'Arg',
'S': 'Ser', 'T': 'Thr', 'V': 'Val', 'W': 'Trp',
'Y': 'Tyr',
}
protein_letters_1to3_extended = dict(list(protein_letters_1to3.items()) + list({
'B': 'Asx', 'X': 'Xaa', 'Z': 'Glx', 'J': 'Xle',
'U': 'Sel', 'O': 'Pyl',
}.items()))
# From BioPython.
def seq3(seq, custom_map={'*': 'Ter'}, undef_code='Xaa'):
"""Turn a one letter code protein sequence into one with three letter codes.
The single input argument 'seq' should be a protein sequence using single
letter codes, either as a python string or as a Seq or MutableSeq object.
This function returns the amino acid sequence as a string using the three
letter amino acid codes. Output follows the IUPAC standard (including
ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk.
Any unknown character (including possible gap characters), is changed into
'Xaa'.
e.g.
>>> from Bio.SeqUtils import seq3
>>> seq3("MAIVMGRWKGAR*")
'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
You can set a custom translation of the codon termination code using the
"custom_map" argument, e.g.
>>> seq3("MAIVMGRWKGAR*", custom_map={"*": "***"})
'MetAlaIleValMetGlyArgTrpLysGlyAlaArg***'
You can also set a custom translation for non-amino acid characters, such
as '-', using the "undef_code" argument, e.g.
def codon_table_string(table_id):
"""
Return the codon table referenced by {table_id} in compresed from. The
result consists of a string of amino acids sorted by the codon that
translates to them. For example, the codon 'AAG' has position 3 in the
sorted list of codons, so its translation 'K' occurs in the third position
of the output.
>>> seq3("MAIVMGRWKGA--R*", undef_code='---')
'MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer'
:arg table_id: ID of a codon table.
:type table_id: int
If not given, "undef_code" defaults to "Xaa", e.g.
:returns: String representation of code table referenced by {table_id}.
:rtype: str
"""
codons = CodonTable.unambiguous_dna_by_id[table_id].forward_table.items()
>>> seq3("MAIVMGRWKGA--R*")
'MetAlaIleValMetGlyArgTrpLysGlyAlaXaaXaaArgTer'
codons += map(lambda x: (x, '*'),
CodonTable.unambiguous_dna_by_id[table_id].stop_codons)
This function was inspired by BioPerl's seq3.
"""
# not doing .update() on IUPACData dict with custom_map dict
# to preserve its initial state (may be imported in other modules)
threecode = dict(list(protein_letters_1to3_extended.items()) +
list(custom_map.items()))
#We use a default of 'Xaa' for undefined letters
#Note this will map '-' to 'Xaa' which may be undesirable!
return ''.join(threecode.get(aa, undef_code) for aa in str(seq))
return ''.join(map(lambda x: x[1], sorted(codons)))
def swig_str(s, ascii_only=True):
......
......@@ -3,8 +3,8 @@ Models for the description extractor.
"""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from __future__ import (
absolute_import, division, print_function, unicode_literals)
from . import extractor
from extractor.util import python_2_unicode_compatible, seq3, str
......@@ -19,11 +19,11 @@ WEIGHTS = {
'delins': extractor.WEIGHT_DELETION_INSERTION
}
FS = {
'+1': extractor.FRAME_SHIFT_1,
'+2': extractor.FRAME_SHIFT_2,
'1': extractor.FRAME_SHIFT_1,
'2': extractor.FRAME_SHIFT_2,
'inv': extractor.FRAME_SHIFT_REVERSE,
'inv+1': extractor.FRAME_SHIFT_REVERSE_1,
'inv+2': extractor.FRAME_SHIFT_REVERSE_1,
'1inv': extractor.FRAME_SHIFT_REVERSE_1,
'2inv': extractor.FRAME_SHIFT_REVERSE_1
}
......@@ -82,15 +82,18 @@ class ISeqList(HGVSList):
pass
class FrameShiftAnnotationList(HGVSList):
pass
class AISeqList(ISeqList):
def get_sequence(self):
return ''.join(map(lambda x: x.sequence, self.items))
@python_2_unicode_compatible
class ISeq(object):
"""
Container for an inserted sequence.
"""
def __init__(self, sequence='', start=0, end=0, reverse=False,
def __init__(
self, sequence='', start=0, end=0, reverse=False,
weight_position=1):
"""
Initialise the class with the appropriate values.
......@@ -122,10 +125,12 @@ class ISeq(object):
return '{0}_{1}{2}'.format(self.start, self.end, inverted)
# TODO: Is this still used?
def __bool__(self):
return bool(self.sequence)
# TODO: Is this still used?
def __nonzero__(self): # Python 2.x compatibility.
return self.__bool__()
......@@ -135,18 +140,61 @@ class ISeq(object):
return len(self.sequence) * extractor.WEIGHT_BASE
inverse_weight = WEIGHTS['inv'] if self.reverse else 0
return (self.weight_position * 2 + extractor.WEIGHT_SEPARATOR +
return (
self.weight_position * 2 + extractor.WEIGHT_SEPARATOR +
inverse_weight)
@python_2_unicode_compatible
class AISeq(object):
"""
Container for an annotated inserted sequence.
"""
def __init__(
self, sequence='', start=0, end=0, sample_start=0, sample_end=0,
frames=[], weight_position=1):
"""
Initialise the class with the appropriate values.
:arg unicode sequence: Literal inserted sequence.
:arg int start: Start position for a transposed sequence.
:arg int end: End position for a transposed sequence.
"""
self.sequence = sequence
self.start = start
self.end = end
self.sample_start = sample_start
self.sample_end = sample_end
self.weight_position = weight_position
self.frames = frames
self.type = 'trans'
if self.sequence:
self.type = 'ins'
if self.frames:
self.type = 'fs'
def __str__(self):
if self.type == 'ins':
return self.sequence
if self.type == 'trans':
return '{}_{}'.format(self.start, self.end)
return '{}_{}{}|{}'.format(
self.start, self.end, self.sequence, '|'.join(self.frames))
@python_2_unicode_compatible
class DNAVar(object):
"""
Container for a DNA variant.
"""
def __init__(self, start=0, start_offset=0, end=0, end_offset=0,
sample_start=0, sample_start_offset=0, sample_end=0,
sample_end_offset=0, type='none', deleted=ISeqList([ISeq()]),
def __init__(
self, start=0, start_offset=0, end=0, end_offset=0, sample_start=0,
sample_start_offset=0, sample_end=0, sample_end_offset=0,
type='none', deleted=ISeqList([ISeq()]),
inserted=ISeqList([ISeq()]), shift=0, weight_position=1):
"""
Initialise the class with the appropriate values.
......@@ -227,9 +275,10 @@ class ProteinVar(object):
Container for a protein variant.
"""
def __init__(self, s1='', s2='', start=0, end=0, sample_start=0,
sample_end=0, type='none', deleted=ISeqList([ISeq()]),
inserted=ISeqList([ISeq()]), shift=0, term=0, weight_position=1):
def __init__(
self, s1='', s2='', start=0, end=0, sample_start=0, sample_end=0,
type='none', deleted=ISeqList([ISeq()]),
inserted=AISeqList([AISeq()]), shift=0, term=0, weight_position=1):
"""
Initialise the class with the appropriate values.
......@@ -255,10 +304,10 @@ class ProteinVar(object):
self.sample_end_aa = s2[sample_end - 1]
self.type = type
self.deleted = deleted
self.inserted = inserted
self.inserted = ISeqList([ISeq(inserted.get_sequence())])
self.annotated_inserted = inserted
self.shift = shift
self.term = term
self.is_frame_shift = False
def __str__(self):
......@@ -268,15 +317,16 @@ class ProteinVar(object):
:returns unicode: The HGVS description of the raw variant stored in
this class.
"""
# TODO: ext*
if self.type == 'unknown':
return '?'
if self.type == 'none':
return '='
description = '{}{}'.format(seq3(self.start_aa), self.start)
if self.is_frame_shift:
if self.term:
return description + '{}fs*{}'.format(
seq3(self.inserted[0].sequence[0]), self.end - self.start + 2)
seq3(self.inserted[0].sequence[0]), self.term)
if self.start != self.end:
description += '_{}{}'.format(seq3(self.end_aa), self.end)
......@@ -284,7 +334,7 @@ class ProteinVar(object):
description += self.type
if self.type in ('ins', 'delins'):
return description + seq3(self.inserted)
return description + seq3(str(self.inserted)) # FIXME: str
return description
return description + seq3(self.inserted)
......@@ -305,9 +355,10 @@ class ProteinVar(object):
description += self.type
if self.type in ('ins', 'delins'):
return description + str(self.inserted)
return description + str(self.annotated_inserted)
return description
return description + '{}>{}'.format(self.deleted, self.inserted)
return description + '{}>{}'.format(
self.deleted, self.annotated_inserted)
@python_2_unicode_compatible
......@@ -315,11 +366,12 @@ class FrameShiftAnnotation(object):
"""
Container for frame shift annotation.
"""
def __init__(self, start=0, end=0, sample_start=0, sample_end=0,
def __init__(self, s2='', start=0, end=0, sample_start=0, sample_end=0,
type='none'):
"""
Initialise the class with the appropriate values.
:arg unicode s2: Sample sequence.
:arg int start: Start position.
:arg int end: End position.
:arg int sample_start: Start position.
......@@ -330,6 +382,7 @@ class FrameShiftAnnotation(object):
self.end = end
self.sample_start = sample_start
self.sample_end = sample_end
self.seq = s2[sample_start - 1:sample_end]
for fs_type in FS:
if FS[fs_type] & type:
self.type = fs_type
......@@ -338,4 +391,4 @@ class FrameShiftAnnotation(object):
def __str__(self):
"""
"""
return '{}_{}fs{}'.format(self.start, self.end, self.type)
return '{}_{}{}|{}'.format(self.start, self.end, self.seq, self.type)
......@@ -74,8 +74,10 @@ setup(
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: C++',
'Topic :: Scientific/Engineering',
],
keywords='bioinformatics'
keywords='bioinformatics',
install_requires=['biopython==1.65']
)
#!/usr/bin/env python
from __future__ import unicode_literals
import monoseq
from extractor import describe
#ref = 'MAVLWRLSAVCGALGGRALLLRTPVVRPAHISAFLQDRPIPEWCGVQHIHLSPSHHSGSKAASLHWTSERVVSVLLLGLLPAAYLNPCSAMDYSLAAALTLHGHWGLGQVVTDYVHGDALQKAAKAGLLALSALTFAGLCYFNYHDVGICKAVAMLWKL*'
#alt = 'MAVLWRLSAVCGAPTARDRRPSSVASNSSGQTCSYLSISSGPTYPRMVWSAAHTLVTEPPFWLQGCISPLD*'
#ref = 'MDYSLAAALTLHGH'
#alt = 'MTIPWRSPHFHGH'
alt = 'TCCTGGCATCAGTTACTGTGTTGACTCACTCAGTGTTGGGATCACTCACTTTCCCCCTACAGGACTCAGATCTGGGAGGCAATTACCTTCGGAGAAAAACGAATAGGAAAAACTGAAGTGTTACTTTTTTTAAAGCTGCTGAAGTTTGTTGGTTTCTCATTGTTTTTAAGCCTACTGGAGCAATAAAGTTTGAAGAACTTTTACCAGGTTTTTTTTATCGCTGCCTTGATATACACTTTTCAAAATGCTTTGGTGGGAAGAAGTAGAGGACTGTTATGAAAGAGAAGATGTTCAAAAGAAAACATTCACAAAATGGGTAAATGCACAATTTTCTAAGTTTGGGAAGCAGCATATTGAGAACCTCTTCAGTGACCTACAGGATGGGAGGCGCCTCCTAGACCTCCTCGAAGGCCTGACAGGGCAAAAACTGCCAAAAGAAAAAGGATCCACAAGAGTTCATGCCCTGAACAATGTCAACAAGGCACTGCGGGTTTTGCAGAACAATAATGTTGATTTAGTGAATATTGGAAGTACTGACATCGTAGATGGAAATCATAAACTGACTCTTGGTTTGATTTGGAATATAATCCTCCACTGGCAGGTCAAAAATGTAATGAAAAATATCATGGCTGGATTGCAACAAACCAACAGTGAAAAGATTCTCCTGAGCTGGGTCCGACAATCAACTCGTAATTATCCACAGGTTAATGTAATCAACTTCACCACCAGCTGGTCTGATGGCCTGGCTTTGAATGCTCTCATCCATAGTCATAGGCCAGACCTATTTGACTGGAATAGTGTGGTTTGCCAGCAGTCAGCCACACAACGACTGGAACATGCATTCAACATCGCCAGATATCAATTAGGCATAGAGAAACTACTCGATCCTGAAGATGTTGATACCACCTATCCAGATAAGAAGTCCATCTTAATGTACATCACATCACTCTTCCAAGTTTTGCCTCAACAAGTGAGCATTGAAGCCATCCAGGAAGTGGAAATGTTGCCAAGGCCACCTAAAGTGACTAAAGAAGAACATTTTCAGTTACATCATCAAATGCACTATTCTCAACAGATCACGGTCAGTCTAGCACAGGGATATGAGAGAACTTCTTCCCCTAAGCCTCGATTCAAGAGCTATGCCTACACACAGGCTGCTTATGTCACCACCTCTGACCCTACACGGAGCCCATTTCCTTCACAGCATTTGGAAGCTCCTGAAGACAAGTCATTTGGCAGTTCATTGATGGAGAGTGAAGTAAACCTGGACCGTTATCAAACAGCTTTAGAAGAAGTATTATCGTGGCTTCTTTCTGCTGAGGACACATTGCAAGCACAAGGAGAGATTTCTAATGATGTGGAAGTGGTGAAAGACCAGTTTCATACTCATGAGGGGTACATGATGGATTTGACAGCCCATCAGGGCCGGGTTGGTAATATTCTACAATTGGGAAGTAAGCTGATTGGAACAGGAAAATTATCAGAAGATGAAGAAACTGAAGTACAAGAGCAGATGAATCTCCTAAATTCAAGATGGGAATGCCTCAGGGTAGCTAGCATGGAAAAACAAAGCAATTTACATAGAGTTTTAATGGATCTCCAGAATCAGAAACTGAAAGAGTTGAATGACTGGCTAACAAAAACAGAAGAAAGAACAAGGAAAATGGAGGAAGAGCCTCTTGGACCTGATCTTGAAGACCTAAAACGCCAAGTACAACAACATAAGGTGCTTCAAGAAGATCTAGAACAAGAACAAGTCAGGGTCAATTCTCTCACTCACATGGTGGTGGTAGTTGATGAATCTAGTGGAGATCACGCAACTGCTGCTTTGGAAGAACAACTTAAGGTATTGGGAGATCGATGGGCAAACATCTGTAGATGGACAGAAGACCGCTGGGTTCTTTTACAAGACATCCTTCTCAAATGGCAACGTCTTACTGAAGAACAGTGCCTTTTTAGTGCATGGCTTTCAGAAAAAGAAGATGCAGTGAACAAGATTCACACAACTGGCTTTAAAGATCAAAATGAAATGTTATCAAGTCTTCAAAAACTGGCCGTTTTAAAAGCGGATCTAGAAAAGAAAAAGCAATCCATGGGCAAACTGTATTCACTCAAACAAGATCTTCTTTCAACACTGAAGAATAAGTCAGTGACCCAGAAGACGGAAGCATGGCTGGATAACTTTGCCCGGTGTTGGGATAATTTAGTCCAAAAACTTGAAAAGAGTACAGCACAGATTTCACAGGCTGTCACCACCACTCAGCCATCACTAACACAGACAACTGTAATGGAAACAGTAACTACGGTGACCACAAGGGAACAGATCCTGGTAAAGCATGCTCAAGAGGAACTTCCACCACCACCTCCCCAAAAGAAGAGGCAGATTACTGTGGATTCTGAAATTAGGAAAAGGTTGGATGTTGATATAACTGAACTTCACAGCTGGATTACTCGCTCAGAAGCTGTGTTGCAGAGTCCTGAATTTGCAATCTTTCGGAAGGAAGGCAACTTCTCAGACTTAAAAGAAAAAGTCAATGCCATAGAGCGAGAAAAAGCTGAGAAGTTCAGAAAACTGCAAGATGCCAGCAGATCAGCTCAGGCCCTGGTGGAACAGATGGTGAATGAGGGTGTTAATGCAGATAGCATCAAACAAGCCTCAGAACAACTGAACAGCCGGTGGATCGAATTCTGCCAGTTGCTAAGTGAGAGACTTAACTGGCTGGAGTATCAGAACAACATCATCGCTTTCTATAATCAGCTACAACAATTGGAGCAGATGACAACTACTGCTGAAAACTGGTTGAAAATCCAACCCACCACCCCATCAGAGCCAACAGCAATTAAAAGTCAGTTAAAAATTTGTAAGGATGAAGTCAACCGGCTATCAGGTCTTCAACCTCAAATTGAACGATTAAAAATTCAAAGCATAGCCCTGAAAGAGAAAGGACAAGGACCCATGTTCCTGGATGCAGACTTTGTGGCCTTTACAAATCATTTTAAGCAAGTCTTTTCTGATGTGCAGGCCAGAGAGAAAGAGCTACAGACAATTTTTGACACTTTGCCACCAATGCGCTATCAGGAGACCATGAGTGCCATCAGGACATGGGTCCAGCAGTCAGAAACCAAACTCTCCATACCTCAACTTAGTGTCACCGACTATGAAATCATGGAGCAGAGACTCGGGGAATTGCAGGCTTTACAAAGTTCTCTGCAAGAGCAACAAAGTGGCCTATACTATCTCAGCACCACTGTGAAAGAGATGTCGAAGAAAGCGCCCTCTGAAATTAGCCGGAAATATCAATCAGAATTTGAAGAAATTGAGGGACGCTGGAAGAAGCTCTCCTCCCAGCTGGTTGAGCATTGTCAAAAGCTAGAGGAGCAAATGAATAAACTCCGAAAAATTCAGAATCACATACAAACCCTGAAGAAATGGATGGCTGAAGTTGATGTTTTTCTGAAGGAGGAATGGCCTGCCCTTGGGGATTCAGAAATTCTAAAAAAGCAGCTGAAACAGTGCAGACTTTTAGTCAGTGATATTCAGACAATTCAGCCCAGTCTAAACAGTGTCAATGAAGGTGGGCAGAAGATAAAGAATGAAGCAGAGCCAGAGTTTGCTTCGAGACTTGAGACAGAACTCAAAGAACTTAACACTCAGTGGGATCACATGTGCCAACAGGTCTATGCCAGAAAGGAGGCCTTGAAGGGAGGTTTGGAGAAAACTGTAAGCCTCCAGAAAGATCTATCAGAGATGCACGAATGGATGACACAAGCTGAAGAAGAGTATCTTGAGAGAGATTTTGAATATAAAACTCCAGATGAATTACAGAAAGCAGTTGAAGAGATGAAGAGAGCTAAAGAAGAGGCCCAACAAAAAGAAGCGAAAGTGAAACTCCTTACTGAGTCTGTAAATAGTGTCATAGCTCAAGCTCCACCTGTAGCACAAGAGGCCTTAAAAAAGGAACTTGAAACTCTAACCACCAACTACCAGTGGCTCTGCACTAGGCTGAATGGGAAATGCAAGACTTTGGAAGAAGTTTGGGCATGTTGGCATGAGTTATTGTCATACTTGGAGAAAGCAAACAAGTGGCTAAATGAAGTAGAATTTAAACTTAAAACCACTGAAAACATTCCTGGCGGAGCTGAGGAAATCTCTGAGGTGCTAGATTCACTTGAAAATTTGATGCGACATTCAGAGGATAACCCAAATCAGATTCGCATATTGGCACAGACCCTAACAGATGGCGGAGTCATGGATGAGCTAATCAATGAGGAACTTGAGACATTTAATTCTCGTTGGAGGGAACTACATGAAGAGGCTGTAAGGAGGCAAAAGTTGCTTGAACAGAGCATCCAGTCTGCCCAGGAGACTGAAAAATCCTTACACTTAATCCAGGAGTCCCTCACATTCATTGACAAGCAGTTGGCAGCTTATATTGCAGACAAGGTGGACGCAGCTCAAATGCCTCAGGAAGCCCAGAAAATCCAATCTGATTTGACAAGTCATGAGATCAGTTTAGAAGAAATGAAGAAACATAATCAGGGGAAGGAGGCTGCCCAAAGAGTCCTGTCTCAGATTGATGTTGCACAGAAAAAATTACAAGATGTCTCCATGAAGTTTCGATTATTCCAGAAACCAGCCAATTTTGAGCAGCGTCTACAAGAAAGTAAGATGATTTTAGATGAAGTGAAGATGCACTTGCCTGCATTGGAAACAAAGAGTGTGGAACAGGAAGTAGTACAGTCACAGCTAAATCATTGTGTGAACTTGTATAAAAGTCTGAGTGAAGTGAAGTCTGAAGTGGAAATGGTGATAAAGACTGGACGTCAGATTGTACAGAAAAAGCAGACGGAAAATCCCAAAGAACTTGATGAAAGAGTAACAGCTTTGAAATTGCATTATAATGAGCTGGGAGCAAAGGTAACAGAAAGAAAGCAACAGTTGGAGAAATGCTTGAAATTGTCCCGTAAGATGCGAAAGGAAATGAATGTCTTGACAGAATGGCTGGCAGCTACAGATATGGAATTGACAAAGAGATCAGCAGTTGAAGGAATGCCTAGTAATTTGGATTCTGAAGTTGCCTGGGGAAAGGCTACTCAAAAAGAGATTGAGAAACAGAAGGTGCACCTGAAGAGTATCACAGAGGTAGGAGAGGCCTTGAAAACAGTTTTGGGCAAGAAGGAGACGTTGGTGGAAGATAAACTCAGTCTTCTGAATAGTAACTGGATAGCTGTCACCTCCCGAGCAGAAGAGTGGTTAAATCTTTTGTTGGAATACCAGAAACACATGGAAACTTTTGACCAGAATGTGGACCACATCACAAAGTGGATCATTCAGGCTGACACACTTTTGGATGAATCAGAGAAAAAGAAACCCCAGCAAAAAGAAGACGTGCTTAAGCGTTTAAAGGCAGAACTGAATGACATACGCCCAAAGGTGGACTCTACACGTGACCAAGCAGCAAACTTGATGGCAAACCGCGGTGACCACTGCAGGAAATTAGTAGAGCCCCAAATCTCAGAGCTCAACCATCGATTTGCAGCCATTTCACACAGAATTAAGACTGGAAAGGCCTCCATTCCTTTGAAGGAATTGGAGCAGTTTAACTCAGATATACAAAAATTGCTTGAACCACTGGAGGCTGAAATTCAGCAGGGGGTGAATCTGAAAGAGGAAGACTTCAATAAAGATATGAATGAAGACAATGAGGGTACTGTAAAAGAATTGTTGCAAAGAGGAGACAACTTACAACAAAGAATCACAGATGAGAGAAAGCGAGAGGAAATAAAGATAAAACAGCAGCTGTTACAGACAAAACATAATGCTCTCAAGGATTTGAGGTCTCAAAGAAGAAAAAAGGCTCTAGAAATTTCTCATCAGTGGTATCAGTACAAGAGGCAGGCTGATGATCTCCTGAAATGCTTGGATGACATTGAAAAAAAATTAGCCAGCCTACCTGAGCCCAGAGATGAAAGGAAAATAAAGGAAATTGATCGGGAATTGCAGAAGAAGAAAGAGGAGCTGAATGCAGTGCGTAGGCAAGCTGAGGGCTTGTCTGAGGATGGGGCCGCAATGGCAGTGGAGCCAACTCAGATCCAGCTCAGCAAGCGCTGGCGGGAAATTGAGAGCAAATTTGCTCAGTTTCGAAGACTCAACTTTGCACAAATTCACACTGTCCGTGAAGAAACGATGATGGTGATGACTGAAGACATGCCTTTGGAAATTTCTTATGTGCCTTCTACTTATTTGACTGAAATCACTCATGTCTCACAAGCCCTATTAGAAGTGGAACAACTTCTCAATGCTCCTGACCTCTGTGCTAAGGACTTTGAAGATCTCTTTAAGCAAGAGGAGTCTCTGAAGAATATAAAAGATAGTCTACAACAAAGCTCAGGTCGGATTGACATTATTCATAGCAAGAAGACAGCAGCATTGCAAAGTGCAACGCCTGTGGAAAGGGTGAAGCTACAGGAAGCTCTCTCCCAGCTTGATTTCCAATGGGAAAAAGTTAACAAAATGTACAAGGACCGACAAGGGCGATTTGACAGATCTGTTGAGAAATGGCGGCGTTTTCATTATGATATAAAGATATTTAATCAGTGGCTAACAGAAGCTGAACAGTTTCTCAGAAAGACACAAATTCCTGAGAATTGGGAACATGCTAAATACAAATGGTATCTTAAGGAACTCCAGGATGGCATTGGGCAGCGGCAAACTGTTGTCAGAACATTGAATGCAACTGGGGAAGAAATAATTCAGCAATCCTCAAAAACAGATGCCAGTATTCTACAGGAAAAATTGGGAAGCCTGAATCTGCGGTGGCAGGAGGTCTGCAAACAGCTGTCAGACAGAAAAAAGAGGCTAGAAGAACAAAAGAATATCTTGTCAGAATTTCAAAGAGATTTAAATGAATTTGTTTTATGGTTGGAGGAAGCAGATAACATTGCTAGTATCCCACTTGAACCTGGAAAAGAGCAGCAACTAAAAGAAAAGCTTGAGCAAGTCAAGTTACTGGTGGAAGAGTTGCCCCTGCGCCAGGGAATTCTCAAACAATTAAATGAAACTGGAGGACCCGTGCTTGTAAGTGCTCCCATAAGCCCAGAAGAGCAAGATAAACTTGAAAATAAGCTCAAGCAGACAAATCTCCAGTGGATAAAGGTTTCCAGAGCTTTACCTGAGAAACAAGGAGAAATTGAAGCTCAAATAAAAGACCTTGGGCAGCTTGAAAAAAAGCTTGAAGACCTTGAAGAGCAGTTAAATCATCTGCTGCTGTGGTTATCTCCTATTAGGAATCAGTTGGAAATTTATAACCAACCAAACCAAGAAGGACCATTTGACGTTCAGGAAACTGAAATAGCAGTTCAAGCTAAACAACCGGATGTGGAAGAGATTTTGTCTAAAGGGCAGCATTTGTACAAGGAAAAACCAGCCACTCAGCCAGTGAAGAGGAAGTTAGAAGATCTGAGCTCTGAGTGGAAGGCGGTAAACCGTTTACTTCAAGAGCTGAGGGCAAAGCAGCCTGACCTAGCTCCTGGACTGACCACTATTGGAGCCTCTCCTACTCAGACTGTTACTCTGGTGACACAACCTGTGGTTACTAAGGAAACTGCCATCTCCAAACTAGAAATGCCATCTTCCTTGATGTTGGAGGTACCTGCTCTGGCAGATTTCAACCGGGCTTGGACAGAACTTACCGACTGGCTTTCTCTGCTTGATCAAGTTATAAAATCACAGAGGGTGATGGTGGGTGACCTTGAGGATATCAACGAGATGATCATCAAGCAGAAGGCAACAATGCAGGATTTGGAACAGAGGCGTCCCCAGTTGGAAGAACTCATTACCGCTGCCCAAAATTTGAAAAACAAGACCAGCAATCAAGAGGCTAGAACAATCATTACGGATCGAATTGAAAGAATTCAGAATCAGTGGGATGAAGTACAAGAACACCTTCAGAACCGGAGGCAACAGTTGAATGAAATGTTAAAGGATTCAACACAATGGCTGGAAGCTAAGGAAGAAGCTGAGCAGGTCTTAGGACAGGCCAGAGCCAAGCTTGAGTCATGGAAGGAGGGTCCCTATACAGTAGATGCAATCCAAAAGAAAATCACAGAAACCAAGCAGTTGGCCAAAGACCTCCGCCAGTGGCAGACAAATGTAGATGTGGCAAATGACTTGGCCCTGAAACTTCTCCGGGATTATTCTGCAGATGATACCAGAAAAGTCCACATGATAACAGAGAATATCAATGCCTCTTGGAGAAGCATTCATAA