Commit 20b45864 authored by Jeroen F.J. Laros's avatar Jeroen F.J. Laros
Browse files

More refactoring.

parent db29b0ef
......@@ -2,3 +2,4 @@
*.egg-info
build
dist
.ipynb_checkpoints
......@@ -2,29 +2,31 @@
This library provides functions for back translation from amino acids to
nucleotides.
from backtranslate import backtranslate
from backtranslate.backtranslate import BackTranslate
# First create the reverse translation table for the organism of interest.
back_table = backtranslate.reverse_translation_table()
# Create a class instance, optionally giving the translation table id.
bt = BackTranslate()
# Find all substitutions that transform the codon 'TTG' into a stop codon.
substitutions = backtranslate.one_subst(back_table, 'TGG', '*')
substitutions = bt.with_dna('TGG', '*')
Sometimes we do not have access to the DNA sequence so we have to find
possible substitutions from the amino acids directly.
# Find all substitutions that transform a Tryptophan into a stop codon.
substitutions = backtranslate.one_subst_without_dna(back_table, 'W', '*')
substitutions = bt.without_dna('W', '*')
To find out which substitution predictions can be improved by adding codon
information, use the following function.
backtranslate.improvable_substitutions(back_table)
bt.improvable()
To get substitutions in a readable format, we can use the following:
from backtranslate.util import subst_to_hgvs
# Transform the substitutions to HGVS.
variants = backtranslate.subst_to_hgvs(substitutions, 12)
variants = subst_to_hgvs(substitutions, 12)
# Print the variants in human readable format.
print map(str, variants)
......@@ -32,7 +34,7 @@ To get substitutions in a readable format, we can use the following:
Use the command `back_translate` to find substitutions that explain an amino
acid change:
$ back_translate with_dna NM_003002.2 69 Asp
$ back_translate with_dna NM_003002.3 69 Asp
['NM_003002.2:c.207G>T', 'NM_003002.2:c.207G>C']
When using a genomic reference sequence, make sure to mention the gene name and
......
......@@ -9,7 +9,7 @@ Licensed under the MIT license, see the LICENSE file.
"""
__version_info__ = ('0', '0', '2')
__version_info__ = ('0', '0', '3')
__version__ = '.'.join(__version_info__)
......
from collections import defaultdict
from Bio.Data import CodonTable, IUPACData
from extractor.variant import Allele, DNAVar, ISeq, ISeqList
from Bio.Data import CodonTable
from Levenshtein import hamming
def _three_to_one():
def cmp_subst(subst_1, subst_2):
"""
The three letter to one letter table for amino acids including stop.
:returns dict: Three letter to one letter amino acids table.
"""
protein_letters_3to1 = dict(IUPACData.protein_letters_3to1_extended)
protein_letters_3to1.update({'Ter': '*'})
return protein_letters_3to1
Compare two substitution sets.
:arg dict subst_1: Substitution set.
:arg dict subst_2: Substitution set.
def _compare_dict(d1, d2):
:returns bool: True if `subst_1` equals `subst2`, False otherwise.
"""
"""
if len(d1) != len(d2):
if len(subst_1) != len(subst_2):
return False
for item in d1:
if d1[item] != d2[item]:
for item in subst_1:
if subst_1[item] != subst_2[item]:
return False
return True
def _one_subst(substitutions, back_table, reference_codon, amino_acid):
"""
Find single nucleotide substitutions that given a reference codon explains
an observed amino acid.
:arg dictsubstitutions: Set of single nucleotide substitutions indexed by
position.
:arg dict back_table: Reverse translation table.
:arg str reference_codon: Original codon.
:arg str amino_acid: Observed amino acid.
"""
for codon in back_table[amino_acid]:
if hamming(codon, reference_codon) == 1:
for position in range(3):
if codon[position] != reference_codon[position]:
substitutions[position].add(
(reference_codon[position], codon[position]))
def reverse_translation_table(table_id=1):
"""
Calculate a reverse translation table.
......@@ -69,94 +42,95 @@ def reverse_translation_table(table_id=1):
return back_table
def one_subst(back_table, reference_codon, amino_acid):
class BackTranslate(object):
"""
Find single nucleotide substitutions that given a reference codon explains
an observed amino acid.
:arg dict back_table: Reverse translation table.
:arg str reference_codon: Original codon.
:arg str amino_acid: Observed amino acid.
:returns dict: Set of single nucleotide substitutions indexed by position.
Back translation.
"""
substitutions = defaultdict(set)
def __init__(self, table_id=1):
"""
Initialise the class.
_one_subst(substitutions, back_table, reference_codon, amino_acid)
:arg int table_id: Translation table id.
"""
self._back_table = reverse_translation_table(table_id)
return substitutions
def _one_subst(self, substitutions, reference_codon, amino_acid):
"""
Find single nucleotide substitutions that given a reference codon
explains an observed amino acid.
def one_subst_without_dna(back_table, reference_amino_acid, amino_acid):
"""
Find single nucleotide substitutions that given a reference amino acid
explains an observed amino acid.
:arg dictsubstitutions: Set of single nucleotide substitutions indexed
by position.
:arg str reference_codon: Original codon.
:arg str amino_acid: Observed amino acid.
"""
for codon in self._back_table[amino_acid]:
if hamming(codon, reference_codon) == 1:
for position in range(3):
if codon[position] != reference_codon[position]:
substitutions[position].add(
(reference_codon[position], codon[position]))
:arg dict back_table: Reverse translation table.
:arg str reference_amino_acid: Original amino acid.
:arg str amino_acid: Observed amino acid.
:returns dict: Set of single nucleotide substitutions indexed by position.
"""
substitutions = defaultdict(set)
def with_dna(self, reference_codon, amino_acid):
"""
Find single nucleotide substitutions that given a reference codon
explains an observed amino acid.
for reference_codon in back_table[reference_amino_acid]:
_one_subst(substitutions, back_table, reference_codon, amino_acid)
:arg str reference_codon: Original codon.
:arg str amino_acid: Observed amino acid.
return substitutions
:returns dict: Set of single nucleotide substitutions indexed by
position.
"""
substitutions = defaultdict(set)
self._one_subst(substitutions, reference_codon, amino_acid)
def subst_to_hgvs(substitutions, offset=0):
"""
Translate a set of substitutions to HGVS.
return substitutions
:arg dict substitutions: Set of single nucleotide substitutions indexed by
position.
:arg int offset: Codon position in the CDS.
:returns set: Substitutions in HGVS format.
"""
variants = set()
for position in substitutions:
for substitution in substitutions[position]:
variants.add(Allele([DNAVar(
start=position + offset + 1,
end=position + offset + 1,
sample_start=position + offset + 1,
sample_end=position + offset + 1,
type='subst',
deleted=ISeqList([ISeq(sequence=substitution[0])]),
inserted=ISeqList([ISeq(sequence=substitution[1])]))]))
def without_dna(self, reference_amino_acid, amino_acid):
"""
Find single nucleotide substitutions that given a reference amino acid
explains an observed amino acid.
return variants
:arg str reference_amino_acid: Original amino acid.
:arg str amino_acid: Observed amino acid.
:returns dict: Set of single nucleotide substitutions indexed by
position.
"""
substitutions = defaultdict(set)
def improvable_substitutions(back_table):
"""
Calculate all pairs of amino acid substututions that can be improved by
looking at the underlying codon.
for reference_codon in self._back_table[reference_amino_acid]:
self._one_subst(substitutions, reference_codon, amino_acid)
:arg dict back_table: Reverse translation table.
return substitutions
:returns list: List of improvable substitutions.
"""
substitutions = set()
for reference_amino_acid in back_table:
for sample_amino_acid in back_table:
substitutions_without_dna = one_subst_without_dna(
back_table, reference_amino_acid, sample_amino_acid)
for codon in back_table[reference_amino_acid]:
substitutions_with_dna = one_subst(
back_table, codon, sample_amino_acid)
if (substitutions_with_dna and not _compare_dict(
substitutions_without_dna, substitutions_with_dna) and
reference_amino_acid != sample_amino_acid):
substitutions.add(
(reference_amino_acid, sample_amino_acid))
def improvable(self):
"""
Calculate all pairs of amino acid substututions that can be improved by
looking at the underlying codon.
return substitutions
:returns list: List of improvable substitutions.
"""
substitutions = set()
for reference_amino_acid in self._back_table:
for sample_amino_acid in self._back_table:
substitutions_without_dna = self.without_dna(
reference_amino_acid, sample_amino_acid)
for codon in self._back_table[reference_amino_acid]:
substitutions_with_dna = self.with_dna(
codon, sample_amino_acid)
if (substitutions_with_dna and not
cmp_subst(substitutions_without_dna,
substitutions_with_dna) and
reference_amino_acid != sample_amino_acid):
substitutions.add(
(reference_amino_acid, sample_amino_acid))
protein_letters_3to1 = _three_to_one()
return substitutions
......@@ -5,9 +5,8 @@ import argparse
from suds.client import Client
from . import usage, version, doc_split
from .backtranslate import (
reverse_translation_table, protein_letters_3to1, one_subst,
one_subst_without_dna, subst_to_hgvs, improvable_substitutions)
from .backtranslate import BackTranslate
from .util import protein_letters_3to1, subst_to_hgvs
URL = 'https://mutalyzer.nl/services/?wsdl'
......@@ -30,9 +29,8 @@ def with_dna(reference, position, amino_acid):
cds = str(service.runMutalyzer('{}:c.1del'.format(reference)).origCDS)
codon = cds[offset:offset + 3]
back_table = reverse_translation_table()
substitutions = one_subst(
back_table, codon, protein_letters_3to1[amino_acid])
bt = BackTranslate()
substitutions = bt.with_dna(codon, protein_letters_3to1[amino_acid])
return subst_to_hgvs(substitutions, offset)
......@@ -49,11 +47,11 @@ def without_dna(reference, position, reference_amino_acid, amino_acid):
:returns set: Variants that lead to the observed amino acid change.
"""
back_table = reverse_translation_table()
improvable = improvable_substitutions(back_table)
bt = BackTranslate()
improvable = bt.improvable()
substitutions = one_subst_without_dna(
back_table, protein_letters_3to1[reference_amino_acid],
substitutions = bt.without_dna(
protein_letters_3to1[reference_amino_acid],
protein_letters_3to1[amino_acid])
if (protein_letters_3to1[reference_amino_acid],
......@@ -70,7 +68,7 @@ def main():
input_parser = argparse.ArgumentParser(add_help=False)
input_parser.add_argument(
'reference', type=str,
help='accession number, e.g., AB026906.1(SDHD_v001)')
help='accession number, e.g., NM_003002.3')
input_parser.add_argument('position', type=int, help='position, e.g., 92')
reference_aa_parser = argparse.ArgumentParser(add_help=False)
......
......@@ -11,17 +11,17 @@ import argparse
from Bio import SeqIO
from .backtranslate import one_subst, reverse_translation_table
from .backtranslate import BackTranslate
def find_positions(sequence, offset):
"""
"""
back_table = reverse_translation_table(1)
bt = BackTranslate()
result = []
for i in range(offset, len(sequence) - ((len(sequence) - offset) % 3), 3):
stop_positions = one_subst(back_table, sequence[i:i + 3], '*')
stop_positions = bt.with_dna(sequence[i:i + 3], '*')
for position in stop_positions:
result.append((i + position + 1, stop_positions[position]))
......
from Bio.Data import IUPACData
from extractor.variant import Allele, DNAVar, ISeq, ISeqList
def _three_to_one():
"""
The three letter to one letter table for amino acids including stop.
:returns dict: Three letter to one letter amino acids table.
"""
protein_letters_3to1 = dict(IUPACData.protein_letters_3to1_extended)
protein_letters_3to1.update({'Ter': '*'})
return protein_letters_3to1
def subst_to_hgvs(substitutions, offset=0):
"""
Translate a set of substitutions to HGVS.
:arg dict substitutions: Set of single nucleotide substitutions indexed by
position.
:arg int offset: Codon position in the CDS.
:returns set: Substitutions in HGVS format.
"""
variants = set()
for position in substitutions:
for substitution in substitutions[position]:
variants.add(Allele([DNAVar(
start=position + offset + 1,
end=position + offset + 1,
sample_start=position + offset + 1,
sample_end=position + offset + 1,
type='subst',
deleted=ISeqList([ISeq(sequence=substitution[0])]),
inserted=ISeqList([ISeq(sequence=substitution[1])]))]))
return variants
protein_letters_3to1 = _three_to_one()
This diff is collapsed.
......@@ -3,8 +3,7 @@ Tests for the backtranslate.backtranslate module.
"""
from backtranslate.backtranslate import (
reverse_translation_table, one_subst, one_subst_without_dna, _compare_dict)
from backtranslate.backtranslate import BackTranslate, cmp_subst
class TestParser(object):
......@@ -12,35 +11,35 @@ class TestParser(object):
Test the backtranslate.backtranslate module.
"""
def setup(self):
self.back_table = reverse_translation_table()
self.bt = BackTranslate()
def test_with_dna_1(self):
assert _compare_dict(
one_subst(self.back_table, 'TGG', '*'),
assert cmp_subst(
self.bt.with_dna('TGG', '*'),
{1: set([('G', 'A')]), 2: set([('G', 'A')])})
def test_with_dna_2(self):
assert _compare_dict(
one_subst(self.back_table, 'GTA', 'L'),
assert cmp_subst(
self.bt.with_dna('GTA', 'L'),
{0: set([('G', 'C'), ('G', 'T')])})
def test_without_dna_1(self):
assert _compare_dict(
one_subst_without_dna(self.back_table, 'R', '*'),
assert cmp_subst(
self.bt.without_dna('R', '*'),
{0: set([('C', 'T'), ('A', 'T')])})
def test_without_dna_2(self):
assert _compare_dict(
one_subst_without_dna(self.back_table, 'N', 'K'),
assert cmp_subst(
self.bt.without_dna('N', 'K'),
{2: set([('T', 'G'), ('C', 'A'), ('C', 'G'), ('T', 'A')])})
def test_without_dna_3(self):
assert _compare_dict(
one_subst_without_dna(self.back_table, 'R', 'S'), {
assert cmp_subst(
self.bt.without_dna('R', 'S'), {
0: set([('C', 'A')]),
2: set([('G', 'C'), ('A', 'T'), ('A', 'C'), ('G', 'T')])})
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment