Skip to content
Snippets Groups Projects
Commit 73c0862f authored by Vermaat's avatar Vermaat
Browse files

Discard incomplete genes in genbank reference files

Many genbank reference files contain more than one gene, especially
slices from an assembly. Some of these genes may be incomplete in
the reference file (i.e., either start or end exceeds the outer
coordinates). We cannot really do anything with these genes, so we
discard them during parsing.
parent 51d8cc50
No related branches found
No related tags found
No related merge requests found
......@@ -10,7 +10,10 @@ Version 2.0.6
Release date to be decided.
- Added `getGeneLocation` webservice method. Given a gene symbol and optional
genome build, it returns the location of the gene.
genome build, it returns the location of the gene (`GitHub#28
<https://github.com/LUMC/mutalyzer/pull/28>`_).
- Discard incomplete genes in genbank reference files (`GitHub#26
<https://github.com/LUMC/mutalyzer/pull/26>`_).
Version 2.0.5
......
......@@ -546,6 +546,12 @@ class GBparser():
#if
if i.qualifiers.has_key("gene") :
if not unicode(i.location.start).isdigit() or \
not unicode(i.location.end).isdigit():
# Feature is not completely in reference. Either start
# or end is not a Bio.SeqFeature.ExactPosition.
continue
geneName = i.qualifiers["gene"][0]
if i.type == "gene" :
if not geneDict.has_key(geneName) :
......@@ -556,6 +562,14 @@ class GBparser():
myGene.location = self.__location2pos(i.location)
geneDict[geneName] = tempGene(geneName)
#if
else:
if geneName not in geneDict:
# We should have seen a gene entry for this gene
# by now. Could be that it was skipped because it
# was not completely in reference (see check
# above). In that case we just ignore any of its
# features.
continue
#if
if i.type in ["mRNA", "misc_RNA", "ncRNA", "rRNA", "tRNA",
......
......@@ -6,16 +6,23 @@ Tests for the mutalyzer.parsers.genbank module.
from __future__ import unicode_literals
#import logging; logging.basicConfig()
import os
from mutalyzer.parsers import genbank
from mutalyzer.config import settings
from fixtures import REFERENCES
from fixtures import database, cache
from utils import MutalyzerTest
from utils import fix
class TestMutator(MutalyzerTest):
"""
Test the mutator module.
"""
fixtures = (database, )
def setup(self):
super(TestMutator, self).setup()
self.gb_parser = genbank.GBparser()
......@@ -35,3 +42,15 @@ class TestMutator(MutalyzerTest):
(['a b c d a b', 'a b'], (2, 2))]
for test in tests:
assert self.gb_parser._find_mismatch(test[0]) == test[1]
@fix(cache('A1BG'))
def test_only_complete_genes_included(self):
"""
Incomplete genes from the reference file should be ignored.
"""
# contains A1BG (complete) and A1BG-AS1, ZNF497, LOC100419840
# (incomplete).
genbank_filename = os.path.join(settings.CACHE_DIR,
REFERENCES['A1BG']['filename'])
record = self.gb_parser.create_record(genbank_filename)
assert [g.name for g in record.geneList] == ['A1BG']
......@@ -8,7 +8,6 @@ from __future__ import unicode_literals
#import logging; logging.basicConfig()
from mutalyzer.output import Output
from mutalyzer.Retriever import GenBankRetriever
from mutalyzer.variantchecker import check_variant
from fixtures import REFERENCES
......@@ -34,7 +33,6 @@ class TestVariantchecker(MutalyzerTest):
"""
super(TestVariantchecker, self).setup()
self.output = Output(__file__)
self.retriever = GenBankRetriever(self.output)
@fix(cache('AL449423.14'))
def test_deletion_in_frame(self):
......@@ -1275,17 +1273,16 @@ class TestVariantchecker(MutalyzerTest):
check_variant('NP_064445.1:p.=', self.output)
assert len(self.output.getMessagesWithErrorCode('ENOTIMPLEMENTED')) == 1
@fix(cache('A1BG'))
@fix(cache('AF230870.1'))
def test_wnomrna_other(self):
"""
Warning for no mRNA field on other than currently selected transcript
should give WNOMRNA_OTHER warning.
"""
# Contains ZNF497 (v1 and v2) with no mRNA
ud = REFERENCES['A1BG']['accession']
check_variant(ud + '(A1BG_v001):c.13del', self.output)
# Contains mtmC2 and mtmB2, both without mRNA
check_variant('AF230870.1(mtmC2_v001):c.13del', self.output)
wnomrna_other = self.output.getMessagesWithErrorCode('WNOMRNA_OTHER')
assert len(wnomrna_other) == 3
assert len(wnomrna_other) == 1
@fix(cache('A1BG'))
def test_wnomrna(self):
......@@ -1293,13 +1290,12 @@ class TestVariantchecker(MutalyzerTest):
Warning for no mRNA field on currently selected transcript should give
WNOMRNA warning.
"""
# Contains ZNF497 (v1 and v2) with no mRNA
ud = REFERENCES['A1BG']['accession']
check_variant(ud + '(ZNF497_v001):c.13del', self.output)
# Contains mtmC2 and mtmB2, both without mRNA
check_variant('AF230870.1(mtmC2_v001):c.13del', self.output)
wnomrna = self.output.getMessagesWithErrorCode('WNOMRNA')
wnomrna_other = self.output.getMessagesWithErrorCode('WNOMRNA_OTHER')
assert len(wnomrna) == 1
assert len(wnomrna_other) == 2
assert len(wnomrna_other) == 1
@fix(cache('L41870.1'))
def test_mrna_ref_adjacent_exons_warn(self):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment