diff --git a/CHANGES.rst b/CHANGES.rst index eaf4adf3145189a50f3a3754a5ab4f1deefbe31a..093165df7ef6d7b50d48f7f69685074a5ec280d7 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -10,7 +10,10 @@ Version 2.0.6 Release date to be decided. - Added `getGeneLocation` webservice method. Given a gene symbol and optional - genome build, it returns the location of the gene. + genome build, it returns the location of the gene (`GitHub#28 + <https://github.com/LUMC/mutalyzer/pull/28>`_). +- Discard incomplete genes in genbank reference files (`GitHub#26 + <https://github.com/LUMC/mutalyzer/pull/26>`_). Version 2.0.5 diff --git a/mutalyzer/parsers/genbank.py b/mutalyzer/parsers/genbank.py index 247545989e105702211e0c796a88b256edd40f3d..907ee45f3eb79dcdafc88a3d7e8bdeb1e85cff1f 100644 --- a/mutalyzer/parsers/genbank.py +++ b/mutalyzer/parsers/genbank.py @@ -546,6 +546,12 @@ class GBparser(): #if if i.qualifiers.has_key("gene") : + if not unicode(i.location.start).isdigit() or \ + not unicode(i.location.end).isdigit(): + # Feature is not completely in reference. Either start + # or end is not a Bio.SeqFeature.ExactPosition. + continue + geneName = i.qualifiers["gene"][0] if i.type == "gene" : if not geneDict.has_key(geneName) : @@ -556,6 +562,14 @@ class GBparser(): myGene.location = self.__location2pos(i.location) geneDict[geneName] = tempGene(geneName) #if + else: + if geneName not in geneDict: + # We should have seen a gene entry for this gene + # by now. Could be that it was skipped because it + # was not completely in reference (see check + # above). In that case we just ignore any of its + # features. + continue #if if i.type in ["mRNA", "misc_RNA", "ncRNA", "rRNA", "tRNA", diff --git a/tests/test_parsers_genbank.py b/tests/test_parsers_genbank.py index f04b883971617ee9885ba3478bd667c674189650..65d10655707919f0c193d6a7ee16187081ee4632 100644 --- a/tests/test_parsers_genbank.py +++ b/tests/test_parsers_genbank.py @@ -6,16 +6,23 @@ Tests for the mutalyzer.parsers.genbank module. from __future__ import unicode_literals #import logging; logging.basicConfig() +import os from mutalyzer.parsers import genbank +from mutalyzer.config import settings +from fixtures import REFERENCES +from fixtures import database, cache from utils import MutalyzerTest +from utils import fix class TestMutator(MutalyzerTest): """ Test the mutator module. """ + fixtures = (database, ) + def setup(self): super(TestMutator, self).setup() self.gb_parser = genbank.GBparser() @@ -35,3 +42,15 @@ class TestMutator(MutalyzerTest): (['a b c d a b', 'a b'], (2, 2))] for test in tests: assert self.gb_parser._find_mismatch(test[0]) == test[1] + + @fix(cache('A1BG')) + def test_only_complete_genes_included(self): + """ + Incomplete genes from the reference file should be ignored. + """ + # contains A1BG (complete) and A1BG-AS1, ZNF497, LOC100419840 + # (incomplete). + genbank_filename = os.path.join(settings.CACHE_DIR, + REFERENCES['A1BG']['filename']) + record = self.gb_parser.create_record(genbank_filename) + assert [g.name for g in record.geneList] == ['A1BG'] diff --git a/tests/test_variantchecker.py b/tests/test_variantchecker.py index 5a859b4f695f8ec1a8554fa7447ad16c051bb486..d82e9c83137d6bd9055a5b1bcd9f4710731bd55b 100644 --- a/tests/test_variantchecker.py +++ b/tests/test_variantchecker.py @@ -8,7 +8,6 @@ from __future__ import unicode_literals #import logging; logging.basicConfig() from mutalyzer.output import Output -from mutalyzer.Retriever import GenBankRetriever from mutalyzer.variantchecker import check_variant from fixtures import REFERENCES @@ -34,7 +33,6 @@ class TestVariantchecker(MutalyzerTest): """ super(TestVariantchecker, self).setup() self.output = Output(__file__) - self.retriever = GenBankRetriever(self.output) @fix(cache('AL449423.14')) def test_deletion_in_frame(self): @@ -1275,17 +1273,16 @@ class TestVariantchecker(MutalyzerTest): check_variant('NP_064445.1:p.=', self.output) assert len(self.output.getMessagesWithErrorCode('ENOTIMPLEMENTED')) == 1 - @fix(cache('A1BG')) + @fix(cache('AF230870.1')) def test_wnomrna_other(self): """ Warning for no mRNA field on other than currently selected transcript should give WNOMRNA_OTHER warning. """ - # Contains ZNF497 (v1 and v2) with no mRNA - ud = REFERENCES['A1BG']['accession'] - check_variant(ud + '(A1BG_v001):c.13del', self.output) + # Contains mtmC2 and mtmB2, both without mRNA + check_variant('AF230870.1(mtmC2_v001):c.13del', self.output) wnomrna_other = self.output.getMessagesWithErrorCode('WNOMRNA_OTHER') - assert len(wnomrna_other) == 3 + assert len(wnomrna_other) == 1 @fix(cache('A1BG')) def test_wnomrna(self): @@ -1293,13 +1290,12 @@ class TestVariantchecker(MutalyzerTest): Warning for no mRNA field on currently selected transcript should give WNOMRNA warning. """ - # Contains ZNF497 (v1 and v2) with no mRNA - ud = REFERENCES['A1BG']['accession'] - check_variant(ud + '(ZNF497_v001):c.13del', self.output) + # Contains mtmC2 and mtmB2, both without mRNA + check_variant('AF230870.1(mtmC2_v001):c.13del', self.output) wnomrna = self.output.getMessagesWithErrorCode('WNOMRNA') wnomrna_other = self.output.getMessagesWithErrorCode('WNOMRNA_OTHER') assert len(wnomrna) == 1 - assert len(wnomrna_other) == 2 + assert len(wnomrna_other) == 1 @fix(cache('L41870.1')) def test_mrna_ref_adjacent_exons_warn(self):