Discard incomplete genes in genbank reference files

Many genbank reference files contain more than one gene, especially slices from an assembly. Some of these genes may be incomplete in the reference file (i.e., either start or end exceeds the outer coordinates). We cannot really do anything with these genes, so we discard them during parsing.

Discard incomplete genes in genbank reference files
73c0862f · Vermaat · 51d8cc50 · 73c0862f · 73c0862f · 73c0862f
Commit 73c0862f authored 10 years ago by Vermaat
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -10,7 +10,10 @@ Version 2.0.6
 Release date to be decided.

 - Added `getGeneLocation` webservice method. Given a gene symbol and optional
-  genome build, it returns the location of the gene.
+  genome build, it returns the location of the gene (`GitHub#28
+  <https://github.com/LUMC/mutalyzer/pull/28>`_).
+- Discard incomplete genes in genbank reference files (`GitHub#26
+  <https://github.com/LUMC/mutalyzer/pull/26>`_).


 Version 2.0.5

--- a/mutalyzer/parsers/genbank.py
+++ b/mutalyzer/parsers/genbank.py
@@ -546,6 +546,12 @@ class GBparser():
                #if

                if i.qualifiers.has_key("gene") :
+                    if not unicode(i.location.start).isdigit() or \
+                       not unicode(i.location.end).isdigit():
+                        # Feature is not completely in reference. Either start
+                        # or end is not a Bio.SeqFeature.ExactPosition.
+                        continue
+
                    geneName = i.qualifiers["gene"][0]
                    if i.type == "gene" :
                        if not geneDict.has_key(geneName) :
@@ -556,6 +562,14 @@ class GBparser():
                            myGene.location = self.__location2pos(i.location)
                            geneDict[geneName] = tempGene(geneName)
                        #if
+                    else:
+                        if geneName not in geneDict:
+                            # We should have seen a gene entry for this gene
+                            # by now. Could be that it was skipped because it
+                            # was not completely in reference (see check
+                            # above). In that case we just ignore any of its
+                            # features.
+                            continue
                    #if

                    if i.type in ["mRNA", "misc_RNA", "ncRNA", "rRNA", "tRNA",

--- a/tests/test_parsers_genbank.py
+++ b/tests/test_parsers_genbank.py
@@ -6,16 +6,23 @@ Tests for the mutalyzer.parsers.genbank module.
 from __future__ import unicode_literals

 #import logging; logging.basicConfig()
+import os

 from mutalyzer.parsers import genbank
+from mutalyzer.config import settings

+from fixtures import REFERENCES
+from fixtures import database, cache
 from utils import MutalyzerTest
+from utils import fix


 class TestMutator(MutalyzerTest):
    """
    Test the mutator module.
    """
+    fixtures = (database, )
+
    def setup(self):
        super(TestMutator, self).setup()
        self.gb_parser = genbank.GBparser()
@@ -35,3 +42,15 @@ class TestMutator(MutalyzerTest):
                 (['a b c d a b', 'a b'], (2, 2))]
        for test in tests:
            assert self.gb_parser._find_mismatch(test[0]) == test[1]
+
+    @fix(cache('A1BG'))
+    def test_only_complete_genes_included(self):
+        """
+        Incomplete genes from the reference file should be ignored.
+        """
+        # contains A1BG (complete) and A1BG-AS1, ZNF497, LOC100419840
+        # (incomplete).
+        genbank_filename = os.path.join(settings.CACHE_DIR,
+                                        REFERENCES['A1BG']['filename'])
+        record = self.gb_parser.create_record(genbank_filename)
+        assert [g.name for g in record.geneList] == ['A1BG']
--- a/tests/test_variantchecker.py
+++ b/tests/test_variantchecker.py
@@ -8,7 +8,6 @@ from __future__ import unicode_literals
 #import logging; logging.basicConfig()

 from mutalyzer.output import Output
-from mutalyzer.Retriever import GenBankRetriever
 from mutalyzer.variantchecker import check_variant

 from fixtures import REFERENCES
@@ -34,7 +33,6 @@ class TestVariantchecker(MutalyzerTest):
        """
        super(TestVariantchecker, self).setup()
        self.output = Output(__file__)
-        self.retriever = GenBankRetriever(self.output)

    @fix(cache('AL449423.14'))
    def test_deletion_in_frame(self):
@@ -1275,17 +1273,16 @@ class TestVariantchecker(MutalyzerTest):
        check_variant('NP_064445.1:p.=', self.output)
        assert len(self.output.getMessagesWithErrorCode('ENOTIMPLEMENTED')) == 1

-    @fix(cache('A1BG'))
+    @fix(cache('AF230870.1'))
    def test_wnomrna_other(self):
        """
        Warning for no mRNA field on other than currently selected transcript
        should give WNOMRNA_OTHER warning.
        """
-        # Contains ZNF497 (v1 and v2) with no mRNA
-        ud = REFERENCES['A1BG']['accession']
-        check_variant(ud + '(A1BG_v001):c.13del', self.output)
+        # Contains mtmC2 and mtmB2, both without mRNA
+        check_variant('AF230870.1(mtmC2_v001):c.13del', self.output)
        wnomrna_other = self.output.getMessagesWithErrorCode('WNOMRNA_OTHER')
-        assert len(wnomrna_other) == 3
+        assert len(wnomrna_other) == 1

    @fix(cache('A1BG'))
    def test_wnomrna(self):
@@ -1293,13 +1290,12 @@ class TestVariantchecker(MutalyzerTest):
        Warning for no mRNA field on currently selected transcript should give
        WNOMRNA warning.
        """
-        # Contains ZNF497 (v1 and v2) with no mRNA
-        ud = REFERENCES['A1BG']['accession']
-        check_variant(ud + '(ZNF497_v001):c.13del', self.output)
+        # Contains mtmC2 and mtmB2, both without mRNA
+        check_variant('AF230870.1(mtmC2_v001):c.13del', self.output)
        wnomrna = self.output.getMessagesWithErrorCode('WNOMRNA')
        wnomrna_other = self.output.getMessagesWithErrorCode('WNOMRNA_OTHER')
        assert len(wnomrna) == 1
-        assert len(wnomrna_other) == 2
+        assert len(wnomrna_other) == 1

    @fix(cache('L41870.1'))
    def test_mrna_ref_adjacent_exons_warn(self):