From 73c0862f8f90d1a84fb95e008a234bef0ec8028e Mon Sep 17 00:00:00 2001
From: Martijn Vermaat <martijn@vermaat.name>
Date: Fri, 23 Jan 2015 13:09:26 +0100
Subject: [PATCH] Discard incomplete genes in genbank reference files

Many genbank reference files contain more than one gene, especially
slices from an assembly. Some of these genes may be incomplete in
the reference file (i.e., either start or end exceeds the outer
coordinates). We cannot really do anything with these genes, so we
discard them during parsing.
---
 CHANGES.rst                   |  5 ++++-
 mutalyzer/parsers/genbank.py  | 14 ++++++++++++++
 tests/test_parsers_genbank.py | 19 +++++++++++++++++++
 tests/test_variantchecker.py  | 18 +++++++-----------
 4 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index eaf4adf3..093165df 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -10,7 +10,10 @@ Version 2.0.6
 Release date to be decided.
 
 - Added `getGeneLocation` webservice method. Given a gene symbol and optional
-  genome build, it returns the location of the gene.
+  genome build, it returns the location of the gene (`GitHub#28
+  <https://github.com/LUMC/mutalyzer/pull/28>`_).
+- Discard incomplete genes in genbank reference files (`GitHub#26
+  <https://github.com/LUMC/mutalyzer/pull/26>`_).
 
 
 Version 2.0.5
diff --git a/mutalyzer/parsers/genbank.py b/mutalyzer/parsers/genbank.py
index 24754598..907ee45f 100644
--- a/mutalyzer/parsers/genbank.py
+++ b/mutalyzer/parsers/genbank.py
@@ -546,6 +546,12 @@ class GBparser():
                 #if
 
                 if i.qualifiers.has_key("gene") :
+                    if not unicode(i.location.start).isdigit() or \
+                       not unicode(i.location.end).isdigit():
+                        # Feature is not completely in reference. Either start
+                        # or end is not a Bio.SeqFeature.ExactPosition.
+                        continue
+
                     geneName = i.qualifiers["gene"][0]
                     if i.type == "gene" :
                         if not geneDict.has_key(geneName) :
@@ -556,6 +562,14 @@ class GBparser():
                             myGene.location = self.__location2pos(i.location)
                             geneDict[geneName] = tempGene(geneName)
                         #if
+                    else:
+                        if geneName not in geneDict:
+                            # We should have seen a gene entry for this gene
+                            # by now. Could be that it was skipped because it
+                            # was not completely in reference (see check
+                            # above). In that case we just ignore any of its
+                            # features.
+                            continue
                     #if
 
                     if i.type in ["mRNA", "misc_RNA", "ncRNA", "rRNA", "tRNA",
diff --git a/tests/test_parsers_genbank.py b/tests/test_parsers_genbank.py
index f04b8839..65d10655 100644
--- a/tests/test_parsers_genbank.py
+++ b/tests/test_parsers_genbank.py
@@ -6,16 +6,23 @@ Tests for the mutalyzer.parsers.genbank module.
 from __future__ import unicode_literals
 
 #import logging; logging.basicConfig()
+import os
 
 from mutalyzer.parsers import genbank
+from mutalyzer.config import settings
 
+from fixtures import REFERENCES
+from fixtures import database, cache
 from utils import MutalyzerTest
+from utils import fix
 
 
 class TestMutator(MutalyzerTest):
     """
     Test the mutator module.
     """
+    fixtures = (database, )
+
     def setup(self):
         super(TestMutator, self).setup()
         self.gb_parser = genbank.GBparser()
@@ -35,3 +42,15 @@ class TestMutator(MutalyzerTest):
                  (['a b c d a b', 'a b'], (2, 2))]
         for test in tests:
             assert self.gb_parser._find_mismatch(test[0]) == test[1]
+
+    @fix(cache('A1BG'))
+    def test_only_complete_genes_included(self):
+        """
+        Incomplete genes from the reference file should be ignored.
+        """
+        # contains A1BG (complete) and A1BG-AS1, ZNF497, LOC100419840
+        # (incomplete).
+        genbank_filename = os.path.join(settings.CACHE_DIR,
+                                        REFERENCES['A1BG']['filename'])
+        record = self.gb_parser.create_record(genbank_filename)
+        assert [g.name for g in record.geneList] == ['A1BG']
diff --git a/tests/test_variantchecker.py b/tests/test_variantchecker.py
index 5a859b4f..d82e9c83 100644
--- a/tests/test_variantchecker.py
+++ b/tests/test_variantchecker.py
@@ -8,7 +8,6 @@ from __future__ import unicode_literals
 #import logging; logging.basicConfig()
 
 from mutalyzer.output import Output
-from mutalyzer.Retriever import GenBankRetriever
 from mutalyzer.variantchecker import check_variant
 
 from fixtures import REFERENCES
@@ -34,7 +33,6 @@ class TestVariantchecker(MutalyzerTest):
         """
         super(TestVariantchecker, self).setup()
         self.output = Output(__file__)
-        self.retriever = GenBankRetriever(self.output)
 
     @fix(cache('AL449423.14'))
     def test_deletion_in_frame(self):
@@ -1275,17 +1273,16 @@ class TestVariantchecker(MutalyzerTest):
         check_variant('NP_064445.1:p.=', self.output)
         assert len(self.output.getMessagesWithErrorCode('ENOTIMPLEMENTED')) == 1
 
-    @fix(cache('A1BG'))
+    @fix(cache('AF230870.1'))
     def test_wnomrna_other(self):
         """
         Warning for no mRNA field on other than currently selected transcript
         should give WNOMRNA_OTHER warning.
         """
-        # Contains ZNF497 (v1 and v2) with no mRNA
-        ud = REFERENCES['A1BG']['accession']
-        check_variant(ud + '(A1BG_v001):c.13del', self.output)
+        # Contains mtmC2 and mtmB2, both without mRNA
+        check_variant('AF230870.1(mtmC2_v001):c.13del', self.output)
         wnomrna_other = self.output.getMessagesWithErrorCode('WNOMRNA_OTHER')
-        assert len(wnomrna_other) == 3
+        assert len(wnomrna_other) == 1
 
     @fix(cache('A1BG'))
     def test_wnomrna(self):
@@ -1293,13 +1290,12 @@ class TestVariantchecker(MutalyzerTest):
         Warning for no mRNA field on currently selected transcript should give
         WNOMRNA warning.
         """
-        # Contains ZNF497 (v1 and v2) with no mRNA
-        ud = REFERENCES['A1BG']['accession']
-        check_variant(ud + '(ZNF497_v001):c.13del', self.output)
+        # Contains mtmC2 and mtmB2, both without mRNA
+        check_variant('AF230870.1(mtmC2_v001):c.13del', self.output)
         wnomrna = self.output.getMessagesWithErrorCode('WNOMRNA')
         wnomrna_other = self.output.getMessagesWithErrorCode('WNOMRNA_OTHER')
         assert len(wnomrna) == 1
-        assert len(wnomrna_other) == 2
+        assert len(wnomrna_other) == 1
 
     @fix(cache('L41870.1'))
     def test_mrna_ref_adjacent_exons_warn(self):
-- 
GitLab