Merge pull request #138 from mutalyzer/genbank-incomplete-genes

Add gene feature to genbank file without version

Merge pull request #138 from mutalyzer/genbank-incomplete-genes
bd705d16 · Vermaat · 17a825d3 · 8fac2dc7 · bd705d16 · bd705d16
Commit bd705d16 authored 9 years ago by Vermaat
--- a/mutalyzer/parsers/genbank.py
+++ b/mutalyzer/parsers/genbank.py
@@ -57,12 +57,14 @@ class GBparser():
    """
    @todo: documentation
    """
-    def __location2pos(self, location):
+    def __location2pos(self, location, require_exact=True):
        """
        Convert a location object to a tuple of integers.
        @arg location: A location object (see the BioPython documentation)
        @type location: location object
+        @arg require_exact: Require exact positions.
+        @type require_exact: bool
        @return: A tuple of integers
        @rtype: list
@@ -70,10 +72,10 @@ class GBparser():
        ret = []
-        if not unicode(location.start).isdigit() or \
+        if require_exact:
-           not unicode(location.end).isdigit() :
+            if not unicode(location.start).isdigit() or \
-            return None
+               not unicode(location.end).isdigit() :
-        #if
+                return None
        ret.append(location.start.position + 1)
        ret.append(location.end.position)
@@ -81,12 +83,14 @@ class GBparser():
        return ret
    #__location2pos
-    def __locationList2posList(self, locationList):
+    def __location2posList(self, location, require_exact=True):
        """
-        Convert a list of locations to a list of integers.
+        Convert a location object to a list of integers.
-        @arg locationList: A list of locations (see the BioPython documentation)
+        @arg location: A location object (see the BioPython documentation)
-        @type locationList: list (location objects)
+        @type location: location object
+        @arg require_exact: Require exact positions.
+        @type require_exact: bool
        @return: A list (of even length) of integers
        @rtype: list (integers)
@@ -94,25 +98,29 @@ class GBparser():
        ret = []
-        if not unicode(locationList.location.start).isdigit() or \
+        if require_exact:
-           not unicode(locationList.location.end).isdigit() :
+            if not unicode(location.start).isdigit() or \
-            return None
+               not unicode(location.end).isdigit() :
+                return None
        #if
-        for i in locationList.sub_features :
+        for part in location.parts[::location.strand]:
-            if i.ref : # This is a workaround for a bug in BioPython.
+            pos = self.__location2pos(part, require_exact=require_exact)
-                ret = None
+            if not pos:
-                break
+                return None
-            #if
-            temp = self.__location2pos(i.location)
+            ret.append(pos[0])
-            if temp :
+            ret.append(pos[1])
-                ret.append(temp[0])
-                ret.append(temp[1])
            #if
        #for
+        if not ret:
+            # No subfeatures found, in that case just use the feature itself
+            # as if it were its only subfeature.
+            ret = self.__location2pos(location, require_exact=require_exact)
        return ret
-    #__locationList2posList
+    #__location2posList
    def _find_mismatch(self, sentences):
        """
@@ -227,11 +235,20 @@ class GBparser():
                        accession, int(version), match_version=False)[0]
                except ncbi.NoLinkError:
                    pass
-            i.positionList = self.__locationList2posList(i)
+            i.original_location = i.location
+            if i.ref:
+                # This is a workaround for a bug in BioPython.
+                # But seriously I have no idea for which bug and couldn't find
+                # any hints in the commit history. So I just copied it over
+                # with the last changes to this code, but it can probably be
+                # removed.
+                i.positionList = None
+            else:
+                i.positionList = self.__location2posList(i.location)
            i.location = self.__location2pos(i.location) #FIXME
            #if not i.positionList : # FIXME ???
            #    i.positionList = i.location
-            if i.positionList or i.location :
+            if i.positionList :
                i.usable = True
            else :
                i.usable = False
@@ -305,6 +322,13 @@ class GBparser():
        mrnaList = mrna.positionList
        if not mrnaList :
            mrnaList = mrna.location
+        if not mrnaList :
+            # If the mRNA doesn't have exact positions (e.g., it's annotated
+            # at `join(<1..11,214..548,851..4143)`), we still want to use the
+            # part that is in this reference for matching.
+            mrnaList = self.__location2posList(mrna.original_location,
+                                               require_exact=False)
        cdsList = cds.positionList
        if not cdsList :
            cdsList = cds.location
@@ -493,12 +517,6 @@ class GBparser():
                #if
                if i.qualifiers.has_key("gene") :
-                    if not unicode(i.location.start).isdigit() or \
-                       not unicode(i.location.end).isdigit():
-                        # Feature is not completely in reference. Either start
-                        # or end is not a Bio.SeqFeature.ExactPosition.
-                        continue
                    geneName = i.qualifiers["gene"][0]
                    if i.type == "gene" :
                        if not geneDict.has_key(geneName) :
@@ -509,14 +527,6 @@ class GBparser():
                            myGene.location = self.__location2pos(i.location)
                            geneDict[geneName] = tempGene(geneName)
                        #if
-                    else:
-                        if geneName not in geneDict:
-                            # We should have seen a gene entry for this gene
-                            # by now. Could be that it was skipped because it
-                            # was not completely in reference (see check
-                            # above). In that case we just ignore any of its
-                            # features.
-                            continue
                    #if
                    if i.type in ["mRNA", "misc_RNA", "ncRNA", "rRNA", "tRNA",
@@ -537,8 +547,15 @@ class GBparser():
                myGene = geneDict[j]
                self.link(myGene.rnaList, myGene.cdsList)
                for i in myGene.rnaList :
+                    myRealGene = record.findGene(i.gene)
+                    version = myRealGene.newLocusTag()
+                    # TODO: Here we discard transcripts that are not complete
+                    # in this reference, but it might be nicer to still keep
+                    # them so that we can (for example) show them in the
+                    # legend. Of course they should still not be allowed to be
+                    # selected in the variant description.
+                    # (Same for leftover CDS features below.)
                    if i.usable :
-                        myRealGene = record.findGene(i.gene)
                        if i.locus_tag :
                            # Note: We use the last three characters of the
                            # locus_tag as a unique transcript version id.
@@ -550,14 +567,13 @@ class GBparser():
                            # underscore. Or prepended with a letter. We
                            # really want a number, so 'fix' this by only
                            # looking for a numeric part.
+                            # (Same for leftover CDS features below.)
                            try:
                                version = LOCUS_TAG_VERSION.findall(
                                    i.locus_tag)[0].zfill(3)
                            except IndexError:
-                                version = '000'
+                                pass
-                            myTranscript = Locus(version)
+                        myTranscript = Locus(version)
-                        else :
-                            myTranscript = Locus(myRealGene.newLocusTag())
                        myTranscript.mRNA = PList()
                        myTranscript.mRNA.positionList = i.positionList
                        myTranscript.mRNA.location = i.location
@@ -580,38 +596,33 @@ class GBparser():
                        myRealGene.transcriptList.append(myTranscript)
                    #if
                #for
+                # We now look for leftover CDS entries that were not linked to
+                # any transcript. We add them and the RNA will be constructed
+                # for them later.
+                # This does mean that these transcripts always come last (and
+                # are shown last in for example the legend).
                for i in myGene.cdsList :
-                    if not i.linked and \
+                    if not i.linked:
-                       (i.usable or not geneDict[myGene.name].rnaList) :
                        myRealGene = record.findGene(i.gene)
-                        if i.locus_tag :
+                        version = myRealGene.newLocusTag()
-                            # Note: We use the last three characters of the
+                        if i.usable:
-                            # locus_tag as a unique transcript version id.
+                            if i.locus_tag :
-                            # This is also used to for the protein-transcript
+                                try:
-                            # link table.
+                                    version = LOCUS_TAG_VERSION.findall(
-                            # Normally, locus_tag ends with three digits, but
+                                        i.locus_tag)[0].zfill(3)
-                            # for some (e.g. mobA on NC_011228, a plasmid) it
+                                except IndexError:
-                            # ends with two digits prepended with an
+                                    pass
-                            # underscore. Or prepended with a letter. We
-                            # really want a number, so 'fix' this by only
-                            # looking for a numeric part.
-                            try:
-                                version = LOCUS_TAG_VERSION.findall(
-                                    i.locus_tag)[0].zfill(3)
-                            except IndexError:
-                                version = '000'
                            myTranscript = Locus(version)
-                        else :
+                            myTranscript.CDS = PList()
-                            myTranscript = Locus(myRealGene.newLocusTag())
+                            myTranscript.CDS.positionList = i.positionList
-                        myTranscript.CDS = PList()
+                            myTranscript.CDS.location = i.location
-                        myTranscript.CDS.positionList = i.positionList
+                            myTranscript.proteinID = i.protein_id
-                        myTranscript.CDS.location = i.location
+                            myTranscript.proteinProduct = i.product
-                        myTranscript.proteinID = i.protein_id
+                            if i.qualifiers.has_key("transl_table") :
-                        myTranscript.proteinProduct = i.product
+                                myTranscript.txTable = \
-                        if i.qualifiers.has_key("transl_table") :
+                                    int(i.qualifiers["transl_table"][0])
-                            myTranscript.txTable = \
+                            myRealGene.transcriptList.append(myTranscript)
-                                int(i.qualifiers["transl_table"][0])
-                        myRealGene.transcriptList.append(myTranscript)
                        #if
                    #if
                #for
@@ -655,9 +666,10 @@ class GBparser():
                #if
            #if
        #else
-        for i in record.geneList :
-            if not i.transcriptList :
+        # Discard genes for which we haven't constructed any transcripts.
-                record.geneList.remove(i)
+        record.geneList = [gene for gene in record.geneList
+                           if gene.transcriptList]
        return record
    #create_record

--- a/tests/data/UD_143772172095.gb.bz2
+++ b/tests/data/UD_143772172095.gb.bz2
--- a/tests/data/UD_144959560058.gb.bz2
+++ b/tests/data/UD_144959560058.gb.bz2
--- a/tests/data/references.yml
+++ b/tests/data/references.yml
@@ -44,6 +44,31 @@ COL1A1:
    - XP_005257115
  - - NM_000088
    - NP_000079
+PIK3R2:
+  accession: UD_144959560058
+  checksum: f696ee19bba83e899ed8c0f2c2f2ebc4
+  filename: UD_144959560058.gb.bz2
+  links:
+  - - XM_005259824
+    - XP_005259881
+  - - XM_005259825
+    - XP_005259882
+  - - NM_015016
+    - NP_055831
+  - - XM_005259822
+    - XP_005259879
+  - - XM_005259828
+    - null
+  - - XM_005259823
+    - XP_005259880
+  - - XM_005259827
+    - XP_005259884
+  - - XM_005259826
+    - XP_005259883
+  - - NR_073517
+    - null
+  - - NM_005027
+    - NP_005018
 DMD:
  accession: UD_139262478721
  checksum: d41d8cd98f00b204e9800998ecf8427e

--- a/tests/test_parsers_genbank.py
+++ b/tests/test_parsers_genbank.py
@@ -37,17 +37,47 @@ def test_product_lists_mismatch(parser, products, expected):
    assert parser._find_mismatch(products) == expected
+@with_references('AB026906.1')
+def test_include_cds_without_mrna(settings, references, parser):
+    """
+    Annotated CDS without mRNA feature should be included since Mutalyzer can
+    construct the RNA from the CDS.
+    """
+    # Contains one gene with only a CDS annotated, no mRNA.
+    accession = references[0].accession
+    filename = os.path.join(settings.CACHE_DIR, '%s.gb.bz2' % accession)
+    record = parser.create_record(filename)
+    assert record.geneList[0].transcriptList[0].name == '001'
 @with_references('A1BG')
-def test_only_complete_genes_included(settings, references, parser):
+def test_only_complete_mrna_included(settings, references, parser):
    """
-    Incomplete genes from the reference file should be ignored.
+    Incomplete transcripts from the reference file should be ignored.
    """
-    # contains A1BG (complete) and A1BG-AS1, ZNF497, LOC100419840
+    # Contains A1BG (two complete transcripts) and A1BG-AS1, ZNF497,
-    # (incomplete).
+    # LOC100419840 (no complete transcripts).
    accession = references[0].accession
    filename = os.path.join(settings.CACHE_DIR, '%s.gb.bz2' % accession)
    record = parser.create_record(filename)
    assert [g.name for g in record.geneList] == ['A1BG']
+    assert len(record.geneList[0].transcriptList) == 2
+@with_references('PIK3R2')
+def test_complete_and_incomplete_mrna(settings, references, parser):
+    """
+    Incomplete transcripts from the reference file should be ignored, but the
+    gene should be included if it contains another complete transcript.
+    """
+    # Contains MAST3 without complete transcripts and PIK3R2 with one complete
+    # and one incomplete transcript.
+    accession = references[0].accession
+    filename = os.path.join(settings.CACHE_DIR, '%s.gb.bz2' % accession)
+    record = parser.create_record(filename)
+    assert [g.name for g in record.geneList] == ['PIK3R2']
+    assert len(record.geneList[0].transcriptList) == 1
 @with_references('ADAC')
 def test_no_version(settings, references, parser):