diff --git a/mutalyzer/parsers/genbank.py b/mutalyzer/parsers/genbank.py index 266def8fb4348ec65dd7e72bf1705eba6e46824d..65df3512ccfead230e902bb1353be2d3be0558dd 100644 --- a/mutalyzer/parsers/genbank.py +++ b/mutalyzer/parsers/genbank.py @@ -57,12 +57,14 @@ class GBparser(): """ @todo: documentation """ - def __location2pos(self, location): + def __location2pos(self, location, require_exact=True): """ Convert a location object to a tuple of integers. @arg location: A location object (see the BioPython documentation) @type location: location object + @arg require_exact: Require exact positions. + @type require_exact: bool @return: A tuple of integers @rtype: list @@ -70,10 +72,10 @@ class GBparser(): ret = [] - if not unicode(location.start).isdigit() or \ - not unicode(location.end).isdigit() : - return None - #if + if require_exact: + if not unicode(location.start).isdigit() or \ + not unicode(location.end).isdigit() : + return None ret.append(location.start.position + 1) ret.append(location.end.position) @@ -81,12 +83,14 @@ class GBparser(): return ret #__location2pos - def __locationList2posList(self, locationList): + def __location2posList(self, location, require_exact=True): """ - Convert a list of locations to a list of integers. + Convert a location object to a list of integers. - @arg locationList: A list of locations (see the BioPython documentation) - @type locationList: list (location objects) + @arg location: A location object (see the BioPython documentation) + @type location: location object + @arg require_exact: Require exact positions. + @type require_exact: bool @return: A list (of even length) of integers @rtype: list (integers) @@ -94,25 +98,29 @@ class GBparser(): ret = [] - if not unicode(locationList.location.start).isdigit() or \ - not unicode(locationList.location.end).isdigit() : - return None + if require_exact: + if not unicode(location.start).isdigit() or \ + not unicode(location.end).isdigit() : + return None #if - for i in locationList.sub_features : - if i.ref : # This is a workaround for a bug in BioPython. - ret = None - break - #if - temp = self.__location2pos(i.location) - if temp : - ret.append(temp[0]) - ret.append(temp[1]) + for part in location.parts[::location.strand]: + pos = self.__location2pos(part, require_exact=require_exact) + if not pos: + return None + + ret.append(pos[0]) + ret.append(pos[1]) #if #for + if not ret: + # No subfeatures found, in that case just use the feature itself + # as if it were its only subfeature. + ret = self.__location2pos(location, require_exact=require_exact) + return ret - #__locationList2posList + #__location2posList def _find_mismatch(self, sentences): """ @@ -227,11 +235,20 @@ class GBparser(): accession, int(version), match_version=False)[0] except ncbi.NoLinkError: pass - i.positionList = self.__locationList2posList(i) + i.original_location = i.location + if i.ref: + # This is a workaround for a bug in BioPython. + # But seriously I have no idea for which bug and couldn't find + # any hints in the commit history. So I just copied it over + # with the last changes to this code, but it can probably be + # removed. + i.positionList = None + else: + i.positionList = self.__location2posList(i.location) i.location = self.__location2pos(i.location) #FIXME #if not i.positionList : # FIXME ??? # i.positionList = i.location - if i.positionList or i.location : + if i.positionList : i.usable = True else : i.usable = False @@ -305,6 +322,13 @@ class GBparser(): mrnaList = mrna.positionList if not mrnaList : mrnaList = mrna.location + if not mrnaList : + # If the mRNA doesn't have exact positions (e.g., it's annotated + # at `join(<1..11,214..548,851..4143)`), we still want to use the + # part that is in this reference for matching. + mrnaList = self.__location2posList(mrna.original_location, + require_exact=False) + cdsList = cds.positionList if not cdsList : cdsList = cds.location @@ -493,12 +517,6 @@ class GBparser(): #if if i.qualifiers.has_key("gene") : - if not unicode(i.location.start).isdigit() or \ - not unicode(i.location.end).isdigit(): - # Feature is not completely in reference. Either start - # or end is not a Bio.SeqFeature.ExactPosition. - continue - geneName = i.qualifiers["gene"][0] if i.type == "gene" : if not geneDict.has_key(geneName) : @@ -509,14 +527,6 @@ class GBparser(): myGene.location = self.__location2pos(i.location) geneDict[geneName] = tempGene(geneName) #if - else: - if geneName not in geneDict: - # We should have seen a gene entry for this gene - # by now. Could be that it was skipped because it - # was not completely in reference (see check - # above). In that case we just ignore any of its - # features. - continue #if if i.type in ["mRNA", "misc_RNA", "ncRNA", "rRNA", "tRNA", @@ -537,8 +547,15 @@ class GBparser(): myGene = geneDict[j] self.link(myGene.rnaList, myGene.cdsList) for i in myGene.rnaList : + myRealGene = record.findGene(i.gene) + version = myRealGene.newLocusTag() + # TODO: Here we discard transcripts that are not complete + # in this reference, but it might be nicer to still keep + # them so that we can (for example) show them in the + # legend. Of course they should still not be allowed to be + # selected in the variant description. + # (Same for leftover CDS features below.) if i.usable : - myRealGene = record.findGene(i.gene) if i.locus_tag : # Note: We use the last three characters of the # locus_tag as a unique transcript version id. @@ -550,14 +567,13 @@ class GBparser(): # underscore. Or prepended with a letter. We # really want a number, so 'fix' this by only # looking for a numeric part. + # (Same for leftover CDS features below.) try: version = LOCUS_TAG_VERSION.findall( i.locus_tag)[0].zfill(3) except IndexError: - version = '000' - myTranscript = Locus(version) - else : - myTranscript = Locus(myRealGene.newLocusTag()) + pass + myTranscript = Locus(version) myTranscript.mRNA = PList() myTranscript.mRNA.positionList = i.positionList myTranscript.mRNA.location = i.location @@ -580,38 +596,33 @@ class GBparser(): myRealGene.transcriptList.append(myTranscript) #if #for + + # We now look for leftover CDS entries that were not linked to + # any transcript. We add them and the RNA will be constructed + # for them later. + # This does mean that these transcripts always come last (and + # are shown last in for example the legend). for i in myGene.cdsList : - if not i.linked and \ - (i.usable or not geneDict[myGene.name].rnaList) : + if not i.linked: myRealGene = record.findGene(i.gene) - if i.locus_tag : - # Note: We use the last three characters of the - # locus_tag as a unique transcript version id. - # This is also used to for the protein-transcript - # link table. - # Normally, locus_tag ends with three digits, but - # for some (e.g. mobA on NC_011228, a plasmid) it - # ends with two digits prepended with an - # underscore. Or prepended with a letter. We - # really want a number, so 'fix' this by only - # looking for a numeric part. - try: - version = LOCUS_TAG_VERSION.findall( - i.locus_tag)[0].zfill(3) - except IndexError: - version = '000' + version = myRealGene.newLocusTag() + if i.usable: + if i.locus_tag : + try: + version = LOCUS_TAG_VERSION.findall( + i.locus_tag)[0].zfill(3) + except IndexError: + pass myTranscript = Locus(version) - else : - myTranscript = Locus(myRealGene.newLocusTag()) - myTranscript.CDS = PList() - myTranscript.CDS.positionList = i.positionList - myTranscript.CDS.location = i.location - myTranscript.proteinID = i.protein_id - myTranscript.proteinProduct = i.product - if i.qualifiers.has_key("transl_table") : - myTranscript.txTable = \ - int(i.qualifiers["transl_table"][0]) - myRealGene.transcriptList.append(myTranscript) + myTranscript.CDS = PList() + myTranscript.CDS.positionList = i.positionList + myTranscript.CDS.location = i.location + myTranscript.proteinID = i.protein_id + myTranscript.proteinProduct = i.product + if i.qualifiers.has_key("transl_table") : + myTranscript.txTable = \ + int(i.qualifiers["transl_table"][0]) + myRealGene.transcriptList.append(myTranscript) #if #if #for @@ -655,9 +666,10 @@ class GBparser(): #if #if #else - for i in record.geneList : - if not i.transcriptList : - record.geneList.remove(i) + + # Discard genes for which we haven't constructed any transcripts. + record.geneList = [gene for gene in record.geneList + if gene.transcriptList] return record #create_record diff --git a/tests/data/UD_143772172095.gb.bz2 b/tests/data/UD_143772172095.gb.bz2 index 30f97e92f1f2b746cb17d66fb9b9c69f83dc1c0d..bfcffb1973cb4988447e6e979bafa5212e5241c1 100644 Binary files a/tests/data/UD_143772172095.gb.bz2 and b/tests/data/UD_143772172095.gb.bz2 differ diff --git a/tests/data/UD_144959560058.gb.bz2 b/tests/data/UD_144959560058.gb.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..72e9dd0f00e3dda68e2e7d6d21d32719938d67bf Binary files /dev/null and b/tests/data/UD_144959560058.gb.bz2 differ diff --git a/tests/data/references.yml b/tests/data/references.yml index 6ab753a8cb1337c8818bc44ad60551c93e8127d5..4a18ca699fe470aa7304d66729cb9035fc6deaff 100644 --- a/tests/data/references.yml +++ b/tests/data/references.yml @@ -44,6 +44,31 @@ COL1A1: - XP_005257115 - - NM_000088 - NP_000079 +PIK3R2: + accession: UD_144959560058 + checksum: f696ee19bba83e899ed8c0f2c2f2ebc4 + filename: UD_144959560058.gb.bz2 + links: + - - XM_005259824 + - XP_005259881 + - - XM_005259825 + - XP_005259882 + - - NM_015016 + - NP_055831 + - - XM_005259822 + - XP_005259879 + - - XM_005259828 + - null + - - XM_005259823 + - XP_005259880 + - - XM_005259827 + - XP_005259884 + - - XM_005259826 + - XP_005259883 + - - NR_073517 + - null + - - NM_005027 + - NP_005018 DMD: accession: UD_139262478721 checksum: d41d8cd98f00b204e9800998ecf8427e diff --git a/tests/test_parsers_genbank.py b/tests/test_parsers_genbank.py index f997e89c9999d2c98fe691859b0b4b58e40ca06d..4c27c9a2780b99977f0914f6914109561987ca88 100644 --- a/tests/test_parsers_genbank.py +++ b/tests/test_parsers_genbank.py @@ -37,17 +37,47 @@ def test_product_lists_mismatch(parser, products, expected): assert parser._find_mismatch(products) == expected +@with_references('AB026906.1') +def test_include_cds_without_mrna(settings, references, parser): + """ + Annotated CDS without mRNA feature should be included since Mutalyzer can + construct the RNA from the CDS. + """ + # Contains one gene with only a CDS annotated, no mRNA. + accession = references[0].accession + filename = os.path.join(settings.CACHE_DIR, '%s.gb.bz2' % accession) + record = parser.create_record(filename) + assert record.geneList[0].transcriptList[0].name == '001' + + @with_references('A1BG') -def test_only_complete_genes_included(settings, references, parser): +def test_only_complete_mrna_included(settings, references, parser): """ - Incomplete genes from the reference file should be ignored. + Incomplete transcripts from the reference file should be ignored. """ - # contains A1BG (complete) and A1BG-AS1, ZNF497, LOC100419840 - # (incomplete). + # Contains A1BG (two complete transcripts) and A1BG-AS1, ZNF497, + # LOC100419840 (no complete transcripts). accession = references[0].accession filename = os.path.join(settings.CACHE_DIR, '%s.gb.bz2' % accession) record = parser.create_record(filename) assert [g.name for g in record.geneList] == ['A1BG'] + assert len(record.geneList[0].transcriptList) == 2 + + +@with_references('PIK3R2') +def test_complete_and_incomplete_mrna(settings, references, parser): + """ + Incomplete transcripts from the reference file should be ignored, but the + gene should be included if it contains another complete transcript. + """ + # Contains MAST3 without complete transcripts and PIK3R2 with one complete + # and one incomplete transcript. + accession = references[0].accession + filename = os.path.join(settings.CACHE_DIR, '%s.gb.bz2' % accession) + record = parser.create_record(filename) + assert [g.name for g in record.geneList] == ['PIK3R2'] + assert len(record.geneList[0].transcriptList) == 1 + @with_references('ADAC') def test_no_version(settings, references, parser):