Skip to content
Snippets Groups Projects
Commit 8fac2dc7 authored by Vermaat's avatar Vermaat
Browse files

Keep incomplete genes with complete features

With this change the genbank parser no longer discards incomplete genes
directly but keeps them as long as they have complete features
annotated.

For example, the PIK3R2 gene is annotated on NC_000019.9 (or a slice) as
4973..>22328 with two RNA entries. One of these, however, is complete so
it would be a shame to discard the entire gene.
parent c1ea8bc3
No related branches found
No related tags found
No related merge requests found
...@@ -57,12 +57,14 @@ class GBparser(): ...@@ -57,12 +57,14 @@ class GBparser():
""" """
@todo: documentation @todo: documentation
""" """
def __location2pos(self, location): def __location2pos(self, location, require_exact=True):
""" """
Convert a location object to a tuple of integers. Convert a location object to a tuple of integers.
@arg location: A location object (see the BioPython documentation) @arg location: A location object (see the BioPython documentation)
@type location: location object @type location: location object
@arg require_exact: Require exact positions.
@type require_exact: bool
@return: A tuple of integers @return: A tuple of integers
@rtype: list @rtype: list
...@@ -70,10 +72,10 @@ class GBparser(): ...@@ -70,10 +72,10 @@ class GBparser():
ret = [] ret = []
if not unicode(location.start).isdigit() or \ if require_exact:
not unicode(location.end).isdigit() : if not unicode(location.start).isdigit() or \
return None not unicode(location.end).isdigit() :
#if return None
ret.append(location.start.position + 1) ret.append(location.start.position + 1)
ret.append(location.end.position) ret.append(location.end.position)
...@@ -81,12 +83,14 @@ class GBparser(): ...@@ -81,12 +83,14 @@ class GBparser():
return ret return ret
#__location2pos #__location2pos
def __locationList2posList(self, locationList): def __location2posList(self, location, require_exact=True):
""" """
Convert a list of locations to a list of integers. Convert a location object to a list of integers.
@arg locationList: A list of locations (see the BioPython documentation) @arg location: A location object (see the BioPython documentation)
@type locationList: list (location objects) @type location: location object
@arg require_exact: Require exact positions.
@type require_exact: bool
@return: A list (of even length) of integers @return: A list (of even length) of integers
@rtype: list (integers) @rtype: list (integers)
...@@ -94,25 +98,29 @@ class GBparser(): ...@@ -94,25 +98,29 @@ class GBparser():
ret = [] ret = []
if not unicode(locationList.location.start).isdigit() or \ if require_exact:
not unicode(locationList.location.end).isdigit() : if not unicode(location.start).isdigit() or \
return None not unicode(location.end).isdigit() :
return None
#if #if
for i in locationList.sub_features : for part in location.parts[::location.strand]:
if i.ref : # This is a workaround for a bug in BioPython. pos = self.__location2pos(part, require_exact=require_exact)
ret = None if not pos:
break return None
#if
temp = self.__location2pos(i.location) ret.append(pos[0])
if temp : ret.append(pos[1])
ret.append(temp[0])
ret.append(temp[1])
#if #if
#for #for
if not ret:
# No subfeatures found, in that case just use the feature itself
# as if it were its only subfeature.
ret = self.__location2pos(location, require_exact=require_exact)
return ret return ret
#__locationList2posList #__location2posList
def _find_mismatch(self, sentences): def _find_mismatch(self, sentences):
""" """
...@@ -227,11 +235,20 @@ class GBparser(): ...@@ -227,11 +235,20 @@ class GBparser():
accession, int(version), match_version=False)[0] accession, int(version), match_version=False)[0]
except ncbi.NoLinkError: except ncbi.NoLinkError:
pass pass
i.positionList = self.__locationList2posList(i) i.original_location = i.location
if i.ref:
# This is a workaround for a bug in BioPython.
# But seriously I have no idea for which bug and couldn't find
# any hints in the commit history. So I just copied it over
# with the last changes to this code, but it can probably be
# removed.
i.positionList = None
else:
i.positionList = self.__location2posList(i.location)
i.location = self.__location2pos(i.location) #FIXME i.location = self.__location2pos(i.location) #FIXME
#if not i.positionList : # FIXME ??? #if not i.positionList : # FIXME ???
# i.positionList = i.location # i.positionList = i.location
if i.positionList or i.location : if i.positionList :
i.usable = True i.usable = True
else : else :
i.usable = False i.usable = False
...@@ -305,6 +322,13 @@ class GBparser(): ...@@ -305,6 +322,13 @@ class GBparser():
mrnaList = mrna.positionList mrnaList = mrna.positionList
if not mrnaList : if not mrnaList :
mrnaList = mrna.location mrnaList = mrna.location
if not mrnaList :
# If the mRNA doesn't have exact positions (e.g., it's annotated
# at `join(<1..11,214..548,851..4143)`), we still want to use the
# part that is in this reference for matching.
mrnaList = self.__location2posList(mrna.original_location,
require_exact=False)
cdsList = cds.positionList cdsList = cds.positionList
if not cdsList : if not cdsList :
cdsList = cds.location cdsList = cds.location
...@@ -493,12 +517,6 @@ class GBparser(): ...@@ -493,12 +517,6 @@ class GBparser():
#if #if
if i.qualifiers.has_key("gene") : if i.qualifiers.has_key("gene") :
if not unicode(i.location.start).isdigit() or \
not unicode(i.location.end).isdigit():
# Feature is not completely in reference. Either start
# or end is not a Bio.SeqFeature.ExactPosition.
continue
geneName = i.qualifiers["gene"][0] geneName = i.qualifiers["gene"][0]
if i.type == "gene" : if i.type == "gene" :
if not geneDict.has_key(geneName) : if not geneDict.has_key(geneName) :
...@@ -509,14 +527,6 @@ class GBparser(): ...@@ -509,14 +527,6 @@ class GBparser():
myGene.location = self.__location2pos(i.location) myGene.location = self.__location2pos(i.location)
geneDict[geneName] = tempGene(geneName) geneDict[geneName] = tempGene(geneName)
#if #if
else:
if geneName not in geneDict:
# We should have seen a gene entry for this gene
# by now. Could be that it was skipped because it
# was not completely in reference (see check
# above). In that case we just ignore any of its
# features.
continue
#if #if
if i.type in ["mRNA", "misc_RNA", "ncRNA", "rRNA", "tRNA", if i.type in ["mRNA", "misc_RNA", "ncRNA", "rRNA", "tRNA",
...@@ -537,8 +547,15 @@ class GBparser(): ...@@ -537,8 +547,15 @@ class GBparser():
myGene = geneDict[j] myGene = geneDict[j]
self.link(myGene.rnaList, myGene.cdsList) self.link(myGene.rnaList, myGene.cdsList)
for i in myGene.rnaList : for i in myGene.rnaList :
myRealGene = record.findGene(i.gene)
version = myRealGene.newLocusTag()
# TODO: Here we discard transcripts that are not complete
# in this reference, but it might be nicer to still keep
# them so that we can (for example) show them in the
# legend. Of course they should still not be allowed to be
# selected in the variant description.
# (Same for leftover CDS features below.)
if i.usable : if i.usable :
myRealGene = record.findGene(i.gene)
if i.locus_tag : if i.locus_tag :
# Note: We use the last three characters of the # Note: We use the last three characters of the
# locus_tag as a unique transcript version id. # locus_tag as a unique transcript version id.
...@@ -550,14 +567,13 @@ class GBparser(): ...@@ -550,14 +567,13 @@ class GBparser():
# underscore. Or prepended with a letter. We # underscore. Or prepended with a letter. We
# really want a number, so 'fix' this by only # really want a number, so 'fix' this by only
# looking for a numeric part. # looking for a numeric part.
# (Same for leftover CDS features below.)
try: try:
version = LOCUS_TAG_VERSION.findall( version = LOCUS_TAG_VERSION.findall(
i.locus_tag)[0].zfill(3) i.locus_tag)[0].zfill(3)
except IndexError: except IndexError:
version = '000' pass
myTranscript = Locus(version) myTranscript = Locus(version)
else :
myTranscript = Locus(myRealGene.newLocusTag())
myTranscript.mRNA = PList() myTranscript.mRNA = PList()
myTranscript.mRNA.positionList = i.positionList myTranscript.mRNA.positionList = i.positionList
myTranscript.mRNA.location = i.location myTranscript.mRNA.location = i.location
...@@ -580,38 +596,33 @@ class GBparser(): ...@@ -580,38 +596,33 @@ class GBparser():
myRealGene.transcriptList.append(myTranscript) myRealGene.transcriptList.append(myTranscript)
#if #if
#for #for
# We now look for leftover CDS entries that were not linked to
# any transcript. We add them and the RNA will be constructed
# for them later.
# This does mean that these transcripts always come last (and
# are shown last in for example the legend).
for i in myGene.cdsList : for i in myGene.cdsList :
if not i.linked and \ if not i.linked:
(i.usable or not geneDict[myGene.name].rnaList) :
myRealGene = record.findGene(i.gene) myRealGene = record.findGene(i.gene)
if i.locus_tag : version = myRealGene.newLocusTag()
# Note: We use the last three characters of the if i.usable:
# locus_tag as a unique transcript version id. if i.locus_tag :
# This is also used to for the protein-transcript try:
# link table. version = LOCUS_TAG_VERSION.findall(
# Normally, locus_tag ends with three digits, but i.locus_tag)[0].zfill(3)
# for some (e.g. mobA on NC_011228, a plasmid) it except IndexError:
# ends with two digits prepended with an pass
# underscore. Or prepended with a letter. We
# really want a number, so 'fix' this by only
# looking for a numeric part.
try:
version = LOCUS_TAG_VERSION.findall(
i.locus_tag)[0].zfill(3)
except IndexError:
version = '000'
myTranscript = Locus(version) myTranscript = Locus(version)
else : myTranscript.CDS = PList()
myTranscript = Locus(myRealGene.newLocusTag()) myTranscript.CDS.positionList = i.positionList
myTranscript.CDS = PList() myTranscript.CDS.location = i.location
myTranscript.CDS.positionList = i.positionList myTranscript.proteinID = i.protein_id
myTranscript.CDS.location = i.location myTranscript.proteinProduct = i.product
myTranscript.proteinID = i.protein_id if i.qualifiers.has_key("transl_table") :
myTranscript.proteinProduct = i.product myTranscript.txTable = \
if i.qualifiers.has_key("transl_table") : int(i.qualifiers["transl_table"][0])
myTranscript.txTable = \ myRealGene.transcriptList.append(myTranscript)
int(i.qualifiers["transl_table"][0])
myRealGene.transcriptList.append(myTranscript)
#if #if
#if #if
#for #for
...@@ -655,9 +666,10 @@ class GBparser(): ...@@ -655,9 +666,10 @@ class GBparser():
#if #if
#if #if
#else #else
for i in record.geneList :
if not i.transcriptList : # Discard genes for which we haven't constructed any transcripts.
record.geneList.remove(i) record.geneList = [gene for gene in record.geneList
if gene.transcriptList]
return record return record
#create_record #create_record
......
File added
...@@ -44,6 +44,31 @@ COL1A1: ...@@ -44,6 +44,31 @@ COL1A1:
- XP_005257115 - XP_005257115
- - NM_000088 - - NM_000088
- NP_000079 - NP_000079
PIK3R2:
accession: UD_144959560058
checksum: f696ee19bba83e899ed8c0f2c2f2ebc4
filename: UD_144959560058.gb.bz2
links:
- - XM_005259824
- XP_005259881
- - XM_005259825
- XP_005259882
- - NM_015016
- NP_055831
- - XM_005259822
- XP_005259879
- - XM_005259828
- null
- - XM_005259823
- XP_005259880
- - XM_005259827
- XP_005259884
- - XM_005259826
- XP_005259883
- - NR_073517
- null
- - NM_005027
- NP_005018
DMD: DMD:
accession: UD_139262478721 accession: UD_139262478721
checksum: d41d8cd98f00b204e9800998ecf8427e checksum: d41d8cd98f00b204e9800998ecf8427e
......
...@@ -37,17 +37,47 @@ def test_product_lists_mismatch(parser, products, expected): ...@@ -37,17 +37,47 @@ def test_product_lists_mismatch(parser, products, expected):
assert parser._find_mismatch(products) == expected assert parser._find_mismatch(products) == expected
@with_references('AB026906.1')
def test_include_cds_without_mrna(settings, references, parser):
"""
Annotated CDS without mRNA feature should be included since Mutalyzer can
construct the RNA from the CDS.
"""
# Contains one gene with only a CDS annotated, no mRNA.
accession = references[0].accession
filename = os.path.join(settings.CACHE_DIR, '%s.gb.bz2' % accession)
record = parser.create_record(filename)
assert record.geneList[0].transcriptList[0].name == '001'
@with_references('A1BG') @with_references('A1BG')
def test_only_complete_genes_included(settings, references, parser): def test_only_complete_mrna_included(settings, references, parser):
""" """
Incomplete genes from the reference file should be ignored. Incomplete transcripts from the reference file should be ignored.
""" """
# contains A1BG (complete) and A1BG-AS1, ZNF497, LOC100419840 # Contains A1BG (two complete transcripts) and A1BG-AS1, ZNF497,
# (incomplete). # LOC100419840 (no complete transcripts).
accession = references[0].accession accession = references[0].accession
filename = os.path.join(settings.CACHE_DIR, '%s.gb.bz2' % accession) filename = os.path.join(settings.CACHE_DIR, '%s.gb.bz2' % accession)
record = parser.create_record(filename) record = parser.create_record(filename)
assert [g.name for g in record.geneList] == ['A1BG'] assert [g.name for g in record.geneList] == ['A1BG']
assert len(record.geneList[0].transcriptList) == 2
@with_references('PIK3R2')
def test_complete_and_incomplete_mrna(settings, references, parser):
"""
Incomplete transcripts from the reference file should be ignored, but the
gene should be included if it contains another complete transcript.
"""
# Contains MAST3 without complete transcripts and PIK3R2 with one complete
# and one incomplete transcript.
accession = references[0].accession
filename = os.path.join(settings.CACHE_DIR, '%s.gb.bz2' % accession)
record = parser.create_record(filename)
assert [g.name for g in record.geneList] == ['PIK3R2']
assert len(record.geneList[0].transcriptList) == 1
@with_references('ADAC') @with_references('ADAC')
def test_no_version(settings, references, parser): def test_no_version(settings, references, parser):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment