Skip to content
Snippets Groups Projects
Commit bd705d16 authored by Vermaat's avatar Vermaat
Browse files

Merge pull request #138 from mutalyzer/genbank-incomplete-genes

Add gene feature to genbank file without version
parents 17a825d3 8fac2dc7
No related branches found
No related tags found
No related merge requests found
...@@ -57,12 +57,14 @@ class GBparser(): ...@@ -57,12 +57,14 @@ class GBparser():
""" """
@todo: documentation @todo: documentation
""" """
def __location2pos(self, location): def __location2pos(self, location, require_exact=True):
""" """
Convert a location object to a tuple of integers. Convert a location object to a tuple of integers.
@arg location: A location object (see the BioPython documentation) @arg location: A location object (see the BioPython documentation)
@type location: location object @type location: location object
@arg require_exact: Require exact positions.
@type require_exact: bool
@return: A tuple of integers @return: A tuple of integers
@rtype: list @rtype: list
...@@ -70,10 +72,10 @@ class GBparser(): ...@@ -70,10 +72,10 @@ class GBparser():
ret = [] ret = []
if not unicode(location.start).isdigit() or \ if require_exact:
not unicode(location.end).isdigit() : if not unicode(location.start).isdigit() or \
return None not unicode(location.end).isdigit() :
#if return None
ret.append(location.start.position + 1) ret.append(location.start.position + 1)
ret.append(location.end.position) ret.append(location.end.position)
...@@ -81,12 +83,14 @@ class GBparser(): ...@@ -81,12 +83,14 @@ class GBparser():
return ret return ret
#__location2pos #__location2pos
def __locationList2posList(self, locationList): def __location2posList(self, location, require_exact=True):
""" """
Convert a list of locations to a list of integers. Convert a location object to a list of integers.
@arg locationList: A list of locations (see the BioPython documentation) @arg location: A location object (see the BioPython documentation)
@type locationList: list (location objects) @type location: location object
@arg require_exact: Require exact positions.
@type require_exact: bool
@return: A list (of even length) of integers @return: A list (of even length) of integers
@rtype: list (integers) @rtype: list (integers)
...@@ -94,25 +98,29 @@ class GBparser(): ...@@ -94,25 +98,29 @@ class GBparser():
ret = [] ret = []
if not unicode(locationList.location.start).isdigit() or \ if require_exact:
not unicode(locationList.location.end).isdigit() : if not unicode(location.start).isdigit() or \
return None not unicode(location.end).isdigit() :
return None
#if #if
for i in locationList.sub_features : for part in location.parts[::location.strand]:
if i.ref : # This is a workaround for a bug in BioPython. pos = self.__location2pos(part, require_exact=require_exact)
ret = None if not pos:
break return None
#if
temp = self.__location2pos(i.location) ret.append(pos[0])
if temp : ret.append(pos[1])
ret.append(temp[0])
ret.append(temp[1])
#if #if
#for #for
if not ret:
# No subfeatures found, in that case just use the feature itself
# as if it were its only subfeature.
ret = self.__location2pos(location, require_exact=require_exact)
return ret return ret
#__locationList2posList #__location2posList
def _find_mismatch(self, sentences): def _find_mismatch(self, sentences):
""" """
...@@ -227,11 +235,20 @@ class GBparser(): ...@@ -227,11 +235,20 @@ class GBparser():
accession, int(version), match_version=False)[0] accession, int(version), match_version=False)[0]
except ncbi.NoLinkError: except ncbi.NoLinkError:
pass pass
i.positionList = self.__locationList2posList(i) i.original_location = i.location
if i.ref:
# This is a workaround for a bug in BioPython.
# But seriously I have no idea for which bug and couldn't find
# any hints in the commit history. So I just copied it over
# with the last changes to this code, but it can probably be
# removed.
i.positionList = None
else:
i.positionList = self.__location2posList(i.location)
i.location = self.__location2pos(i.location) #FIXME i.location = self.__location2pos(i.location) #FIXME
#if not i.positionList : # FIXME ??? #if not i.positionList : # FIXME ???
# i.positionList = i.location # i.positionList = i.location
if i.positionList or i.location : if i.positionList :
i.usable = True i.usable = True
else : else :
i.usable = False i.usable = False
...@@ -305,6 +322,13 @@ class GBparser(): ...@@ -305,6 +322,13 @@ class GBparser():
mrnaList = mrna.positionList mrnaList = mrna.positionList
if not mrnaList : if not mrnaList :
mrnaList = mrna.location mrnaList = mrna.location
if not mrnaList :
# If the mRNA doesn't have exact positions (e.g., it's annotated
# at `join(<1..11,214..548,851..4143)`), we still want to use the
# part that is in this reference for matching.
mrnaList = self.__location2posList(mrna.original_location,
require_exact=False)
cdsList = cds.positionList cdsList = cds.positionList
if not cdsList : if not cdsList :
cdsList = cds.location cdsList = cds.location
...@@ -493,12 +517,6 @@ class GBparser(): ...@@ -493,12 +517,6 @@ class GBparser():
#if #if
if i.qualifiers.has_key("gene") : if i.qualifiers.has_key("gene") :
if not unicode(i.location.start).isdigit() or \
not unicode(i.location.end).isdigit():
# Feature is not completely in reference. Either start
# or end is not a Bio.SeqFeature.ExactPosition.
continue
geneName = i.qualifiers["gene"][0] geneName = i.qualifiers["gene"][0]
if i.type == "gene" : if i.type == "gene" :
if not geneDict.has_key(geneName) : if not geneDict.has_key(geneName) :
...@@ -509,14 +527,6 @@ class GBparser(): ...@@ -509,14 +527,6 @@ class GBparser():
myGene.location = self.__location2pos(i.location) myGene.location = self.__location2pos(i.location)
geneDict[geneName] = tempGene(geneName) geneDict[geneName] = tempGene(geneName)
#if #if
else:
if geneName not in geneDict:
# We should have seen a gene entry for this gene
# by now. Could be that it was skipped because it
# was not completely in reference (see check
# above). In that case we just ignore any of its
# features.
continue
#if #if
if i.type in ["mRNA", "misc_RNA", "ncRNA", "rRNA", "tRNA", if i.type in ["mRNA", "misc_RNA", "ncRNA", "rRNA", "tRNA",
...@@ -537,8 +547,15 @@ class GBparser(): ...@@ -537,8 +547,15 @@ class GBparser():
myGene = geneDict[j] myGene = geneDict[j]
self.link(myGene.rnaList, myGene.cdsList) self.link(myGene.rnaList, myGene.cdsList)
for i in myGene.rnaList : for i in myGene.rnaList :
myRealGene = record.findGene(i.gene)
version = myRealGene.newLocusTag()
# TODO: Here we discard transcripts that are not complete
# in this reference, but it might be nicer to still keep
# them so that we can (for example) show them in the
# legend. Of course they should still not be allowed to be
# selected in the variant description.
# (Same for leftover CDS features below.)
if i.usable : if i.usable :
myRealGene = record.findGene(i.gene)
if i.locus_tag : if i.locus_tag :
# Note: We use the last three characters of the # Note: We use the last three characters of the
# locus_tag as a unique transcript version id. # locus_tag as a unique transcript version id.
...@@ -550,14 +567,13 @@ class GBparser(): ...@@ -550,14 +567,13 @@ class GBparser():
# underscore. Or prepended with a letter. We # underscore. Or prepended with a letter. We
# really want a number, so 'fix' this by only # really want a number, so 'fix' this by only
# looking for a numeric part. # looking for a numeric part.
# (Same for leftover CDS features below.)
try: try:
version = LOCUS_TAG_VERSION.findall( version = LOCUS_TAG_VERSION.findall(
i.locus_tag)[0].zfill(3) i.locus_tag)[0].zfill(3)
except IndexError: except IndexError:
version = '000' pass
myTranscript = Locus(version) myTranscript = Locus(version)
else :
myTranscript = Locus(myRealGene.newLocusTag())
myTranscript.mRNA = PList() myTranscript.mRNA = PList()
myTranscript.mRNA.positionList = i.positionList myTranscript.mRNA.positionList = i.positionList
myTranscript.mRNA.location = i.location myTranscript.mRNA.location = i.location
...@@ -580,38 +596,33 @@ class GBparser(): ...@@ -580,38 +596,33 @@ class GBparser():
myRealGene.transcriptList.append(myTranscript) myRealGene.transcriptList.append(myTranscript)
#if #if
#for #for
# We now look for leftover CDS entries that were not linked to
# any transcript. We add them and the RNA will be constructed
# for them later.
# This does mean that these transcripts always come last (and
# are shown last in for example the legend).
for i in myGene.cdsList : for i in myGene.cdsList :
if not i.linked and \ if not i.linked:
(i.usable or not geneDict[myGene.name].rnaList) :
myRealGene = record.findGene(i.gene) myRealGene = record.findGene(i.gene)
if i.locus_tag : version = myRealGene.newLocusTag()
# Note: We use the last three characters of the if i.usable:
# locus_tag as a unique transcript version id. if i.locus_tag :
# This is also used to for the protein-transcript try:
# link table. version = LOCUS_TAG_VERSION.findall(
# Normally, locus_tag ends with three digits, but i.locus_tag)[0].zfill(3)
# for some (e.g. mobA on NC_011228, a plasmid) it except IndexError:
# ends with two digits prepended with an pass
# underscore. Or prepended with a letter. We
# really want a number, so 'fix' this by only
# looking for a numeric part.
try:
version = LOCUS_TAG_VERSION.findall(
i.locus_tag)[0].zfill(3)
except IndexError:
version = '000'
myTranscript = Locus(version) myTranscript = Locus(version)
else : myTranscript.CDS = PList()
myTranscript = Locus(myRealGene.newLocusTag()) myTranscript.CDS.positionList = i.positionList
myTranscript.CDS = PList() myTranscript.CDS.location = i.location
myTranscript.CDS.positionList = i.positionList myTranscript.proteinID = i.protein_id
myTranscript.CDS.location = i.location myTranscript.proteinProduct = i.product
myTranscript.proteinID = i.protein_id if i.qualifiers.has_key("transl_table") :
myTranscript.proteinProduct = i.product myTranscript.txTable = \
if i.qualifiers.has_key("transl_table") : int(i.qualifiers["transl_table"][0])
myTranscript.txTable = \ myRealGene.transcriptList.append(myTranscript)
int(i.qualifiers["transl_table"][0])
myRealGene.transcriptList.append(myTranscript)
#if #if
#if #if
#for #for
...@@ -655,9 +666,10 @@ class GBparser(): ...@@ -655,9 +666,10 @@ class GBparser():
#if #if
#if #if
#else #else
for i in record.geneList :
if not i.transcriptList : # Discard genes for which we haven't constructed any transcripts.
record.geneList.remove(i) record.geneList = [gene for gene in record.geneList
if gene.transcriptList]
return record return record
#create_record #create_record
......
No preview for this file type
File added
...@@ -44,6 +44,31 @@ COL1A1: ...@@ -44,6 +44,31 @@ COL1A1:
- XP_005257115 - XP_005257115
- - NM_000088 - - NM_000088
- NP_000079 - NP_000079
PIK3R2:
accession: UD_144959560058
checksum: f696ee19bba83e899ed8c0f2c2f2ebc4
filename: UD_144959560058.gb.bz2
links:
- - XM_005259824
- XP_005259881
- - XM_005259825
- XP_005259882
- - NM_015016
- NP_055831
- - XM_005259822
- XP_005259879
- - XM_005259828
- null
- - XM_005259823
- XP_005259880
- - XM_005259827
- XP_005259884
- - XM_005259826
- XP_005259883
- - NR_073517
- null
- - NM_005027
- NP_005018
DMD: DMD:
accession: UD_139262478721 accession: UD_139262478721
checksum: d41d8cd98f00b204e9800998ecf8427e checksum: d41d8cd98f00b204e9800998ecf8427e
......
...@@ -37,17 +37,47 @@ def test_product_lists_mismatch(parser, products, expected): ...@@ -37,17 +37,47 @@ def test_product_lists_mismatch(parser, products, expected):
assert parser._find_mismatch(products) == expected assert parser._find_mismatch(products) == expected
@with_references('AB026906.1')
def test_include_cds_without_mrna(settings, references, parser):
"""
Annotated CDS without mRNA feature should be included since Mutalyzer can
construct the RNA from the CDS.
"""
# Contains one gene with only a CDS annotated, no mRNA.
accession = references[0].accession
filename = os.path.join(settings.CACHE_DIR, '%s.gb.bz2' % accession)
record = parser.create_record(filename)
assert record.geneList[0].transcriptList[0].name == '001'
@with_references('A1BG') @with_references('A1BG')
def test_only_complete_genes_included(settings, references, parser): def test_only_complete_mrna_included(settings, references, parser):
""" """
Incomplete genes from the reference file should be ignored. Incomplete transcripts from the reference file should be ignored.
""" """
# contains A1BG (complete) and A1BG-AS1, ZNF497, LOC100419840 # Contains A1BG (two complete transcripts) and A1BG-AS1, ZNF497,
# (incomplete). # LOC100419840 (no complete transcripts).
accession = references[0].accession accession = references[0].accession
filename = os.path.join(settings.CACHE_DIR, '%s.gb.bz2' % accession) filename = os.path.join(settings.CACHE_DIR, '%s.gb.bz2' % accession)
record = parser.create_record(filename) record = parser.create_record(filename)
assert [g.name for g in record.geneList] == ['A1BG'] assert [g.name for g in record.geneList] == ['A1BG']
assert len(record.geneList[0].transcriptList) == 2
@with_references('PIK3R2')
def test_complete_and_incomplete_mrna(settings, references, parser):
"""
Incomplete transcripts from the reference file should be ignored, but the
gene should be included if it contains another complete transcript.
"""
# Contains MAST3 without complete transcripts and PIK3R2 with one complete
# and one incomplete transcript.
accession = references[0].accession
filename = os.path.join(settings.CACHE_DIR, '%s.gb.bz2' % accession)
record = parser.create_record(filename)
assert [g.name for g in record.geneList] == ['PIK3R2']
assert len(record.geneList[0].transcriptList) == 1
@with_references('ADAC') @with_references('ADAC')
def test_no_version(settings, references, parser): def test_no_version(settings, references, parser):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment