diff --git a/mutalyzer/parsers/lrg.py b/mutalyzer/parsers/lrg.py index 7ba03a860899ce86b5ac7b5432f88ae4b3377555..a052557985fb8f98368c0dd1b17c598b9d9e1d9b 100644 --- a/mutalyzer/parsers/lrg.py +++ b/mutalyzer/parsers/lrg.py @@ -12,6 +12,9 @@ transcripts and mapping information. This module is based on the result of the minidom xml parser. +Todo: Check document to Relax NG LRG schema. + ftp://ftp.ebi.ac.uk/pub/databases/lrgex/ + NOTE: A strong alternative to the minidom parser would be ElementTree which is added in python2.5. Its main strengths are speed and readability [pythonesque]. (http://docs.python.org/library/xml.etree.elementtree.html) @@ -84,6 +87,29 @@ def _attr2dict(attr): #_attr2dict +def _get_attributes(feature): + """ + Get attributes for a feature (gene, transcript, or protein). + + @arg feature: a minidom node + @type attr: object + + @return: A dictionary with pairing of attribute names and values. + @rtype: dictionary + """ + attributes = _attr2dict(feature.attributes) + if not 'symbol' in attributes: + attributes['symbol'] = _get_content(feature, 'symbol') + if not 'start' in attributes: + coordinates = feature.getElementsByTagName('coordinates') + if coordinates: + attributes.update(_attr2dict(coordinates[0].attributes)) + if 'transcript_id' in attributes: + attributes['accession'] = attributes['transcript_id'] + return attributes +#_get_attributes + + def create_record(data): """ Create a GenRecord.Record of a LRG <xml> formatted string. @@ -300,7 +326,7 @@ def _transcriptPopulator(trName, trData): transcript.transcriptProduct = trData.get("transLongName") if trData.has_key("transAttr"): tA = trData["transAttr"] - transcript.transcriptID = tA.get("transcript_id") + transcript.transcriptID = tA.get("accession") transcript.location = [tA.get("start"), tA.get("end")] if trData.has_key("proteinAttr"): @@ -399,7 +425,8 @@ def getLrgAnnotation(data): for mapp in data.getElementsByTagName("mapping"): mapattr = _attr2dict(mapp.attributes) # only the most recent mapping - if not(mapattr.has_key("most_recent")): continue + if ret['mapping'] and not mapattr.has_key("most_recent"): + continue # check if span exists for span in mapp.getElementsByTagName("mapping_span"): spanattr = _attr2dict(span.attributes) @@ -456,18 +483,18 @@ def getFeaturesAnnotation(data): if not data.getElementsByTagName("features"): return ret feature = data.getElementsByTagName("features")[0] for gene in feature.getElementsByTagName("gene"): - geneAttr = _attr2dict(gene.attributes) + geneAttr = _get_attributes(gene) geneLongName = _get_content(gene, "long_name") transcripts = {"noFixedId": []} for transcript in gene.getElementsByTagName("transcript"): - transAttr = _attr2dict(transcript.attributes) + transAttr = _get_attributes(transcript) transLongName = _get_content(transcript, "long_name") # Check if the transcript has a protein product proteinProduct =\ transcript.getElementsByTagName("protein_product") if proteinProduct: protein = proteinProduct[0] - proteinAttr = _attr2dict(protein.attributes) + proteinAttr = _get_attributes(protein) proteinLongName = _get_content(protein, "long_name") else: proteinAttr = {} diff --git a/tests/test_variantchecker.py b/tests/test_variantchecker.py index fe94045718fe1c54a11ba410a609188352614add..0d154709fabc31d0896100c8d72ffbab0cfdf9d8 100644 --- a/tests/test_variantchecker.py +++ b/tests/test_variantchecker.py @@ -526,6 +526,18 @@ class TestVariantchecker(): assert_equal(self.output.getIndexedOutput('genomicDescription', 0), 'LRG_1:g.6855G>T') + def test_lrg_reference_new(self): + """ + We should be able to use new LRG reference sequence without error. + + Note that all LRG sequences are now in a new format and essentially + this test is no different from the previous, except that LRG_218 was + not yet in our cache which makes it easier to test the new format. + """ + check_variant('LRG_218:c.1786_1788delAAT', self.output) + error_count, _, _ = self.output.Summary() + assert_equal(error_count, 0) + def test_non_numeric_locus_tag_ending(self): """ Locus tag in NC_002128 does not end in an underscore and three digits