From a89fcf463979747843829bbcd581818dc25248a3 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat <martijn@vermaat.name> Date: Wed, 26 Mar 2014 14:27:47 +0100 Subject: [PATCH] In LRG, only support one CDS per transcript (ignore all others) The LRG model allows for more than one CDS per transcript (and each CDS can be coupled with more than one transcript). The Mutalyzer gene model does not allow that. Previously, multiple CDSs per transcript where treated incorrectly (I think), so now we just ignore any CDS except the first per transcript. Unfortunately, it is a bit more involved to give good warnings for this, so that is not implemented. --- mutalyzer/parsers/lrg.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mutalyzer/parsers/lrg.py b/mutalyzer/parsers/lrg.py index 9b5d0895..d8f742c8 100644 --- a/mutalyzer/parsers/lrg.py +++ b/mutalyzer/parsers/lrg.py @@ -192,7 +192,16 @@ def create_record(data): # ending position, keep the possibility in mind that multiple CDS # regions are given CDSPList = GenRecord.PList() - for CDS in tData.getElementsByTagName("coding_region"): + for cds_id, CDS in enumerate(tData.getElementsByTagName("coding_region")): + if cds_id > 0: + # Todo: For now, we only support one CDS per transcript and + # ignore all others. + # By the way, I don't think the loop and sorting of CDS + # positions makes any sense here, but I leave it in place + # and just ignore everything except the first iteration. + #translationName = CDS.getElementsByTagName("translation")[0].getAttribute("name").encode("utf8")[1:] + #print 'Ignoring transcript %s translation %s' % (transcriptName, translationName) + continue coordinates = _get_coordinates(CDS, lrg_id) CDSPList.positionList.extend(\ [int(coordinates["start"]), int(coordinates["end"])]) -- GitLab