diff --git a/mutalyzer/entrypoints/admin.py b/mutalyzer/entrypoints/admin.py index 77a4e948d9cec2d87a6187f2ff7ec81a710de04d..ae4e071bd795c26a5ba0d5bb3d986d77df501849 100644 --- a/mutalyzer/entrypoints/admin.py +++ b/mutalyzer/entrypoints/admin.py @@ -113,6 +113,20 @@ def import_mapview(assembly_name_or_alias, mapview_file, encoding, raise UserError(unicode(e)) +def import_lrgmap(assembly_name_or_alias, lrgmap_file, encoding): + """ + Import transcript mappings from an EBI LRG transcripts map file. + """ + lrgmap_file = codecs.getreader(encoding)(lrgmap_file) + + try: + assembly = Assembly.by_name_or_alias(assembly_name_or_alias) + except NoResultFound: + raise UserError('Not a valid assembly: %s' % assembly_name_or_alias) + + mapping.import_from_lrgmap_file(assembly, lrgmap_file) + + def import_gene(assembly_name_or_alias, gene): """ Import transcript mappings for a gene from the UCSC database. @@ -313,6 +327,22 @@ def main(): help='use only entries with this group label (example: ' 'GRCh37.p2-Primary Assembly)') + # Subparser 'assemblies import-lrgmap'. + p = s.add_parser( + 'import-lrgmap', + help='import mappings from EBI LRG transcripts map file', + parents=[assembly_parser], + description=import_lrgmap.__doc__.split('\n\n')[0]) + p.set_defaults(func=import_lrgmap) + p.add_argument( + 'lrgmap_file', metavar='FILE', type=argparse.FileType('rb'), + help='EBI LRG transcript map file (example: ' + 'list_LRGs_transcripts_GRCh37.txt)') + p.add_argument( + '--encoding', metavar='ENCODING', type=_cli_string, + default=default_encoding, + help='input file encoding (default: %s)' % default_encoding) + # Subparser 'assemblies import-gene'. p = s.add_parser( 'import-gene', help='import mappings by gene from UCSC database', diff --git a/mutalyzer/mapping.py b/mutalyzer/mapping.py index 31e67e4d210c543b76831ec67344a70c2db49184..d71f3aeaec8e31e989413872fb3f2e83f335bd62 100644 --- a/mutalyzer/mapping.py +++ b/mutalyzer/mapping.py @@ -1053,3 +1053,74 @@ def import_from_mapview_file(assembly, mapview_file, group_label): session.add(mapping) session.commit() + + +def import_from_lrgmap_file(assembly, lrgmap_file): + """ + Import transcript mappings from an EBI LRG transcripts map file. + + All positions are one-based, inclusive, and that is what we also use in + our database. + """ + columns = ['transcript', 'gene', 'chromosome', 'strand', 'start', 'stop', + 'exons', 'protein', 'cds_start', 'cds_stop'] + + chromosomes = assembly.chromosomes.all() + + def read_mappings(lrgmap_file): + for line in lrgmap_file: + if line.startswith('#'): + continue + record = dict(zip(columns, line.rstrip('\r\n').split('\t'))) + + record['start'] = int(record['start']) + record['stop'] = int(record['stop']) + try: + record['cds_start'] = int(record['cds_start']) + except ValueError: + record['cds_start'] = None + try: + record['cds_stop'] = int(record['cds_stop']) + except ValueError: + record['cds_stop'] = None + record['exons'] = [[int(pos) for pos in exon.split('-')] + for exon in record['exons'].split(',')] + + try: + yield build_mapping(record) + except ValueError: + pass + + def build_mapping(record): + # Only use records on chromosomes we know. + try: + chromosome = next(c for c in chromosomes if + c.name == 'chr' + record['chromosome']) + except StopIteration: + raise ValueError() + + accession, transcript = record['transcript'].split('t') + transcript = int(transcript) + + orientation = 'reverse' if record['strand'] == '-1' else 'forward' + + if record['cds_start']: + cds = record['cds_start'], record['cds_stop'] + else: + cds = None + + # TODO: Also take protein into account. For example, in LRG_32 (TP53) + # some transcripts occur twice (with different CDSs and different + # protein numbers). + # https://humgenprojects.lumc.nl/trac/mutalyzer/ticket/150 + return TranscriptMapping.create_or_update( + chromosome, 'lrg', accession, record['gene'], orientation, + record['start'], record['stop'], + [start for start, _ in record['exons']], + [stop for _, stop in record['exons']], + 'ebi', transcript=transcript, cds=cds, select_transcript=True) + + for mapping in read_mappings(lrgmap_file): + session.add(mapping) + + session.commit() diff --git a/tests/data/hg19.lrgmap.subset.txt b/tests/data/hg19.lrgmap.subset.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f83d3dad55853dbf1e6c915e9e70f6665869f82 --- /dev/null +++ b/tests/data/hg19.lrgmap.subset.txt @@ -0,0 +1,30 @@ +# Last modified: 10-02-2016@22:00:03 +# LRG_TRANSCRIPT HGNC_SYMBOL CHROMOSOME STRAND TRANSCRIPT_START TRANSCRIPT_STOP EXONS_COORDS LRG_PROTEIN CDS_START CDS_STOP +LRG_1t1 COL1A1 17 -1 48261457 48279000 48261457-48263009,48263139-48263381,48263678-48263868,48264001-48264283,48264376-48264483,48264845-48264898,48265237-48265344,48265457-48265510,48265891-48265998,48266103-48266156,48266264-48266371,48266529-48266636,48266738-48266899,48267040-48267093,48267220-48267273,48267362-48267469,48267688-48267741,48267904-48267957,48268178-48268285,48268744-48268851,48269149-48269247,48269341-48269385,48269836-48269889,48270001-48270054,48270158-48270211,48270355-48270408,48271304-48271402,48271491-48271544,48271710-48271808,48271934-48271987,48272082-48272189,48272408-48272461,48272593-48272691,48272795-48272839,48272928-48273026,48273284-48273337,48273516-48273560,48273675-48273728,48273845-48273889,48273978-48274031,48274371-48274424,48274541-48274594,48275093-48275146,48275310-48275363,48275522-48275566,48275794-48275865,48276587-48276688,48276779-48276814,48276917-48276951,48277114-48277308,48278772-48279000 LRG_1p1 48262863 48278874 +LRG_2t1 COL1A2 7 1 94023873 94060544 94023873-94024413,94027060-94027070,94027694-94027708,94028361-94028396,94029508-94029600,94030879-94030932,94033868-94033912,94034005-94034058,94034151-94034204,94034511-94034564,94034985-94035038,94035562-94035615,94037159-94037203,94037495-94037548,94037648-94037692,94038082-94038135,94038634-94038732,94038876-94038920,94039035-94039133,94039554-94039607,94039732-94039839,94040201-94040254,94040368-94040466,94041380-94041433,94041896-94041994,94042395-94042448,94043002-94043055,94043206-94043259,94043534-94043587,94044538-94044582,94045717-94045815,94047036-94047143,94047811-94047864,94048810-94048863,94049545-94049598,94049703-94049756,94049853-94049960,94050321-94050374,94051211-94051264,94052269-94052430,94053648-94053755,94054429-94054536,94054922-94054975,94055062-94055169,94055310-94055363,94055735-94055842,94056320-94056373,94056500-94056607,94056939-94057197,94057605-94057789,94058500-94058742,94059559-94060544 LRG_2p1 94024344 94059705 +LRG_3t1 COL3A1 2 1 189839099 189877472 189839099-189839294,189849486-189849688,189849923-189849973,189850391-189850504,189851785-189851865,189852807-189852860,189853316-189853369,189854122-189854175,189854822-189854875,189855033-189855086,189855730-189855783,189856213-189856257,189856395-189856448,189856910-189856954,189857613-189857666,189858087-189858185,189858764-189858808,189858960-189859058,189859267-189859320,189859450-189859557,189859772-189859825,189860418-189860516,189860851-189860904,189861124-189861222,189861891-189861944,189862062-189862115,189862426-189862479,189862992-189863045,189863400-189863444,189864011-189864109,189864196-189864303,189864568-189864621,189866123-189866176,189866262-189866315,189867024-189867077,189867681-189867788,189868137-189868190,189868460-189868513,189868708-189868869,189868983-189869090,189870076-189870183,189870932-189870985,189871071-189871178,189871663-189871716,189872226-189872333,189872611-189872664,189872761-189872868,189873650-189873947,189874904-189875091,189875374-189875616,189876354-189877472 LRG_3p1 189839216 189876500 +LRG_4t1 CRTAP 3 1 33155450 33189265 33155450-33156040,33161836-33161985,33165900-33166071,33171431-33171559,33174047-33174192,33175674-33175757,33183887-33189265 LRG_4p1 33155570 33183940 +LRG_5t1 P3H1 1 -1 43212006 43232755 43212006-43212523,43212943-43213083,43213394-43213469,43213871-43213988,43215857-43216007,43217945-43218040,43218208-43218335,43220540-43220661,43220836-43220888,43221219-43221308,43223454-43223593,43224523-43224654,43224872-43225061,43227994-43228146,43232178-43232755 LRG_5p1 43212368 43232642 +LRG_5t2 P3H1 1 -1 43212006 43232755 43212006-43212523,43212924-43213083,43213394-43213469,43213871-43213988,43215857-43216007,43217945-43218040,43218208-43218335,43220540-43220661,43220836-43220888,43221219-43221308,43223454-43223593,43224523-43224654,43224872-43225061,43227994-43228146,43232178-43232755 LRG_5p2 43212504 43232642 +LRG_5t3 P3H1 1 -1 43212006 43232755 43212006-43213083,43213394-43213469,43213871-43213988,43215857-43216007,43217945-43218040,43218208-43218335,43220540-43220661,43220836-43220888,43221219-43221308,43223454-43223593,43224523-43224654,43224872-43225061,43227994-43228146,43232178-43232755 LRG_5p3 43212583 43232642 +LRG_6t1 ATP1A2 1 1 160085548 160113381 160085548-160085663,160090696-160090800,160090982-160091041,160093003-160093206,160093733-160093846,160094086-160094220,160094926-160095043,160097342-160097610,160098442-160098640,160098770-160098879,160099056-160099190,160099892-160100081,160100212-160100387,160104274-160104410,160104935-160105085,160105224-160105392,160105629-160105783,160106037-160106160,160106360-160106505,160106691-160106821,160109430-160109531,160109683-160109774,160111084-160113381 LRG_6p1 160085652 160111112 +LRG_160t1 PIGA X -1 15337573 15353676 15337573-15339894,15342787-15342993,15343142-15343274,15344036-15344168,15349338-15350114,15353623-15353676 LRG_160p1 15339628 15350052 +LRG_161t1 PMS2 7 -1 6012870 6048737 6012870-6013173,6017219-6017388,6018227-6018327,6022455-6022622,6026390-6027251,6029431-6029586,6031604-6031688,6035165-6035264,6036957-6037054,6038739-6038906,6042084-6042267,6043321-6043423,6043603-6043689,6045523-6045662,6048628-6048737 LRG_161p1 6013030 6048650 +LRG_162t1 PRKDC 8 -1 48685669 48872743 48685669-48686938,48689405-48689544,48690247-48690435,48691020-48691221,48691289-48691360,48691565-48691654,48694723-48694815,48694939-48695159,48696303-48696370,48697674-48697878,48701467-48701610,48701712-48701799,48706851-48707062,48710798-48710958,48711771-48711951,48713354-48713547,48715867-48716041,48719698-48719887,48730011-48730122,48731963-48732071,48733280-48733504,48734165-48734353,48736419-48736557,48739217-48739422,48740729-48740908,48743166-48743297,48744375-48744487,48746757-48746957,48748899-48749088,48749773-48749980,48751709-48751807,48752577-48752750,48761715-48761864,48761940-48762064,48765234-48765345,48766644-48766775,48767783-48767934,48769717-48769860,48771077-48771196,48771410-48771547,48772172-48772320,48773460-48773532,48774623-48774688,48774934-48775102,48775960-48776138,48777117-48777324,48790285-48790412,48792052-48792219,48793977-48794081,48794473-48794658,48798505-48798708,48800108-48800266,48801079-48801211,48801575-48801783,48802818-48803041,48805700-48805947,48809721-48809854,48811030-48811129,48812933-48813027,48815129-48815355,48817429-48817536,48824970-48825122,48826461-48826624,48827888-48827978,48830837-48830943,48839754-48839913,48840331-48840450,48841652-48841738,48842413-48842572,48843232-48843347,48845580-48845732,48846525-48846650,48847569-48847618,48848292-48848460,48848913-48849077,48852111-48852257,48855769-48855926,48856413-48856443,48856534-48856589,48866180-48866279,48866367-48866479,48866898-48867006,48868434-48868508,48869731-48869823,48869915-48869991,48872533-48872743 LRG_162p1 48686734 48872686 +LRG_163t1 RMRP 9 -1 35657748 35658015 35657748-35658015 +LRG_164t1 STIM1 11 1 3876933 4114440 3876933-3877639,3988782-3988912,4045103-4045217,4076756-4076867,4080511-4080626,4091256-4091433,4095732-4095909,4103414-4103581,4104112-4104212,4104493-4104728,4107707-4107773,4112512-4114440 LRG_164p1 3877501 4113028 +LRG_165t1 STXBP2 19 1 7701991 7712759 7701991-7702072,7703612-7703661,7703905-7703986,7704617-7704693,7705617-7705695,7705786-7705889,7706591-7706739,7706920-7707004,7707089-7707219,7707315-7707422,7707652-7707709,7707869-7707934,7708051-7708131,7709500-7709638,7710083-7710192,7711135-7711230,7712048-7712133,7712240-7712397,7712611-7712759 LRG_165p1 7702036 7712696 +LRG_166t1 TAP1 6 -1 32812986 32821748 32812986-32813562,32814845-32814981,32815290-32815452,32815696-32815869,32816429-32816617,32816767-32816895,32818097-32818294,32818721-32818926,32819886-32820016,32820165-32820279,32820816-32821748 LRG_166p1 32813356 32821593 +LRG_167t1 TAP2 6 -1 32793187 32806547 32793187-32796811,32797177-32797313,32797707-32797866,32798044-32798217,32798395-32798583,32800110-32800238,32800404-32800601,32802931-32803136,32803420-32803550,32805314-32805428,32805518-32806014,32806430-32806547 LRG_167p1 32796632 32806010 +LRG_167t2 TAP2 6 -1 32789610 32806547 32789610-32790095,32797177-32797313,32797707-32797866,32798044-32798217,32798395-32798583,32800110-32800238,32800404-32800601,32802931-32803136,32803420-32803550,32805314-32805428,32805518-32806014,32806430-32806547 LRG_167p2 32790066 32806010 +LRG_168t1 THBD 20 -1 23026270 23030301 23026270-23030301 LRG_168p1 23028414 23030141 +LRG_341t2 UNC119 17 -1 26873725 26879646 26873725-26874867,26875017-26875119,26875610-26875723,26879356-26879646 LRG_341p2 26874642 26879575 +LRG_343t1 TERT 5 -1 1253282 1295162 1253282-1253946,1254483-1254620,1255402-1255526,1258713-1258774,1260589-1260715,1264519-1264707,1266579-1266650,1268635-1268748,1271234-1271319,1272300-1272395,1278756-1278911,1279406-1279585,1280273-1280453,1282544-1282739,1293428-1294781,1294886-1295162 LRG_343p1 1253843 1295104 +LRG_344t1 KRAS 12 -1 25357723 25403865 25357723-25362845,25368371-25368494,25378548-25378707,25380168-25380346,25398208-25398329,25403685-25403865 LRG_344p1 25368375 25398318 +LRG_344t2 KRAS 12 -1 25357723 25403865 25357723-25362845,25378548-25378707,25380168-25380346,25398208-25398329,25403685-25403865 LRG_344p2 25362729 25398318 +LRG_345t1 NOP10 15 -1 34633917 34635362 34633917-34634309,34635221-34635362 LRG_345p1 34634169 34635274 +LRG_346t1 NHP2 5 -1 177576464 177580961 177576464-177576839,177577889-177577994,177580492-177580561,177580659-177580961 LRG_346p1 177576714 177580818 +LRG_347t1 TERC 3 -1 169482398 169482848 169482398-169482848 +LRG_348t1 CR2 1 1 207627645 207663240 207627645-207627821,207639871-207640257,207641869-207642060,207642145-207642247,207642495-207642577,207643040-207643447,207644085-207644261,207644342-207644432,207644768-207644844,207646117-207646524,207646890-207647066,207647146-207647230,207647586-207647668,207648169-207648561,207649579-207649764,207651230-207651415,207652602-207652625,207653323-207653398,207658809-207658917,207662487-207663240 LRG_348p1 207627764 207658899 +LRG_349t1 MASP1 3 -1 186964142 187009810 186964142-186965173,186968039-186968117,186969422-186969540,186970956-186971103,186974452-186974648,186978529-186978660,186980331-186980508,187003613-187003844,187009416-187009810 LRG_349p1 186965121 187009420 +LRG_349t2 MASP1 3 -1 186933873 187009810 186933873-186938049,186938823-186938922,186940915-186940982,186943112-186943297,186944195-186944308,186947548-186947685,186959269-186959343,186961272-186961409,186968039-186968117,186969422-186969540,186970956-186971103,186974452-186974648,186978529-186978660,186980331-186980508,187003613-187003844,187009416-187009810 LRG_349p2 186937859 187009420 diff --git a/tests/test_mapping.py b/tests/test_mapping.py index 8ce9d64418db2a5566fbcacc87f9c32d792d989c..aaebbc8d82c145c69886042f5b28415e7b1b2dfa 100644 --- a/tests/test_mapping.py +++ b/tests/test_mapping.py @@ -448,3 +448,73 @@ def test_import_mapview(hg19): assert new.orientation == 'forward' assert new.reference_type == 'refseq' assert new.source == 'ncbi' + + +def test_import_lrgmap(hg19): + original_count = TranscriptMapping.query.count() + + path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', + 'hg19.lrgmap.subset.txt') + lrgmap = codecs.open(path, encoding='utf-8') + lrgmap_count = sum(1 for line in lrgmap if not line.startswith('#')) + lrgmap.seek(0) + + mapping.import_from_lrgmap_file(hg19, lrgmap) + + # Two transcripts were already in, the rest is new: + # - LRG_1 + # - LRG_348 + assert TranscriptMapping.query.count() == original_count + lrgmap_count - 2 + + # No changes here. + unchanged = TranscriptMapping.query.filter_by(accession='LRG_1').one() + assert unchanged.start == 48261457 + assert unchanged.stop == 48279000 + assert unchanged.exon_starts == [ + 48261457, 48263139, 48263678, 48264001, 48264376, 48264845, 48265237, + 48265457, 48265891, 48266103, 48266264, 48266529, 48266738, 48267040, + 48267220, 48267362, 48267688, 48267904, 48268178, 48268744, 48269149, + 48269341, 48269836, 48270001, 48270158, 48270355, 48271304, 48271491, + 48271710, 48271934, 48272082, 48272408, 48272593, 48272795, 48272928, + 48273284, 48273516, 48273675, 48273845, 48273978, 48274371, 48274541, + 48275093, 48275310, 48275522, 48275794, 48276587, 48276779, 48276917, + 48277114, 48278772] + assert unchanged.exon_stops == [ + 48263009, 48263381, 48263868, 48264283, 48264483, 48264898, 48265344, + 48265510, 48265998, 48266156, 48266371, 48266636, 48266899, 48267093, + 48267273, 48267469, 48267741, 48267957, 48268285, 48268851, 48269247, + 48269385, 48269889, 48270054, 48270211, 48270408, 48271402, 48271544, + 48271808, 48271987, 48272189, 48272461, 48272691, 48272839, 48273026, + 48273337, 48273560, 48273728, 48273889, 48274031, 48274424, 48274594, + 48275146, 48275363, 48275566, 48275865, 48276688, 48276814, 48276951, + 48277308, 48279000] + assert unchanged.cds == (48262863, 48278874) + + # We made some artificial changes to the lrgmap file here. + updated = TranscriptMapping.query.filter_by(accession='LRG_348').one() + assert updated.start == 207627645 + assert updated.stop == 207663240 + assert updated.exon_starts == [ + 207627645, 207639871, 207641869, 207642145, 207642495, 207643040, + 207644085, 207644342, 207644768, 207646117, 207646890, 207647146, + 207647586, 207648169, 207649579, 207651230, 207652602, 207653323, + 207658809, 207662487] + assert updated.exon_stops == [ + 207627821, 207640257, 207642060, 207642247, 207642577, 207643447, + 207644261, 207644432, 207644844, 207646524, 207647066, 207647230, + 207647668, 207648561, 207649764, 207651415, 207652625, 207653398, + 207658917, 207663240] + assert updated.cds == (207627764, 207658899) + + # This is a new entry. + new = TranscriptMapping.query.filter_by(accession='LRG_163').one() + assert new.version is None + assert new.start == 35657748 + assert new.stop == 35658015 + assert new.exon_starts == [35657748] + assert new.exon_stops == [35658015] + assert new.cds is None + assert new.gene == 'RMRP' + assert new.orientation == 'reverse' + assert new.reference_type == 'lrg' + assert new.source == 'ebi'