Skip to content
Snippets Groups Projects
Commit 736f0bc1 authored by Vermaat's avatar Vermaat
Browse files

Import LRG transcript mappings from EBI map files

parent d2987465
No related branches found
No related tags found
No related merge requests found
......@@ -113,6 +113,20 @@ def import_mapview(assembly_name_or_alias, mapview_file, encoding,
raise UserError(unicode(e))
def import_lrgmap(assembly_name_or_alias, lrgmap_file, encoding):
"""
Import transcript mappings from an EBI LRG transcripts map file.
"""
lrgmap_file = codecs.getreader(encoding)(lrgmap_file)
try:
assembly = Assembly.by_name_or_alias(assembly_name_or_alias)
except NoResultFound:
raise UserError('Not a valid assembly: %s' % assembly_name_or_alias)
mapping.import_from_lrgmap_file(assembly, lrgmap_file)
def import_gene(assembly_name_or_alias, gene):
"""
Import transcript mappings for a gene from the UCSC database.
......@@ -313,6 +327,22 @@ def main():
help='use only entries with this group label (example: '
'GRCh37.p2-Primary Assembly)')
# Subparser 'assemblies import-lrgmap'.
p = s.add_parser(
'import-lrgmap',
help='import mappings from EBI LRG transcripts map file',
parents=[assembly_parser],
description=import_lrgmap.__doc__.split('\n\n')[0])
p.set_defaults(func=import_lrgmap)
p.add_argument(
'lrgmap_file', metavar='FILE', type=argparse.FileType('rb'),
help='EBI LRG transcript map file (example: '
'list_LRGs_transcripts_GRCh37.txt)')
p.add_argument(
'--encoding', metavar='ENCODING', type=_cli_string,
default=default_encoding,
help='input file encoding (default: %s)' % default_encoding)
# Subparser 'assemblies import-gene'.
p = s.add_parser(
'import-gene', help='import mappings by gene from UCSC database',
......
......@@ -1053,3 +1053,74 @@ def import_from_mapview_file(assembly, mapview_file, group_label):
session.add(mapping)
session.commit()
def import_from_lrgmap_file(assembly, lrgmap_file):
"""
Import transcript mappings from an EBI LRG transcripts map file.
All positions are one-based, inclusive, and that is what we also use in
our database.
"""
columns = ['transcript', 'gene', 'chromosome', 'strand', 'start', 'stop',
'exons', 'protein', 'cds_start', 'cds_stop']
chromosomes = assembly.chromosomes.all()
def read_mappings(lrgmap_file):
for line in lrgmap_file:
if line.startswith('#'):
continue
record = dict(zip(columns, line.rstrip('\r\n').split('\t')))
record['start'] = int(record['start'])
record['stop'] = int(record['stop'])
try:
record['cds_start'] = int(record['cds_start'])
except ValueError:
record['cds_start'] = None
try:
record['cds_stop'] = int(record['cds_stop'])
except ValueError:
record['cds_stop'] = None
record['exons'] = [[int(pos) for pos in exon.split('-')]
for exon in record['exons'].split(',')]
try:
yield build_mapping(record)
except ValueError:
pass
def build_mapping(record):
# Only use records on chromosomes we know.
try:
chromosome = next(c for c in chromosomes if
c.name == 'chr' + record['chromosome'])
except StopIteration:
raise ValueError()
accession, transcript = record['transcript'].split('t')
transcript = int(transcript)
orientation = 'reverse' if record['strand'] == '-1' else 'forward'
if record['cds_start']:
cds = record['cds_start'], record['cds_stop']
else:
cds = None
# TODO: Also take protein into account. For example, in LRG_32 (TP53)
# some transcripts occur twice (with different CDSs and different
# protein numbers).
# https://humgenprojects.lumc.nl/trac/mutalyzer/ticket/150
return TranscriptMapping.create_or_update(
chromosome, 'lrg', accession, record['gene'], orientation,
record['start'], record['stop'],
[start for start, _ in record['exons']],
[stop for _, stop in record['exons']],
'ebi', transcript=transcript, cds=cds, select_transcript=True)
for mapping in read_mappings(lrgmap_file):
session.add(mapping)
session.commit()
# Last modified: 10-02-2016@22:00:03
# LRG_TRANSCRIPT HGNC_SYMBOL CHROMOSOME STRAND TRANSCRIPT_START TRANSCRIPT_STOP EXONS_COORDS LRG_PROTEIN CDS_START CDS_STOP
LRG_1t1 COL1A1 17 -1 48261457 48279000 48261457-48263009,48263139-48263381,48263678-48263868,48264001-48264283,48264376-48264483,48264845-48264898,48265237-48265344,48265457-48265510,48265891-48265998,48266103-48266156,48266264-48266371,48266529-48266636,48266738-48266899,48267040-48267093,48267220-48267273,48267362-48267469,48267688-48267741,48267904-48267957,48268178-48268285,48268744-48268851,48269149-48269247,48269341-48269385,48269836-48269889,48270001-48270054,48270158-48270211,48270355-48270408,48271304-48271402,48271491-48271544,48271710-48271808,48271934-48271987,48272082-48272189,48272408-48272461,48272593-48272691,48272795-48272839,48272928-48273026,48273284-48273337,48273516-48273560,48273675-48273728,48273845-48273889,48273978-48274031,48274371-48274424,48274541-48274594,48275093-48275146,48275310-48275363,48275522-48275566,48275794-48275865,48276587-48276688,48276779-48276814,48276917-48276951,48277114-48277308,48278772-48279000 LRG_1p1 48262863 48278874
LRG_2t1 COL1A2 7 1 94023873 94060544 94023873-94024413,94027060-94027070,94027694-94027708,94028361-94028396,94029508-94029600,94030879-94030932,94033868-94033912,94034005-94034058,94034151-94034204,94034511-94034564,94034985-94035038,94035562-94035615,94037159-94037203,94037495-94037548,94037648-94037692,94038082-94038135,94038634-94038732,94038876-94038920,94039035-94039133,94039554-94039607,94039732-94039839,94040201-94040254,94040368-94040466,94041380-94041433,94041896-94041994,94042395-94042448,94043002-94043055,94043206-94043259,94043534-94043587,94044538-94044582,94045717-94045815,94047036-94047143,94047811-94047864,94048810-94048863,94049545-94049598,94049703-94049756,94049853-94049960,94050321-94050374,94051211-94051264,94052269-94052430,94053648-94053755,94054429-94054536,94054922-94054975,94055062-94055169,94055310-94055363,94055735-94055842,94056320-94056373,94056500-94056607,94056939-94057197,94057605-94057789,94058500-94058742,94059559-94060544 LRG_2p1 94024344 94059705
LRG_3t1 COL3A1 2 1 189839099 189877472 189839099-189839294,189849486-189849688,189849923-189849973,189850391-189850504,189851785-189851865,189852807-189852860,189853316-189853369,189854122-189854175,189854822-189854875,189855033-189855086,189855730-189855783,189856213-189856257,189856395-189856448,189856910-189856954,189857613-189857666,189858087-189858185,189858764-189858808,189858960-189859058,189859267-189859320,189859450-189859557,189859772-189859825,189860418-189860516,189860851-189860904,189861124-189861222,189861891-189861944,189862062-189862115,189862426-189862479,189862992-189863045,189863400-189863444,189864011-189864109,189864196-189864303,189864568-189864621,189866123-189866176,189866262-189866315,189867024-189867077,189867681-189867788,189868137-189868190,189868460-189868513,189868708-189868869,189868983-189869090,189870076-189870183,189870932-189870985,189871071-189871178,189871663-189871716,189872226-189872333,189872611-189872664,189872761-189872868,189873650-189873947,189874904-189875091,189875374-189875616,189876354-189877472 LRG_3p1 189839216 189876500
LRG_4t1 CRTAP 3 1 33155450 33189265 33155450-33156040,33161836-33161985,33165900-33166071,33171431-33171559,33174047-33174192,33175674-33175757,33183887-33189265 LRG_4p1 33155570 33183940
LRG_5t1 P3H1 1 -1 43212006 43232755 43212006-43212523,43212943-43213083,43213394-43213469,43213871-43213988,43215857-43216007,43217945-43218040,43218208-43218335,43220540-43220661,43220836-43220888,43221219-43221308,43223454-43223593,43224523-43224654,43224872-43225061,43227994-43228146,43232178-43232755 LRG_5p1 43212368 43232642
LRG_5t2 P3H1 1 -1 43212006 43232755 43212006-43212523,43212924-43213083,43213394-43213469,43213871-43213988,43215857-43216007,43217945-43218040,43218208-43218335,43220540-43220661,43220836-43220888,43221219-43221308,43223454-43223593,43224523-43224654,43224872-43225061,43227994-43228146,43232178-43232755 LRG_5p2 43212504 43232642
LRG_5t3 P3H1 1 -1 43212006 43232755 43212006-43213083,43213394-43213469,43213871-43213988,43215857-43216007,43217945-43218040,43218208-43218335,43220540-43220661,43220836-43220888,43221219-43221308,43223454-43223593,43224523-43224654,43224872-43225061,43227994-43228146,43232178-43232755 LRG_5p3 43212583 43232642
LRG_6t1 ATP1A2 1 1 160085548 160113381 160085548-160085663,160090696-160090800,160090982-160091041,160093003-160093206,160093733-160093846,160094086-160094220,160094926-160095043,160097342-160097610,160098442-160098640,160098770-160098879,160099056-160099190,160099892-160100081,160100212-160100387,160104274-160104410,160104935-160105085,160105224-160105392,160105629-160105783,160106037-160106160,160106360-160106505,160106691-160106821,160109430-160109531,160109683-160109774,160111084-160113381 LRG_6p1 160085652 160111112
LRG_160t1 PIGA X -1 15337573 15353676 15337573-15339894,15342787-15342993,15343142-15343274,15344036-15344168,15349338-15350114,15353623-15353676 LRG_160p1 15339628 15350052
LRG_161t1 PMS2 7 -1 6012870 6048737 6012870-6013173,6017219-6017388,6018227-6018327,6022455-6022622,6026390-6027251,6029431-6029586,6031604-6031688,6035165-6035264,6036957-6037054,6038739-6038906,6042084-6042267,6043321-6043423,6043603-6043689,6045523-6045662,6048628-6048737 LRG_161p1 6013030 6048650
LRG_162t1 PRKDC 8 -1 48685669 48872743 48685669-48686938,48689405-48689544,48690247-48690435,48691020-48691221,48691289-48691360,48691565-48691654,48694723-48694815,48694939-48695159,48696303-48696370,48697674-48697878,48701467-48701610,48701712-48701799,48706851-48707062,48710798-48710958,48711771-48711951,48713354-48713547,48715867-48716041,48719698-48719887,48730011-48730122,48731963-48732071,48733280-48733504,48734165-48734353,48736419-48736557,48739217-48739422,48740729-48740908,48743166-48743297,48744375-48744487,48746757-48746957,48748899-48749088,48749773-48749980,48751709-48751807,48752577-48752750,48761715-48761864,48761940-48762064,48765234-48765345,48766644-48766775,48767783-48767934,48769717-48769860,48771077-48771196,48771410-48771547,48772172-48772320,48773460-48773532,48774623-48774688,48774934-48775102,48775960-48776138,48777117-48777324,48790285-48790412,48792052-48792219,48793977-48794081,48794473-48794658,48798505-48798708,48800108-48800266,48801079-48801211,48801575-48801783,48802818-48803041,48805700-48805947,48809721-48809854,48811030-48811129,48812933-48813027,48815129-48815355,48817429-48817536,48824970-48825122,48826461-48826624,48827888-48827978,48830837-48830943,48839754-48839913,48840331-48840450,48841652-48841738,48842413-48842572,48843232-48843347,48845580-48845732,48846525-48846650,48847569-48847618,48848292-48848460,48848913-48849077,48852111-48852257,48855769-48855926,48856413-48856443,48856534-48856589,48866180-48866279,48866367-48866479,48866898-48867006,48868434-48868508,48869731-48869823,48869915-48869991,48872533-48872743 LRG_162p1 48686734 48872686
LRG_163t1 RMRP 9 -1 35657748 35658015 35657748-35658015
LRG_164t1 STIM1 11 1 3876933 4114440 3876933-3877639,3988782-3988912,4045103-4045217,4076756-4076867,4080511-4080626,4091256-4091433,4095732-4095909,4103414-4103581,4104112-4104212,4104493-4104728,4107707-4107773,4112512-4114440 LRG_164p1 3877501 4113028
LRG_165t1 STXBP2 19 1 7701991 7712759 7701991-7702072,7703612-7703661,7703905-7703986,7704617-7704693,7705617-7705695,7705786-7705889,7706591-7706739,7706920-7707004,7707089-7707219,7707315-7707422,7707652-7707709,7707869-7707934,7708051-7708131,7709500-7709638,7710083-7710192,7711135-7711230,7712048-7712133,7712240-7712397,7712611-7712759 LRG_165p1 7702036 7712696
LRG_166t1 TAP1 6 -1 32812986 32821748 32812986-32813562,32814845-32814981,32815290-32815452,32815696-32815869,32816429-32816617,32816767-32816895,32818097-32818294,32818721-32818926,32819886-32820016,32820165-32820279,32820816-32821748 LRG_166p1 32813356 32821593
LRG_167t1 TAP2 6 -1 32793187 32806547 32793187-32796811,32797177-32797313,32797707-32797866,32798044-32798217,32798395-32798583,32800110-32800238,32800404-32800601,32802931-32803136,32803420-32803550,32805314-32805428,32805518-32806014,32806430-32806547 LRG_167p1 32796632 32806010
LRG_167t2 TAP2 6 -1 32789610 32806547 32789610-32790095,32797177-32797313,32797707-32797866,32798044-32798217,32798395-32798583,32800110-32800238,32800404-32800601,32802931-32803136,32803420-32803550,32805314-32805428,32805518-32806014,32806430-32806547 LRG_167p2 32790066 32806010
LRG_168t1 THBD 20 -1 23026270 23030301 23026270-23030301 LRG_168p1 23028414 23030141
LRG_341t2 UNC119 17 -1 26873725 26879646 26873725-26874867,26875017-26875119,26875610-26875723,26879356-26879646 LRG_341p2 26874642 26879575
LRG_343t1 TERT 5 -1 1253282 1295162 1253282-1253946,1254483-1254620,1255402-1255526,1258713-1258774,1260589-1260715,1264519-1264707,1266579-1266650,1268635-1268748,1271234-1271319,1272300-1272395,1278756-1278911,1279406-1279585,1280273-1280453,1282544-1282739,1293428-1294781,1294886-1295162 LRG_343p1 1253843 1295104
LRG_344t1 KRAS 12 -1 25357723 25403865 25357723-25362845,25368371-25368494,25378548-25378707,25380168-25380346,25398208-25398329,25403685-25403865 LRG_344p1 25368375 25398318
LRG_344t2 KRAS 12 -1 25357723 25403865 25357723-25362845,25378548-25378707,25380168-25380346,25398208-25398329,25403685-25403865 LRG_344p2 25362729 25398318
LRG_345t1 NOP10 15 -1 34633917 34635362 34633917-34634309,34635221-34635362 LRG_345p1 34634169 34635274
LRG_346t1 NHP2 5 -1 177576464 177580961 177576464-177576839,177577889-177577994,177580492-177580561,177580659-177580961 LRG_346p1 177576714 177580818
LRG_347t1 TERC 3 -1 169482398 169482848 169482398-169482848
LRG_348t1 CR2 1 1 207627645 207663240 207627645-207627821,207639871-207640257,207641869-207642060,207642145-207642247,207642495-207642577,207643040-207643447,207644085-207644261,207644342-207644432,207644768-207644844,207646117-207646524,207646890-207647066,207647146-207647230,207647586-207647668,207648169-207648561,207649579-207649764,207651230-207651415,207652602-207652625,207653323-207653398,207658809-207658917,207662487-207663240 LRG_348p1 207627764 207658899
LRG_349t1 MASP1 3 -1 186964142 187009810 186964142-186965173,186968039-186968117,186969422-186969540,186970956-186971103,186974452-186974648,186978529-186978660,186980331-186980508,187003613-187003844,187009416-187009810 LRG_349p1 186965121 187009420
LRG_349t2 MASP1 3 -1 186933873 187009810 186933873-186938049,186938823-186938922,186940915-186940982,186943112-186943297,186944195-186944308,186947548-186947685,186959269-186959343,186961272-186961409,186968039-186968117,186969422-186969540,186970956-186971103,186974452-186974648,186978529-186978660,186980331-186980508,187003613-187003844,187009416-187009810 LRG_349p2 186937859 187009420
......@@ -448,3 +448,73 @@ def test_import_mapview(hg19):
assert new.orientation == 'forward'
assert new.reference_type == 'refseq'
assert new.source == 'ncbi'
def test_import_lrgmap(hg19):
original_count = TranscriptMapping.query.count()
path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data',
'hg19.lrgmap.subset.txt')
lrgmap = codecs.open(path, encoding='utf-8')
lrgmap_count = sum(1 for line in lrgmap if not line.startswith('#'))
lrgmap.seek(0)
mapping.import_from_lrgmap_file(hg19, lrgmap)
# Two transcripts were already in, the rest is new:
# - LRG_1
# - LRG_348
assert TranscriptMapping.query.count() == original_count + lrgmap_count - 2
# No changes here.
unchanged = TranscriptMapping.query.filter_by(accession='LRG_1').one()
assert unchanged.start == 48261457
assert unchanged.stop == 48279000
assert unchanged.exon_starts == [
48261457, 48263139, 48263678, 48264001, 48264376, 48264845, 48265237,
48265457, 48265891, 48266103, 48266264, 48266529, 48266738, 48267040,
48267220, 48267362, 48267688, 48267904, 48268178, 48268744, 48269149,
48269341, 48269836, 48270001, 48270158, 48270355, 48271304, 48271491,
48271710, 48271934, 48272082, 48272408, 48272593, 48272795, 48272928,
48273284, 48273516, 48273675, 48273845, 48273978, 48274371, 48274541,
48275093, 48275310, 48275522, 48275794, 48276587, 48276779, 48276917,
48277114, 48278772]
assert unchanged.exon_stops == [
48263009, 48263381, 48263868, 48264283, 48264483, 48264898, 48265344,
48265510, 48265998, 48266156, 48266371, 48266636, 48266899, 48267093,
48267273, 48267469, 48267741, 48267957, 48268285, 48268851, 48269247,
48269385, 48269889, 48270054, 48270211, 48270408, 48271402, 48271544,
48271808, 48271987, 48272189, 48272461, 48272691, 48272839, 48273026,
48273337, 48273560, 48273728, 48273889, 48274031, 48274424, 48274594,
48275146, 48275363, 48275566, 48275865, 48276688, 48276814, 48276951,
48277308, 48279000]
assert unchanged.cds == (48262863, 48278874)
# We made some artificial changes to the lrgmap file here.
updated = TranscriptMapping.query.filter_by(accession='LRG_348').one()
assert updated.start == 207627645
assert updated.stop == 207663240
assert updated.exon_starts == [
207627645, 207639871, 207641869, 207642145, 207642495, 207643040,
207644085, 207644342, 207644768, 207646117, 207646890, 207647146,
207647586, 207648169, 207649579, 207651230, 207652602, 207653323,
207658809, 207662487]
assert updated.exon_stops == [
207627821, 207640257, 207642060, 207642247, 207642577, 207643447,
207644261, 207644432, 207644844, 207646524, 207647066, 207647230,
207647668, 207648561, 207649764, 207651415, 207652625, 207653398,
207658917, 207663240]
assert updated.cds == (207627764, 207658899)
# This is a new entry.
new = TranscriptMapping.query.filter_by(accession='LRG_163').one()
assert new.version is None
assert new.start == 35657748
assert new.stop == 35658015
assert new.exon_starts == [35657748]
assert new.exon_stops == [35658015]
assert new.cds is None
assert new.gene == 'RMRP'
assert new.orientation == 'reverse'
assert new.reference_type == 'lrg'
assert new.source == 'ebi'
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment