Newer
Older
"""
Tests for the mutalyzer.mapping module.
"""
from mutalyzer.db.models import TranscriptMapping
from mutalyzer import mapping
# Some example positional coding/chromosomal mappings we use in the tests.
LRG_1_T1_POSITIONS = [
('-150', 48279024),
('-126', 48279000),
('-1', 48278875),
('1', 48278874),
('103', 48278772),
('103+5', 48278767),
('104-5', 48277313),
('104', 48277308),
('870', 48273878),
('4248', 48263139),
('4249', 48263009),
('4395', 48262863),
('*1', 48262862),
('*1406', 48261457),
('*1407', 48261456),
('*1417', 48261446)]
LRG_348_T1_POSITIONS = [
('-150', 207627614),
('-119', 207627645),
('-1', 207627763),
('1', 207627764),
('58', 207627821),
('58+5', 207627826),
('59-5', 207639866),
('59', 207639871),
('3279', 207658899),
('*1', 207658900),
('*772', 207663240),
('*780', 207663248)]
pytestmark = pytest.mark.usefixtures('hg19_transcript_mappings')
@pytest.fixture
def converter(output, hg19):
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
def test_converter(converter):
"""
Simple test.
"""
genomic = converter.c2chrom('NM_003002.2:c.274G>T')
assert genomic == 'NC_000011.9:g.111959695G>T'
coding = converter.chrom2c(genomic, 'list')
assert 'NM_003002.2:c.274G>T' in coding
# Fix for r536: disable the -u and +d convention.
# assert 'NR_028383.1:c.1-u2173C>A' in coding
assert 'NR_028383.1:n.-2173C>A' in coding
def test_converter_non_coding(converter):
"""
Test with variant on non-coding transcript.
"""
genomic = converter.c2chrom('NR_028383.1:n.-2173C>A')
assert genomic == 'NC_000011.9:g.111959695G>T'
coding = converter.chrom2c(genomic, 'list')
assert 'NM_003002.2:c.274G>T' in coding
# Fix for r536: disable the -u and +d convention.
# assert 'NR_028383.1:c.1-u2173C>A' in coding
assert 'NR_028383.1:n.-2173C>A' in coding
def test_converter_compound(converter):
"""
Test with compound variant.
"""
genomic = converter.c2chrom('NM_003002.2:c.[274G>T;278A>G]')
assert genomic == 'NC_000011.9:g.[111959695G>T;111959699A>G]'
coding = converter.chrom2c(genomic, 'list')
assert 'NM_003002.2:c.[274G>T;278A>G]' in coding
assert 'NR_028383.1:n.[-2173C>A;-2177T>C]' in coding
def test_hla_cluster(converter):
"""
Convert to primary assembly.
Transcript NM_000500.5 is mapped to different chromosome locations,
but we like to just see the primary assembly mapping to chromosome 6.
See also bug #58.
"""
# Todo: This test is bogus now that we use a fixture that has just the
# mapping to chromosome 6. However, I think we only get this mapping
# from our current source (NCBI seq_gene.md) anyway, so I'm not sure
# where we got the other mappings from in the past (but haven't
# investigated really).
genomic = converter.c2chrom('NM_000500.5:c.92C>T')
assert genomic == 'NC_000006.11:g.32006291C>T'
coding = converter.chrom2c(genomic, 'list')
assert 'NM_000500.5:c.92C>T' in coding
def test_converter_del_length_reverse(converter):
"""
Position converter on deletion (denoted by length) on transcripts
located on the reverse strand.
"""
coding = converter.chrom2c(
'NC_000022.10:g.51016285_51017117del123456789', 'list')
# Fix for r536: disable the -u and +d convention.
# assert 'NM_001145134.1:c.-138-u21_60del123456789' in coding
# assert 'NR_021492.1:c.1-u5170_1-u4338del123456789' in coding
assert 'NM_001145134.1:c.-159_60del123456789' in coding
assert 'NR_021492.1:n.-5170_-4338del123456789' in coding
def test_S_Venkata_Suresh_Kumar(converter):
"""
Test for correct mapping information on genes where CDS start or stop
is exactly on the border of an exon.
Bug reported February 24, 2012 by S Venkata Suresh Kumar.
"""
coding = converter.chrom2c(
'NC_000001.10:g.115259837_115259837delT', 'list')
assert 'NM_001007553.1:c.3863delA' not in coding
assert 'NM_001007553.1:c.*953delA' in coding
assert 'NM_001130523.1:c.*953delA' in coding
def test_S_Venkata_Suresh_Kumar_more(converter):
"""
Another test for correct mapping information on genes where CDS start
or stop is exactly on the border of an exon.
Bug reported March 21, 2012 by S Venkata Suresh Kumar.
"""
coding = converter.chrom2c(
'NC_000001.10:g.160012314_160012329del16', 'list')
assert 'NM_002241.4:c.-27250-7_-27242del16' not in coding
assert 'NM_002241.4:c.1-7_9del16' in coding
def test_range_order_forward_correct(converter):
"""
Just a normal position converter call, both directions. See Trac #95.
"""
genomic = converter.c2chrom('NM_003002.2:c.-1_274del')
assert genomic == 'NC_000011.9:g.111957631_111959695del'
coding = converter.chrom2c(genomic, 'list')
assert 'NM_003002.2:c.-1_274del' in coding
def test_range_order_forward_incorrect_c2chrom(output, converter):
"""
Incorrect order of a range on the forward strand. See Trac #95.
"""
genomic = converter.c2chrom('NM_003002.2:c.274_-1del')
assert genomic is None
erange = output.getMessagesWithErrorCode('ERANGE')
assert len(erange) == 1
def test_range_order_reverse_correct(converter):
"""
Just a normal position converter call on the reverse strand, both
directions. See Trac #95.
"""
genomic = converter.c2chrom('NM_001162505.1:c.-1_40del')
assert genomic == 'NC_000020.10:g.48770135_48770175del'
coding = converter.chrom2c(genomic, 'list')
assert 'NM_001162505.1:c.-1_40del' in coding
def test_range_order_reverse_incorrect_c2chrom(output, converter):
"""
Incorrect order of a range on the reverse strand. See Trac #95.
"""
genomic = converter.c2chrom('NM_001162505.1:c.40_-1del')
assert genomic is None
erange = output.getMessagesWithErrorCode('ERANGE')
assert len(erange) == 1
def test_range_order_incorrect_chrom2c(output, converter):
"""
Incorrect order of a chromosomal range. See Trac #95.
"""
coding = converter.chrom2c('NC_000011.9:g.111959695_111957631del', 'list')
assert coding is None
erange = output.getMessagesWithErrorCode('ERANGE')
assert len(erange) == 1
def test_delins_large_ins_c2chrom(converter):
"""
Delins with multi-base insertion c. to chrom.
"""
genomic = converter.c2chrom('NM_003002.2:c.274delinsTAAA')
assert genomic == 'NC_000011.9:g.111959695delinsTAAA'
coding = converter.chrom2c(genomic, 'list')
assert 'NM_003002.2:c.274delinsTAAA' in coding
def test_delins_large_ins_explicit_c2chrom(converter):
"""
Delins with multi-base insertion and explicit deleted sequence c. to chrom.
"""
genomic = converter.c2chrom('NM_003002.2:c.274delGinsTAAA')
assert genomic == 'NC_000011.9:g.111959695delinsTAAA'
coding = converter.chrom2c(genomic, 'list')
assert 'NM_003002.2:c.274delinsTAAA' in coding
def test_delins_large_ins_chrom2c(converter):
"""
Delins with multi-base insertion chrom to c.
"""
coding = converter.chrom2c('NC_000011.9:g.111959695delinsTAAA', 'list')
assert 'NM_003002.2:c.274delinsTAAA' in coding
def test_delins_large_ins_explicit_chrom2c(converter):
"""
Delins with multi-base insertion and explicit deleted sequence chrom to c.
"""
coding = converter.chrom2c('NC_000011.9:g.111959695delGinsTAAA', 'list')
assert 'NM_003002.2:c.274delinsTAAA' in coding
def test_chrm_chrom2c(converter):
"""
Mitochondrial m. to c.
"""
coding = converter.chrom2c('NC_012920.1:m.12030del', 'list')
assert 'NC_012920.1(ND4_v001):c.1271del' in coding
def test_chrm_name_chrom2c(converter):
"""
Mitochondrial m. (by chromosome name) to c.
"""
variant = converter.correctChrVariant('chrM:m.12030del')
coding = converter.chrom2c(variant, 'list')
assert 'NC_012920.1(ND4_v001):c.1271del' in coding
def test_chrm_c2chrom(converter):
"""
Mitochondrial c. to m.
"""
genomic = converter.c2chrom('NC_012920.1(ND4_v001):c.1271del')
assert genomic == 'NC_012920.1:m.12030del'
def test_nm_without_selector_chrom2c(converter):
"""
NM reference without transcript selection c. to g.
"""
genomic = converter.c2chrom('NM_017780.2:c.109A>T')
assert genomic == 'NC_000008.10:g.61654100A>T'
def test_nm_with_selector_chrom2c(converter):
"""
NM reference with transcript selection c. to g.
"""
genomic = converter.c2chrom('NM_017780.2(CHD7_v001):c.109A>T')
assert genomic == 'NC_000008.10:g.61654100A>T'
def test_nm_c2chrom_no_selector(converter):
"""
To NM reference should never result in transcript selection.
"""
variant = converter.correctChrVariant('NC_000008.10:g.61654100A>T')
coding = converter.chrom2c(variant, 'list')
assert 'NM_017780.2:c.109A>T' in coding
def test_incorrect_selector_c2chrom(output, converter):
"""
Incorrect selector.
"""
converter.c2chrom('NM_017780.2(CHD8):c.109A>T')
erange = output.getMessagesWithErrorCode('EACCNOTINDB')
assert len(erange) == 1
def test_incorrect_selector_version_c2chrom(output, converter):
"""
Incorrect selector version.
"""
converter.c2chrom('NM_017780.2(CHD7_v002):c.109A>T')
erange = output.getMessagesWithErrorCode('EACCNOTINDB')
assert len(erange) == 1
def test_no_selector_version_c2chrom(converter):
"""
Selector but no selector version.
"""
genomic = converter.c2chrom('NM_017780.2(CHD7):c.109A>T')
assert genomic == 'NC_000008.10:g.61654100A>T'
def test_incorrect_selector_no_selector_version_c2chrom(output, converter):
"""
Incorrect selector, no selector version.
"""
converter.c2chrom('NM_017780.2(CHD8):c.109A>T')
erange = output.getMessagesWithErrorCode('EACCNOTINDB')
assert len(erange) == 1
def test_ins_seq_chrom2c(converter):
"""
Insertion of a sequence (chrom2c).
"""
coding = converter.chrom2c(
'NC_000011.9:g.111957482_111957483insGAT', 'list')
assert 'NM_003002.2:c.-150_-149insGAT' in coding
assert 'NM_012459.2:c.10_11insATC' in coding
def test_ins_seq_seq(converter):
"""
Insertion of two sequences (chrom2c).
"""
coding = converter.chrom2c(
'NC_000011.9:g.111957482_111957483ins[GAT;AAA]', 'list')
assert 'NM_003002.2:c.-150_-149ins[GAT;AAA]' in coding
assert 'NM_012459.2:c.10_11ins[TTT;ATC]' in coding
def test_ins_seq_c2chrom_reverse(converter):
"""
Insertion of a sequence on reverse strand (c2chrom).
"""
genomic = converter.c2chrom('NM_012459.2:c.10_11insATC')
assert genomic == 'NC_000011.9:g.111957482_111957483insGAT'
def test_ins_seq_seq_c2chrom_reverse(converter):
"""
Insertion of two sequences on reverse strand (c2chrom).
"""
genomic = converter.c2chrom('NM_012459.2:c.10_11ins[TTT;ATC]')
assert genomic == 'NC_000011.9:g.111957482_111957483ins[GAT;AAA]'
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
@pytest.mark.parametrize('coding,chromosomal', LRG_1_T1_POSITIONS)
def test_lrg_1t1_c2chrom(converter, coding, chromosomal):
"""
Conversion from LRG reference on reverse strand.
"""
chromosomal_descr = converter.c2chrom('LRG_1t1:c.%sdel' % coding)
assert chromosomal_descr == 'NC_000017.10:g.%ddel' % chromosomal
@pytest.mark.parametrize('coding,chromosomal', LRG_1_T1_POSITIONS)
def test_lrg_1t1_chrom2c(converter, coding, chromosomal):
"""
Conversion to LRG reference on reverse strand.
"""
coding_descr = converter.chrom2c(
'NC_000017.10:g.%ddel' % chromosomal, 'list')
assert 'LRG_1t1:c.%sdel' % coding in coding_descr
@pytest.mark.parametrize('coding,chromosomal', LRG_348_T1_POSITIONS)
def test_lrg_348t1_c2chrom(converter, coding, chromosomal):
"""
Conversion from LRG reference on forward strand.
"""
chromosomal_descr = converter.c2chrom('LRG_348t1:c.%sdel' % coding)
assert chromosomal_descr == 'NC_000001.10:g.%ddel' % chromosomal
@pytest.mark.parametrize('coding,chromosomal', LRG_348_T1_POSITIONS)
def test_lrg_348t1_chrom2c(converter, coding, chromosomal):
"""
Conversion to LRG reference on forward strand.
"""
coding_descr = converter.chrom2c(
'NC_000001.10:g.%ddel' % chromosomal, 'list')
assert 'LRG_348t1:c.%sdel' % coding in coding_descr
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
def test_import_mapview(hg19):
original_count = TranscriptMapping.query.count()
group_label = 'GRCh37.p13-Primary Assembly'
path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data',
'hg19.chr11.111771755-112247252.seq_gene.sorted.md')
mapview = codecs.open(path, encoding='utf-8')
mapview_count = sum(1 for line in mapview
if line.split('\t')[12] == group_label
and line.split('\t')[11] == 'RNA')
mapview.seek(0)
mapping.import_from_mapview_file(hg19, mapview, group_label)
# Two transcripts were already in, the rest is new:
# - NR_028383.1
# - NM_012459.2
assert TranscriptMapping.query.count() == original_count + mapview_count - 2
# No changes here.
unchanged = TranscriptMapping.query.filter_by(accession='NM_012459').one()
assert unchanged.start == 111955524
assert unchanged.stop == 111957522
assert unchanged.exon_starts == [111955524, 111957364]
assert unchanged.exon_stops == [111956186, 111957522]
assert unchanged.cds == (111956019, 111957492)
# We made some artificial changes to the mapview file here.
updated = TranscriptMapping.query.filter_by(accession='NR_028383').one()
assert updated.start == 111955524
assert updated.stop == 111957525
assert updated.exon_starts == [111955524, 111956700, 111957364]
assert updated.exon_stops == [111956180, 111957034, 111957525]
# This is a new entry.
new = TranscriptMapping.query.filter_by(accession='NM_000317').one()
assert new.version == 2
assert new.start == 112097088
assert new.stop == 112104696
assert new.exon_starts == [112097088, 112099317, 112100931, 112101349,
112103886, 112104155]
assert new.exon_stops == [112097249, 112099396, 112100953, 112101405,
112103956, 112104696]
assert new.cds == (112097167, 112104278)
assert new.gene == 'PTS'
assert new.orientation == 'forward'
assert new.reference_type == 'refseq'
assert new.source == 'ncbi'