variantchecker.py 90.3 KB
Newer Older
1
"""
2
3
4
The HGVS variant nomenclature checker.

Entrypoint is the check_variant() function.
5

6
7
8
9
10
11
Notes about naming positions:
* CDS -> use start/stop
* splice sites or exons -> acceptor/donor
* translation -> begin/end
* any range of bases -> first/last
* interbase position (if two numbers are used) -> before/after
Vermaat's avatar
Vermaat committed
12
13
14
15

Notes about string representations:
* All variant descriptions and their parts are unicode strings
* All reference sequences (and their mutated version) are Bio.Seq.Seq objects
16
17
"""

18

Vermaat's avatar
Vermaat committed
19
20
21
from __future__ import unicode_literals

from operator import attrgetter
22

Vermaat's avatar
Vermaat committed
23
from Bio.Data import CodonTable
24
from Bio.Alphabet import IUPAC
25
26
from Bio.Alphabet import DNAAlphabet
from Bio.Alphabet import ProteinAlphabet
Vermaat's avatar
Vermaat committed
27
from Bio.Alphabet import _verify_alphabet
28

29
from mutalyzer import util
30
from mutalyzer.db.models import Assembly
31
from mutalyzer.grammar import Grammar
Vermaat's avatar
Vermaat committed
32
from mutalyzer.mutator import Mutator
Vermaat's avatar
Vermaat committed
33
from mutalyzer.mapping import Converter
34
35
from mutalyzer import Retriever
from mutalyzer import GenRecord
Mihai's avatar
Mihai committed
36
from mutalyzer.nc_db import get_nc_record, get_chromosome_ids
37
from datetime import datetime
38

39
# Exceptions used (privately) in this module.
Vermaat's avatar
Vermaat committed
40
class _VariantError(Exception): pass
41
42
class _RawVariantError(_VariantError): pass
class _UnknownPositionError(_RawVariantError): pass
43
44
45
46
class _NotDNAError(_RawVariantError): pass
class _PositionsNotConsecutiveError(_RawVariantError): pass
class _LengthMismatchError(_RawVariantError): pass
class _ReferenceMismatchError(_RawVariantError): pass
Vermaat's avatar
Vermaat committed
47
class _RangeInsertionError(_RawVariantError): pass
48
class _OffsetSignError(_RawVariantError):
49
50
51
52
    def __init__(self, main, offset, acceptor):
        self.main = main
        self.offset = offset
        self.acceptor = acceptor
53
class _OffsetNotFromBoundaryError(_RawVariantError):
54
55
    def __init__(self, main):
        self.main = main
56
57
58
59
60
61
class _InvalidExonError(_RawVariantError):
    def __init__(self, exon):
        self.exon = exon
class _InvalidIntronError(_RawVariantError):
    def __init__(self, intron):
        self.intron = intron
Vermaat's avatar
Vermaat committed
62
63


Vermaat's avatar
Vermaat committed
64
def _is_coding_intronic(loc):
65
    """
Vermaat's avatar
Vermaat committed
66
    Check whether a location is an intronic c. position.
67

68
69
    @arg loc: A location from the Parser module.
    @type loc: pyparsing.ParseResults
70

Vermaat's avatar
Vermaat committed
71
    @return: True if the location is c. intronic, False otherwise.
72
73
74
75
76
77
78
79
80
    @rtype: boolean
    """
    if not loc:
        return False
    if not loc.PtLoc:
        return False
    if not loc.PtLoc.Offset:
        return False
    return True
Vermaat's avatar
Vermaat committed
81
#_is_coding_intronic
82

83
84

def _check_intronic_position(main, offset, transcript):
85
    """
86
87
    Check whether a c. position is really in an intron: The main coordinate
    must be a splice site and the offset coordinate must have the correct
88
    sign. Raise _RawVariantError exception if this check fails.
Vermaat's avatar
Vermaat committed
89

90
    @arg main: Main coordinate of the position.
91
    @type main: int
92
    @arg offset: Offset coordinate of the position.
93
    @type offset: int
94
95
    @arg transcript: Transcript under scrutiny.
    @type transcript: object
96
97
98
99

    @raise _OffsetNotFromBoundary: An offset from a non-boundary position
                                   was used.
    @raise _OffsetSignError: Offset from exon boundary has the wrong sign.
100
101

    @todo: Check if the offset is really in the flanking intron.
102
    """
103
104
105
106
107
108
109
110
111
112
    main_g = transcript.CM.x2g(main, 0)
    sites = transcript.CM.RNA

    if offset:
        oriented_offset = offset * transcript.CM.orientation
        try:
            i = sites.index(main_g)
            if not i % 2:
                # Splice acceptor, so sign must be -.
                if oriented_offset > 0:
113
114
115
116
                    raise _OffsetSignError(
                        transcript.CM.int2main(main),
                        transcript.CM.int2offset((main, offset)),
                        True)
117
118
119
            else:
                # Splice donor, so sign must be +.
                if oriented_offset < 0:
120
121
122
123
                    raise _OffsetSignError(
                        transcript.CM.int2main(main),
                        transcript.CM.int2offset((main, offset)),
                        False)
124
125
        except ValueError:
            # The main coordinate is not a splice site.
126
            raise _OffsetNotFromBoundaryError(transcript.CM.int2main(main))
127
#_check_intronic_position
128

129

Vermaat's avatar
Vermaat committed
130
def _check_argument(argument, reference, first, last, output):
131
    """
132
133
    Do several checks for the optional argument of a variant. Raise a
    _RawVariantError exception if the checks fail.
134

Vermaat's avatar
Vermaat committed
135
136
    @arg argument: The optional argument.
    @type argument: unicode
Vermaat's avatar
Vermaat committed
137
    @arg reference: The reference sequence.
Vermaat's avatar
Vermaat committed
138
    @type reference: Bio.Seq.Seq
Vermaat's avatar
Vermaat committed
139
140
141
142
143
144
    @arg first: Start position of the variant.
    @type first: int
    @arg last: End position of the variant.
    @type last: int
    @arg output: The Output object.
    @type output: mutalyzer.Output.Output
145

146
147
    @raise _LengthMismatchError: The argument is a length, but it does not
                                 match the given range length.
Vermaat's avatar
Vermaat committed
148
149
150
    @raise _NotDNAError: The argument should be DNA, but it is not.
    @raise _ReferenceMismatchError: The argument is DNA, but it does not
                                    match the given reference.
151
    """
Vermaat's avatar
Vermaat committed
152
153
    if not argument:
        # The argument is optional, if it is not present, it is correct.
154
        return
Vermaat's avatar
Vermaat committed
155
156
157
158
159

    if argument.isdigit():
        # If it is a digit (3_9del7 for example), the digit must be equal to
        # the length of the given range.
        length = int(argument)
160
        interval = last - first + 1
Vermaat's avatar
Vermaat committed
161
162
163
164
        if length != interval:
            output.addMessage(__file__, 3, 'EARGLEN',
                              'The length (%i) differed from that of the ' \
                              'range (%i).' % (length, interval))
165
            raise _LengthMismatchError()
Vermaat's avatar
Vermaat committed
166
167
168
169
170
    else:
        # If it is not a digit, it muse be DNA.
        if not util.is_dna(argument):
            output.addMessage(__file__, 4, 'ENODNA',
                              'Invalid letters in argument.')
171
            raise _NotDNAError()
Vermaat's avatar
Vermaat committed
172
        # And the DNA must match the reference sequence.
Vermaat's avatar
Vermaat committed
173
174
        reference_slice = unicode(reference[first - 1:last])
        if reference_slice != argument:
Vermaat's avatar
Vermaat committed
175
176
177
178
179
180
            # Todo: Be more informative.
            output.addMessage(__file__, 3, 'EREF',
                              '%s not found at position %s, found %s ' \
                              'instead.' % (argument,
                                            util.format_range(first, last),
                                            reference_slice))
181
            raise _ReferenceMismatchError()
Vermaat's avatar
Vermaat committed
182
#_check_argument
183

Laros's avatar
Laros committed
184

185
def _add_batch_output(O):
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
186
    """
187
    Format the results to a batch output.
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
188

189
190
    Filter the mutalyzer output and reformat it for use in the batch system
    as output object 'batchDone'.
191

192
193
    @arg O: The Output object
    @type O: Modules.Output.Output
Gerben Stouten's avatar
Gerben Stouten committed
194

195
    @todo: More documentation.
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
196
    """
Gerben Stouten's avatar
Gerben Stouten committed
197
198
199
200
201
202
203
    goi, toi = O.getOutput("geneSymbol")[-1] # Two strings [can be empty]
    tList   = []                             # Temporary List
    tDescr  = []                             # Temporary Descr

    reference = O.getOutput("reference")[-1]
    recordType = O.getOutput("recordType")[0]
    descriptions = O.getOutput("NewDescriptions")
204
        #iName, jName, mType, cDescr, pDescr, gAcc, cAcc, pAcc,
Gerben Stouten's avatar
Gerben Stouten committed
205
206
207
208
209
        #fullDescr, fullpDescr

    if len(descriptions) == 0:
        #No descriptions generated [unlikely]
        return
Gerben Stouten's avatar
Gerben Stouten committed
210
211
212
    if O.Summary()[0]:
        #There were errors during the run, return.
        return
Gerben Stouten's avatar
Gerben Stouten committed
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
    for descr in descriptions:
        if goi in descr[0] and toi in descr[1]: # Gene and Transcript
            if tDescr:
                # Already inserted a value in the tDescr
                tDescr, tList = [], descriptions
                break
            tDescr = descr

    tList = descriptions

    var = O.getOutput("variant")[-1]

    # Generate output
    outputline = ""
    if tDescr: #Filtering worked, only one Description left
        (gName, trName, mType, cDescr,
            pDescr, gAcc, cAcc, pAcc, fullD, fullpD) = tDescr

        gene = "%s_v%.3i" % (gName, int(trName))

        outputline += "%s\t%s\t%s\t" % (reference, gene, var)

        #Add genomic Description
        outputline += "%s\t" % O.getOutput("gDescription")[0]

        #Add coding Description & protein Description
        outputline += "%s\t%s\t" % (cDescr, pDescr)

        gc = cDescr and "%s:%s" % (gene, cDescr)
        gp = pDescr and "%s:%s" % (gene, pDescr)

        #Add mutation with GeneSymbols
        outputline += "%s\t%s\t" % (gc, gp)

        #Add References, should get genomic ref from parsed data
        if recordType == "LRG":
            gAcc = reference
        if recordType == "GB":
            geno = ["NC", "NG", "AC", "NT", "NW", "NZ", "NS"]
            for g in geno:
                if reference.startswith(g):
                    gAcc = reference
                    break
        outputline += "%s\t%s\t%s\t" % (gAcc or "", cAcc or "", pAcc or "")
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
257
258

    else:
Gerben Stouten's avatar
Gerben Stouten committed
259
260
261
262
263
264
        outputline += "\t"*11

    #Add list of affected transcripts "|" seperator
    if tList:
        outputline += "%s\t" % "|".join(e[-2] for e in tList)
        outputline += "%s\t" % "|".join(e[-1] for e in tList)
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
265
    else:
Gerben Stouten's avatar
Gerben Stouten committed
266
267
        outputline += "\t"*2

268
269
270
271
272
273
274
275
276
277
278
    # Add effects on restriction sites as two columns (created and deleted).
    # The value for each column is a semicolon-separated list of
    # comma-separated lists: for each raw variant, a list of restriction
    # sites.
    sites_created = []
    sites_deleted = []
    for variant in O.getOutput("restrictionSites"):
        sites_created.append(','.join(variant[0]))
        sites_deleted.append(','.join(variant[1]))
    outputline += "%s\t%s\t" % (';'.join(sites_created), ';'.join(sites_deleted))

Gerben Stouten's avatar
Gerben Stouten committed
279
280
281
282
283
    #Link naar additional info:
    #outputline+="http://localhost/mutalyzer2/redirect?mutationName=%s" %\
    #        "todovariant"

    O.addOutput("batchDone", outputline)
284
#_add_batch_output
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
285

Laros's avatar
Laros committed
286

287
def apply_substitution(position, original, substitute, mutator, record, O):
288
    """
289
290
291
292
293
294
    Do a semantic check for a substitution, do the actual substitution and
    give it a name.

    @arg position: Genomic location of the substitution.
    @type position: int
    @arg original: Nucleotide in the reference sequence.
Vermaat's avatar
Vermaat committed
295
    @type original: unicode
296
    @arg substitute: Nucleotide in the mutated sequence.
Vermaat's avatar
Vermaat committed
297
    @type substitute: unicode
Vermaat's avatar
Vermaat committed
298
299
    @arg mutator: A Mutator instance.
    @type mutator: mutalyzer.mutator.Mutator
300
301
302
303
    @arg record: A GenRecord object.
    @type record: Modules.GenRecord.GenRecord
    @arg O: The Output object.
    @type O: Modules.Output.Output
304

305
    @raise _NotDNAError: Invalid (non-DNA) letter in input.
306
    """
Vermaat's avatar
Vermaat committed
307
    if not util.is_dna(substitute):
308
309
        O.addMessage(__file__, 4, 'ENODNA', 'Invalid letter in input')
        raise _NotDNAError()
Laros's avatar
Laros committed
310

311
    if original == substitute:
312
        O.addMessage(__file__, 2, 'WNOCHANGE',
313
314
                     'No mutation given (%c>%c) at position %i.' % \
                     (original, substitute, position))
315
        return
316

317
    mutator.substitution(position, substitute)
318

Vermaat's avatar
Vermaat committed
319
    record.name(position, position, 'subst', unicode(mutator.orig[position - 1]),
320
321
                substitute, None)
#apply_substitution
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
322

323

324
325
def apply_deletion_duplication(first, last, type, mutator, record, O,
                               first_fuzzy=False, last_fuzzy=False):
326
    """
327
328
329
    Do a semantic check for a deletion or duplication, do the actual
    deletion/duplication and give it a name.

330
331
332
333
    @arg first: Genomic start position of the del/dup.
    @type first: int
    @arg last: Genomic end position of the del/dup.
    @type last: int
334
    @arg type: The variant type (del or dup).
Vermaat's avatar
Vermaat committed
335
    @type type: unicode
Vermaat's avatar
Vermaat committed
336
337
    @arg mutator: A Mutator instance.
    @type mutator: mutalyzer.mutator.Mutator
338
339
340
341
    @arg record: A GenRecord object.
    @type record: Modules.GenRecord.GenRecord
    @arg O: The Output object.
    @type O: Modules.Output.Output
342
343
344
345
346
347
348

    @kwarg first_fuzzy: Denotes that the start position is fuzzy (e.g. in the
        case of an unknown offset in c. notation).
    @type first_fuzzy: bool
    @kwarg last_fuzzy: Denotes that the end position is fuzzy (e.g. in the
        case of an unknown offset in c. notation).
    @type last_fuzzy: bool
349
    """
Vermaat's avatar
Vermaat committed
350
    reverse_roll, forward_roll = util.roll(mutator.orig, first, last)
351

352
    # In the case of RNA, check if we roll over a splice site. If so, make
Vermaat's avatar
Vermaat committed
353
354
355
356
357
    # the roll shorter, just up to the splice site. (Effectively this always
    # means we roll over two splice sites, since they are adjacent.)
    # We only have to consider the forward roll, since RNA reference
    # sequences are always orientated in correspondence with the transcript.
    original_forward_roll = forward_roll
358
    if record.record.molType != 'g':
359
360
        # Todo: Do we assume .geneList[0].transcriptList[0] is the selected
        # transcript here?? Why not use record.current_transcript?
Vermaat's avatar
Vermaat committed
361
362
363
        splice_sites = record.record.geneList[0].transcriptList[0] \
                       .mRNA.positionList
        for acceptor, donor in util.grouper(splice_sites):
364
365
366
            # Note that acceptor and donor splice sites both point to the
            # first, respectively last, position of the exon, so they are
            # both at different sides of the boundary.
Vermaat's avatar
Vermaat committed
367
368
            if last < acceptor and last + forward_roll >= acceptor:
                forward_roll = acceptor - 1 - last
369
                break
Vermaat's avatar
Vermaat committed
370
371
            if last <= donor and last + forward_roll > donor:
                forward_roll = donor - last
372
373
                break

374
375
376
377
378
    # Did we select a transcript on the reverse strand?
    transcript = record.current_transcript()
    reverse_strand = transcript and transcript.CM.orientation == -1

    if forward_roll and not reverse_strand:
Vermaat's avatar
Vermaat committed
379
380
381
        new_first = first + forward_roll
        new_stop = last + forward_roll
        O.addMessage(__file__, 2, 'WROLLFORWARD',
382
            'Sequence "%s" at position %s was given, however, ' \
Vermaat's avatar
Vermaat committed
383
384
            'the HGVS notation prescribes that on the forward strand ' \
            'it should be "%s" at position %s.' % (
Vermaat's avatar
Vermaat committed
385
            util.visualise_sequence(unicode(mutator.orig[first - 1:last])),
Vermaat's avatar
Vermaat committed
386
            util.format_range(first, last),
Vermaat's avatar
Vermaat committed
387
            util.visualise_sequence(unicode(mutator.orig[new_first - 1:new_stop])),
Vermaat's avatar
Vermaat committed
388
            util.format_range(new_first, new_stop)))
389

390
    if forward_roll != original_forward_roll and not reverse_strand:
391
        # The original roll was decreased because it crossed a splice site.
Vermaat's avatar
Vermaat committed
392
393
        incorrect_first = first + original_forward_roll
        incorrect_stop = last + original_forward_roll
394
395
396
        O.addMessage(__file__, 1, 'IROLLBACK',
            'Sequence "%s" at position %s was not corrected to "%s" at ' \
            'position %s, since they reside in different exons.' % (
Vermaat's avatar
Vermaat committed
397
            util.visualise_sequence(unicode(mutator.orig[first - 1:last])),
Vermaat's avatar
Vermaat committed
398
            util.format_range(first, last),
Vermaat's avatar
Vermaat committed
399
            util.visualise_sequence(unicode(mutator.orig[incorrect_first - 1:incorrect_stop])),
Vermaat's avatar
Vermaat committed
400
            util.format_range(incorrect_first, incorrect_stop)))
401

402
    if reverse_roll and reverse_strand:
Vermaat's avatar
Vermaat committed
403
404
405
406
407
408
        new_first = first - reverse_roll
        new_stop = last - reverse_roll
        O.addMessage(__file__, 2, 'WROLLREVERSE',
            'Sequence "%s" at position %s was given, however, ' \
            'the HGVS notation prescribes that on the reverse strand ' \
            'it should be "%s" at position %s.' % (
Vermaat's avatar
Vermaat committed
409
            util.visualise_sequence(unicode(mutator.orig[first - 1:last])),
Vermaat's avatar
Vermaat committed
410
            util.format_range(first, last),
Vermaat's avatar
Vermaat committed
411
            util.visualise_sequence(unicode(mutator.orig[new_first - 1:new_stop])),
Vermaat's avatar
Vermaat committed
412
413
414
415
            util.format_range(new_first, new_stop)))

    # We don't go through the trouble of visualising the *corrected* variant
    # and are happy with visualising what the user gave us.
416
    if type == 'del':
417
        mutator.deletion(first, last)
Vermaat's avatar
Vermaat committed
418
    else:
419
        mutator.duplication(first, last)
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
420

421
422
423
    record.name(first, last, type, '', '', (reverse_roll, forward_roll),
                start_fuzzy=first_fuzzy,
                stop_fuzzy=last_fuzzy)
424
425
426
#apply_deletion_duplication


427
def apply_inversion(first, last, mutator, record, O):
428
    """
429
430
431
    Do a semantic check for an inversion, do the actual inversion, and give
    it a name.

432
433
434
435
    @arg first: Genomic start position of the inversion.
    @type first: int
    @arg last: Genomic end position of the inversion.
    @type last: int
Vermaat's avatar
Vermaat committed
436
437
    @arg mutator: A Mutator instance.
    @type mutator: mutalyzer.mutator.Mutator
438
439
440
441
    @arg record: A GenRecord object.
    @type record: Modules.GenRecord.GenRecord
    @arg O: The Output object.
    @type O: Modules.Output.Output
442
    """
Vermaat's avatar
Vermaat committed
443
    snoop = util.palinsnoop(unicode(mutator.orig[first - 1:last]))
Laros's avatar
Laros committed
444

445
446
    if snoop:
        # We have a reverse-complement-palindromic prefix.
447
        if snoop == -1 :
448
449
450
451
452
            # Actually, not just a prefix, but the entire selected sequence is
            # a 'palindrome'.
            O.addMessage(__file__, 2, 'WNOCHANGE',
                'Sequence "%s" at position %i_%i is a palindrome ' \
                '(its own reverse complement).' % (
Vermaat's avatar
Vermaat committed
453
                util.visualise_sequence(unicode(mutator.orig[first - 1:last])),
454
                first, last))
455
            return
456
457
458
459
460
461
        else:
            O.addMessage(__file__, 2, 'WNOTMINIMAL',
                'Sequence "%s" at position %i_%i is a partial ' \
                'palindrome (the first %i nucleotide(s) are the reverse ' \
                'complement of the last one(s)), the HGVS notation ' \
                'prescribes that it should be "%s" at position %i_%i.' % (
Vermaat's avatar
Vermaat committed
462
                util.visualise_sequence(unicode(mutator.orig[first - 1:last])),
463
                first, last, snoop,
464
                util.visualise_sequence(
Vermaat's avatar
Vermaat committed
465
                    unicode(mutator.orig[first + snoop - 1: last - snoop])),
466
467
468
                first + snoop, last - snoop))
            first += snoop
            last -= snoop
469

470
    mutator.inversion(first, last)
471

472
    if first == last:
473
        O.addMessage(__file__, 2, 'WWRONGTYPE', 'Inversion at position ' \
474
            '%i is actually a substitution.' % first)
Vermaat's avatar
Vermaat committed
475
476
        record.name(first, first, 'subst', unicode(mutator.orig[first - 1]),
                    util.reverse_complement(unicode(mutator.orig[first - 1])), None)
477
    else :
478
        record.name(first, last, 'inv', '', '', None)
479
#apply_inversion
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
480

481

Mihai's avatar
Mihai committed
482
def apply_insertion(before, after, s, mutator, record, O, original_reftype):
483
    """
484
485
    Do a semantic check for an insertion, do the actual insertion, and give
    it a name.
486

487
488
489
490
    @arg before: Genomic position before the insertion.
    @type before: int
    @arg after: Genomic position after the insertion.
    @type after: int
491
    @arg s: Nucleotides to be inserted.
Vermaat's avatar
Vermaat committed
492
    @type s: nucleotide
Vermaat's avatar
Vermaat committed
493
494
    @arg mutator: A Mutator instance.
    @type mutator: mutalyzer.mutator.Mutator
495
496
497
498
499
    @arg record: A GenRecord object.
    @type record: Modules.GenRecord.GenRecord
    @arg O: The Output object.
    @type O: Modules.Output.Output

500
501
502
    @raise _NotDNAError: Invalid (non-DNA) letter in input.
    @raise _PositionsNotConsecutiveError: Positions {before} and {after} are
                                          not consecutive.
503
    """
504
    if before + 1 != after:
505
        O.addMessage(__file__, 3, 'EINSRANGE',
506
            '%i and %i are not consecutive positions.' % (before, after))
507
        raise _PositionsNotConsecutiveError()
508

Vermaat's avatar
Vermaat committed
509
    if not s or not util.is_dna(s):
510
511
        O.addMessage(__file__, 3, 'EUNKVAR', 'Although the syntax of this ' \
            'variant is correct, the effect can not be analysed.')
512
        raise _NotDNAError()
Laros's avatar
Laros committed
513

514
515
    insertion_length = len(s)

Vermaat's avatar
Vermaat committed
516
517
    # We don't go through the trouble of visualising the *corrected* variant
    # and are happy with visualising what the user gave us.
518
    mutator.insertion(before, s)
Vermaat's avatar
Vermaat committed
519

520
521
    new_before = mutator.shift(before)
    new_stop = mutator.shift(before) + insertion_length
522

Vermaat's avatar
Vermaat committed
523
    reverse_roll, forward_roll = util.roll(mutator.mutated, new_before + 1, new_stop)
524

525
    # In the case of RNA, check if we roll over a splice site. If so, make
Vermaat's avatar
Vermaat committed
526
527
528
529
    # the roll shorter, just up to the splice site. (Effectively this always
    # means we roll over two splice sites, since they are adjacent.)
    # We only have to consider the forward roll, since RNA reference
    # sequences are always orientated in correspondence with the transcript.
Vermaat's avatar
Vermaat committed
530
    # Todo: There's probably a better way to check if we are on RNA.
Vermaat's avatar
Vermaat committed
531
    original_forward_roll = forward_roll
Vermaat's avatar
Vermaat committed
532
533
534
535
536
537
    if record.record.molType != 'g':
        try:
            splice_sites = record.record.geneList[0].transcriptList[0] \
                .mRNA.positionList
        except IndexError:
            splice_sites = []
Vermaat's avatar
Vermaat committed
538
        for acceptor, donor in util.grouper(splice_sites):
539
540
541
            # Note that acceptor and donor splice sites both point to the
            # first, respectively last, position of the exon, so they are
            # both at different sides of the boundary.
Vermaat's avatar
Vermaat committed
542
543
            if new_stop < acceptor and new_stop + forward_roll >= acceptor:
                forward_roll = acceptor - 1 - new_stop
544
                break
Vermaat's avatar
Vermaat committed
545
546
            if new_stop <= donor and new_stop + forward_roll > donor:
                forward_roll = donor - new_stop
547
548
                break

Mihai's avatar
Mihai committed
549
550
    transcript = record.current_transcript()

Vermaat's avatar
Vermaat committed
551
    if reverse_roll + forward_roll >= insertion_length:
552
        # Todo: Could there also be a IROLLBACK message in this case?
Mihai's avatar
Mihai committed
553
554
        original_before = before
        original_after = after
Vermaat's avatar
Vermaat committed
555
        after += forward_roll - 1
556
        before = after - insertion_length + 1
Mihai's avatar
Mihai committed
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
        if before == after:
            corrected_position = _get_position(before, transcript,
                                               original_reftype)
        else:
            corrected_position = _get_position(before, transcript,
                                               original_reftype), after
        position = _get_position(original_before, transcript,
                                 original_reftype, original_after)
        O.addMessage(__file__, 2, 'WINSDUP',
                     'Insertion of {} at position {} was given, however, '
                     'the HGVS notation prescribes that it should be a '
                     'duplication of {} at position {}.'.format(
                         s, position,
                         unicode(mutator.mutated[new_before + forward_roll:
                                                 new_stop + forward_roll]),
                         corrected_position))
573
        record.name(before, after, 'dup', '', '',
Vermaat's avatar
Vermaat committed
574
575
576
                    (reverse_roll + forward_roll - insertion_length, 0))
        return

577
578
579
580
    # Did we select a transcript on the reverse strand?
    reverse_strand = transcript and transcript.CM.orientation == -1

    if forward_roll and not reverse_strand:
Vermaat's avatar
Vermaat committed
581
582
583
584
585
        O.addMessage(__file__, 2, 'WROLLFORWARD', 'Insertion of %s at position ' \
            '%i_%i was given, however, the HGVS notation prescribes ' \
            'that on the forward strand it should be an insertion of %s ' \
            'at position %i_%i.' % (
            s, before, before + 1,
Vermaat's avatar
Vermaat committed
586
            unicode(mutator.mutated[new_before + forward_roll:new_stop + forward_roll]),
Vermaat's avatar
Vermaat committed
587
588
            new_before + forward_roll, new_before + forward_roll + 1))

589
    if forward_roll != original_forward_roll and not reverse_strand:
Vermaat's avatar
Vermaat committed
590
591
592
593
594
595
        # The original roll was decreased because it crossed a splice site.
        O.addMessage(__file__, 1, 'IROLLBACK',
            'Insertion of %s at position %i_%i was not corrected to an ' \
            'insertion of %s at position %i_%i, since they reside in ' \
            'different exons.' % (
            s, before, before + 1,
Vermaat's avatar
Vermaat committed
596
            unicode(mutator.mutated[new_before + original_forward_roll:new_stop + original_forward_roll]),
Vermaat's avatar
Vermaat committed
597
598
            new_before + original_forward_roll, new_before + original_forward_roll + 1))

599
    if reverse_roll and reverse_strand:
Vermaat's avatar
Vermaat committed
600
601
602
603
604
        O.addMessage(__file__, 2, 'WROLLREVERSE', 'Insertion of %s at position ' \
            '%i_%i was given, however, the HGVS notation prescribes ' \
            'that on the reverse strand it should be an insertion of %s ' \
            'at position %i_%i.' % (
            s, before, before + 1,
Vermaat's avatar
Vermaat committed
605
            unicode(mutator.mutated[new_before - reverse_roll:new_stop - reverse_roll]),
Vermaat's avatar
Vermaat committed
606
607
608
            new_before - reverse_roll, (new_before - reverse_roll) + 1))

    record.name(before, before + 1, 'ins',
Vermaat's avatar
Vermaat committed
609
                unicode(mutator.mutated[new_before + forward_roll:new_stop + forward_roll]),
Vermaat's avatar
Vermaat committed
610
                '', (reverse_roll, forward_roll),
Vermaat's avatar
Vermaat committed
611
                unicode(mutator.mutated[new_before - reverse_roll:new_stop - reverse_roll]))
612
613
#apply_insertion

614

Mihai's avatar
Mihai committed
615
def apply_delins(first, last, insert, mutator, record, output, original_reftype):
616
    """
617
618
619
620
621
622
623
    Do a semantic check for an delins, do the actual delins, and give
    it a name.

    @arg first: Genomic start position of the delins.
    @type first: int
    @arg last: Genomic end position of the delins.
    @type last: int
624
    @arg insert: Sequence to insert.
Vermaat's avatar
Vermaat committed
625
    @type insert: unicode
Vermaat's avatar
Vermaat committed
626
627
    @arg mutator: A Mutator instance.
    @type mutator: mutalyzer.mutator.Mutator
628
629
630
631
    @arg record: A GenRecord object.
    @type record: Modules.GenRecord.GenRecord
    @arg output: The Output object.
    @type output: Modules.Output.Output
632
    """
Vermaat's avatar
Vermaat committed
633
    delete = unicode(mutator.orig[first - 1:last])
634

Vermaat's avatar
Vermaat committed
635
    if delete == insert:
636
637
638
        output.addMessage(__file__, 2, 'WNOCHANGE',
                          'Sequence "%s" at position %i_%i is identical to ' \
                          'the variant.' % (
Vermaat's avatar
Vermaat committed
639
                              util.visualise_sequence(delete), first, last))
640
        return
641

Vermaat's avatar
Vermaat committed
642
    delete_trimmed, insert_trimmed, lcp, lcs = util.trim_common(delete, insert)
643

644
    if not len(delete_trimmed):
645
646
        output.addMessage(__file__, 2, 'WWRONGTYPE', 'The given DelIns ' \
                          'is actually an insertion.')
647
        apply_insertion(first + lcp - 1, first + lcp, insert_trimmed, mutator,
Mihai's avatar
Mihai committed
648
                        record, output, original_reftype)
649
650
        return

651
    if len(delete_trimmed) == 1 and len(insert_trimmed) == 1:
652
653
            output.addMessage(__file__, 2, 'WWRONGTYPE', 'The given DelIns ' \
                              'is actually a substitution.')
654
655
            apply_substitution(first + lcp, delete_trimmed, insert_trimmed,
                               mutator, record, output)
656
            return
657

658
    if not len(insert_trimmed):
659
660
661
662
663
664
        output.addMessage(__file__, 2, 'WWRONGTYPE', 'The given DelIns ' \
                          'is actually a deletion.')
        apply_deletion_duplication(first + lcp, last - lcs, 'del',
                                   mutator, record, output)
        return

Vermaat's avatar
Vermaat committed
665
    if util.reverse_complement(delete_trimmed) == insert_trimmed:
666
667
668
669
670
        output.addMessage(__file__, 2, 'WWRONGTYPE', 'The given DelIns ' \
                          'is actually an inversion.')
        apply_inversion(first + lcp, last - lcs, mutator,
                        record, output)
        return
671

672
    if len(insert) != len(insert_trimmed):
673
674
675
676
        output.addMessage(__file__, 2, 'WNOTMINIMAL',
                'Sequence "%s" at position %i_%i has the same prefix or ' \
                'suffix as the inserted sequence "%s". The HGVS notation ' \
                'prescribes that it should be "%s" at position %i_%i.' % (
Vermaat's avatar
Vermaat committed
677
                util.visualise_sequence(unicode(mutator.orig[first - 1:last])),
678
                first, last, insert, insert_trimmed, first + lcp, last - lcs))
679

680
    mutator.delins(first + lcp, last - lcs, insert_trimmed)
681

682
    record.name(first + lcp, last - lcs, 'delins', insert_trimmed, '', None)
683
684
685
#apply_delins


686
def _get_offset(location, main_genomic, sites, output):
687
688
689
690
691
692
    """
    Convert the offset coordinate in a location (from the Parser) to an
    integer.

    @arg location: A location.
    @type location: pyparsing.ParseResults
693
694
695
696
697
698
    @arg main_genomic: Genomic main position to which the offset belongs.
    @type main_genomic: int
    @arg sites: List of splice sites.
    @type sites: list
    @arg output: The Output object.
    @type output: Modules.Output.Output
699
700
701
702
703

    @return: Integer representation of the offset coordinate.
    @rtype: int
    """
    if location.Offset :
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
        if location.Offset == '?' :
            try:
                # Todo: If it removes CDS start, don't do protein translation.
                # Todo: Wrt orientation, perhaps always go to splice site
                #     locations via the crossmapper...
                # Todo: Also check if +? and -? are correctly used.
                # Todo: Exactly centering might not be so nice, since the center
                # might be closer to a neighbouring exon, making a+xxx from b-?
                # and vice versa. This might not be fixed directly by doing a
                # center +/- 1 because there might be rolling. Ideally we
                # disable rolling entirely for these positions...
                #
                # Note that the code below might be a bit confusing, especially
                # considering reverse strand transcripts. Magically, it works
                # for both orientations.
                i = sites.index(main_genomic)
                if i == 0:
                    # Before first exon (or last on the reverse strand).
                    offset = main_genomic / 2
                elif i == len(sites) - 1:
                    # After last exon (or first on the reverse strand).
                    # Todo: Get length of reference, and calculate a sensible
                    # offset.
                    #
                    # We now use that 2000 is the default downstream length,
                    # but of course this is bogus on the reverse strand and
                    # just a hack anyway.
                    offset = 1000
                elif i % 2 == 0:
                    # Acceptor site (or donor on the reverse strand).
                    offset = abs(main_genomic - sites[i - 1]) / 2 - 1
                else:
                    # Donor site (or acceptor on the reverse strand).
                    offset = abs(sites[i + 1] - main_genomic) / 2 - 1
                # Todo: We would like to use the c. position in this message.
                output.addMessage(__file__, 1, "IUNKNOWNOFFSET", "Unknown offset " \
                                  "relative to %s interpreted as middle of " \
                                  "flanking intron." % main_genomic)
            except ValueError:
                # Todo: This means we don't get an error if the main position
                # was not on an exon boundary. We should return something else
                # than 0 I guess.
                #return 0  # This is highly debatable.
                # Any non-zero value will do.
                return 1
        else:
            offset = int(location.Offset)
751
752
753
754
755
756
757
758
        if location.OffSgn == '-' :
            return -offset
        return offset

    return 0
#_get_offset


759
def _intronic_to_genomic(location, transcript):
760
    """
761
762
763
764
765
766
767
768
769
    Get genomic location from IVS location.

    @arg location: A location.
    @type location: pyparsing.ParseResults
    @arg transcript: todo
    @type transcript: todo

    @return: Genomic location represented by given IVS location.
    @rtype: int
770
771

    @raise _InvalidIntronError: Intron does not exist.
772
773
774
775
    """
    ivs_number = int(location.IVSNumber)

    if ivs_number < 1 or ivs_number > transcript.CM.numberOfIntrons():
776
        raise _InvalidIntronError(ivs_number)
777
778
779
780
781
782
783
784
785
786
787

    if location.OffSgn == '+':
        return transcript.CM.getSpliceSite(ivs_number * 2 - 1) + \
               transcript.CM.orientation * int(location.Offset)
    else:
        return transcript.CM.getSpliceSite(ivs_number * 2) - \
               transcript.CM.orientation * int(location.Offset)
#_intronic_to_genomic


def _exonic_to_genomic(location, transcript) :
788
    """
789
790
791
792
793
794
    Get genomic range from EX location.

    @arg location: A location.
    @type location: pyparsing.ParseResults
    @arg transcript: todo
    @type transcript: todo
795

796
797
798
799
    @return: A tuple of:
        - first: Genomic start location represented by given EX location.
        - last:  Genomic end location represented by given EX location.
    @rtype: tuple(int, int)
800

801
802
    @raise _InvalidExonError: Exon does not exist.

803
804
805
806
807
    @todo: We probably want to treat this as a-?_b+?, so take the centers of
           flanking exons.
    """
    first_exon = int(location.EXNumberStart)
    if first_exon < 1 or first_exon > transcript.CM.numberOfExons():
808
        raise _InvalidExonError(first_exon)
809
    first = transcript.CM.getSpliceSite(first_exon * 2 - 2)
810

811
812
813
    if location.EXNumberStop:
        last_exon = int(location.EXNumberStop)
        if last_exon < 1 or last_exon > transcript.CM.numberOfExons():
814
            raise _InvalidExonError(last_exon)
815
816
817
818
819
820
        last = transcript.CM.getSpliceSite(last_exon * 2 - 1)
    else:
        last = transcript.CM.getSpliceSite(first_exon * 2 - 1)

    return first, last
#_exonic_to_genomic
821
822


823
def _genomic_to_genomic(first_location, last_location):
824
    """
825
826
827
828
829
830
831
832
833
834
835
    Get genomic range from parsed genomic location.

    @arg first_location: The start location (g.) of the variant.
    @type first_location: pyparsing.ParseResults
    @arg last_location: The start location (g.) of the variant.
    @type last_location: pyparsing.ParseResults

    @return: A tuple of:
        - first: Genomic start location represented by given location.
        - last:  Genomic end location represented by given location.
    @rtype: tuple(int, int)
836
837
838

    @raise _UnknownPositionError: Unknown positions were used.
    @raise _RawVariantError: Range cannot be intepreted.
839
    """
840
841
842
    if not first_location.Main or not last_location.Main:
        # Unknown positions are denoted by the '?' character.
        raise _UnknownPositionError()
843

844
    if not first_location.Main.isdigit() or not last_location.Main.isdigit():
845
        raise _RawVariantError()
846

847
    first = int(first_location.Main)
848
    last = int(last_location.Main)
849

850
    return first, last
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
851

852

853
def _coding_to_genomic(first_location, last_location, transcript, output):
854
855
856
857
858
859
860
861
862
    """
    Get genomic range from parsed c. location.

    @arg first_location: The start location (c.) of the variant.
    @type first_location: pyparsing.ParseResults
    @arg last_location: The start location (c.) of the variant.
    @type last_location: pyparsing.ParseResults
    @arg transcript: todo
    @type transcript: todo
863
864
    @arg output: The Output object.
    @type output: Modules.Output.Output
865
866
867
868
869

    @return: A tuple of:
        - first: Genomic start location represented by given location.
        - last:  Genomic end location represented by given location.
    @rtype: tuple(int, int)
870

871
872
873
874
875
    @raise _UnknownPositionError: Unknown positions were used.
    @raise _RawVariantError: Range cannot be interpreted.
    @raise _OffsetNotFromBoundary: An offset from a non-boundary position
                                    was used.
    @raise _OffsetSignError: Offset from exon boundary has the wrong sign.
876
    """
877
878
879
    if not first_location.Main or not last_location.Main:
        # Unknown positions are denoted by the '?' character.
        raise _UnknownPositionError()
880

881
    if not first_location.Main.isdigit() or not last_location.Main.isdigit():
882
        raise _RawVariantError()
883
884
885

    first_main = transcript.CM.main2int(first_location.MainSgn + \
                                        first_location.Main)
886
887
888
    first_main_genomic = transcript.CM.x2g(first_main, 0)
    first_offset = _get_offset(first_location, first_main_genomic,
                               transcript.CM.RNA, output)
889
890
891

    last_main = transcript.CM.main2int(last_location.MainSgn + \
                                       last_location.Main)
892
893
894
    last_main_genomic = transcript.CM.x2g(last_main, 0)
    last_offset = _get_offset(last_location, last_main_genomic,
                              transcript.CM.RNA, output)
895

896
    # These raise _RawVariantError exceptions on invalid positions.
897
898
899
900
901
    _check_intronic_position(first_main, first_offset, transcript)
    _check_intronic_position(last_main, last_offset, transcript)

    first = transcript.CM.x2g(first_main, first_offset)
    last = transcript.CM.x2g(last_main, last_offset)
902
903
904
905
906
907
908
909

    if transcript.CM.orientation == -1:
        first, last = last, first

    return first, last
#_coding_to_genomic


910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
def process_protein_variant(mutator, variant, record, output):
    """
    Process a protein variant.

    We raise _RawVariantError if there was something seriously in error
    with the raw variant (but it is still okay to process other raw
    variants). We might (don't at the moment) also raise _VariantError,
    meaning to stop processing the entire variant.

    @arg mutator: A Mutator instance.
    @type mutator: mutalyzer.mutator.Mutator
    @arg variant: A parsed raw (simple, noncompound) variant.
    @type variant: pyparsing.ParseResults
    @arg record: A GenRecord object.
    @type record: Modules.GenRecord.GenRecord
    @arg output: The Output object.
    @type output: Modules.Output.Output

    @raise _RawVariantError: Cannot process this raw variant.
    @raise _VariantError: Cannot further process the entire variant.
    """
    variant, original_description = variant.RawVar, variant[-1]

    # List of amino acids.
    arguments = variant.Args

    if variant.MutationType == 'del':
        first = int(variant.Main)
        last = first
        mutator.deletion(first, last)
        record.name(first, last, 'del', '', '', (0, 0))

    # Todo: This is really unimplemented, the above is just a start to support
    #     protein level descriptions.


Mihai's avatar
Mihai committed
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
def _get_nm_in_nc_tip(mol_type, transcript_id):
    if mol_type == 'n':
        chromosome_ids = get_chromosome_ids(transcript_id)
        examples = ', '.join(['{}({})'.format(
            c_id, transcript_id) for c_id in chromosome_ids])
        if examples:
            return ' Tip: make use of a genomic reference sequence, ' \
                   'e.g., {}.'.format(examples)
        else:
            return ' Tip: make use of a genomic reference sequence ' \
                   'like NC_*(NM_*).'
    return ''


def _get_position(p_start, transcript, reftype, p_end=None):
    # Note that this still does not provide the original location.
    # For 'NG_012337.1(SDHD_v001):c.53-22274del' it provides 'c.-21325'
    if transcript:
        if reftype == 'c':
            if p_end:
                return 'c.{}_{} (g.{}_{})'.format(transcript.CM.g2c(p_start),
                                                  transcript.CM.g2c(p_end),
                                                  p_start, p_end)
            else:
                return 'c.{} (g.{})'.format(transcript.CM.g2c(p_start),
                                            p_start)
        elif reftype == 'n':
            return 'n.{} (g.{})'.format(transcript.CM.tuple2string(
                transcript.CM.g2x(p_start)), p_start)
    return 'g.{}'.format(p_start)


978
def process_raw_variant(mutator, variant, record, transcript, output):
979
    """
980
    Process a raw variant.
981

Vermaat's avatar
Vermaat committed
982
983
984
985
986
    We raise _RawVariantError if there was something seriously in error
    with the raw variant (but it is still okay to process other raw
    variants). We might (don't at the moment) also raise _VariantError,
    meaning to stop processing the entire variant.

Vermaat's avatar
Vermaat committed
987
988
    @arg mutator: A Mutator instance.
    @type mutator: mutalyzer.mutator.Mutator
989
990
991
992
993
994
995
996
    @arg variant: A parsed raw (simple, noncompound) variant.
    @type variant: pyparsing.ParseResults
    @arg record: A GenRecord object.
    @type record: Modules.GenRecord.GenRecord
    @arg transcript: A transcript object.
    @type transcript: Modules.GenRecord.Locus
    @arg output: The Output object.
    @type output: Modules.Output.Output
997

998
999
    @raise _RawVariantError: Cannot process this raw variant.
    @raise _VariantError: Cannot further process the entire variant.
1000
    """