variantchecker.py 89.8 KB
Newer Older
1
"""
2
3
4
The HGVS variant nomenclature checker.

Entrypoint is the check_variant() function.
5

6
7
8
9
10
11
Notes about naming positions:
* CDS -> use start/stop
* splice sites or exons -> acceptor/donor
* translation -> begin/end
* any range of bases -> first/last
* interbase position (if two numbers are used) -> before/after
12
13
"""

14
15

from operator import itemgetter, attrgetter
16

17
import Bio
18
19
20
import Bio.Seq
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
21
22
from Bio.Alphabet import DNAAlphabet
from Bio.Alphabet import ProteinAlphabet
23

24
from mutalyzer import config
25
from mutalyzer import util
26
from mutalyzer.grammar import Grammar
Vermaat's avatar
Vermaat committed
27
from mutalyzer.mutator import Mutator
Vermaat's avatar
Vermaat committed
28
from mutalyzer.mapping import Converter
29
30
31
from mutalyzer import Retriever
from mutalyzer import GenRecord
from mutalyzer import Db
32
from monoseq import pprint_sequence, AnsiFormat, HtmlFormat
33

34

35
# Exceptions used (privately) in this module.
Vermaat's avatar
Vermaat committed
36
class _VariantError(Exception): pass
37
38
class _RawVariantError(_VariantError): pass
class _UnknownPositionError(_RawVariantError): pass
39
40
41
42
class _NotDNAError(_RawVariantError): pass
class _PositionsNotConsecutiveError(_RawVariantError): pass
class _LengthMismatchError(_RawVariantError): pass
class _ReferenceMismatchError(_RawVariantError): pass
Vermaat's avatar
Vermaat committed
43
class _RangeInsertionError(_RawVariantError): pass
44
class _OffsetSignError(_RawVariantError):
45
46
47
48
    def __init__(self, main, offset, acceptor):
        self.main = main
        self.offset = offset
        self.acceptor = acceptor
49
class _OffsetNotFromBoundaryError(_RawVariantError):
50
51
    def __init__(self, main):
        self.main = main
52
53
54
55
56
57
class _InvalidExonError(_RawVariantError):
    def __init__(self, exon):
        self.exon = exon
class _InvalidIntronError(_RawVariantError):
    def __init__(self, intron):
        self.intron = intron
58
59
60
61
aa_dict = {"Ala":"A", "Gly":"G", "Val":"V", "Leu":"L", "Ile":"I",
			      "Met":"M", "Phe":"F", "Asn":"N", "Gln":"Q", "Asp":"D",
                  "Glu":"E", "His":"H", "Lys":"K", "Arg":"R", "Ser":"S",
			      "Thr":"T", "Tyr":"Y", "Trp":"W", "Cys":"C", "Pro":"P", 
62
63
			      "Sec":"U", "Pyl":"O", "TERM":"Stop", "OTHER": "X", "Asx" : "B", 
                  "Glx" : "Z", "Xle" : "J"}
Vermaat's avatar
Vermaat committed
64
65


Vermaat's avatar
Vermaat committed
66
def _is_coding_intronic(loc):
67
    """
Vermaat's avatar
Vermaat committed
68
    Check whether a location is an intronic c. position.
69

70
71
    @arg loc: A location from the Parser module.
    @type loc: pyparsing.ParseResults
72

Vermaat's avatar
Vermaat committed
73
    @return: True if the location is c. intronic, False otherwise.
74
75
76
77
78
79
80
81
82
    @rtype: boolean
    """
    if not loc:
        return False
    if not loc.PtLoc:
        return False
    if not loc.PtLoc.Offset:
        return False
    return True
Vermaat's avatar
Vermaat committed
83
#_is_coding_intronic
84

85
86

def _check_intronic_position(main, offset, transcript):
87
    """
88
89
    Check whether a c. position is really in an intron: The main coordinate
    must be a splice site and the offset coordinate must have the correct
90
    sign. Raise _RawVariantError exception if this check fails.
Vermaat's avatar
Vermaat committed
91

92
    @arg main: Main coordinate of the position.
93
    @type main: int
94
    @arg offset: Offset coordinate of the position.
95
    @type offset: int
96
97
    @arg transcript: Transcript under scrutiny.
    @type transcript: object
98
99
100
101

    @raise _OffsetNotFromBoundary: An offset from a non-boundary position
                                   was used.
    @raise _OffsetSignError: Offset from exon boundary has the wrong sign.
102
103

    @todo: Check if the offset is really in the flanking intron.
104
    """
105
106
107
108
109
110
111
112
113
114
    main_g = transcript.CM.x2g(main, 0)
    sites = transcript.CM.RNA

    if offset:
        oriented_offset = offset * transcript.CM.orientation
        try:
            i = sites.index(main_g)
            if not i % 2:
                # Splice acceptor, so sign must be -.
                if oriented_offset > 0:
115
116
117
118
                    raise _OffsetSignError(
                        transcript.CM.int2main(main),
                        transcript.CM.int2offset((main, offset)),
                        True)
119
120
121
            else:
                # Splice donor, so sign must be +.
                if oriented_offset < 0:
122
123
124
125
                    raise _OffsetSignError(
                        transcript.CM.int2main(main),
                        transcript.CM.int2offset((main, offset)),
                        False)
126
127
        except ValueError:
            # The main coordinate is not a splice site.
128
            raise _OffsetNotFromBoundaryError(transcript.CM.int2main(main))
129
#_check_intronic_position
130

131

Vermaat's avatar
Vermaat committed
132
def _check_argument(argument, reference, first, last, output):
133
    """
134
135
    Do several checks for the optional argument of a variant. Raise a
    _RawVariantError exception if the checks fail.
136

Vermaat's avatar
Vermaat committed
137
138
139
140
141
142
143
144
145
146
    @arg reference: The reference sequence.
    @type reference: string
    @arg first: Start position of the variant.
    @type first: int
    @arg last: End position of the variant.
    @type last: int
    @arg argument: The optional argument.
    @type argument: string
    @arg output: The Output object.
    @type output: mutalyzer.Output.Output
147

148
149
    @raise _LengthMismatchError: The argument is a length, but it does not
                                 match the given range length.
Vermaat's avatar
Vermaat committed
150
151
152
    @raise _NotDNAError: The argument should be DNA, but it is not.
    @raise _ReferenceMismatchError: The argument is DNA, but it does not
                                    match the given reference.
153
    """
Vermaat's avatar
Vermaat committed
154
155
    if not argument:
        # The argument is optional, if it is not present, it is correct.
156
        return
Vermaat's avatar
Vermaat committed
157
158
159
160
161

    if argument.isdigit():
        # If it is a digit (3_9del7 for example), the digit must be equal to
        # the length of the given range.
        length = int(argument)
162
        interval = last - first + 1
Vermaat's avatar
Vermaat committed
163
164
165
166
        if length != interval:
            output.addMessage(__file__, 3, 'EARGLEN',
                              'The length (%i) differed from that of the ' \
                              'range (%i).' % (length, interval))
167
            raise _LengthMismatchError()
Vermaat's avatar
Vermaat committed
168
169
170
171
172
    else:
        # If it is not a digit, it muse be DNA.
        if not util.is_dna(argument):
            output.addMessage(__file__, 4, 'ENODNA',
                              'Invalid letters in argument.')
173
            raise _NotDNAError()
Vermaat's avatar
Vermaat committed
174
175
176
177
178
179
180
181
182
        # And the DNA must match the reference sequence.
        reference_slice = str(reference[first - 1:last])
        if reference_slice != str(argument):
            # Todo: Be more informative.
            output.addMessage(__file__, 3, 'EREF',
                              '%s not found at position %s, found %s ' \
                              'instead.' % (argument,
                                            util.format_range(first, last),
                                            reference_slice))
183
            raise _ReferenceMismatchError()
Vermaat's avatar
Vermaat committed
184
#_check_argument
185

Laros's avatar
Laros committed
186

187
def _add_batch_output(O):
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
188
    """
189
    Format the results to a batch output.
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
190

191
192
    Filter the mutalyzer output and reformat it for use in the batch system
    as output object 'batchDone'.
193

194
195
    @arg O: The Output object
    @type O: Modules.Output.Output
Gerben Stouten's avatar
Gerben Stouten committed
196

197
    @todo: More documentation.
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
198
    """
Gerben Stouten's avatar
Gerben Stouten committed
199
200
201
202
203
204
205
    goi, toi = O.getOutput("geneSymbol")[-1] # Two strings [can be empty]
    tList   = []                             # Temporary List
    tDescr  = []                             # Temporary Descr

    reference = O.getOutput("reference")[-1]
    recordType = O.getOutput("recordType")[0]
    descriptions = O.getOutput("NewDescriptions")
206
        #iName, jName, mType, cDescr, pDescr, gAcc, cAcc, pAcc,
Gerben Stouten's avatar
Gerben Stouten committed
207
208
209
210
211
        #fullDescr, fullpDescr

    if len(descriptions) == 0:
        #No descriptions generated [unlikely]
        return
Gerben Stouten's avatar
Gerben Stouten committed
212
213
214
    if O.Summary()[0]:
        #There were errors during the run, return.
        return
Gerben Stouten's avatar
Gerben Stouten committed
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
    for descr in descriptions:
        if goi in descr[0] and toi in descr[1]: # Gene and Transcript
            if tDescr:
                # Already inserted a value in the tDescr
                tDescr, tList = [], descriptions
                break
            tDescr = descr

    tList = descriptions

    var = O.getOutput("variant")[-1]

    # Generate output
    outputline = ""
    if tDescr: #Filtering worked, only one Description left
        (gName, trName, mType, cDescr,
            pDescr, gAcc, cAcc, pAcc, fullD, fullpD) = tDescr

        gene = "%s_v%.3i" % (gName, int(trName))

        outputline += "%s\t%s\t%s\t" % (reference, gene, var)

        #Add genomic Description
        outputline += "%s\t" % O.getOutput("gDescription")[0]

        #Add coding Description & protein Description
        outputline += "%s\t%s\t" % (cDescr, pDescr)

        gc = cDescr and "%s:%s" % (gene, cDescr)
        gp = pDescr and "%s:%s" % (gene, pDescr)

        #Add mutation with GeneSymbols
        outputline += "%s\t%s\t" % (gc, gp)

        #Add References, should get genomic ref from parsed data
        if recordType == "LRG":
            gAcc = reference
        if recordType == "GB":
            geno = ["NC", "NG", "AC", "NT", "NW", "NZ", "NS"]
            for g in geno:
                if reference.startswith(g):
                    gAcc = reference
                    break
        outputline += "%s\t%s\t%s\t" % (gAcc or "", cAcc or "", pAcc or "")
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
259
260

    else:
Gerben Stouten's avatar
Gerben Stouten committed
261
262
263
264
265
266
        outputline += "\t"*11

    #Add list of affected transcripts "|" seperator
    if tList:
        outputline += "%s\t" % "|".join(e[-2] for e in tList)
        outputline += "%s\t" % "|".join(e[-1] for e in tList)
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
267
    else:
Gerben Stouten's avatar
Gerben Stouten committed
268
269
        outputline += "\t"*2

270
271
272
273
274
275
276
277
278
279
280
    # Add effects on restriction sites as two columns (created and deleted).
    # The value for each column is a semicolon-separated list of
    # comma-separated lists: for each raw variant, a list of restriction
    # sites.
    sites_created = []
    sites_deleted = []
    for variant in O.getOutput("restrictionSites"):
        sites_created.append(','.join(variant[0]))
        sites_deleted.append(','.join(variant[1]))
    outputline += "%s\t%s\t" % (';'.join(sites_created), ';'.join(sites_deleted))

Gerben Stouten's avatar
Gerben Stouten committed
281
282
283
284
285
    #Link naar additional info:
    #outputline+="http://localhost/mutalyzer2/redirect?mutationName=%s" %\
    #        "todovariant"

    O.addOutput("batchDone", outputline)
286
#_add_batch_output
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
287

Laros's avatar
Laros committed
288

289
def apply_substitution(position, original, substitute, mutator, record, O):
290
    """
291
292
293
294
295
296
297
298
299
    Do a semantic check for a substitution, do the actual substitution and
    give it a name.

    @arg position: Genomic location of the substitution.
    @type position: int
    @arg original: Nucleotide in the reference sequence.
    @type original: string
    @arg substitute: Nucleotide in the mutated sequence.
    @type substitute: string
Vermaat's avatar
Vermaat committed
300
301
    @arg mutator: A Mutator instance.
    @type mutator: mutalyzer.mutator.Mutator
302
303
304
305
    @arg record: A GenRecord object.
    @type record: Modules.GenRecord.GenRecord
    @arg O: The Output object.
    @type O: Modules.Output.Output
306

307
    @raise _NotDNAError: Invalid (non-DNA) letter in input.
308
    """
Vermaat's avatar
Vermaat committed
309
    if not util.is_dna(substitute):
310
311
        O.addMessage(__file__, 4, 'ENODNA', 'Invalid letter in input')
        raise _NotDNAError()
Laros's avatar
Laros committed
312

313
    if original == substitute:
314
        O.addMessage(__file__, 2, 'WNOCHANGE',
315
316
                     'No mutation given (%c>%c) at position %i.' % \
                     (original, substitute, position))
317
        return
318

319
    mutator.substitution(position, substitute)
320
321
322
323

    record.name(position, position, 'subst', mutator.orig[position - 1],
                substitute, None)
#apply_substitution
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
324

325

326
327
def apply_deletion_duplication(first, last, type, mutator, record, O,
                               first_fuzzy=False, last_fuzzy=False):
328
    """
329
330
331
    Do a semantic check for a deletion or duplication, do the actual
    deletion/duplication and give it a name.

332
333
334
335
    @arg first: Genomic start position of the del/dup.
    @type first: int
    @arg last: Genomic end position of the del/dup.
    @type last: int
336
337
    @arg type: The variant type (del or dup).
    @type type: string
Vermaat's avatar
Vermaat committed
338
339
    @arg mutator: A Mutator instance.
    @type mutator: mutalyzer.mutator.Mutator
340
341
342
343
    @arg record: A GenRecord object.
    @type record: Modules.GenRecord.GenRecord
    @arg O: The Output object.
    @type O: Modules.Output.Output
344
345
346
347
348
349
350

    @kwarg first_fuzzy: Denotes that the start position is fuzzy (e.g. in the
        case of an unknown offset in c. notation).
    @type first_fuzzy: bool
    @kwarg last_fuzzy: Denotes that the end position is fuzzy (e.g. in the
        case of an unknown offset in c. notation).
    @type last_fuzzy: bool
351
    """
Vermaat's avatar
Vermaat committed
352
    reverse_roll, forward_roll = util.roll(mutator.orig, first, last)
353

354
    # In the case of RNA, check if we roll over a splice site. If so, make
Vermaat's avatar
Vermaat committed
355
356
357
358
359
    # the roll shorter, just up to the splice site. (Effectively this always
    # means we roll over two splice sites, since they are adjacent.)
    # We only have to consider the forward roll, since RNA reference
    # sequences are always orientated in correspondence with the transcript.
    original_forward_roll = forward_roll
360
    if record.record.molType != 'g':
361
362
        # Todo: Do we assume .geneList[0].transcriptList[0] is the selected
        # transcript here?? Why not use record.current_transcript?
Vermaat's avatar
Vermaat committed
363
364
365
        splice_sites = record.record.geneList[0].transcriptList[0] \
                       .mRNA.positionList
        for acceptor, donor in util.grouper(splice_sites):
366
367
368
            # Note that acceptor and donor splice sites both point to the
            # first, respectively last, position of the exon, so they are
            # both at different sides of the boundary.
Vermaat's avatar
Vermaat committed
369
370
            if last < acceptor and last + forward_roll >= acceptor:
                forward_roll = acceptor - 1 - last
371
                break
Vermaat's avatar
Vermaat committed
372
373
            if last <= donor and last + forward_roll > donor:
                forward_roll = donor - last
374
375
                break

376
377
378
379
380
    # Did we select a transcript on the reverse strand?
    transcript = record.current_transcript()
    reverse_strand = transcript and transcript.CM.orientation == -1

    if forward_roll and not reverse_strand:
Vermaat's avatar
Vermaat committed
381
382
383
        new_first = first + forward_roll
        new_stop = last + forward_roll
        O.addMessage(__file__, 2, 'WROLLFORWARD',
384
            'Sequence "%s" at position %s was given, however, ' \
Vermaat's avatar
Vermaat committed
385
386
            'the HGVS notation prescribes that on the forward strand ' \
            'it should be "%s" at position %s.' % (
387
388
389
            util.visualise_sequence(str(mutator.orig[first - 1:last]),
                                    config.get('maxvissize'),
                                    config.get('flankclipsize')),
Vermaat's avatar
Vermaat committed
390
            util.format_range(first, last),
391
392
393
            util.visualise_sequence(str(mutator.orig[new_first - 1:new_stop]),
                                    config.get('maxvissize'),
                                    config.get('flankclipsize')),
Vermaat's avatar
Vermaat committed
394
            util.format_range(new_first, new_stop)))
395

396
    if forward_roll != original_forward_roll and not reverse_strand:
397
        # The original roll was decreased because it crossed a splice site.
Vermaat's avatar
Vermaat committed
398
399
        incorrect_first = first + original_forward_roll
        incorrect_stop = last + original_forward_roll
400
401
402
        O.addMessage(__file__, 1, 'IROLLBACK',
            'Sequence "%s" at position %s was not corrected to "%s" at ' \
            'position %s, since they reside in different exons.' % (
403
404
405
            util.visualise_sequence(str(mutator.orig[first - 1:last]),
                                    config.get('maxvissize'),
                                    config.get('flankclipsize')),
Vermaat's avatar
Vermaat committed
406
            util.format_range(first, last),
407
408
409
            util.visualise_sequence(str(mutator.orig[incorrect_first - 1:incorrect_stop]),
                                    config.get('maxvissize'),
                                    config.get('flankclipsize')),
Vermaat's avatar
Vermaat committed
410
            util.format_range(incorrect_first, incorrect_stop)))
411

412
    if reverse_roll and reverse_strand:
Vermaat's avatar
Vermaat committed
413
414
415
416
417
418
        new_first = first - reverse_roll
        new_stop = last - reverse_roll
        O.addMessage(__file__, 2, 'WROLLREVERSE',
            'Sequence "%s" at position %s was given, however, ' \
            'the HGVS notation prescribes that on the reverse strand ' \
            'it should be "%s" at position %s.' % (
419
420
421
            util.visualise_sequence(str(mutator.orig[first - 1:last]),
                                    config.get('maxvissize'),
                                    config.get('flankclipsize')),
Vermaat's avatar
Vermaat committed
422
            util.format_range(first, last),
423
424
425
            util.visualise_sequence(str(mutator.orig[new_first - 1:new_stop]),
                                    config.get('maxvissize'),
                                    config.get('flankclipsize')),
Vermaat's avatar
Vermaat committed
426
427
428
429
            util.format_range(new_first, new_stop)))

    # We don't go through the trouble of visualising the *corrected* variant
    # and are happy with visualising what the user gave us.
430
    if type == 'del':
431
        mutator.deletion(first, last)
Vermaat's avatar
Vermaat committed
432
    else:
433
        mutator.duplication(first, last)
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
434

435
436
437
    record.name(first, last, type, '', '', (reverse_roll, forward_roll),
                start_fuzzy=first_fuzzy,
                stop_fuzzy=last_fuzzy)
438
439
440
#apply_deletion_duplication


441
def apply_inversion(first, last, mutator, record, O):
442
    """
443
444
445
    Do a semantic check for an inversion, do the actual inversion, and give
    it a name.

446
447
448
449
    @arg first: Genomic start position of the inversion.
    @type first: int
    @arg last: Genomic end position of the inversion.
    @type last: int
Vermaat's avatar
Vermaat committed
450
451
    @arg mutator: A Mutator instance.
    @type mutator: mutalyzer.mutator.Mutator
452
453
454
455
    @arg record: A GenRecord object.
    @type record: Modules.GenRecord.GenRecord
    @arg O: The Output object.
    @type O: Modules.Output.Output
456
    """
Vermaat's avatar
Vermaat committed
457
    snoop = util.palinsnoop(mutator.orig[first - 1:last])
Laros's avatar
Laros committed
458

459
460
    if snoop:
        # We have a reverse-complement-palindromic prefix.
461
        if snoop == -1 :
462
463
464
465
466
            # Actually, not just a prefix, but the entire selected sequence is
            # a 'palindrome'.
            O.addMessage(__file__, 2, 'WNOCHANGE',
                'Sequence "%s" at position %i_%i is a palindrome ' \
                '(its own reverse complement).' % (
467
468
469
                util.visualise_sequence(str(mutator.orig[first - 1:last]),
                                        config.get('maxvissize'),
                                        config.get('flankclipsize')),
470
                first, last))
471
            return
472
473
474
475
476
477
        else:
            O.addMessage(__file__, 2, 'WNOTMINIMAL',
                'Sequence "%s" at position %i_%i is a partial ' \
                'palindrome (the first %i nucleotide(s) are the reverse ' \
                'complement of the last one(s)), the HGVS notation ' \
                'prescribes that it should be "%s" at position %i_%i.' % (
478
479
480
                util.visualise_sequence(str(mutator.orig[first - 1:last]),
                                        config.get('maxvissize'),
                                        config.get('flankclipsize')),
481
                first, last, snoop,
482
483
484
                util.visualise_sequence(
                    str(mutator.orig[first + snoop - 1: last - snoop]),
                    config.get('maxvissize'), config.get('flankclipsize')),
485
486
487
                first + snoop, last - snoop))
            first += snoop
            last -= snoop
488

489
    mutator.inversion(first, last)
490

491
    if first == last:
492
        O.addMessage(__file__, 2, 'WWRONGTYPE', 'Inversion at position ' \
493
            '%i is actually a substitution.' % first)
494
495
        record.name(first, first, 'subst', mutator.orig[first - 1],
            Bio.Seq.reverse_complement(mutator.orig[first - 1]), None)
496
    else :
497
        record.name(first, last, 'inv', '', '', None)
498
#apply_inversion
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
499

500

501
def apply_insertion(before, after, s, mutator, record, O):
502
    """
503
504
    Do a semantic check for an insertion, do the actual insertion, and give
    it a name.
505

506
507
508
509
    @arg before: Genomic position before the insertion.
    @type before: int
    @arg after: Genomic position after the insertion.
    @type after: int
510
511
    @arg s: Nucleotides to be inserted.
    @type s: string
Vermaat's avatar
Vermaat committed
512
513
    @arg mutator: A Mutator instance.
    @type mutator: mutalyzer.mutator.Mutator
514
515
516
517
518
    @arg record: A GenRecord object.
    @type record: Modules.GenRecord.GenRecord
    @arg O: The Output object.
    @type O: Modules.Output.Output

519
520
521
    @raise _NotDNAError: Invalid (non-DNA) letter in input.
    @raise _PositionsNotConsecutiveError: Positions {before} and {after} are
                                          not consecutive.
522
    """
523
    if before + 1 != after:
524
        O.addMessage(__file__, 3, 'EINSRANGE',
525
            '%i and %i are not consecutive positions.' % (before, after))
526
        raise _PositionsNotConsecutiveError()
527

Vermaat's avatar
Vermaat committed
528
    if not s or not util.is_dna(s):
529
530
        O.addMessage(__file__, 3, 'EUNKVAR', 'Although the syntax of this ' \
            'variant is correct, the effect can not be analysed.')
531
        raise _NotDNAError()
Laros's avatar
Laros committed
532

533
534
    insertion_length = len(s)

Vermaat's avatar
Vermaat committed
535
536
    # We don't go through the trouble of visualising the *corrected* variant
    # and are happy with visualising what the user gave us.
537
    mutator.insertion(before, s)
Vermaat's avatar
Vermaat committed
538

539
540
    new_before = mutator.shift(before)
    new_stop = mutator.shift(before) + insertion_length
541

Vermaat's avatar
Vermaat committed
542
    reverse_roll, forward_roll = util.roll(mutator.mutated, new_before + 1, new_stop)
543

544
    # In the case of RNA, check if we roll over a splice site. If so, make
Vermaat's avatar
Vermaat committed
545
546
547
548
    # the roll shorter, just up to the splice site. (Effectively this always
    # means we roll over two splice sites, since they are adjacent.)
    # We only have to consider the forward roll, since RNA reference
    # sequences are always orientated in correspondence with the transcript.
Vermaat's avatar
Vermaat committed
549
    # Todo: There's probably a better way to check if we are on RNA.
Vermaat's avatar
Vermaat committed
550
    original_forward_roll = forward_roll
Vermaat's avatar
Vermaat committed
551
552
553
554
555
556
    if record.record.molType != 'g':
        try:
            splice_sites = record.record.geneList[0].transcriptList[0] \
                .mRNA.positionList
        except IndexError:
            splice_sites = []
Vermaat's avatar
Vermaat committed
557
        for acceptor, donor in util.grouper(splice_sites):
558
559
560
            # Note that acceptor and donor splice sites both point to the
            # first, respectively last, position of the exon, so they are
            # both at different sides of the boundary.
Vermaat's avatar
Vermaat committed
561
562
            if new_stop < acceptor and new_stop + forward_roll >= acceptor:
                forward_roll = acceptor - 1 - new_stop
563
                break
Vermaat's avatar
Vermaat committed
564
565
            if new_stop <= donor and new_stop + forward_roll > donor:
                forward_roll = donor - new_stop
566
567
                break

Vermaat's avatar
Vermaat committed
568
    if reverse_roll + forward_roll >= insertion_length:
569
570
571
572
573
        # Todo: Could there also be a IROLLBACK message in this case?
        O.addMessage(__file__, 2, 'WINSDUP',
            'Insertion of %s at position %i_%i was given, ' \
            'however, the HGVS notation prescribes that it should be a ' \
            'duplication of %s at position %i_%i.' % (
574
            s, before, before + 1,
Vermaat's avatar
Vermaat committed
575
            mutator.mutated[new_before + forward_roll:new_stop + forward_roll],
Vermaat's avatar
Vermaat committed
576
577
578
            before + forward_roll,
            before + forward_roll + insertion_length - 1))
        after += forward_roll - 1
579
580
        before = after - insertion_length + 1
        record.name(before, after, 'dup', '', '',
Vermaat's avatar
Vermaat committed
581
582
583
                    (reverse_roll + forward_roll - insertion_length, 0))
        return

584
585
586
587
588
    # Did we select a transcript on the reverse strand?
    transcript = record.current_transcript()
    reverse_strand = transcript and transcript.CM.orientation == -1

    if forward_roll and not reverse_strand:
Vermaat's avatar
Vermaat committed
589
590
591
592
593
594
595
596
        O.addMessage(__file__, 2, 'WROLLFORWARD', 'Insertion of %s at position ' \
            '%i_%i was given, however, the HGVS notation prescribes ' \
            'that on the forward strand it should be an insertion of %s ' \
            'at position %i_%i.' % (
            s, before, before + 1,
            mutator.mutated[new_before + forward_roll:new_stop + forward_roll],
            new_before + forward_roll, new_before + forward_roll + 1))

597
    if forward_roll != original_forward_roll and not reverse_strand:
Vermaat's avatar
Vermaat committed
598
599
600
601
602
603
604
605
606
        # The original roll was decreased because it crossed a splice site.
        O.addMessage(__file__, 1, 'IROLLBACK',
            'Insertion of %s at position %i_%i was not corrected to an ' \
            'insertion of %s at position %i_%i, since they reside in ' \
            'different exons.' % (
            s, before, before + 1,
            mutator.mutated[new_before + original_forward_roll:new_stop + original_forward_roll],
            new_before + original_forward_roll, new_before + original_forward_roll + 1))

607
    if reverse_roll and reverse_strand:
Vermaat's avatar
Vermaat committed
608
609
610
611
612
613
614
615
616
617
618
619
        O.addMessage(__file__, 2, 'WROLLREVERSE', 'Insertion of %s at position ' \
            '%i_%i was given, however, the HGVS notation prescribes ' \
            'that on the reverse strand it should be an insertion of %s ' \
            'at position %i_%i.' % (
            s, before, before + 1,
            mutator.mutated[new_before - reverse_roll:new_stop - reverse_roll],
            new_before - reverse_roll, (new_before - reverse_roll) + 1))

    record.name(before, before + 1, 'ins',
                mutator.mutated[new_before + forward_roll:new_stop + forward_roll],
                '', (reverse_roll, forward_roll),
                mutator.mutated[new_before - reverse_roll:new_stop - reverse_roll])
620
621
#apply_insertion

622

623
def apply_delins(first, last, insert, mutator, record, output):
624
    """
625
626
627
628
629
630
631
    Do a semantic check for an delins, do the actual delins, and give
    it a name.

    @arg first: Genomic start position of the delins.
    @type first: int
    @arg last: Genomic end position of the delins.
    @type last: int
632
633
    @arg insert: Sequence to insert.
    @type insert: string
Vermaat's avatar
Vermaat committed
634
635
    @arg mutator: A Mutator instance.
    @type mutator: mutalyzer.mutator.Mutator
636
637
638
639
    @arg record: A GenRecord object.
    @type record: Modules.GenRecord.GenRecord
    @arg output: The Output object.
    @type output: Modules.Output.Output
640
    """
641
    delete = mutator.orig[first - 1:last]
642

643
    if str(delete) == str(insert):
644
645
646
        output.addMessage(__file__, 2, 'WNOCHANGE',
                          'Sequence "%s" at position %i_%i is identical to ' \
                          'the variant.' % (
647
648
649
650
                util.visualise_sequence(str(mutator.orig[first - 1:last]),
                                        config.get('maxvissize'),
                                        config.get('flankclipsize')),
                first, last))
651
        return
652

Vermaat's avatar
Vermaat committed
653
    delete_trimmed, insert_trimmed, lcp, lcs = util.trim_common(delete, insert)
654

655
    if not len(delete_trimmed):
656
657
        output.addMessage(__file__, 2, 'WWRONGTYPE', 'The given DelIns ' \
                          'is actually an insertion.')
658
        apply_insertion(first + lcp - 1, first + lcp, insert_trimmed, mutator,
659
660
661
                        record, output)
        return

662
    if len(delete_trimmed) == 1 and len(insert_trimmed) == 1:
663
664
            output.addMessage(__file__, 2, 'WWRONGTYPE', 'The given DelIns ' \
                              'is actually a substitution.')
665
666
            apply_substitution(first + lcp, delete_trimmed, insert_trimmed,
                               mutator, record, output)
667
            return
668

669
    if not len(insert_trimmed):
670
671
672
673
674
675
        output.addMessage(__file__, 2, 'WWRONGTYPE', 'The given DelIns ' \
                          'is actually a deletion.')
        apply_deletion_duplication(first + lcp, last - lcs, 'del',
                                   mutator, record, output)
        return

676
    if str(Bio.Seq.reverse_complement(delete_trimmed)) == insert_trimmed:
677
678
679
680
681
        output.addMessage(__file__, 2, 'WWRONGTYPE', 'The given DelIns ' \
                          'is actually an inversion.')
        apply_inversion(first + lcp, last - lcs, mutator,
                        record, output)
        return
682

683
    if len(insert) != len(insert_trimmed):
684
685
686
687
        output.addMessage(__file__, 2, 'WNOTMINIMAL',
                'Sequence "%s" at position %i_%i has the same prefix or ' \
                'suffix as the inserted sequence "%s". The HGVS notation ' \
                'prescribes that it should be "%s" at position %i_%i.' % (
688
689
690
                util.visualise_sequence(str(mutator.orig[first - 1:last]),
                                        config.get('maxvissize'),
                                        config.get('flankclipsize')),
691
                first, last, insert, insert_trimmed, first + lcp, last - lcs))
692

693
    mutator.delins(first + lcp, last - lcs, insert_trimmed)
694

695
    record.name(first + lcp, last - lcs, 'delins', insert_trimmed, '', None)
696
697
698
#apply_delins


699
def _get_offset(location, main_genomic, sites, output):
700
701
702
703
704
705
    """
    Convert the offset coordinate in a location (from the Parser) to an
    integer.

    @arg location: A location.
    @type location: pyparsing.ParseResults
706
707
708
709
710
711
    @arg main_genomic: Genomic main position to which the offset belongs.
    @type main_genomic: int
    @arg sites: List of splice sites.
    @type sites: list
    @arg output: The Output object.
    @type output: Modules.Output.Output
712
713
714
715
716

    @return: Integer representation of the offset coordinate.
    @rtype: int
    """
    if location.Offset :
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
        if location.Offset == '?' :
            try:
                # Todo: If it removes CDS start, don't do protein translation.
                # Todo: Wrt orientation, perhaps always go to splice site
                #     locations via the crossmapper...
                # Todo: Also check if +? and -? are correctly used.
                # Todo: Exactly centering might not be so nice, since the center
                # might be closer to a neighbouring exon, making a+xxx from b-?
                # and vice versa. This might not be fixed directly by doing a
                # center +/- 1 because there might be rolling. Ideally we
                # disable rolling entirely for these positions...
                #
                # Note that the code below might be a bit confusing, especially
                # considering reverse strand transcripts. Magically, it works
                # for both orientations.
                i = sites.index(main_genomic)
                if i == 0:
                    # Before first exon (or last on the reverse strand).
                    offset = main_genomic / 2
                elif i == len(sites) - 1:
                    # After last exon (or first on the reverse strand).
                    # Todo: Get length of reference, and calculate a sensible
                    # offset.
                    #
                    # We now use that 2000 is the default downstream length,
                    # but of course this is bogus on the reverse strand and
                    # just a hack anyway.
                    offset = 1000
                elif i % 2 == 0:
                    # Acceptor site (or donor on the reverse strand).
                    offset = abs(main_genomic - sites[i - 1]) / 2 - 1
                else:
                    # Donor site (or acceptor on the reverse strand).
                    offset = abs(sites[i + 1] - main_genomic) / 2 - 1
                # Todo: We would like to use the c. position in this message.
                output.addMessage(__file__, 1, "IUNKNOWNOFFSET", "Unknown offset " \
                                  "relative to %s interpreted as middle of " \
                                  "flanking intron." % main_genomic)
            except ValueError:
                # Todo: This means we don't get an error if the main position
                # was not on an exon boundary. We should return something else
                # than 0 I guess.
                #return 0  # This is highly debatable.
                # Any non-zero value will do.
                return 1
        else:
            offset = int(location.Offset)
764
765
766
767
768
769
770
771
        if location.OffSgn == '-' :
            return -offset
        return offset

    return 0
#_get_offset


772
def _intronic_to_genomic(location, transcript):
773
    """
774
775
776
777
778
779
780
781
782
    Get genomic location from IVS location.

    @arg location: A location.
    @type location: pyparsing.ParseResults
    @arg transcript: todo
    @type transcript: todo

    @return: Genomic location represented by given IVS location.
    @rtype: int
783
784

    @raise _InvalidIntronError: Intron does not exist.
785
786
787
788
    """
    ivs_number = int(location.IVSNumber)

    if ivs_number < 1 or ivs_number > transcript.CM.numberOfIntrons():
789
        raise _InvalidIntronError(ivs_number)
790
791
792
793
794
795
796
797
798
799
800

    if location.OffSgn == '+':
        return transcript.CM.getSpliceSite(ivs_number * 2 - 1) + \
               transcript.CM.orientation * int(location.Offset)
    else:
        return transcript.CM.getSpliceSite(ivs_number * 2) - \
               transcript.CM.orientation * int(location.Offset)
#_intronic_to_genomic


def _exonic_to_genomic(location, transcript) :
801
    """
802
803
804
805
806
807
    Get genomic range from EX location.

    @arg location: A location.
    @type location: pyparsing.ParseResults
    @arg transcript: todo
    @type transcript: todo
808

809
810
811
812
    @return: A tuple of:
        - first: Genomic start location represented by given EX location.
        - last:  Genomic end location represented by given EX location.
    @rtype: tuple(int, int)
813

814
815
    @raise _InvalidExonError: Exon does not exist.

816
817
818
819
820
    @todo: We probably want to treat this as a-?_b+?, so take the centers of
           flanking exons.
    """
    first_exon = int(location.EXNumberStart)
    if first_exon < 1 or first_exon > transcript.CM.numberOfExons():
821
        raise _InvalidExonError(first_exon)
822
    first = transcript.CM.getSpliceSite(first_exon * 2 - 2)
823

824
825
826
    if location.EXNumberStop:
        last_exon = int(location.EXNumberStop)
        if last_exon < 1 or last_exon > transcript.CM.numberOfExons():
827
            raise _InvalidExonError(last_exon)
828
829
830
831
832
833
        last = transcript.CM.getSpliceSite(last_exon * 2 - 1)
    else:
        last = transcript.CM.getSpliceSite(first_exon * 2 - 1)

    return first, last
#_exonic_to_genomic
834
835


836
def _genomic_to_genomic(first_location, last_location):
837
    """
838
839
840
841
842
843
844
845
846
847
848
    Get genomic range from parsed genomic location.

    @arg first_location: The start location (g.) of the variant.
    @type first_location: pyparsing.ParseResults
    @arg last_location: The start location (g.) of the variant.
    @type last_location: pyparsing.ParseResults

    @return: A tuple of:
        - first: Genomic start location represented by given location.
        - last:  Genomic end location represented by given location.
    @rtype: tuple(int, int)
849
850
851

    @raise _UnknownPositionError: Unknown positions were used.
    @raise _RawVariantError: Range cannot be intepreted.
852
    """
853
854
855
    if not first_location.Main or not last_location.Main:
        # Unknown positions are denoted by the '?' character.
        raise _UnknownPositionError()
856

857
    if not first_location.Main.isdigit() or not last_location.Main.isdigit():
858
        raise _RawVariantError()
859

860
    first = int(first_location.Main)
861
    last = int(last_location.Main)
862

863
    return first, last
Gerben Stouten's avatar
TODO!:    
Gerben Stouten committed
864

865

866
def _coding_to_genomic(first_location, last_location, transcript, output):
867
868
869
870
871
872
873
874
875
    """
    Get genomic range from parsed c. location.

    @arg first_location: The start location (c.) of the variant.
    @type first_location: pyparsing.ParseResults
    @arg last_location: The start location (c.) of the variant.
    @type last_location: pyparsing.ParseResults
    @arg transcript: todo
    @type transcript: todo
876
877
    @arg output: The Output object.
    @type output: Modules.Output.Output
878
879
880
881
882

    @return: A tuple of:
        - first: Genomic start location represented by given location.
        - last:  Genomic end location represented by given location.
    @rtype: tuple(int, int)
883

884
885
886
887
888
    @raise _UnknownPositionError: Unknown positions were used.
    @raise _RawVariantError: Range cannot be interpreted.
    @raise _OffsetNotFromBoundary: An offset from a non-boundary position
                                    was used.
    @raise _OffsetSignError: Offset from exon boundary has the wrong sign.
889
    """
890
891
892
    if not first_location.Main or not last_location.Main:
        # Unknown positions are denoted by the '?' character.
        raise _UnknownPositionError()
893

894
    if not first_location.Main.isdigit() or not last_location.Main.isdigit():
895
        raise _RawVariantError()
896
897
898

    first_main = transcript.CM.main2int(first_location.MainSgn + \
                                        first_location.Main)
899
900
901
    first_main_genomic = transcript.CM.x2g(first_main, 0)
    first_offset = _get_offset(first_location, first_main_genomic,
                               transcript.CM.RNA, output)
902
903
904

    last_main = transcript.CM.main2int(last_location.MainSgn + \
                                       last_location.Main)
905
906
907
    last_main_genomic = transcript.CM.x2g(last_main, 0)
    last_offset = _get_offset(last_location, last_main_genomic,
                              transcript.CM.RNA, output)
908

909
    # These raise _RawVariantError exceptions on invalid positions.
910
911
912
913
914
    _check_intronic_position(first_main, first_offset, transcript)
    _check_intronic_position(last_main, last_offset, transcript)

    first = transcript.CM.x2g(first_main, first_offset)
    last = transcript.CM.x2g(last_main, last_offset)
915
916
917
918
919
920
921
922

    if transcript.CM.orientation == -1:
        first, last = last, first

    return first, last
#_coding_to_genomic


923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
def process_protein_variant(mutator, variant, record, output):
    """
    Process a protein variant.

    We raise _RawVariantError if there was something seriously in error
    with the raw variant (but it is still okay to process other raw
    variants). We might (don't at the moment) also raise _VariantError,
    meaning to stop processing the entire variant.

    @arg mutator: A Mutator instance.
    @type mutator: mutalyzer.mutator.Mutator
    @arg variant: A parsed raw (simple, noncompound) variant.
    @type variant: pyparsing.ParseResults
    @arg record: A GenRecord object.
    @type record: Modules.GenRecord.GenRecord
    @arg output: The Output object.
    @type output: Modules.Output.Output

    @raise _RawVariantError: Cannot process this raw variant.
    @raise _VariantError: Cannot further process the entire variant.
    """
    variant, original_description = variant.RawVar, variant[-1]

    # List of amino acids.
    arguments = variant.Args

    if variant.MutationType == 'del':
        first = int(variant.Main)
        last = first
        mutator.deletion(first, last)
        record.name(first, last, 'del', '', '', (0, 0))

    # Todo: This is really unimplemented, the above is just a start to support
    #     protein level descriptions.


959
def process_raw_variant(mutator, variant, record, transcript, output):
960
    """
961
    Process a raw variant.
962

Vermaat's avatar
Vermaat committed
963
964
965
966
967
    We raise _RawVariantError if there was something seriously in error
    with the raw variant (but it is still okay to process other raw
    variants). We might (don't at the moment) also raise _VariantError,
    meaning to stop processing the entire variant.

Vermaat's avatar
Vermaat committed
968
969
    @arg mutator: A Mutator instance.
    @type mutator: mutalyzer.mutator.Mutator
970
971
972
973
974
975
976
977
    @arg variant: A parsed raw (simple, noncompound) variant.
    @type variant: pyparsing.ParseResults
    @arg record: A GenRecord object.
    @type record: Modules.GenRecord.GenRecord
    @arg transcript: A transcript object.
    @type transcript: Modules.GenRecord.Locus
    @arg output: The Output object.
    @type output: Modules.Output.Output
978

979
980
    @raise _RawVariantError: Cannot process this raw variant.
    @raise _VariantError: Cannot further process the entire variant.
981
    """
982
983
    variant, original_description = variant.RawVar, variant[-1]

Vermaat's avatar
Vermaat committed
984
985
986
987
988
989
990
    # {argument} may be a number, or a subsequence of the reference.
    # {sequence} is the variant subsequence.
    argument = variant.Arg1
    sequence = variant.Arg2

    # If we are on the reverse strand, subsequences must be in reverse
    # complement.
991
    if transcript and transcript.CM.orientation == -1:
Vermaat's avatar
Vermaat committed
992
993
994
995
996
997
        sequence = Bio.Seq.reverse_complement(sequence)
        if util.is_dna(argument):
            argument = Bio.Seq.reverse_complement(argument)

    # Get genomic first and last positions for this variant. Below we handle
    # the different ways of describing these positions.
998

999
    if variant.EXLoc:
Vermaat's avatar
Vermaat committed
1000
        # EX positioning.