mapping.py 40.4 KB
Newer Older
1
"""
2
Work with the mappings of transcripts to chromosomes.
3

4
5
6
7
8
9
Instances of the {Converter} class convert between transcript and chromosomal
locations, using the 'Mapping' table.

The {Updater} class is an abstract base class, subclassed by {NCBIUpdater}.
Instances of {NCBIUpdater} can load NCBI mapping information from a file and
update the database with this information.
10
"""
11

12

Vermaat's avatar
Vermaat committed
13
14
from __future__ import unicode_literals

Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
15
from collections import defaultdict
16
17
from itertools import groupby
from operator import attrgetter, itemgetter
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
18

19
import binning
20
21
import MySQLdb

22
from mutalyzer.db import session
23
from mutalyzer.db.models import Chromosome, TranscriptMapping
24
from mutalyzer.grammar import Grammar
25
26
from mutalyzer.models import SoapMessage, Mapping, Transcript
from mutalyzer.output import Output
27
from mutalyzer import Crossmap
28
from mutalyzer import Retriever
Vermaat's avatar
Vermaat committed
29
from mutalyzer import util
30
31
32
33


class MapviewSortError(Exception):
    pass
34
35


36
37
38
39
40
41
42
43
44
45
def _construct_change(var, reverse=False):
    """
    Construct mutation description.

    @arg var: RawVar object.
    @type var: pyparsing.ParseResults
    @var reverse: Variant is on the reverse strand.
    @type reverse: bool

    @return: Description of mutation (without reference and positions).
Vermaat's avatar
Vermaat committed
46
    @rtype: unicode
47
    """
Vermaat's avatar
Vermaat committed
48
49
    # Note that the pyparsing parse tree yields `str('')` for nonexisting
    # attributes, so we wrap the optional attributes in `unicode()`.
50
51
    if reverse:
        try:
Vermaat's avatar
Vermaat committed
52
            arg1 = unicode(int(var.Arg1))
53
        except ValueError:
Vermaat's avatar
Vermaat committed
54
            arg1 = util.reverse_complement(unicode(var.Arg1))
55
        try:
Vermaat's avatar
Vermaat committed
56
            arg2 = unicode(int(var.Arg2))
57
        except ValueError:
Vermaat's avatar
Vermaat committed
58
            arg2 = util.reverse_complement(unicode(var.Arg2))
59
    else:
Vermaat's avatar
Vermaat committed
60
61
        arg1 = unicode(var.Arg1)
        arg2 = unicode(var.Arg2)
62

63
64
65
66
67
    def parse_sequence(seq):
        if not seq.Sequence:
            raise NotImplementedError('Only explicit sequences are supported '
                                      'for insertions.')
        if reverse:
Vermaat's avatar
Vermaat committed
68
            return util.reverse_complement(seq.Sequence)
69
70
        return seq.Sequence

71
72
    if var.MutationType == 'subst':
        change = '%s>%s' % (arg1, arg2)
73
74
75
76
77
78
    elif var.MutationType in ('ins', 'delins'):
        if var.SeqList:
            if reverse:
                seqs = reversed(var.SeqList)
            else:
                seqs = var.SeqList
Vermaat's avatar
Vermaat committed
79
            insertion = '[' + ';'.join(parse_sequence(seq)
80
81
82
83
                                       for seq in seqs) + ']'
        else:
            insertion = parse_sequence(var.Seq)
        change = '%s%s' % (var.MutationType, insertion)
84
85
86
87
88
89
90
    else:
        change = '%s%s' % (var.MutationType, arg1 or arg2 or '')

    return change
#_construct_change


91
class Converter(object) :
92
    """
93
    Convert between transcript and chromosomal locations.
94

95
96
97
98
    Search for an NM number in the MySQL database, if the version number
    matches, get the start and end positions in a variant. Translate these
    positions to I{g.} notation if the variant is in I{c.} notation or vice
    versa.
99

100
101
102
103
104
105
106
107
108
    - If no end position is present, the start position is assumed to be the
      end position.
    - If the version number is not found in the database, an error message is
      generated and a suggestion for an other version is given.
    - If the reference sequence is not found at all, an error is returned.
    - If no variant is present, the transcription start and end and CDS end in
      I{c.} notation is returned.
    - If the variant is not accepted by the nomenclature parser, a parse error
      will be printed.
109

110
111
    @todo: Refactor anything using {mutalyzer.models} into the {webservice}
    module.
112
    """
113
    def __init__(self, assembly, O) :
114
115
        """
        Initialise the class.
Vermaat's avatar
Vermaat committed
116

117
        @arg assembly: the genome build version of the organism (e.g. hg19 for
118
        human genome build version 19)
119
        @type assembly: string
120
        @arg O: output object
Vermaat's avatar
Vermaat committed
121
        @type O: object
122
        """
123
        self.assembly = assembly
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
124
125
126
127
128
        self.__output = O

        # Populated arguments
        self.parseTree = None
        self.crossmap = None
129
        self.mapping = None
130
131
132
    #__init__

    def _reset(self) :
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
133
        self.crossmap = None
134
        self.mapping = None
135
136
137
138
139
    #_reset

    def _parseInput(self, variant) :
        """
        Parse a variant.
Vermaat's avatar
Vermaat committed
140

141
142
        @arg variant: variant description
        @type variant: string
Vermaat's avatar
Vermaat committed
143

144
145
146
        @return: parsetree object
        @rtype: object
        """
147
148
        grammar = Grammar(self.__output)
        parseTree = grammar.parse(variant)
149
        if not parseTree :
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
150
151
152
            self.__output.addMessage(__file__, 4, "EPARSE",
                    "Could not parse the given variant")
            return None
153
        #if
154
        if not (parseTree.RefSeqAcc or parseTree.LrgAcc):
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
155
            self.__output.addMessage(__file__, 4, "EONLYGB",
156
                "Currently we only support GenBank and LRG records")
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
157
            return None
158
        #if
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
159
160
161
162
        self.parseTree = parseTree
        return parseTree
    #_parseInput

163
    def _get_mapping(self, acc, version=None, selector=None, selector_version=None) :
164
        """
165
        Get data from database.
Vermaat's avatar
Vermaat committed
166

167
        @arg acc: NM_ accession number (without version)
Vermaat's avatar
Vermaat committed
168
        @type acc: unicode
169
170
        @arg version: version number
        @type version: integer
171
        @kwarg selector: Optional gene symbol selector.
Vermaat's avatar
Vermaat committed
172
        @type selector: unicode
173
174
        @kwarg selector_version: Optional transcript version selector.
        @type selector_version: int
175
        """
176
177
178
        versions = [m.version for m in TranscriptMapping.query.filter(
                      TranscriptMapping.accession == acc,
                      TranscriptMapping.chromosome.has(assembly=self.assembly))]
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
179

180
        if not versions:
Gerben Stouten's avatar
Gerben Stouten committed
181
            self.__output.addMessage(__file__, 4, "EACCNOTINDB",
182
183
                    "The accession number %s could not be "
                    "found in our database (or is not a transcript)." % acc)
Gerben Stouten's avatar
Gerben Stouten committed
184
            self.__output.addOutput("LOVDERR",
Gerben Stouten's avatar
FIXES    
Gerben Stouten committed
185
                    "Reference sequence not found.")
186
187
188
189
            return

        if version in versions:
            mappings = TranscriptMapping.query.join(Chromosome).filter(
190
191
                TranscriptMapping.accession == acc,
                TranscriptMapping.version == version,
192
193
                Chromosome.assembly == self.assembly)
            if selector:
194
                mappings = mappings.filter(TranscriptMapping.gene == selector)
195
            if selector_version:
196
                mappings = mappings.filter(TranscriptMapping.transcript == selector_version)
197
198
199
200
201
202
203

            # Todo: The 'order by chrom asc' is a quick hack to make sure we
            #   first get a primary assembly mapping instead of some haplotype
            #   mapping for genes in the HLA cluster.
            #   A better fix is to return the entire list of mappings, and/or
            #   remove all secondary mappings for the HLA cluster.
            #   See also test_converter.test_hla_cluster and bug #58.
204
            mapping = mappings.order_by(TranscriptMapping.version.desc(),
205
206
                                             Chromosome.name.asc()).first()

207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
            if not mapping:
                self.__output.addMessage(
                    __file__, 4, 'EACCNOTINDB',
                    'The accession number %s %swith %s%scould not be '
                    'found in our database.' %
                    (acc, 'version %s ' % version if version else '',
                     'gene %s ' % selector if selector else '',
                     'transcript %s ' % selector_version if selector_version else ''))
                return

            if mapping.select_transcript:
                if mapping.reference_type == 'refseq' and selector is None:
                    self.__output.addMessage(__file__, 4, 'EINVALIDGENE',
                                             'No gene specified.')
                    return
                if mapping.reference_type in ('refseq', 'lrg') and selector_version is None:
                    self.__output.addMessage(__file__, 4, 'ENOTRANSCRIPT',
                                             'No transcript specified.')
                    return

            self.mapping = mapping
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
            return

        if not version:
            self.__output.addMessage(__file__, 4, "ENOVERSION",
                "Version number missing for %s" % acc)
        else :
            self.__output.addMessage(__file__, 4, "EACCNOTINDB",
                "The accession number %s version %s "
                "could not be found in our database (or is not a transcript)." %
                (acc, version))
        self.__output.addMessage(__file__, 2, "WDIFFFOUND",
            "We found these versions: %s" %
            (", ".join("%s.%s" % (acc, i) for i in sorted(versions))))

        #LOVD list of versions available
        self.__output.addOutput("LOVDERR",
                "Reference sequence version not found. "
                "Available: %s" %
            (", ".join("%s.%s" % (acc, i) for i in sorted(versions))))

        #LOVD Only newest
        #self.__output.addOutput("LOVDERR",
        #        "Reference sequence version not found. "
        #        "Available: %s.%s" % (acc, sorted(versions)[-1]))
    #_get_mapping
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
253
254

    def makeCrossmap(self) :
Vermaat's avatar
Vermaat committed
255
        """
256
        Build the crossmapper.
Vermaat's avatar
Vermaat committed
257

258
259
260
261
262
        @todo: ADD Error Messages

        @return: Cross ; A Crossmap object
        @rtype: object
        """
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
263
        #TODO: ADD Error Messages
264
265
        if not self.mapping:
            return None
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
266

267
268
269
270
        # Create Mutalyzer compatible exon list.
        mrna = []
        for exon in zip(self.mapping.exon_starts, self.mapping.exon_stops):
            mrna.extend(exon)
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
271

272
273
        cds = self.mapping.cds or []
        orientation = 1 if self.mapping.orientation == 'forward' else -1
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
274

275
        self.crossmap = Crossmap.Crossmap(mrna, cds, orientation)
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
276
277
278
        return self.crossmap
    #makeCrossmap

279
280
281
282
283
284
285
286
287
288
289
    @staticmethod
    def _getcoords(C, Loc, Type) :
        """
        Return main, offset and g positions given either a position in
        I{c.} or in I{g.} notation.

        @arg C: A crossmapper
        @type C: object
        @arg Loc: A location in either I{g.} or I{c.} notation
        @type Loc: object
        @arg Type: The reference type
Vermaat's avatar
Vermaat committed
290
        @type Type: unicode
291
292
293
294
295
296
        @returns: triple:
            0. Main coordinate in I{c.} notation
            1. Offset coordinate in I{c.} notation
            2. Position in I{g.} notation
        @rtype: triple (integer, integer, integer)
        """
297
        if Type in 'cn' :
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
            if Loc.IVSLoc:
                ivs_number = int(Loc.IVSLoc.IVSNumber)
                if ivs_number < 1 or ivs_number > C.numberOfIntrons():
                    # Todo: Error handling in this entire module is 'suboptimal'
                    raise Exception('Invalid intron')
                if Loc.IVSLoc.OffSgn == '+':
                    g = C.getSpliceSite(ivs_number * 2 - 1) + \
                        C.orientation * int(Loc.IVSLoc.Offset)
                else:
                    g = C.getSpliceSite(ivs_number * 2) - \
                        C.orientation * int(Loc.IVSLoc.Offset)
                main, offset = C.g2x(g)
            else:
                main = C.main2int(Loc.PtLoc.MainSgn +  Loc.PtLoc.Main)
                offset = C.offset2int(Loc.PtLoc.OffSgn +  Loc.PtLoc.Offset)
                g = C.x2g(main, offset)
                main, offset = C.g2x(g)
        else:
            g = int(Loc.PtLoc.Main)
317
            main, offset = C.g2x(g)
318

319
320
321
        return (main, offset, g)
    #_getcoords

322
323
324
325
326
327
328
    def _coreMapping(self) :
        """
        Build the Mapping ClassSerializer.

        @return: Mapping ; A ClassSerializer object
        @rtype: object
        """
329

Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
330
        Cross = self.makeCrossmap()
Vermaat's avatar
Vermaat committed
331
        if not Cross :
332
            return None
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
333

334
335
        if self.parseTree.SingleAlleleVarSet:
            mutations = [v.RawVar for v in self.parseTree.SingleAlleleVarSet]
Laros's avatar
Laros committed
336
        elif self.parseTree.RawVar:
337
            mutations = [self.parseTree.RawVar]
Laros's avatar
Laros committed
338
339
340
341
342
        else:
            self.__output.addMessage(
                __file__, 4, 'ENOVARIANT',
               'Variant description contains no mutation.')
            return None
343

344
        mappings = []
345

346
        for mutation in mutations:
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
347

348
            if not mutation.StartLoc :
349
350
                self.__output.addMessage(__file__, 4, 'EUNKNOWN',
                                         'Variant type not supported.')
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
                return None

            # Get the coordinates of the start position
            startmain, startoffset, start_g = \
                    self._getcoords(Cross, mutation.StartLoc,
                                    self.parseTree.RefType)

            # If there is an end position, calculate the coordinates.
            if mutation.EndLoc :
                endmain, endoffset, end_g = \
                    self._getcoords(Cross, mutation.EndLoc,
                                    self.parseTree.RefType)
            else :
                end_g, endmain, endoffset = start_g, startmain, startoffset

            # Assign these values to the Mapping ClassSerializer
            V = Mapping()
            V.startmain     = startmain
            V.startoffset   = startoffset
            V.endmain       = endmain
            V.endoffset     = endoffset
            V.start_g       = start_g
            V.end_g         = end_g
            V.mutationType  = mutation.MutationType

            mappings.append(V)

        return mappings
379
    #_coreMapping
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
380

Laros's avatar
Laros committed
381
    def giveInfo(self, accNo) :
382
383
384
        """
        Returns transcription start, transcription end and CDS stop, if
        available.
Vermaat's avatar
Vermaat committed
385

386
        @arg accNo: transcript (NM_) accession number (with or without version)
Vermaat's avatar
Vermaat committed
387
        @type accNo: unicode
Vermaat's avatar
Vermaat committed
388

389
390
391
392
393
        @return: transcription start, transcription end and CDS stop
        @rtype: triple
        """

        if '.' not in accNo :
Gerben Stouten's avatar
Gerben Stouten committed
394
            acc, ver = accNo, None
395
        else :
Gerben Stouten's avatar
Gerben Stouten committed
396
            acc, ver = accNo.split('.')
397
398
            ver = int(ver)
        self._get_mapping(acc, ver)
Gerben Stouten's avatar
Gerben Stouten committed
399
        CM = self.makeCrossmap()
Vermaat's avatar
Vermaat committed
400
        if CM :
401
402
            return CM.info()
    #giveInfo
Gerben Stouten's avatar
Gerben Stouten committed
403

404
    def mainTranscript(self, accNo) :
Gerben Stouten's avatar
Gerben Stouten committed
405
        """
406
        One of the entry points (called by the HTML publisher).
Gerben Stouten's avatar
Gerben Stouten committed
407

408
        @arg accNo: The full NM accession number (including version)
Vermaat's avatar
Vermaat committed
409
        @type accNo: unicode
Gerben Stouten's avatar
Gerben Stouten committed
410

411
412
413
        @return: T ; ClassSerializer object with the types trans_start,
        trans_stop and CDS_stop
        @rtype: object
Gerben Stouten's avatar
Gerben Stouten committed
414
415

        """
416

Gerben Stouten's avatar
Gerben Stouten committed
417
418
419
        # Initiate ClassSerializer object
        info = self.giveInfo(accNo)
        T = Transcript()
420
        if info :
Gerben Stouten's avatar
Gerben Stouten committed
421
422
423
424
425
            T.trans_start = info[0]
            T.trans_stop  = info[1]
            T.CDS_stop    = info[2]
        return T
    #mainTranscript
Laros's avatar
Laros committed
426

Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
427
428
    def mainMapping(self, accNo, mutation) :
        """
429
        One of the entry points (called by the HTML publisher).
Vermaat's avatar
Vermaat committed
430

431
        @arg accNo: transcript (NM_) accession number (with version?)
Vermaat's avatar
Vermaat committed
432
        @type accNo: unicode
433
        @arg mutation: the 'mutation' (e.g. c.123C>T)
Vermaat's avatar
Vermaat committed
434
        @type mutation: unicode
Vermaat's avatar
Vermaat committed
435

436
437
        @return: ClassSerializer object
        @rtype: object
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
438
439
        """
        variant = "%s:%s" % (accNo, mutation)
440
        if self._parseInput(variant) :
441
            acc = self.parseTree.LrgAcc or self.parseTree.RefSeqAcc
442
443
444
445
446
            try:
                version = int(self.parseTree.Version)
            except ValueError:
                version = None
            self._get_mapping(acc, version)
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
447

448
        mappings = self._coreMapping()
449
450
451
452
453
454
455

        errors = []
        for message in self.__output.getMessages():
            soap_message = SoapMessage()
            soap_message.errorcode = message.code
            soap_message.message = message.description
            errors.append(soap_message)
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
456

457
        mapping = Mapping()
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
458

459
        mapping.messages = errors
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
460

461
462
463
464
465
466
467
468
469
470
471
472
473
        if not mappings:         # Something went wrong
            mapping.errorcode = len(errors)
            return mapping

        mapping.errorcode = 0

        if len(mappings) == 1:
            return mappings[0]

        mapping.mutationType = 'compound'

        # Now we have to do some tricks to combine the information from
        # possibly multiple variants into one Mapping object.
474
        # Todo: Maybe it is better to use self.mapping.orientation here.
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
        min_g = 9000000000
        max_g = 0
        forward = True
        for m in mappings:
            if m.start_g > m.end_g:
                forward = False
            if m.start_g < min_g:
                min_g = m.start_g
                min_main = m.startmain
                min_offset = m.startoffset
            if m.end_g < min_g:
                min_g = m.end_g
                min_main = m.endmain
                min_offset = m.endoffset
            if m.start_g > max_g:
                max_g = m.start_g
                max_main = m.startmain
                max_offset = m.startoffset
            if m.end_g > max_g:
                max_g = m.end_g
                max_main = m.endmain
                max_offset = m.endoffset

        if forward:
            mapping.startmain = min_main
            mapping.startoffset = min_offset
            mapping.endmain = max_main
            mapping.endoffset = max_offset
            mapping.start_g = min_g
            mapping.end_g = max_g
        else:
            mapping.startmain = max_main
            mapping.startoffset = max_offset
            mapping.endmain = min_main
            mapping.endoffset = min_offset
            mapping.start_g = max_g
            mapping.end_g = min_g

Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
513
514
515
516
517
        return mapping
    #main_Mapping

    def c2chrom(self, variant) :
        """
518
        Converts a complete HGVS I{c.} notation into a chromosomal notation.
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
519

520
        @arg variant: The variant in HGVS I{c.} notation
Vermaat's avatar
Vermaat committed
521
        @type variant: unicode
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
522

523
        @return: var_in_g ; The variant in HGVS I{g.} notation
Vermaat's avatar
Vermaat committed
524
        @rtype: unicode
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
525
        """
526
        if self._parseInput(variant):
527
            acc = self.parseTree.LrgAcc or self.parseTree.RefSeqAcc
528
529
530
531
            try:
                version = int(self.parseTree.Version)
            except ValueError:
                version = None
532
            selector = selector_version = None
533
534
            if self.parseTree.Gene:
                selector = self.parseTree.Gene.GeneSymbol
535
536
                if self.parseTree.Gene.TransVar:
                    selector_version = int(self.parseTree.Gene.TransVar)
537
538
            elif self.parseTree.LRGTranscriptID:
                selector_version = int(self.parseTree.LRGTranscriptID)
539
            self._get_mapping(acc, version, selector, selector_version)
540
541
542

        mappings = self._coreMapping()
        if not mappings:
543
            return None
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
544

545
546
547
548
549
550
551
552
        if self.parseTree.SingleAlleleVarSet:
            variants = [v.RawVar for v in self.parseTree.SingleAlleleVarSet]
        else:
            variants = [self.parseTree.RawVar]

        # Construct the variant descriptions
        descriptions = []
        for variant, mapping in zip(variants, mappings):
553
554
555
556
557
            try:
                f_change = _construct_change(variant)
                r_change = _construct_change(variant, reverse=True)
            except NotImplementedError as e:
                self.__output.addMessage(__file__, 3, 'ENOTIMPLEMENTED',
Vermaat's avatar
Vermaat committed
558
                                         unicode(e))
559
560
                return None

561
            if self.mapping.orientation == 'forward':
562
563
564
565
566
                change = f_change
            else :
                change = r_change

            if mapping.start_g != mapping.end_g:
567
                if self.mapping.orientation == 'reverse':
568
569
570
571
572
573
574
575
576
577
578
                    last_g, first_g = mapping.start_g, mapping.end_g
                else:
                    first_g, last_g = mapping.start_g, mapping.end_g
                if last_g < first_g:
                    self.__output.addMessage(__file__, 3, 'ERANGE', 'End position '
                                             'is smaller than the begin position.')
                    return None
                descriptions.append('%s_%s%s' % (first_g, last_g, change))
            #if
            else :
                descriptions.append('%s%s' % (mapping.start_g, change))
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
579

580
581
582
583
584
        if len(descriptions) == 1:
            description = descriptions[0]
        else:
            description = '[' + ';'.join(descriptions) + ']'

585
        if self.mapping.chromosome.organelle == 'mitochondrion':
586
            return "%s:m.%s" % (self.mapping.chromosome.accession, description)
587
        else:
588
            return "%s:g.%s" % (self.mapping.chromosome.accession, description)
589
590
    #c2chrom

591
    def chromosomal_positions(self, positions, reference, version=None):
Vermaat's avatar
Vermaat committed
592
        """
593
        Convert c. positions to chromosomal positions.
594

Vermaat's avatar
Vermaat committed
595
596
597
        @arg positions: Positions in c. notation to convert.
        @type positions: list
        @arg reference: Transcript reference.
Vermaat's avatar
Vermaat committed
598
        @type reference: unicode
Vermaat's avatar
Vermaat committed
599
600
        @kwarg version: Transcript reference version. If omitted, '0' is
            assumed.
Vermaat's avatar
Vermaat committed
601
        @type version: unicode
Vermaat's avatar
Vermaat committed
602

Vermaat's avatar
Vermaat committed
603
604
        @return: Chromosome name, orientation (+ or -), and converted
            positions.
Vermaat's avatar
Vermaat committed
605
        @rtype: tuple(unicode, unicode, list)
Vermaat's avatar
Vermaat committed
606

607
        This only works for positions on transcript references in c. notation.
Vermaat's avatar
Vermaat committed
608
        """
609
610
611
612
        versions = [m.version for m in TranscriptMapping.query.filter(
                      TranscriptMapping.accession == reference,
                      TranscriptMapping.version != None,
                      TranscriptMapping.chromosome.has(assembly=self.assembly))]
613
614
615

        if version not in versions:
            return None
Vermaat's avatar
Vermaat committed
616

617
618
619
620
621
622
623
624
625
626
        self.mapping = TranscriptMapping.query \
            .join(Chromosome) \
            .filter(TranscriptMapping.accession == reference,
                    TranscriptMapping.version == version,
                    Chromosome.assembly == self.assembly) \
            .order_by(TranscriptMapping.version.desc(),
                      Chromosome.name.asc()).first()

        if not self.mapping:
            return
Vermaat's avatar
Vermaat committed
627
628
629
630
631

        mapper = self.makeCrossmap()
        if not mapper:
            return None

632
        chromosomal_positions = []
Vermaat's avatar
Vermaat committed
633
634
635
636

        for position in positions:
            main = mapper.main2int(position.MainSgn +  position.Main)
            offset = mapper.offset2int(position.OffSgn +  position.Offset)
637
            chromosomal_positions.append(mapper.x2g(main, offset))
Vermaat's avatar
Vermaat committed
638

639
640
641
        orientation = '+' if self.mapping.orientation == 'forward' else '-'

        return self.mapping.chromosome.name, orientation, chromosomal_positions
642
    #chromosomal_positions
Vermaat's avatar
Vermaat committed
643

644
645
646
    def correctChrVariant(self, variant) :
        """
        @arg variant:
Vermaat's avatar
Vermaat committed
647
        @type variant: unicode
Vermaat's avatar
Vermaat committed
648
649

        @return: variant ;
Vermaat's avatar
Vermaat committed
650
        @rtype: unicode
651
        """
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
652

653
        #Pre split check
654
        if ':' not in variant :
655
656
657
658
            self.__output.addMessage(__file__, 4, "EPARSE",
                "The variant needs a colon")
            return None

Gerben Stouten's avatar
Gerben Stouten committed
659
660
        #Remove whitespace
        variant = variant.replace(" ","")
661

662
663
        if variant.startswith('chr') and ':' in variant:
            preco, postco = variant.split(':', 1)
664
665
666
667

            chromosome = Chromosome.query.filter_by(assembly=self.assembly,
                                                    name=preco).first()
            if not chromosome:
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
668
                self.__output.addMessage(__file__, 4, "ENOTINDB",
Mihai's avatar
Mihai committed
669
670
                    "Accession number %s could not be found in our database "
                    "or is not suitable for the requested conversion." %
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
671
672
                    preco)
                return None
673
674

            variant = "%s:%s" % (chromosome.accession, postco)
675
        #if
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
676
        return variant
677
    #correctChrVariant
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
678

679
    def chrom2c(self, variant, rt, gene=None):
680
681
        """
        @arg variant: a variant description
Vermaat's avatar
Vermaat committed
682
        @type variant: unicode
683
        @arg rt: the return type
Vermaat's avatar
Vermaat committed
684
        @type rt: unicode
685
686
        @kwarg gene: Optional gene name. If given, return variant descriptions
            on all transcripts for this gene.
Vermaat's avatar
Vermaat committed
687
        @type gene: unicode
Vermaat's avatar
Vermaat committed
688
689

        @return: HGVS_notatations ;
690
691
692
693
        @rtype: dictionary or list
        """

        if not self._parseInput(variant) :
694
            return None
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
695

696
        acc = self.parseTree.LrgAcc or self.parseTree.RefSeqAcc
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
697
        version = self.parseTree.Version
698
699
700
701
702

        chromosome = Chromosome.query \
            .filter_by(assembly=self.assembly,
                       accession='%s.%s' % (acc, version)).first()
        if not chromosome :
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
703
            self.__output.addMessage(__file__, 4, "ENOTINDB",
Mihai's avatar
Mihai committed
704
705
                "Accession number %s could not be found in our database or is "
                "not suitable for the requested conversion." %
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
706
707
                acc)
            return None
Mihai's avatar
Mihai committed
708

709
        #if
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
710

711
712
713
714
715
716
717
718
719
720
721
722
723
724
        if self.parseTree.SingleAlleleVarSet:
            variants = [v.RawVar for v in self.parseTree.SingleAlleleVarSet]
        else:
            variants = [self.parseTree.RawVar]

        min_loc = 9000000000
        max_loc = 0
        for variant in variants:
            #FIXME This should be a proper conversion.
            loc = int(variant.StartLoc.PtLoc.Main)
            if variant.EndLoc :
                loc2 = int(variant.EndLoc.PtLoc.Main)
            else :
                loc2 = loc
725

726
727
728
729
730
731
732
            if loc2 < loc:
                self.__output.addMessage(__file__, 3, 'ERANGE', 'End position is '
                                         'smaller than the begin position.')
                return None

            min_loc = min(min_loc, loc)
            max_loc = max(max_loc, loc2)
733

734
        if gene:
735
            mappings = chromosome.transcript_mappings.filter_by(gene=gene)
736
        else:
737
738
739
            start = max(min_loc - 5000, 1)
            stop = min(max_loc + 5000, binning.MAX_POSITION + 1)
            bins = binning.overlapping_bins(start - 1, stop)
740
            mappings = chromosome.transcript_mappings.filter(
741
742
743
744
745
746
747
748
749
750
                TranscriptMapping.bin.in_(bins),
                TranscriptMapping.start <= stop,
                TranscriptMapping.stop >= start
            ).order_by(
                TranscriptMapping.start,
                TranscriptMapping.stop,
                TranscriptMapping.gene,
                TranscriptMapping.accession,
                TranscriptMapping.version,
                TranscriptMapping.transcript)
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
751
752

        HGVS_notatations = defaultdict(list)
753
        NM_list = []
754
        for mapping in mappings:
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
755
            self._reset()
756
757
758
            self.mapping = mapping
            core_mapping = self._coreMapping()
            if not core_mapping:
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
759
760
                #balen
                continue
761
            reference = self.mapping.reference
762
763
            geneName = self.mapping.gene
            strand = self.mapping.orientation == 'forward'
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
764

765
766
767
768
769
770
771
772
773
774
775
            # Check if n or c type
            # Note: Originally, the below check using crossmap.info() was
            #     used (commented out now), but I do not understand this
            #     logic. Also, it breaks n. notation on non-coding mtDNA
            #     transcripts, so I replaced it with a simple .CDS check.
            #info = self.crossmap.info()
            #if info[0] == 1 and info[1] == info[2] :
            #    mtype = 'n'
            #else :
            #    mtype = 'c'
            if self.crossmap.CDS:
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
776
                mtype = 'c'
777
778
            else:
                mtype = 'n'
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
779

780
            mutations = []
781
            for variant, cmap in zip(variants, core_mapping):
782
783
784
785
786
                try:
                    f_change = _construct_change(variant)
                    r_change = _construct_change(variant, reverse=True)
                except NotImplementedError as e:
                    self.__output.addMessage(__file__, 4,
Vermaat's avatar
Vermaat committed
787
                                             "ENOTIMPLEMENTEDERROR", unicode(e))
788
                    return None
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
789

790
791
                startp = self.crossmap.tuple2string((cmap.startmain, cmap.startoffset))
                endp = self.crossmap.tuple2string((cmap.endmain, cmap.endoffset))
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
792

793
794
795
796
797
                if strand :
                    change = f_change
                else :
                    change = r_change
                    startp, endp = endp, startp
798

799
                if cmap.start_g != cmap.end_g :
800
801
802
                    loca = "%s_%s" % (startp, endp)
                else :
                    loca = "%s" % startp
Vermaat's avatar
Vermaat committed
803

804
                mutations.append('%s%s' % (loca, change))
805

806
807
808
809
            if len(mutations) == 1:
                mutation = mutations[0]
            else:
                mutation = '[' + ';'.join(mutations) + ']'
Gerben Stouten's avatar
TODO:    
Gerben Stouten committed
810

811
            description = "%s:%c.%s" % (reference, mtype, mutation)
812
813
814
815
816
817
818
            HGVS_notatations[geneName].append(description)
            NM_list.append(description)
        #for
        if rt == "list" :
            return NM_list
        return HGVS_notatations
    #chrom2c
819
#Converter
820
821
822
823
824
825
826
827


def import_from_ucsc_by_gene(assembly, gene):
    """
    Import transcript mappings for a gene from the UCSC.
    """
    connection = MySQLdb.connect(user='genome',
                                 host='genome-mysql.cse.ucsc.edu',
828
829
830
                                 db=assembly.alias,
                                 charset='utf8',
                                 use_unicode=True)
831
832
833
834
835
836
837
838
839
840
841

    query = """
        SELECT DISTINCT
          acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts,
          exonEnds, name2 AS geneName, chrom, strand, protAcc
        FROM gbStatus, refGene, refLink
        WHERE type = "mRNA"
        AND refGene.name = acc
        AND acc = mrnaAcc
        AND name2 = %s
    """
842
    parameters = gene,
843
844
845
846
847
848

    cursor = connection.cursor()
    cursor.execute(query, parameters)
    result = cursor.fetchall()
    cursor.close()

849
850
851
    # All ranges in the UCSC tables are zero-based and open-ended. We convert
    # this to one-based, inclusive for our database.

852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
    for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds,
         geneName, chrom, strand, protAcc) in result:
        chromosome = assembly.chromosomes.filter_by(name=chrom).one()
        orientation = 'reverse' if strand == '-' else 'forward'
        exon_starts = [int(i) + 1 for i in exonStarts.split(',') if i]
        exon_stops = [int(i) for i in exonEnds.split(',') if i]
        if cdsStart and cdsEnd:
            cds = cdsStart + 1, cdsEnd
        else:
            cds = None
        mapping = TranscriptMapping.create_or_update(
            chromosome, 'refseq', acc, geneName, orientation, txStart + 1,
            txEnd, exon_starts, exon_stops, 'ucsc', cds=cds,
            version=int(version))
        session.add(mapping)

    session.commit()


def import_from_reference(assembly, reference):
    """
    Import transcript mappings from a genomic reference.

    .. todo: Also report how much was added/updated.

    .. note: Currently no exon locations are supported, this has only been
       tested on mtDNA.
    """
    chromosome = assembly.chromosomes.filter_by(name='chrM').one()

    output = Output(__file__)
    retriever = Retriever.GenBankRetriever(output)
    record = retriever.loadrecord(reference)

886
887
888
    if record.molType != 'm':
        raise ValueError('Only mitochondial references are supported')

889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
    select_transcript = len(record.geneList) > 1

    for gene in record.geneList:
        # We support exactly one transcript per gene.
        try:
            transcript = sorted(gene.transcriptList, key=attrgetter('name'))[0]
        except IndexError:
            continue

        # We use gene.location for now, it is always present and the same
        # for our purposes.
        #start, stop = transcript.mRNA.location[0], transcript.mRNA.location[1]
        start, stop = gene.location

        orientation = 'reverse' if gene.orientation == -1 else 'forward'

        try:
            cds = transcript.CDS.location
        except AttributeError:
            cds = None

        mapping = TranscriptMapping.create_or_update(
            chromosome, 'refseq', record.source_accession, gene.name,
            orientation, start, stop, [start], [stop], 'reference', cds=cds,
            select_transcript=select_transcript,
            version=int(record.source_version))
        session.add(mapping)

    session.commit()


def import_from_mapview_file(assembly, mapview_file, group_label):
    """
    Import transcript mappings from an NCBI mapview file.

    We require that this file is first sorted on the `feature_id` column
    (#11), which always contains the gene identifier, and then on the
    `chromosome` column (#2).

Vermaat's avatar
Vermaat committed
928
        sort -t $'\t' -k 11,11 -k 2,2 seq_gene.md > seq_gene.by_gene.md
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958

    Raises :exc:`ValueError` if `mapview_file` is not sorted this way.

    The NCBI mapping file consists of entries, one per line, in order of
    their location in the genome (more specifically by start location).
    Every entry has a 'group_label' column, denoting the assembly it is
    from. We only use entries where this value is `group_label`.

    There are four types of entries (for our purposes):
    - Gene: Name, identifier, and location of a gene.
    - Transcript: Name, gene id, and location of a transcript.
    - UTR: Location and transcript of a non-coding exon (or part of it).
    - CDS: Location and transcript of a coding exon (or part of it).

    A bit troublesome for us is that exons are split in UTR exons and CDS
    exons, with exons overlapping the UTR/CDS border defined as two
    separate entries (one of type UTR and one of type CDS).

    Another minor annoyance is that some transcripts (~ 15) are split over
    two contigs (NT_*). In that case, they are defined by two entries in
    the file, where we should merge them by taking the start position of
    the first and the stop position of the second.

    To complicate this annoyance, some genes (e.g. in the PAR) are mapped
    on both the X and Y chromosomes, but stored in the file just like the
    transcripts split over two contigs. However, these ones should of
    course not be merged.

    Our strategy is too sort by gene and chromosome and process the file
    grouped by these two fields.
959
960
961

    For transcripts without any UTR and CDS entries (seems to happen for
    predicted genes), we generate one exon spanning the entire transcript.
962
963
964

    All positions are one-based, inclusive, and that is what we also use in
    our database.
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
    """
    columns = ['taxonomy', 'chromosome', 'start', 'stop', 'orientation',
               'contig', 'ctg_start', 'ctg_stop', 'ctg_orientation',
               'feature_name', 'feature_id', 'feature_type', 'group_label',
               'transcript', 'evidence_code']

    chromosomes = assembly.chromosomes.all()

    def read_records(mapview_file):
        for line in mapview_file:
            if line.startswith('#'):
                continue
            record = dict(zip(columns, line.rstrip().split('\t')))

            # Only use records from the given assembly.
            if record['group_label'] != group_label:
                continue

            # Only use records on chromosomes we know.
            try:
                record['chromosome'] = next(c for c in chromosomes if
                                            c.name == 'chr' + record['chromosome'])
            except StopIteration:
                continue

            record['start'] = int(record['start'])
            record['stop'] = int(record['stop'])

            yield record

    def build_mappings(records):
        # We structure the records per transcript and per record type. This is
        # generalized to a list of records for each type, but we expect only
        # one GENE record (with `-` as transcript value).
        # Note that there can be more than one RNA record per transcript if it
        # is split over different reference contigs.
For faster browsing, not all history is shown. View entire blame