diff --git a/migrations/versions/56ddeb75114e_add_ebi_to_mappings_source_enum.py b/migrations/versions/56ddeb75114e_add_ebi_to_mappings_source_enum.py new file mode 100644 index 0000000000000000000000000000000000000000..7556cedd329cb0517ebfad71ce6a09501b978477 --- /dev/null +++ b/migrations/versions/56ddeb75114e_add_ebi_to_mappings_source_enum.py @@ -0,0 +1,64 @@ +"""Add EBI to mappings source enum + +Revision ID: 56ddeb75114e +Revises: 1ed411f9fdfa +Create Date: 2016-02-11 04:21:40.658981 + +""" + +from __future__ import unicode_literals + +# revision identifiers, used by Alembic. +revision = '56ddeb75114e' +down_revision = u'1ed411f9fdfa' + +from alembic import op +import sqlalchemy as sa + + +def upgrade(): + context = op.get_context() + + if context.bind.dialect.name == 'postgresql': + # In PostgreSQL < 9.1 there was no ALTER TYPE for enums, so it would + # have been something like: + # + # ALTER TABLE foo ALTER COLUMN bar TYPE new_type USING bar::text::new_type; + # + # However, all my installations are PostgreSQL >= 9.1 and I think the + # USING syntax is PostgreSQL-specific, so let's ignore that. It would + # also come with all the hassle of moving old column values into the + # new column. + if context.bind.dialect.server_version_info >= (9, 3): + op.execute('COMMIT') + op.execute("ALTER TYPE source ADD VALUE IF NOT EXISTS 'ebi'") + return + if context.bind.dialect.server_version_info >= (9, 1): + op.execute('COMMIT') + op.execute("ALTER TYPE source ADD VALUE 'ebi'") + return + + elif context.bind.dialect.name == 'sqlite': + # SQLite doesn't support altering columns, so we have to wrap this in + # a batch operation. + with op.batch_alter_table('transcript_mappings') as batch_op: + batch_op.alter_column( + 'source', nullable=False, + type_=sa.Enum('ucsc', 'ncbi', 'ebi', 'reference', name='source') + ) + return + + elif context.bind.dialect.name == 'mysql': + # In MySQL we can simply alter the column. + op.alter_column( + 'transcript_mappings', 'source', nullable=False, + type_=sa.Enum('ucsc', 'ncbi', 'ebi', 'reference', name='source'), + existing_type=sa.Enum('ucsc', 'ncbi', 'reference', name='source') + ) + return + + raise Exception('Sorry, only PostgreSQL >= 9.1, SQLite, and MySQL are supported by this migration') + + +def downgrade(): + raise Exception('Downgrade not supported by this migration') diff --git a/mutalyzer/db/models.py b/mutalyzer/db/models.py index b6e397e34bd543e02f729243edfa17a493a79b07..73a99c48994c3240b13875057bef2f113b823236 100644 --- a/mutalyzer/db/models.py +++ b/mutalyzer/db/models.py @@ -375,11 +375,11 @@ class TranscriptMapping(db.Base): #: If `False`, variant descriptions can use just the accession number #: without gene and transcript selector (e.g., ``NM_000020:c.1del``, #: ``LRG_1:c.1del``). If `True`, gene and transcript selection is - #: necessary (e.g., ``NC_012920(TRNI_v001):c.1del``, ``LRG_1_t1:c.1del``). + #: necessary (e.g., ``NC_012920(TRNI_v001):c.1del``, ``LRG_1t1:c.1del``). select_transcript = Column(Boolean, nullable=False) #: Source of the mapping. - source = Column(Enum('ucsc', 'ncbi', 'reference', name='source'), + source = Column(Enum('ucsc', 'ncbi', 'ebi', 'reference', name='source'), nullable=False) #: The :class:`Assembly` this chromosome is in. diff --git a/mutalyzer/entrypoints/admin.py b/mutalyzer/entrypoints/admin.py index 77a4e948d9cec2d87a6187f2ff7ec81a710de04d..ae4e071bd795c26a5ba0d5bb3d986d77df501849 100644 --- a/mutalyzer/entrypoints/admin.py +++ b/mutalyzer/entrypoints/admin.py @@ -113,6 +113,20 @@ def import_mapview(assembly_name_or_alias, mapview_file, encoding, raise UserError(unicode(e)) +def import_lrgmap(assembly_name_or_alias, lrgmap_file, encoding): + """ + Import transcript mappings from an EBI LRG transcripts map file. + """ + lrgmap_file = codecs.getreader(encoding)(lrgmap_file) + + try: + assembly = Assembly.by_name_or_alias(assembly_name_or_alias) + except NoResultFound: + raise UserError('Not a valid assembly: %s' % assembly_name_or_alias) + + mapping.import_from_lrgmap_file(assembly, lrgmap_file) + + def import_gene(assembly_name_or_alias, gene): """ Import transcript mappings for a gene from the UCSC database. @@ -313,6 +327,22 @@ def main(): help='use only entries with this group label (example: ' 'GRCh37.p2-Primary Assembly)') + # Subparser 'assemblies import-lrgmap'. + p = s.add_parser( + 'import-lrgmap', + help='import mappings from EBI LRG transcripts map file', + parents=[assembly_parser], + description=import_lrgmap.__doc__.split('\n\n')[0]) + p.set_defaults(func=import_lrgmap) + p.add_argument( + 'lrgmap_file', metavar='FILE', type=argparse.FileType('rb'), + help='EBI LRG transcript map file (example: ' + 'list_LRGs_transcripts_GRCh37.txt)') + p.add_argument( + '--encoding', metavar='ENCODING', type=_cli_string, + default=default_encoding, + help='input file encoding (default: %s)' % default_encoding) + # Subparser 'assemblies import-gene'. p = s.add_parser( 'import-gene', help='import mappings by gene from UCSC database', diff --git a/mutalyzer/mapping.py b/mutalyzer/mapping.py index b9a55afefdec70ecddbb1dd98eede58420ec95f1..d71f3aeaec8e31e989413872fb3f2e83f335bd62 100644 --- a/mutalyzer/mapping.py +++ b/mutalyzer/mapping.py @@ -151,9 +151,9 @@ class Converter(object) : "Could not parse the given variant") return None #if - if not parseTree.RefSeqAcc: #In case of LRG for example + if not (parseTree.RefSeqAcc or parseTree.LrgAcc): self.__output.addMessage(__file__, 4, "EONLYGB", - "Currently we only support GenBank Records") + "Currently we only support GenBank and LRG records") return None #if self.parseTree = parseTree @@ -175,7 +175,6 @@ class Converter(object) : """ versions = [m.version for m in TranscriptMapping.query.filter( TranscriptMapping.accession == acc, - TranscriptMapping.version != None, TranscriptMapping.chromosome.has(assembly=self.assembly))] if not versions: @@ -207,10 +206,10 @@ class Converter(object) : if not self.mapping: self.__output.addMessage(__file__, 4, "EACCNOTINDB", - "The accession number %s version %s " + "The accession number %s %s" "with transcript %s version %s could not be found " "in our database." % - (acc, version, selector, selector_version)) + (acc, 'version %s ' % version if version else '', selector, selector_version)) return if not version: @@ -417,7 +416,7 @@ class Converter(object) : """ variant = "%s:%s" % (accNo, mutation) if self._parseInput(variant) : - acc = self.parseTree.RefSeqAcc + acc = self.parseTree.LrgAcc or self.parseTree.RefSeqAcc try: version = int(self.parseTree.Version) except ValueError: @@ -503,7 +502,7 @@ class Converter(object) : @rtype: unicode """ if self._parseInput(variant): - acc = self.parseTree.RefSeqAcc + acc = self.parseTree.LrgAcc or self.parseTree.RefSeqAcc try: version = int(self.parseTree.Version) except ValueError: @@ -511,6 +510,10 @@ class Converter(object) : if self.parseTree.Gene: selector = self.parseTree.Gene.GeneSymbol selector_version = int(self.parseTree.Gene.TransVar or 1) + elif self.parseTree.LRGTranscriptID: + selector = None + selector_version = int(self.parseTree.LRGTranscriptID) + pass else: selector = selector_version = None self._get_mapping(acc, version, selector, selector_version) @@ -669,7 +672,7 @@ class Converter(object) : if not self._parseInput(variant) : return None - acc = self.parseTree.RefSeqAcc + acc = self.parseTree.LrgAcc or self.parseTree.RefSeqAcc version = self.parseTree.Version chromosome = Chromosome.query \ @@ -733,9 +736,14 @@ class Converter(object) : #balen continue # construct the variant description - accNo = "%s.%s" % (self.mapping.accession, self.mapping.version) + if self.mapping.reference_type == 'lrg': + accNo = self.mapping.accession + else: + accNo = "%s.%s" % (self.mapping.accession, self.mapping.version) if self.mapping.select_transcript: - if self.mapping.transcript: + if self.mapping.reference_type == 'lrg': + selector = 't%d' % self.mapping.transcript + elif self.mapping.transcript: selector = '(%s_v%.3i)' % (self.mapping.gene, self.mapping.transcript) else: selector = '(%s)' % self.mapping.gene @@ -1045,3 +1053,74 @@ def import_from_mapview_file(assembly, mapview_file, group_label): session.add(mapping) session.commit() + + +def import_from_lrgmap_file(assembly, lrgmap_file): + """ + Import transcript mappings from an EBI LRG transcripts map file. + + All positions are one-based, inclusive, and that is what we also use in + our database. + """ + columns = ['transcript', 'gene', 'chromosome', 'strand', 'start', 'stop', + 'exons', 'protein', 'cds_start', 'cds_stop'] + + chromosomes = assembly.chromosomes.all() + + def read_mappings(lrgmap_file): + for line in lrgmap_file: + if line.startswith('#'): + continue + record = dict(zip(columns, line.rstrip('\r\n').split('\t'))) + + record['start'] = int(record['start']) + record['stop'] = int(record['stop']) + try: + record['cds_start'] = int(record['cds_start']) + except ValueError: + record['cds_start'] = None + try: + record['cds_stop'] = int(record['cds_stop']) + except ValueError: + record['cds_stop'] = None + record['exons'] = [[int(pos) for pos in exon.split('-')] + for exon in record['exons'].split(',')] + + try: + yield build_mapping(record) + except ValueError: + pass + + def build_mapping(record): + # Only use records on chromosomes we know. + try: + chromosome = next(c for c in chromosomes if + c.name == 'chr' + record['chromosome']) + except StopIteration: + raise ValueError() + + accession, transcript = record['transcript'].split('t') + transcript = int(transcript) + + orientation = 'reverse' if record['strand'] == '-1' else 'forward' + + if record['cds_start']: + cds = record['cds_start'], record['cds_stop'] + else: + cds = None + + # TODO: Also take protein into account. For example, in LRG_32 (TP53) + # some transcripts occur twice (with different CDSs and different + # protein numbers). + # https://humgenprojects.lumc.nl/trac/mutalyzer/ticket/150 + return TranscriptMapping.create_or_update( + chromosome, 'lrg', accession, record['gene'], orientation, + record['start'], record['stop'], + [start for start, _ in record['exons']], + [stop for _, stop in record['exons']], + 'ebi', transcript=transcript, cds=cds, select_transcript=True) + + for mapping in read_mappings(lrgmap_file): + session.add(mapping) + + session.commit() diff --git a/mutalyzer/models.py b/mutalyzer/models.py index 7ebade7f8fa429b92845cac6f3f13a784474bb69..133fc6d84662e665bb61a217280fe80ff22313ee 100644 --- a/mutalyzer/models.py +++ b/mutalyzer/models.py @@ -310,6 +310,7 @@ class TranscriptMappingInfo(ComplexModel): """ __namespace__ = SOAP_NAMESPACE + transcript = Mandatory.Unicode name = Mandatory.Unicode version = Mandatory.Integer gene = Mandatory.Unicode diff --git a/mutalyzer/services/rpc.py b/mutalyzer/services/rpc.py index 9cf89b9c979e558241f0f90640a7e8f9cbe5e3a4..93ca06898087fb1273e2cfc6aeb506fe84190187 100644 --- a/mutalyzer/services/rpc.py +++ b/mutalyzer/services/rpc.py @@ -264,11 +264,24 @@ class MutalyzerService(ServiceBase): "Finished processing getTranscripts(%s %s %s %s)" % (build, chrom, pos, versions)) - #filter out the accNo - if versions: - return ['%s.%s' % (m.accession, m.version) for m in mappings] - else: - return [m.accession for m in mappings] + transcripts = [] + for mapping in mappings: + if versions and mapping.version: + accession = '%s.%i' % (mapping.accession, mapping.version) + else: + accession = mapping.accession + if mapping.select_transcript: + if mapping.reference_type == 'lrg': + selector = 't%d' % mapping.transcript + elif mapping.transcript: + selector = '(%s_v%.3i)' % (mapping.gene, mapping.transcript) + else: + selector = '(%s)' % mapping.gene + else: + selector = '' + transcripts.append('%s%s' % (accession, selector)) + + return transcripts #getTranscripts @srpc(Mandatory.Unicode, Mandatory.Unicode, _returns=Array(Mandatory.Unicode)) @@ -296,12 +309,30 @@ class MutalyzerService(ServiceBase): L.addMessage(__file__, -1, "INFO", "Finished processing getTranscriptsByGene(%s %s)" % (build, name)) - return ['%s.%s' % (m.accession, m.version) for m in mappings] - #getTranscriptsByGene + transcripts = [] + for mapping in mappings: + if mapping.version: + accession = '%s.%i' % (mapping.accession, mapping.version) + else: + accession = mapping.accession + if mapping.select_transcript: + if mapping.reference_type == 'lrg': + selector = 't%d' % mapping.transcript + elif mapping.transcript: + selector = '(%s_v%.3i)' % (mapping.gene, mapping.transcript) + else: + selector = '(%s)' % mapping.gene + else: + selector = '' + transcripts.append('%s%s' % (accession, selector)) + + return transcripts + #getTranscriptsByGeneName @srpc(Mandatory.Unicode, Mandatory.Unicode, Mandatory.Integer, - Mandatory.Integer, Mandatory.Integer, _returns=Array(Mandatory.Unicode)) - def getTranscriptsRange(build, chrom, pos1, pos2, method) : + Mandatory.Integer, Mandatory.Integer, Boolean, + _returns=Array(Mandatory.Unicode)) + def getTranscriptsRange(build, chrom, pos1, pos2, method, versions=False): """ Get all the transcripts that overlap with a range on a chromosome. @@ -319,6 +350,8 @@ class MutalyzerService(ServiceBase): - 0 ; Return only the transcripts that completely fall in the range [pos1, pos2]. - 1 ; Return all hit transcripts. + @kwarg versions: If set to True, also include transcript versions. + @type versions: bool @return: A list of transcripts. @rtype: list @@ -381,7 +414,24 @@ class MutalyzerService(ServiceBase): "Finished processing getTranscriptsRange(%s %s %s %s %s)" % ( build, chrom, pos1, pos2, method)) - return [m.accession for m in mappings] + transcripts = [] + for mapping in mappings: + if versions and mapping.version: + accession = '%s.%i' % (mapping.accession, mapping.version) + else: + accession = mapping.accession + if mapping.select_transcript: + if mapping.reference_type == 'lrg': + selector = 't%d' % mapping.transcript + elif mapping.transcript: + selector = '(%s_v%.3i)' % (mapping.gene, mapping.transcript) + else: + selector = '(%s)' % mapping.gene + else: + selector = '' + transcripts.append('%s%s' % (accession, selector)) + + return transcripts #getTranscriptsRange @srpc(Mandatory.Unicode, Mandatory.Unicode, Mandatory.Integer, @@ -408,6 +458,7 @@ class MutalyzerService(ServiceBase): - 1 ; Return all hit transcripts. @return: Array of TranscriptMappingInfo objects with fields: + - transcript - name - version - gene @@ -476,6 +527,22 @@ class MutalyzerService(ServiceBase): for mapping in mappings: t = TranscriptMappingInfo() + + if mapping.version: + accession = '%s.%i' % (mapping.accession, mapping.version) + else: + accession = mapping.accession + if mapping.select_transcript: + if mapping.reference_type == 'lrg': + selector = 't%d' % mapping.transcript + elif mapping.transcript: + selector = '(%s_v%.3i)' % (mapping.gene, mapping.transcript) + else: + selector = '(%s)' % mapping.gene + else: + selector = '' + t.transcript = '%s%s' % (accession, selector) + t.name = mapping.accession t.version = mapping.version t.gene = mapping.gene @@ -558,7 +625,7 @@ class MutalyzerService(ServiceBase): @type LOVD_ver: string @arg build: The genome build (hg19, hg18, mm10). @type build: string - @arg accNo: The NM accession number and version. + @arg accNo: The NM accession number and version or LRG. @type accNo: string @arg variant: The variant. @type variant: string @@ -614,7 +681,7 @@ class MutalyzerService(ServiceBase): @type LOVD_ver: string @arg build: The genome build (hg19, hg18, mm10). @type build: string - @arg accNo: The NM accession number and version. + @arg accNo: The NM accession number and version or LRG. @type accNo: string @return: Complex object: @@ -729,7 +796,7 @@ class MutalyzerService(ServiceBase): @arg build: The genome build (hg19, hg18, mm10). @type build: string - @arg acc: The NM accession number (version NOT included). + @arg acc: The NM accession number (version NOT included) or LRG. @type acc: string @return: The name of a chromosome. @@ -768,7 +835,7 @@ class MutalyzerService(ServiceBase): @arg build: The genome build (hg19, hg18, mm10). @type build: string @arg variant: The variant in either I{c.} or I{g.} notation, full HGVS - notation, including NM_ or NC_ accession number. + notation, including NM_, NC_, or LRG_ accession number. @type variant: string @kwarg gene: Optional gene name. If given, return variant descriptions on all transcripts for this gene. diff --git a/mutalyzer/website/templates/position-converter.html b/mutalyzer/website/templates/position-converter.html index 0db57523fd018b27136e7e3cf98ec9217e1cbcc5..db9c6c1ae4beae009b26796ace56e21e6211b6dc 100644 --- a/mutalyzer/website/templates/position-converter.html +++ b/mutalyzer/website/templates/position-converter.html @@ -29,8 +29,16 @@ normalize it to HGVS. Use the <a href="{{ url_for('.name_checker') }}">Name Chec <label for="description">Variant description</label> <input type="text" name="description" id="description" value="{{ description }}" class="form-control form-pre" placeholder="Variant description using HGVS format"> - <p>Examples: <code class="example-input" - data-for="description">NM_003002.3:c.274G>T</code>, <code class="example-input" data-for="description">chr11:g.111959693G>T</code> and <code class="example-input" data-for="description">NC_000011.9:g.111959693G>T</code></p> + <p>Examples: + <code class="example-input" + data-for="description">NM_003002.3:c.274G>T</code>, + <code class="example-input" + data-for="description">LRG_9t1:c.274G>T</code>, + <code class="example-input" + data-for="description">chr11:g.111959693G>T</code> and + <code class="example-input" + data-for="description">NC_000011.9:g.111959693G>T</code> + </p> </div> <div class="form-group button-group"> <input type="submit" class="btn btn-primary" value="Convert variant description"> diff --git a/tests/data/hg19.lrgmap.subset.txt b/tests/data/hg19.lrgmap.subset.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f83d3dad55853dbf1e6c915e9e70f6665869f82 --- /dev/null +++ b/tests/data/hg19.lrgmap.subset.txt @@ -0,0 +1,30 @@ +# Last modified: 10-02-2016@22:00:03 +# LRG_TRANSCRIPT HGNC_SYMBOL CHROMOSOME STRAND TRANSCRIPT_START TRANSCRIPT_STOP EXONS_COORDS LRG_PROTEIN CDS_START CDS_STOP +LRG_1t1 COL1A1 17 -1 48261457 48279000 48261457-48263009,48263139-48263381,48263678-48263868,48264001-48264283,48264376-48264483,48264845-48264898,48265237-48265344,48265457-48265510,48265891-48265998,48266103-48266156,48266264-48266371,48266529-48266636,48266738-48266899,48267040-48267093,48267220-48267273,48267362-48267469,48267688-48267741,48267904-48267957,48268178-48268285,48268744-48268851,48269149-48269247,48269341-48269385,48269836-48269889,48270001-48270054,48270158-48270211,48270355-48270408,48271304-48271402,48271491-48271544,48271710-48271808,48271934-48271987,48272082-48272189,48272408-48272461,48272593-48272691,48272795-48272839,48272928-48273026,48273284-48273337,48273516-48273560,48273675-48273728,48273845-48273889,48273978-48274031,48274371-48274424,48274541-48274594,48275093-48275146,48275310-48275363,48275522-48275566,48275794-48275865,48276587-48276688,48276779-48276814,48276917-48276951,48277114-48277308,48278772-48279000 LRG_1p1 48262863 48278874 +LRG_2t1 COL1A2 7 1 94023873 94060544 94023873-94024413,94027060-94027070,94027694-94027708,94028361-94028396,94029508-94029600,94030879-94030932,94033868-94033912,94034005-94034058,94034151-94034204,94034511-94034564,94034985-94035038,94035562-94035615,94037159-94037203,94037495-94037548,94037648-94037692,94038082-94038135,94038634-94038732,94038876-94038920,94039035-94039133,94039554-94039607,94039732-94039839,94040201-94040254,94040368-94040466,94041380-94041433,94041896-94041994,94042395-94042448,94043002-94043055,94043206-94043259,94043534-94043587,94044538-94044582,94045717-94045815,94047036-94047143,94047811-94047864,94048810-94048863,94049545-94049598,94049703-94049756,94049853-94049960,94050321-94050374,94051211-94051264,94052269-94052430,94053648-94053755,94054429-94054536,94054922-94054975,94055062-94055169,94055310-94055363,94055735-94055842,94056320-94056373,94056500-94056607,94056939-94057197,94057605-94057789,94058500-94058742,94059559-94060544 LRG_2p1 94024344 94059705 +LRG_3t1 COL3A1 2 1 189839099 189877472 189839099-189839294,189849486-189849688,189849923-189849973,189850391-189850504,189851785-189851865,189852807-189852860,189853316-189853369,189854122-189854175,189854822-189854875,189855033-189855086,189855730-189855783,189856213-189856257,189856395-189856448,189856910-189856954,189857613-189857666,189858087-189858185,189858764-189858808,189858960-189859058,189859267-189859320,189859450-189859557,189859772-189859825,189860418-189860516,189860851-189860904,189861124-189861222,189861891-189861944,189862062-189862115,189862426-189862479,189862992-189863045,189863400-189863444,189864011-189864109,189864196-189864303,189864568-189864621,189866123-189866176,189866262-189866315,189867024-189867077,189867681-189867788,189868137-189868190,189868460-189868513,189868708-189868869,189868983-189869090,189870076-189870183,189870932-189870985,189871071-189871178,189871663-189871716,189872226-189872333,189872611-189872664,189872761-189872868,189873650-189873947,189874904-189875091,189875374-189875616,189876354-189877472 LRG_3p1 189839216 189876500 +LRG_4t1 CRTAP 3 1 33155450 33189265 33155450-33156040,33161836-33161985,33165900-33166071,33171431-33171559,33174047-33174192,33175674-33175757,33183887-33189265 LRG_4p1 33155570 33183940 +LRG_5t1 P3H1 1 -1 43212006 43232755 43212006-43212523,43212943-43213083,43213394-43213469,43213871-43213988,43215857-43216007,43217945-43218040,43218208-43218335,43220540-43220661,43220836-43220888,43221219-43221308,43223454-43223593,43224523-43224654,43224872-43225061,43227994-43228146,43232178-43232755 LRG_5p1 43212368 43232642 +LRG_5t2 P3H1 1 -1 43212006 43232755 43212006-43212523,43212924-43213083,43213394-43213469,43213871-43213988,43215857-43216007,43217945-43218040,43218208-43218335,43220540-43220661,43220836-43220888,43221219-43221308,43223454-43223593,43224523-43224654,43224872-43225061,43227994-43228146,43232178-43232755 LRG_5p2 43212504 43232642 +LRG_5t3 P3H1 1 -1 43212006 43232755 43212006-43213083,43213394-43213469,43213871-43213988,43215857-43216007,43217945-43218040,43218208-43218335,43220540-43220661,43220836-43220888,43221219-43221308,43223454-43223593,43224523-43224654,43224872-43225061,43227994-43228146,43232178-43232755 LRG_5p3 43212583 43232642 +LRG_6t1 ATP1A2 1 1 160085548 160113381 160085548-160085663,160090696-160090800,160090982-160091041,160093003-160093206,160093733-160093846,160094086-160094220,160094926-160095043,160097342-160097610,160098442-160098640,160098770-160098879,160099056-160099190,160099892-160100081,160100212-160100387,160104274-160104410,160104935-160105085,160105224-160105392,160105629-160105783,160106037-160106160,160106360-160106505,160106691-160106821,160109430-160109531,160109683-160109774,160111084-160113381 LRG_6p1 160085652 160111112 +LRG_160t1 PIGA X -1 15337573 15353676 15337573-15339894,15342787-15342993,15343142-15343274,15344036-15344168,15349338-15350114,15353623-15353676 LRG_160p1 15339628 15350052 +LRG_161t1 PMS2 7 -1 6012870 6048737 6012870-6013173,6017219-6017388,6018227-6018327,6022455-6022622,6026390-6027251,6029431-6029586,6031604-6031688,6035165-6035264,6036957-6037054,6038739-6038906,6042084-6042267,6043321-6043423,6043603-6043689,6045523-6045662,6048628-6048737 LRG_161p1 6013030 6048650 +LRG_162t1 PRKDC 8 -1 48685669 48872743 48685669-48686938,48689405-48689544,48690247-48690435,48691020-48691221,48691289-48691360,48691565-48691654,48694723-48694815,48694939-48695159,48696303-48696370,48697674-48697878,48701467-48701610,48701712-48701799,48706851-48707062,48710798-48710958,48711771-48711951,48713354-48713547,48715867-48716041,48719698-48719887,48730011-48730122,48731963-48732071,48733280-48733504,48734165-48734353,48736419-48736557,48739217-48739422,48740729-48740908,48743166-48743297,48744375-48744487,48746757-48746957,48748899-48749088,48749773-48749980,48751709-48751807,48752577-48752750,48761715-48761864,48761940-48762064,48765234-48765345,48766644-48766775,48767783-48767934,48769717-48769860,48771077-48771196,48771410-48771547,48772172-48772320,48773460-48773532,48774623-48774688,48774934-48775102,48775960-48776138,48777117-48777324,48790285-48790412,48792052-48792219,48793977-48794081,48794473-48794658,48798505-48798708,48800108-48800266,48801079-48801211,48801575-48801783,48802818-48803041,48805700-48805947,48809721-48809854,48811030-48811129,48812933-48813027,48815129-48815355,48817429-48817536,48824970-48825122,48826461-48826624,48827888-48827978,48830837-48830943,48839754-48839913,48840331-48840450,48841652-48841738,48842413-48842572,48843232-48843347,48845580-48845732,48846525-48846650,48847569-48847618,48848292-48848460,48848913-48849077,48852111-48852257,48855769-48855926,48856413-48856443,48856534-48856589,48866180-48866279,48866367-48866479,48866898-48867006,48868434-48868508,48869731-48869823,48869915-48869991,48872533-48872743 LRG_162p1 48686734 48872686 +LRG_163t1 RMRP 9 -1 35657748 35658015 35657748-35658015 +LRG_164t1 STIM1 11 1 3876933 4114440 3876933-3877639,3988782-3988912,4045103-4045217,4076756-4076867,4080511-4080626,4091256-4091433,4095732-4095909,4103414-4103581,4104112-4104212,4104493-4104728,4107707-4107773,4112512-4114440 LRG_164p1 3877501 4113028 +LRG_165t1 STXBP2 19 1 7701991 7712759 7701991-7702072,7703612-7703661,7703905-7703986,7704617-7704693,7705617-7705695,7705786-7705889,7706591-7706739,7706920-7707004,7707089-7707219,7707315-7707422,7707652-7707709,7707869-7707934,7708051-7708131,7709500-7709638,7710083-7710192,7711135-7711230,7712048-7712133,7712240-7712397,7712611-7712759 LRG_165p1 7702036 7712696 +LRG_166t1 TAP1 6 -1 32812986 32821748 32812986-32813562,32814845-32814981,32815290-32815452,32815696-32815869,32816429-32816617,32816767-32816895,32818097-32818294,32818721-32818926,32819886-32820016,32820165-32820279,32820816-32821748 LRG_166p1 32813356 32821593 +LRG_167t1 TAP2 6 -1 32793187 32806547 32793187-32796811,32797177-32797313,32797707-32797866,32798044-32798217,32798395-32798583,32800110-32800238,32800404-32800601,32802931-32803136,32803420-32803550,32805314-32805428,32805518-32806014,32806430-32806547 LRG_167p1 32796632 32806010 +LRG_167t2 TAP2 6 -1 32789610 32806547 32789610-32790095,32797177-32797313,32797707-32797866,32798044-32798217,32798395-32798583,32800110-32800238,32800404-32800601,32802931-32803136,32803420-32803550,32805314-32805428,32805518-32806014,32806430-32806547 LRG_167p2 32790066 32806010 +LRG_168t1 THBD 20 -1 23026270 23030301 23026270-23030301 LRG_168p1 23028414 23030141 +LRG_341t2 UNC119 17 -1 26873725 26879646 26873725-26874867,26875017-26875119,26875610-26875723,26879356-26879646 LRG_341p2 26874642 26879575 +LRG_343t1 TERT 5 -1 1253282 1295162 1253282-1253946,1254483-1254620,1255402-1255526,1258713-1258774,1260589-1260715,1264519-1264707,1266579-1266650,1268635-1268748,1271234-1271319,1272300-1272395,1278756-1278911,1279406-1279585,1280273-1280453,1282544-1282739,1293428-1294781,1294886-1295162 LRG_343p1 1253843 1295104 +LRG_344t1 KRAS 12 -1 25357723 25403865 25357723-25362845,25368371-25368494,25378548-25378707,25380168-25380346,25398208-25398329,25403685-25403865 LRG_344p1 25368375 25398318 +LRG_344t2 KRAS 12 -1 25357723 25403865 25357723-25362845,25378548-25378707,25380168-25380346,25398208-25398329,25403685-25403865 LRG_344p2 25362729 25398318 +LRG_345t1 NOP10 15 -1 34633917 34635362 34633917-34634309,34635221-34635362 LRG_345p1 34634169 34635274 +LRG_346t1 NHP2 5 -1 177576464 177580961 177576464-177576839,177577889-177577994,177580492-177580561,177580659-177580961 LRG_346p1 177576714 177580818 +LRG_347t1 TERC 3 -1 169482398 169482848 169482398-169482848 +LRG_348t1 CR2 1 1 207627645 207663240 207627645-207627821,207639871-207640257,207641869-207642060,207642145-207642247,207642495-207642577,207643040-207643447,207644085-207644261,207644342-207644432,207644768-207644844,207646117-207646524,207646890-207647066,207647146-207647230,207647586-207647668,207648169-207648561,207649579-207649764,207651230-207651415,207652602-207652625,207653323-207653398,207658809-207658917,207662487-207663240 LRG_348p1 207627764 207658899 +LRG_349t1 MASP1 3 -1 186964142 187009810 186964142-186965173,186968039-186968117,186969422-186969540,186970956-186971103,186974452-186974648,186978529-186978660,186980331-186980508,187003613-187003844,187009416-187009810 LRG_349p1 186965121 187009420 +LRG_349t2 MASP1 3 -1 186933873 187009810 186933873-186938049,186938823-186938922,186940915-186940982,186943112-186943297,186944195-186944308,186947548-186947685,186959269-186959343,186961272-186961409,186968039-186968117,186969422-186969540,186970956-186971103,186974452-186974648,186978529-186978660,186980331-186980508,187003613-187003844,187009416-187009810 LRG_349p2 186937859 187009420 diff --git a/tests/fixtures.py b/tests/fixtures.py index f96a9c3a82c62d83e166d77265107ad79681f604..28e168f12e12d62473b5984597b9d2eab76fbdea 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -593,6 +593,54 @@ def hg19_transcript_mappings(db, hg19): cds=(37035039, 37092144), select_transcript=False, version=3)) + db.session.add(TranscriptMapping( + hg19.chromosomes.filter_by(name='chr17').one(), + 'lrg', + 'LRG_1', + 'COL1A1', + 'reverse', + 48261457, + 48279000, + [48261457, 48263139, 48263678, 48264001, 48264376, 48264845, 48265237, + 48265457, 48265891, 48266103, 48266264, 48266529, 48266738, 48267040, + 48267220, 48267362, 48267688, 48267904, 48268178, 48268744, 48269149, + 48269341, 48269836, 48270001, 48270158, 48270355, 48271304, 48271491, + 48271710, 48271934, 48272082, 48272408, 48272593, 48272795, 48272928, + 48273284, 48273516, 48273675, 48273845, 48273978, 48274371, 48274541, + 48275093, 48275310, 48275522, 48275794, 48276587, 48276779, 48276917, + 48277114, 48278772], + [48263009, 48263381, 48263868, 48264283, 48264483, 48264898, 48265344, + 48265510, 48265998, 48266156, 48266371, 48266636, 48266899, 48267093, + 48267273, 48267469, 48267741, 48267957, 48268285, 48268851, 48269247, + 48269385, 48269889, 48270054, 48270211, 48270408, 48271402, 48271544, + 48271808, 48271987, 48272189, 48272461, 48272691, 48272839, 48273026, + 48273337, 48273560, 48273728, 48273889, 48274031, 48274424, 48274594, + 48275146, 48275363, 48275566, 48275865, 48276688, 48276814, 48276951, + 48277308, 48279000], + 'ebi', + transcript=1, + cds=(48262863, 48278874), + select_transcript=True)) + db.session.add(TranscriptMapping( + hg19.chromosomes.filter_by(name='chr1').one(), + 'lrg', + 'LRG_348', + 'CR2', + 'forward', + 207627645, + 207663240, + [207627645, 207639871, 207641872, 207642145, 207642495, 207643040, + 207644085, 207644342, 207644768, 207646117, 207646890, 207647146, + 207647586, 207648169, 207649579, 207651230, 207652602, 207653323, + 207658809, 207662487], + [207627821, 207640257, 207642060, 207642244, 207642577, 207643447, + 207644261, 207644432, 207644844, 207646524, 207647066, 207647230, + 207647668, 207648561, 207649764, 207651415, 207652625, 207653398, + 207658917, 207663240], + 'ebi', + transcript=1, + cds=(207627764, 207658899), + select_transcript=True)) db.session.commit() diff --git a/tests/test_mapping.py b/tests/test_mapping.py index 30b4c7efd8bebd9f0fa650a73b6ac88c763efed6..aaebbc8d82c145c69886042f5b28415e7b1b2dfa 100644 --- a/tests/test_mapping.py +++ b/tests/test_mapping.py @@ -14,6 +14,39 @@ from mutalyzer.db.models import TranscriptMapping from mutalyzer import mapping +# Some example positional coding/chromosomal mappings we use in the tests. +LRG_1_T1_POSITIONS = [ + ('-150', 48279024), + ('-126', 48279000), + ('-1', 48278875), + ('1', 48278874), + ('103', 48278772), + ('103+5', 48278767), + ('104-5', 48277313), + ('104', 48277308), + ('870', 48273878), + ('4248', 48263139), + ('4249', 48263009), + ('4395', 48262863), + ('*1', 48262862), + ('*1406', 48261457), + ('*1407', 48261456), + ('*1417', 48261446)] +LRG_348_T1_POSITIONS = [ + ('-150', 207627614), + ('-119', 207627645), + ('-1', 207627763), + ('1', 207627764), + ('58', 207627821), + ('58+5', 207627826), + ('59-5', 207639866), + ('59', 207639871), + ('3279', 207658899), + ('*1', 207658900), + ('*772', 207663240), + ('*780', 207663248)] + + pytestmark = pytest.mark.usefixtures('hg19_transcript_mappings') @@ -328,6 +361,44 @@ def test_ins_seq_seq_c2chrom_reverse(converter): assert genomic == 'NC_000011.9:g.111957482_111957483ins[GAT;AAA]' +@pytest.mark.parametrize('coding,chromosomal', LRG_1_T1_POSITIONS) +def test_lrg_1t1_c2chrom(converter, coding, chromosomal): + """ + Conversion from LRG reference on reverse strand. + """ + chromosomal_descr = converter.c2chrom('LRG_1t1:c.%sdel' % coding) + assert chromosomal_descr == 'NC_000017.10:g.%ddel' % chromosomal + + +@pytest.mark.parametrize('coding,chromosomal', LRG_1_T1_POSITIONS) +def test_lrg_1t1_chrom2c(converter, coding, chromosomal): + """ + Conversion to LRG reference on reverse strand. + """ + coding_descr = converter.chrom2c( + 'NC_000017.10:g.%ddel' % chromosomal, 'list') + assert 'LRG_1t1:c.%sdel' % coding in coding_descr + + +@pytest.mark.parametrize('coding,chromosomal', LRG_348_T1_POSITIONS) +def test_lrg_348t1_c2chrom(converter, coding, chromosomal): + """ + Conversion from LRG reference on forward strand. + """ + chromosomal_descr = converter.c2chrom('LRG_348t1:c.%sdel' % coding) + assert chromosomal_descr == 'NC_000001.10:g.%ddel' % chromosomal + + +@pytest.mark.parametrize('coding,chromosomal', LRG_348_T1_POSITIONS) +def test_lrg_348t1_chrom2c(converter, coding, chromosomal): + """ + Conversion to LRG reference on forward strand. + """ + coding_descr = converter.chrom2c( + 'NC_000001.10:g.%ddel' % chromosomal, 'list') + assert 'LRG_348t1:c.%sdel' % coding in coding_descr + + def test_import_mapview(hg19): original_count = TranscriptMapping.query.count() @@ -377,3 +448,73 @@ def test_import_mapview(hg19): assert new.orientation == 'forward' assert new.reference_type == 'refseq' assert new.source == 'ncbi' + + +def test_import_lrgmap(hg19): + original_count = TranscriptMapping.query.count() + + path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', + 'hg19.lrgmap.subset.txt') + lrgmap = codecs.open(path, encoding='utf-8') + lrgmap_count = sum(1 for line in lrgmap if not line.startswith('#')) + lrgmap.seek(0) + + mapping.import_from_lrgmap_file(hg19, lrgmap) + + # Two transcripts were already in, the rest is new: + # - LRG_1 + # - LRG_348 + assert TranscriptMapping.query.count() == original_count + lrgmap_count - 2 + + # No changes here. + unchanged = TranscriptMapping.query.filter_by(accession='LRG_1').one() + assert unchanged.start == 48261457 + assert unchanged.stop == 48279000 + assert unchanged.exon_starts == [ + 48261457, 48263139, 48263678, 48264001, 48264376, 48264845, 48265237, + 48265457, 48265891, 48266103, 48266264, 48266529, 48266738, 48267040, + 48267220, 48267362, 48267688, 48267904, 48268178, 48268744, 48269149, + 48269341, 48269836, 48270001, 48270158, 48270355, 48271304, 48271491, + 48271710, 48271934, 48272082, 48272408, 48272593, 48272795, 48272928, + 48273284, 48273516, 48273675, 48273845, 48273978, 48274371, 48274541, + 48275093, 48275310, 48275522, 48275794, 48276587, 48276779, 48276917, + 48277114, 48278772] + assert unchanged.exon_stops == [ + 48263009, 48263381, 48263868, 48264283, 48264483, 48264898, 48265344, + 48265510, 48265998, 48266156, 48266371, 48266636, 48266899, 48267093, + 48267273, 48267469, 48267741, 48267957, 48268285, 48268851, 48269247, + 48269385, 48269889, 48270054, 48270211, 48270408, 48271402, 48271544, + 48271808, 48271987, 48272189, 48272461, 48272691, 48272839, 48273026, + 48273337, 48273560, 48273728, 48273889, 48274031, 48274424, 48274594, + 48275146, 48275363, 48275566, 48275865, 48276688, 48276814, 48276951, + 48277308, 48279000] + assert unchanged.cds == (48262863, 48278874) + + # We made some artificial changes to the lrgmap file here. + updated = TranscriptMapping.query.filter_by(accession='LRG_348').one() + assert updated.start == 207627645 + assert updated.stop == 207663240 + assert updated.exon_starts == [ + 207627645, 207639871, 207641869, 207642145, 207642495, 207643040, + 207644085, 207644342, 207644768, 207646117, 207646890, 207647146, + 207647586, 207648169, 207649579, 207651230, 207652602, 207653323, + 207658809, 207662487] + assert updated.exon_stops == [ + 207627821, 207640257, 207642060, 207642247, 207642577, 207643447, + 207644261, 207644432, 207644844, 207646524, 207647066, 207647230, + 207647668, 207648561, 207649764, 207651415, 207652625, 207653398, + 207658917, 207663240] + assert updated.cds == (207627764, 207658899) + + # This is a new entry. + new = TranscriptMapping.query.filter_by(accession='LRG_163').one() + assert new.version is None + assert new.start == 35657748 + assert new.stop == 35658015 + assert new.exon_starts == [35657748] + assert new.exon_stops == [35658015] + assert new.cds is None + assert new.gene == 'RMRP' + assert new.orientation == 'reverse' + assert new.reference_type == 'lrg' + assert new.source == 'ebi' diff --git a/tests/test_services_json.py b/tests/test_services_json.py index e47e306e30fe8fef28ecc84f13493efa03e4536e..76bcb8c4dc8294d3ec5fe20542beb0bfff93e745 100644 --- a/tests/test_services_json.py +++ b/tests/test_services_json.py @@ -220,6 +220,7 @@ def test_get_transcripts_mapping(api): r = api('getTranscriptsMapping', 'hg19', 'chr11', 111955524, 111966518) assert r == [{'cds_start': 111957492, 'cds_stop': 111956019, + 'transcript': 'NM_012459.2', 'name': 'NM_012459', 'stop': 111955524, 'start': 111957522, @@ -228,6 +229,7 @@ def test_get_transcripts_mapping(api): 'orientation': '-'}, {'cds_start': None, 'cds_stop': None, + 'transcript': 'NR_028383.1', 'name': 'NR_028383', 'stop': 111955524, 'start': 111957522, @@ -236,6 +238,7 @@ def test_get_transcripts_mapping(api): 'orientation': '-'}, {'cds_start': 111957632, 'cds_stop': 111965694, + 'transcript': 'NM_003002.2', 'name': 'NM_003002', 'stop': 111966518, 'start': 111957571, diff --git a/tests/test_services_soap.py b/tests/test_services_soap.py index 37a5799fba0a02f0a26aeeabbe906e4936250b3c..83792b7f54d3571b362f9cb310ffc2deafead2c8 100644 --- a/tests/test_services_soap.py +++ b/tests/test_services_soap.py @@ -166,6 +166,52 @@ def test_numberconversion_gtoc_required_gene(api): assert 'XM_001715131.2:c.*19483A>G' in r.string +@pytest.mark.usefixtures('hg19_transcript_mappings') +def test_gettranscripts_lrg(api): + """ + Running getTranscripts should give us overlapping transcripts. + list of transcripts including LRG transcripts. + """ + r = api('getTranscripts', build='hg19', chrom='chr1', + pos=207646118) + assert type(r.string) == list + assert 'LRG_348t1' in r.string + + +@pytest.mark.usefixtures('hg19_transcript_mappings') +def test_gettranscripts_mtdna(api): + """ + Running getTranscripts should give us overlapping transcripts. + list of transcripts, also on chrM. + """ + r = api('getTranscripts', build='hg19', chrom='chrM', + pos=10765) + assert type(r.string) == list + assert 'NC_012920(ND4_v001)' in r.string + + +@pytest.mark.usefixtures('hg19_transcript_mappings') +def test_gettranscriptsrange_lrg(api): + """ + Running getTranscriptsRange should give us overlapping transcripts. + list of transcripts including LRG transcripts. + """ + r = api('getTranscriptsRange', 'hg19', 'chr1', 207646118, 207646118, 1) + assert type(r.string) == list + assert 'LRG_348t1' in r.string + + +@pytest.mark.usefixtures('hg19_transcript_mappings') +def test_gettranscriptsrange_mtdna(api): + """ + Running getTranscripts should give us overlapping transcripts. + list of transcripts, also on chrM. + """ + r = api('getTranscriptsRange', 'hg19', 'chrM', 10765, 10765, 1) + assert type(r.string) == list + assert 'NC_012920(ND4_v001)' in r.string + + @pytest.mark.usefixtures('hg19_transcript_mappings') def test_gettranscriptsbygenename_valid(api): """ @@ -180,6 +226,28 @@ def test_gettranscriptsbygenename_valid(api): assert t in r.string +@pytest.mark.usefixtures('hg19_transcript_mappings') +def test_gettranscriptsbygenename_valid_lrg(api): + """ + Running getTranscriptsByGeneName with valid gene name should give a + list of transcripts including LRG transcripts. + """ + r = api('getTranscriptsByGeneName', build='hg19', name='CR2') + assert type(r.string) == list + assert 'LRG_348t1' in r.string + + +@pytest.mark.usefixtures('hg19_transcript_mappings') +def test_gettranscriptsbygenename_valid_mtdna(api): + """ + Running getTranscriptsByGeneName with valid gene name should give a + list of transcripts also on chrM. + """ + r = api('getTranscriptsByGeneName', build='hg19', name='ND4') + assert type(r.string) == list + assert 'NC_012920.1(ND4_v001)' in r.string + + @pytest.mark.usefixtures('hg19_transcript_mappings') def test_gettranscriptsbygenename_invalid(api): """ @@ -190,6 +258,22 @@ def test_gettranscriptsbygenename_invalid(api): assert not r +@pytest.mark.usefixtures('hg19_transcript_mappings') +def test_gettranscriptsmapping_mtdna(api): + """ + Running getTranscriptsMapping on mtDNA should give a list of transcripts + including their complete identification in the transcript attribute. + """ + r = api('getTranscriptsMapping', 'hg19', 'chrM', 10765, 10765, 1) + assert type(r.TranscriptMappingInfo) == list + assert len(r.TranscriptMappingInfo) == 1 + info = r.TranscriptMappingInfo[0] + assert 'NC_012920' == info.name + assert 1 == info.version + assert 'ND4' == info.gene + assert 'NC_012920.1(ND4_v001)' == info.transcript + + @with_references('AF230870.1') def test_gettranscriptsandinfo_valid(api): """ @@ -736,6 +820,7 @@ def test_get_transcripts_mapping(api): for t_real, t_expected in zip(r.TranscriptMappingInfo, [{'cds_start': 111957492, 'cds_stop': 111956019, + 'transcript': 'NM_012459.2', 'name': 'NM_012459', 'stop': 111955524, 'start': 111957522, @@ -744,6 +829,7 @@ def test_get_transcripts_mapping(api): 'orientation': '-'}, {'cds_start': None, 'cds_stop': None, + 'transcript': 'NR_028383.1', 'name': 'NR_028383', 'stop': 111955524, 'start': 111957522, @@ -752,6 +838,7 @@ def test_get_transcripts_mapping(api): 'orientation': '-'}, {'cds_start': 111957632, 'cds_stop': 111965694, + 'transcript': 'NM_003002.2', 'name': 'NM_003002', 'stop': 111966518, 'start': 111957571,