diff --git a/mutalyzer/db/models.py b/mutalyzer/db/models.py index a38ab2aaf358334b4b9a3f9da9a55802c3d433e4..12f94486a0e351f37b5ddbd14b99bca5789c5960 100644 --- a/mutalyzer/db/models.py +++ b/mutalyzer/db/models.py @@ -7,8 +7,9 @@ from datetime import datetime import sqlite3 import uuid -from sqlalchemy import (event, Boolean, Column, DateTime, Enum, ForeignKey, - Index, Integer, String, Text, TypeDecorator) +from sqlalchemy import event, or_ +from sqlalchemy import (Boolean, Column, DateTime, Enum, ForeignKey, Index, + Integer, String, Text, TypeDecorator) from sqlalchemy.engine import Engine from sqlalchemy.orm import backref, relationship @@ -274,6 +275,16 @@ class Assembly(db.Base): return '<Assembly %r taxonomy=%r>' % (self.name, self.taxonomy_common_name) + @classmethod + def by_name_or_alias(cls, name_or_alias): + """ + Returns an :class:`Assembly` instance for the given `name_or_alias` if + it exists, or raises :exc:`sqlalchemy.orm.exc.NoResultFound` + otherwise. + """ + return cls.query.filter(or_(cls.name == name_or_alias, + cls.alias == name_or_alias)).one() + class Chromosome(db.Base): """ diff --git a/mutalyzer/entrypoints/admin.py b/mutalyzer/entrypoints/admin.py new file mode 100644 index 0000000000000000000000000000000000000000..f0ecca89c9bc4d254b337d628952ad10f27df820 --- /dev/null +++ b/mutalyzer/entrypoints/admin.py @@ -0,0 +1,158 @@ +""" +Command line interface to Mutalyzer administrative tools. +""" + + +# Todo: Manage genome assemblies. + + +import argparse + +from sqlalchemy.orm.exc import NoResultFound + +from ..db.models import Assembly +from .. import mapping +from .. import output +from .. import sync + + +class UserError(Exception): + pass + + +def import_mapview(assembly_name_or_alias, mapview_file, group_label): + """ + Import transcript mappings from an NCBI mapview file. + """ + try: + assembly = Assembly.by_name_or_alias(assembly_name_or_alias) + except NoResultFound: + raise UserError('Not a valid assembly: %s' % assembly_name_or_alias) + + try: + mapping.import_from_mapview_file(assembly, mapview_file, group_label) + except mapping.MapviewSortError as e: + raise UserError(str(e)) + + +def import_gene(assembly_name_or_alias, gene): + """ + Import transcript mappings for a gene from the UCSC database. + """ + try: + assembly = Assembly.by_name_or_alias(assembly_name_or_alias) + except NoResultFound: + raise UserError('Not a valid assembly: %s' % assembly_name_or_alias) + + mapping.import_from_ucsc_by_gene(assembly, gene) + + +def import_reference(assembly_name_or_alias, reference): + """ + Import transcript mappings from a genomic reference. Note that this is + currently hard-coded to only work with mtDNA transcripts. + """ + try: + assembly = Assembly.by_name_or_alias(assembly_name_or_alias) + except NoResultFound: + raise UserError('Not a valid assembly: %s' % assembly_name_or_alias) + + mapping.import_from_reference(assembly, reference) + + +def sync_cache(wsdl_url, url_template, history=7): + """ + Synchronize the database cache with another Mutalyzer instance. + + This program is intended to be run daily from cron. Example: + + 25 5 * * * mutalyzer-cache-sync 'http://dom1/?wsdl' 'http://dom1/{file}' -H 7 + 55 5 * * * mutalyzer-cache-sync 'http://dom2/?wsdl' 'http://dom2/{file}' -H 7 + """ + cache_sync = sync.CacheSync(output.Output(__file__)) + cache_sync.sync_with_remote(wsdl_url, url_template, history) + + +def main(): + """ + Command-line interface to Mutalyzer administrative tools. + """ + assembly_parser = argparse.ArgumentParser(add_help=False) + assembly_parser.add_argument( + '-a', '--assembly', metavar='ASSEMBLY', dest='assembly_name_or_alias', + default='hg19', help='assembly to import to (default: hg19)') + + parser = argparse.ArgumentParser( + description='Mutalyzer administrative tools.') + subparsers = parser.add_subparsers( + title='subcommands', dest='subcommand', help='subcommand help') + + p = subparsers.add_parser( + 'import-mapview', help='import mappings from NCBI mapview file', + parents=[assembly_parser], + description=import_mapview.__doc__.split('\n\n')[0], + epilog='Note: We require that FILE is sorted on the `feature_id` ' + '(#11) and `chromosome` (#2) columns. This can be done with a ' + '`sort -k 11,11 -k 2,2` command.') + p.set_defaults(func=import_mapview) + p.add_argument( + 'mapview_file', metavar='FILE', type=argparse.FileType('r'), + help='file from NCBI mapview (example: seq_gene.md), see note below') + p.add_argument( + 'group_label', metavar='GROUP_LABEL', + help='use only entries with this group label (example: ' + 'GRCh37.p2-Primary Assembly)') + + p = subparsers.add_parser( + 'import-gene', help='import mappings by gene from UCSC database', + parents=[assembly_parser], + description=import_gene.__doc__.split('\n\n')[0], + epilog='Intended use is whenever transcript mappings for a specific ' + 'gene are required that are not available from our usual source ' + ' (i.e., NCBI mapview).') + p.set_defaults(func=import_gene) + p.add_argument( + 'gene', metavar='GENE_SYMBOL', + help='gene to import all transcript mappings for from the UCSC ' + 'database (example: TTN)') + + p = subparsers.add_parser( + 'import-reference', help='import mappings from reference', + parents=[assembly_parser], + description=import_reference.__doc__.split('\n\n')[0], + epilog='Intended use is whenever transcript mappings from a specific ' + 'genomic reference are required that are not available from our ' + 'usual source (i.e., NCBI mapview).') + p.set_defaults(func=import_reference) + p.add_argument( + 'reference', metavar='ACCESSION', + help='genomic reference to import all genes from (example: ' + 'NC_012920.1)') + + p = subparsers.add_parser( + 'sync-cache', help='synchronize cache with remote Mutalyzer', + description=sync_cache.__doc__.split('\n\n')[0], + epilog='Intended use is to run daily from cron.') + p.add_argument( + 'wsdl_url', metavar='WSDL_URL', + help='location of the remote WSDL description') + p.add_argument( + 'url_template', metavar='URL_TEMPLATE', + help='URL for remote downloads, in which the filename is to be ' + 'substituted for {file}') + p.add_argument( + '-H', '--history', metavar='DAYS', dest='history', type=int, + default=7, help='number of days to go back in the remote cache ' + '(default: 7)') + + args = parser.parse_args() + + try: + args.func(**{k: v for k, v in vars(args).items() + if k not in ('func', 'subcommand')}) + except UserError as e: + parser.error(str(e)) + + +if __name__ == '__main__': + main() diff --git a/mutalyzer/entrypoints/cache_sync.py b/mutalyzer/entrypoints/cache_sync.py deleted file mode 100644 index fc2c6bee46035ddffdd5a4b07f7020fdd331364a..0000000000000000000000000000000000000000 --- a/mutalyzer/entrypoints/cache_sync.py +++ /dev/null @@ -1,50 +0,0 @@ -""" -Synchronize the database cache with other Mutalyzer instances. - -This program is intended to be run daily from cron. Example: - - 25 5 * * * mutalyzer-cache-sync 'http://dom1/?wsdl' 'http://dom1/{file}' -H 7 - 55 5 * * * mutalyzer-cache-sync 'http://dom2/?wsdl' 'http://dom2/{file}' -H 7 -""" - - -import argparse - -from .. import output -from .. import sync - - -def sync_cache(remote_wsdl, url_template, history=7): - """ - Synchronize the database cache with other Mutalyzer instances. - """ - output = output.Output(__file__) - - cache_sync = sync.CacheSync(output) - cache_sync.sync_with_remote(remote_wsdl, url_template, history) - - -def main(): - """ - Command-line interface to the cache synchronizer. - """ - parser = argparse.ArgumentParser( - description='Mutalyzer cache synchronizer.') - parser.add_argument( - 'wsdl', metavar='WSDL', - help='location of the remote WSDL description') - parser.add_argument( - 'url_template', metavar='URL_TEMPLATE', - help='URL for remote downloads, in which the filename is to be ' - 'substituted for {{file}}') - parser.add_argument( - '-H', '--history', metavar='DAYS', dest='history', type=int, - default=7, help='number of days to go back in the remote cache ' - '(default: 7)') - - args = parser.parse_args() - sync_cache(args.wsdl, args.url_template, history=args.history) - - -if __name__ == '__main__': - main() diff --git a/mutalyzer/entrypoints/mapping_import.py b/mutalyzer/entrypoints/mapping_import.py deleted file mode 100644 index 1d72523e6b6e547ac19281d6fdc272f0589916d3..0000000000000000000000000000000000000000 --- a/mutalyzer/entrypoints/mapping_import.py +++ /dev/null @@ -1,175 +0,0 @@ -""" -Update the database with mapping information for the given gene or genomic -reference. -""" - - -import argparse -from operator import attrgetter -import sys - -from sqlalchemy import or_ - -from ..db import session -from ..db.models import Assembly, TranscriptMapping -from .. import output -from .. import Retriever - - -def import_gene(assembly, gene): - """ - Update the database with information from the UCSC. - - .. todo: This is not really tested. - """ - o = output.Output(__file__) - o.addMessage(__file__, -1, 'INFO', - 'Starting UCSC mapping data update (gene: %s)' % gene) - - connection = MySQLdb.connect(user='genome', - host='genome-mysql.cse.ucsc.edu', - db=assembly.alias) - - query = """ - SELECT DISTINCT - acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, - exonEnds, name2 AS geneName, chrom, strand, protAcc - FROM gbStatus, refGene, refLink - WHERE type = "mRNA" - AND refGene.name = acc - AND acc = mrnaAcc - AND name2 = %s - """ - parameters = gene - - cursor = connection.cursor() - cursor.execute(query, parameters) - result = cursor.fetchall() - cursor.close() - - for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds, - geneName, chrom, strand, protAcc) in result: - orientation = 'reverse' if strand == '-' else 'forward' - exon_starts = [int(i) + 1 for i in exonStarts.split(',') if i] - exon_stops = [int(i) for i in exonEnds.split(',') if i] - if cdsStart and cdsEnd: - cds = cdsStart + 1, cdsEnd - else: - cds = None - mapping = TranscriptMapping.create_or_update( - chrom, 'refseq', acc, gene.name, orientation, txStart + 1, txEnd, - exon_starts, exon_stops, 'ucsc', cds=cds, version=int(version)) - session.add(mapping) - - session.commit() - - o.addMessage(__file__, -1, 'INFO', - 'UCSC mapping data update end (gene: %s)' % gene) - - -def import_reference(assembly, reference): - """ - Update the database with information from the given reference. - - .. todo: Also report how much was added/updated. - - .. note: Currently no exon locations are supported, this has only been - tested on mtDNA. - """ - o = output.Output(__file__) - o.addMessage(__file__, -1, 'INFO', - 'Starting reference mapping data update (reference: %s)' % reference) - - chromosome = assembly.chromosomes.filter_by(name='chrM').one() - - retriever = Retriever.GenBankRetriever(o) - record = retriever.loadrecord(reference) - - select_transcript = len(record.geneList) > 1 - - for gene in record.geneList: - # We support exactly one transcript per gene. - try: - transcript = sorted(gene.transcriptList, key=attrgetter('name'))[0] - except IndexError: - continue - - # We use gene.location for now, it is always present and the same - # for our purposes. - #start, stop = transcript.mRNA.location[0], transcript.mRNA.location[1] - start, stop = gene.location - - orientation = 'reverse' if gene.orientation == -1 else 'forward' - - try: - cds = transcript.CDS.location - except AttributeError: - cds = None - - mapping = TranscriptMapping.create_or_update( - chromosome, 'refseq', record.source_accession, gene.name, - orientation, start, stop, [start], [stop], 'reference', cds=cds, - select_transcript=select_transcript, - version=int(record.source_version)) - session.add(mapping) - - session.commit() - - o.addMessage(__file__, -1, 'INFO', - 'Reference mapping data update end (reference: %s)' % reference) - - -def main(): - """ - Command-line interface to the mapping importer. - """ - database_parser = argparse.ArgumentParser(add_help=False) - database_parser.add_argument( - '-a', '--assembly', metavar='ASSEMBLY', dest='assembly_name_or_alias', - default='hg19', help='assembly to import to (default: hg19)') - - parser = argparse.ArgumentParser( - description='Mutalyzer mapping importer.', - epilog='This program is intended to be run manually whenever ' - 'transcript mappings for specific genes are required that are not ' - 'yet in our database (i.e., they are not yet available from the ' - 'NCBI, or they are mtDNA genes). It will not overwrite ' - 'transcript/version entries that are already in our database.', - parents=[database_parser]) - - subparsers = parser.add_subparsers( - title='subcommands', dest='subcommand', help='subcommand help') - - p = subparsers.add_parser( - 'gene', help='import gene', parents=[database_parser], - description='Import gene mapping from the UCSC.') - p.add_argument( - 'gene', metavar='GENE_SYMBOL', - help='gene to import all transcript mappings for from the UCSC ' - 'database (example: TTN)') - - p = subparsers.add_parser( - 'reference', help='import reference', parents=[database_parser], - description='Import genomic reference file') - p.add_argument( - 'reference', metavar='FILE', - help='genomic reference to import all genes from (example: ' - 'NC_012920.1)') - - args = parser.parse_args() - - assembly = Assembly.query \ - .filter(or_(Assembly.name == args.assembly_name_or_alias, - Assembly.alias == args.assembly_name_or_alias)).first() - if not assembly: - sys.stderr.write('Invalid assembly: %s\n' % (assembly_name_or_alias,)) - sys.exit(1) - - if args.subcommand == 'gene': - import_gene(assembly, args.gene) - if args.subcommand == 'reference': - import_reference(assembly, args.reference) - - -if __name__ == '__main__': - main() diff --git a/mutalyzer/entrypoints/mapping_update.py b/mutalyzer/entrypoints/mapping_update.py deleted file mode 100644 index e5610ea8cb6cf92c09181ff5005b545cc9e22fdc..0000000000000000000000000000000000000000 --- a/mutalyzer/entrypoints/mapping_update.py +++ /dev/null @@ -1,189 +0,0 @@ -""" -Update the database with mapping information from the NCBI. - -This program is intended to be run daily from cron. Example: - - 25 6 * * * mutalyzer-mapping-update hg19 /tmp/seq_gene.md reference -""" - - -# Todo: Merge this script with mapping_import, the difference between the two -# makes no sense. -# Todo: Check if the input file is sorted as required and abort if not. - - -import argparse -from collections import defaultdict -from itertools import groupby -from operator import itemgetter -import sys - -from sqlalchemy import or_ - -from ..db import session -from ..db.models import Assembly, Chromosome, TranscriptMapping - - -COLUMNS = ['taxonomy', 'chromosome', 'start', 'stop', 'orientation', 'contig', - 'ctg_start', 'ctg_stop', 'ctg_orientation', 'feature_name', - 'feature_id', 'feature_type', 'group_label', 'transcript', - 'evidence_code'] - - -def update_mapping(mappings_path, group_label, assembly_name_or_alias): - """ - Update the database with information from a NCBI `seq_gene.md` file. - - We require that this file is first sorted on the `feature_id` column - (#11), which always contains the gene identifier, and then on the - `chromosome` column (#2). - - sort -k 11,11 -k 2,2 seq_gene.md > seq_gene.by_gene.md - - Load NCBI mapping information from {mapping_file} into the database. - - The NCBI mapping file consists of entries, one per line, in order of - their location in the genome (more specifically by start location). - Every entry has a 'group_label' column, denoting the assembly it is - from. We only use entries where this value is `group_label`. - - There are four types of entries (for our purposes): - - Gene: Name, identifier, and location of a gene. - - Transcript: Name, gene id, and location of a transcript. - - UTR: Location and transcript of a non-coding exon (or part of it). - - CDS: Location and transcript of a coding exon (or part of it). - - A bit troublesome for us is that exons are split in UTR exons and CDS - exons, with exons overlapping the UTR/CDS border defined as two - separate entries (one of type UTR and one of type CDS). - - Another minor annoyance is that some transcripts (~ 15) are split over - two contigs (NT_*). In that case, they are defined by two entries in - the file, where we should merge them by taking the start position of - the first and the stop position of the second. - - To complicate this annoyance, some genes (e.g. in the PAR) are mapped - on both the X and Y chromosomes, but stored in the file just like the - transcripts split over two contigs. However, these ones should of - course not be merged. - - Our strategy is too sort by gene and chromosome and process the file - grouped by these two fields. - """ - assembly = Assembly.query \ - .filter(or_(Assembly.name == assembly_name_or_alias, - Assembly.alias == assembly_name_or_alias)).first() - if not assembly: - sys.stderr.write('Invalid assembly: %s\n' % (assembly_name_or_alias,)) - sys.exit(1) - - chromosomes = assembly.chromosomes.all() - - def read_records(mappings_file): - for line in mappings_file: - if line.startswith('#'): - continue - record = dict(zip(COLUMNS, line.rstrip().split('\t'))) - - # Only use records from the given assembly. - if record['group_label'] != group_label: - continue - - # Only use records on chromosomes we know. - try: - record['chromosome'] = next(c for c in chromosomes if - c.name == 'chr' + record['chromosome']) - except StopIteration: - continue - - record['start'] = int(record['start']) - record['stop'] = int(record['stop']) - - yield record - - def build_mappings(records): - # We structure the records per transcript and per record type. This is - # generalized to a list of records for each type, but we expect only - # one GENE record (with `-` as transcript value). - # Note that there can be more than one RNA record per transcript if it - # is split over different reference contigs. - by_transcript = defaultdict(lambda: defaultdict(list)) - for r in records: - by_transcript[r['transcript']][r['feature_type']].append(r) - - gene = by_transcript['-']['GENE'][0]['feature_name'] - - for transcript, by_type in by_transcript.items(): - if transcript == '-': - continue - accession, version = transcript.split('.') - version = int(version) - chromosome = by_type['RNA'][0]['chromosome'] - orientation = 'reverse' if by_type['RNA'][0]['orientation'] == '-' else 'forward' - start = min(t['start'] for t in by_type['RNA']) - stop = max(t['stop'] for t in by_type['RNA']) - - exon_starts = [] - exon_stops = [] - cds_positions = [] - for exon in sorted(by_type['UTR'] + by_type['CDS'], - key=itemgetter('start')): - if exon_stops and exon_stops[-1] > exon['start'] - 1: - # This exon starts before the end of the previous exon. We - # have no idea what to do in this case, so we ignore it. - # The number of transcripts affected is very small (e.g., - # NM_031860.1 and NM_001184961.1 in the GRCh37 assembly). - continue - if exon['feature_type'] == 'CDS': - cds_positions.extend([exon['start'], exon['stop']]) - if exon_stops and exon_stops[-1] == exon['start'] - 1: - # This exon must be merged with the previous one because - # it is split over two entries (a CDS part and a UTR part - # or split over different reference contigs). - exon_stops[-1] = exon['stop'] - else: - exon_starts.append(exon['start']) - exon_stops.append(exon['stop']) - - if cds_positions: - cds = min(cds_positions), max(cds_positions) - else: - cds = None - - yield TranscriptMapping.create_or_update( - chromosome, 'refseq', accession, gene, orientation, start, - stop, exon_starts, exon_stops, 'ncbi', cds=cds, - version=version) - - with open(mappings_path) as mappings_file: - for _, records in groupby(read_records(mappings_file), - itemgetter('feature_id', 'chromosome')): - for mapping in build_mappings(records): - session.add(mapping) - - session.commit() - - -def main(): - """ - Command-line interface to the mapping updater. - """ - parser = argparse.ArgumentParser( - description='Mutalyzer mapping updater.') - - parser.add_argument( - 'mappings_path', metavar='FILE', - help='Path to the NCBI mapping information (example: seq_gene.md)') - parser.add_argument( - 'group_label', metavar='GROUP_LABEL', - help='use only entries with this group label') - parser.add_argument( - 'assembly', metavar='ASSEMBLY', - help='import mappings into this assembly (example: hg19)') - - args = parser.parse_args() - update_mapping(args.mappings_path, args.group_label, args.assembly) - - -if __name__ == '__main__': - main() diff --git a/mutalyzer/mapping.py b/mutalyzer/mapping.py index cec62b63ec9e7a7b39066d3959795485aa81a60a..2a4e534e059fc2ef030cb0d1072b243ec7bd9c23 100644 --- a/mutalyzer/mapping.py +++ b/mutalyzer/mapping.py @@ -11,16 +11,23 @@ update the database with this information. from collections import defaultdict -from operator import attrgetter +from itertools import groupby +from operator import attrgetter, itemgetter from Bio.Seq import reverse_complement import MySQLdb +from mutalyzer.db import session from mutalyzer.db.models import Chromosome, TranscriptMapping from mutalyzer.grammar import Grammar +from mutalyzer.models import SoapMessage, Mapping, Transcript +from mutalyzer.output import Output from mutalyzer import Crossmap from mutalyzer import Retriever -from mutalyzer.models import SoapMessage, Mapping, Transcript + + +class MapviewSortError(Exception): + pass def _construct_change(var, reverse=False): @@ -749,3 +756,234 @@ class Converter(object) : return HGVS_notatations #chrom2c #Converter + + +def import_from_ucsc_by_gene(assembly, gene): + """ + Import transcript mappings for a gene from the UCSC. + + .. todo: Also report how much was added/updated. + """ + connection = MySQLdb.connect(user='genome', + host='genome-mysql.cse.ucsc.edu', + db=assembly.alias) + + query = """ + SELECT DISTINCT + acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, + exonEnds, name2 AS geneName, chrom, strand, protAcc + FROM gbStatus, refGene, refLink + WHERE type = "mRNA" + AND refGene.name = acc + AND acc = mrnaAcc + AND name2 = %s + """ + parameters = gene + + cursor = connection.cursor() + cursor.execute(query, parameters) + result = cursor.fetchall() + cursor.close() + + for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds, + geneName, chrom, strand, protAcc) in result: + chromosome = assembly.chromosomes.filter_by(name=chrom).one() + orientation = 'reverse' if strand == '-' else 'forward' + exon_starts = [int(i) + 1 for i in exonStarts.split(',') if i] + exon_stops = [int(i) for i in exonEnds.split(',') if i] + if cdsStart and cdsEnd: + cds = cdsStart + 1, cdsEnd + else: + cds = None + mapping = TranscriptMapping.create_or_update( + chromosome, 'refseq', acc, geneName, orientation, txStart + 1, + txEnd, exon_starts, exon_stops, 'ucsc', cds=cds, + version=int(version)) + session.add(mapping) + + session.commit() + + +def import_from_reference(assembly, reference): + """ + Import transcript mappings from a genomic reference. + + .. todo: Also report how much was added/updated. + + .. note: Currently no exon locations are supported, this has only been + tested on mtDNA. + """ + chromosome = assembly.chromosomes.filter_by(name='chrM').one() + + output = Output(__file__) + retriever = Retriever.GenBankRetriever(output) + record = retriever.loadrecord(reference) + + select_transcript = len(record.geneList) > 1 + + for gene in record.geneList: + # We support exactly one transcript per gene. + try: + transcript = sorted(gene.transcriptList, key=attrgetter('name'))[0] + except IndexError: + continue + + # We use gene.location for now, it is always present and the same + # for our purposes. + #start, stop = transcript.mRNA.location[0], transcript.mRNA.location[1] + start, stop = gene.location + + orientation = 'reverse' if gene.orientation == -1 else 'forward' + + try: + cds = transcript.CDS.location + except AttributeError: + cds = None + + mapping = TranscriptMapping.create_or_update( + chromosome, 'refseq', record.source_accession, gene.name, + orientation, start, stop, [start], [stop], 'reference', cds=cds, + select_transcript=select_transcript, + version=int(record.source_version)) + session.add(mapping) + + session.commit() + + +def import_from_mapview_file(assembly, mapview_file, group_label): + """ + Import transcript mappings from an NCBI mapview file. + + We require that this file is first sorted on the `feature_id` column + (#11), which always contains the gene identifier, and then on the + `chromosome` column (#2). + + sort -k 11,11 -k 2,2 seq_gene.md > seq_gene.by_gene.md + + Raises :exc:`ValueError` if `mapview_file` is not sorted this way. + + The NCBI mapping file consists of entries, one per line, in order of + their location in the genome (more specifically by start location). + Every entry has a 'group_label' column, denoting the assembly it is + from. We only use entries where this value is `group_label`. + + There are four types of entries (for our purposes): + - Gene: Name, identifier, and location of a gene. + - Transcript: Name, gene id, and location of a transcript. + - UTR: Location and transcript of a non-coding exon (or part of it). + - CDS: Location and transcript of a coding exon (or part of it). + + A bit troublesome for us is that exons are split in UTR exons and CDS + exons, with exons overlapping the UTR/CDS border defined as two + separate entries (one of type UTR and one of type CDS). + + Another minor annoyance is that some transcripts (~ 15) are split over + two contigs (NT_*). In that case, they are defined by two entries in + the file, where we should merge them by taking the start position of + the first and the stop position of the second. + + To complicate this annoyance, some genes (e.g. in the PAR) are mapped + on both the X and Y chromosomes, but stored in the file just like the + transcripts split over two contigs. However, these ones should of + course not be merged. + + Our strategy is too sort by gene and chromosome and process the file + grouped by these two fields. + """ + columns = ['taxonomy', 'chromosome', 'start', 'stop', 'orientation', + 'contig', 'ctg_start', 'ctg_stop', 'ctg_orientation', + 'feature_name', 'feature_id', 'feature_type', 'group_label', + 'transcript', 'evidence_code'] + + chromosomes = assembly.chromosomes.all() + + def read_records(mapview_file): + for line in mapview_file: + if line.startswith('#'): + continue + record = dict(zip(columns, line.rstrip().split('\t'))) + + # Only use records from the given assembly. + if record['group_label'] != group_label: + continue + + # Only use records on chromosomes we know. + try: + record['chromosome'] = next(c for c in chromosomes if + c.name == 'chr' + record['chromosome']) + except StopIteration: + continue + + record['start'] = int(record['start']) + record['stop'] = int(record['stop']) + + yield record + + def build_mappings(records): + # We structure the records per transcript and per record type. This is + # generalized to a list of records for each type, but we expect only + # one GENE record (with `-` as transcript value). + # Note that there can be more than one RNA record per transcript if it + # is split over different reference contigs. + by_transcript = defaultdict(lambda: defaultdict(list)) + for r in records: + by_transcript[r['transcript']][r['feature_type']].append(r) + + gene = by_transcript['-']['GENE'][0]['feature_name'] + + for transcript, by_type in by_transcript.items(): + if transcript == '-': + continue + accession, version = transcript.split('.') + version = int(version) + chromosome = by_type['RNA'][0]['chromosome'] + orientation = 'reverse' if by_type['RNA'][0]['orientation'] == '-' else 'forward' + start = min(t['start'] for t in by_type['RNA']) + stop = max(t['stop'] for t in by_type['RNA']) + + exon_starts = [] + exon_stops = [] + cds_positions = [] + for exon in sorted(by_type['UTR'] + by_type['CDS'], + key=itemgetter('start')): + if exon_stops and exon_stops[-1] > exon['start'] - 1: + # This exon starts before the end of the previous exon. We + # have no idea what to do in this case, so we ignore it. + # The number of transcripts affected is very small (e.g., + # NM_031860.1 and NM_001184961.1 in the GRCh37 assembly). + continue + if exon['feature_type'] == 'CDS': + cds_positions.extend([exon['start'], exon['stop']]) + if exon_stops and exon_stops[-1] == exon['start'] - 1: + # This exon must be merged with the previous one because + # it is split over two entries (a CDS part and a UTR part + # or split over different reference contigs). + exon_stops[-1] = exon['stop'] + else: + exon_starts.append(exon['start']) + exon_stops.append(exon['stop']) + + if cds_positions: + cds = min(cds_positions), max(cds_positions) + else: + cds = None + + yield TranscriptMapping.create_or_update( + chromosome, 'refseq', accession, gene, orientation, start, + stop, exon_starts, exon_stops, 'ncbi', cds=cds, + version=version) + + processed_keys = set() + + for key, records in groupby(read_records(mapview_file), + itemgetter('feature_id', 'chromosome')): + if key in processed_keys: + raise MapviewSortError('Mapview file must be sorted by feature_id ' + 'and chromosome (try `sort -k 11,11 -k ' + '2,2`)') + processed_keys.add(key) + + for mapping in build_mappings(records): + session.add(mapping) + + session.commit() diff --git a/mutalyzer/services/rpc.py b/mutalyzer/services/rpc.py index 6a287117a800b9c041693f06a646013b7a82e3e5..0699ffd83f0ac1719be541bfede7380b1769176c 100644 --- a/mutalyzer/services/rpc.py +++ b/mutalyzer/services/rpc.py @@ -19,7 +19,7 @@ import socket from cStringIO import StringIO import tempfile from operator import itemgetter, attrgetter -from sqlalchemy import and_, or_ +from sqlalchemy.orm.exc import NoResultFound import mutalyzer from mutalyzer.config import settings @@ -184,17 +184,17 @@ class MutalyzerService(ServiceBase): "Received request getTranscripts(%s %s %s %s)" % (build, chrom, pos, versions)) - assembly = Assembly.query.filter(or_(Assembly.name == build, - Assembly.alias == build)).first() - if not assembly: + try: + assembly = Assembly.by_name_or_alias(build) + except NoResultFound: L.addMessage(__file__, 4, "EARG", "EARG %s" % build) raise Fault("EARG", "The build argument (%s) was not a valid " \ "build name." % build) - chromosome = Chromosome.query.filter_by(assembly=assembly, - name=chrom).first() - if not chromosome: + try: + chromosome = assembly.chromosomes.filter_by(name=chrom).one() + except NoResultFound: L.addMessage(__file__, 4, "EARG", "EARG %s" % chrom) raise Fault("EARG", "The chrom argument (%s) was not a valid " \ "chromosome name." % chrom) @@ -224,9 +224,9 @@ class MutalyzerService(ServiceBase): L.addMessage(__file__, -1, "INFO", "Received request getTranscriptsByGene(%s %s)" % (build, name)) - assembly = Assembly.query.filter(or_(Assembly.name == build, - Assembly.alias == build)).first() - if not assembly: + try: + assembly = Assembly.by_name_or_alias(build) + except NoResultFound: L.addMessage(__file__, 4, "EARG", "EARG %s" % build) raise Fault("EARG", "The build argument (%s) was not a valid " \ @@ -270,17 +270,17 @@ class MutalyzerService(ServiceBase): "Received request getTranscriptsRange(%s %s %s %s %s)" % (build, chrom, pos1, pos2, method)) - assembly = Assembly.query.filter(or_(Assembly.name == build, - Assembly.alias == build)).first() - if not assembly: + try: + assembly = Assembly.by_name_or_alias(build) + except NoResultFound: L.addMessage(__file__, 4, "EARG", "EARG %s" % build) raise Fault("EARG", "The build argument (%s) was not a valid " \ "build name." % build) - chromosome = Chromosome.query.filter_by(assembly=assembly, - name=chrom).first() - if not chromosome: + try: + chromosome = assembly.chromosomes.filter_by(name=chrom).one() + except NoResultFound: L.addMessage(__file__, 4, "EARG", "EARG %s" % chrom) raise Fault("EARG", "The chrom argument (%s) was not a valid " \ "chromosome name." % chrom) @@ -337,17 +337,17 @@ class MutalyzerService(ServiceBase): 'getTranscriptsMapping(%s %s %s %s %s)' % (build, chrom, pos1, pos2, method)) - assembly = Assembly.query.filter(or_(Assembly.name == build, - Assembly.alias == build)).first() - if not assembly: + try: + assembly = Assembly.by_name_or_alias(build) + except NoResultFound: L.addMessage(__file__, 4, "EARG", "EARG %s" % build) raise Fault("EARG", "The build argument (%s) was not a valid " \ "build name." % build) - chromosome = Chromosome.query.filter_by(assembly=assembly, - name=chrom).first() - if not chromosome: + try: + chromosome = assembly.chromosomes.filter_by(name=chrom).one() + except NoResultFound: L.addMessage(__file__, 4, "EARG", "EARG %s" % chrom) raise Fault("EARG", "The chrom argument (%s) was not a valid " \ "chromosome name." % chrom) @@ -404,9 +404,9 @@ class MutalyzerService(ServiceBase): L.addMessage(__file__, -1, "INFO", "Received request getGeneName(%s %s)" % (build, accno)) - assembly = Assembly.query.filter(or_(Assembly.name == build, - Assembly.alias == build)).first() - if not assembly: + try: + assembly = Assembly.by_name_or_alias(build) + except NoResultFound: L.addMessage(__file__, 4, "EARG", "EARG %s" % build) raise Fault("EARG", "The build argument (%s) was not a valid " \ @@ -472,9 +472,9 @@ class MutalyzerService(ServiceBase): "Reveived request mappingInfo(%s %s %s %s)" % (LOVD_ver, build, accNo, variant)) - assembly = Assembly.query.filter(or_(Assembly.name == build, - Assembly.alias == build)).first() - if not assembly: + try: + assembly = Assembly.by_name_or_alias(build) + except NoResultFound: L.addMessage(__file__, 4, "EARG", "EARG %s" % build) raise Fault("EARG", "The build argument (%s) was not a valid " \ @@ -518,9 +518,9 @@ class MutalyzerService(ServiceBase): "Received request transcriptInfo(%s %s %s)" % (LOVD_ver, build, accNo)) - assembly = Assembly.query.filter(or_(Assembly.name == build, - Assembly.alias == build)).first() - if not assembly: + try: + assembly = Assembly.by_name_or_alias(build) + except NoResultFound: O.addMessage(__file__, 4, "EARG", "EARG %s" % build) raise Fault("EARG", "The build argument (%s) was not a valid " \ @@ -552,17 +552,17 @@ class MutalyzerService(ServiceBase): L.addMessage(__file__, -1, "INFO", "Received request chromAccession(%s %s)" % (build, name)) - assembly = Assembly.query.filter(or_(Assembly.name == build, - Assembly.alias == build)).first() - if not assembly: + try: + assembly = Assembly.by_name_or_alias(build) + except NoResultFound: L.addMessage(__file__, 4, "EARG", "EARG %s" % build) raise Fault("EARG", "The build argument (%s) was not a valid " \ "build name." % build) - chromosome = Chromosome.query.filter_by(assembly=assembly, - name=name).first() - if not chromosome: + try: + chromosome = assembly.chromosomes.filter_by(name=name).one() + except NoResultFound: L.addMessage(__file__, 4, "EARG", "EARG %s" % name) raise Fault("EARG", "The name argument (%s) was not a valid " \ "chromosome name." % name) @@ -590,17 +590,17 @@ class MutalyzerService(ServiceBase): L.addMessage(__file__, -1, "INFO", "Received request chromName(%s %s)" % (build, accNo)) - assembly = Assembly.query.filter(or_(Assembly.name == build, - Assembly.alias == build)).first() - if not assembly: + try: + assembly = Assembly.by_name_or_alias(build) + except NoResultFound: L.addMessage(__file__, 4, "EARG", "EARG %s" % build) raise Fault("EARG", "The build argument (%s) was not a valid " \ "build name." % build) - chromosome = Chromosome.query.filter_by(assembly=assembly, - accession=accNo).first() - if not chromosome: + try: + chromosome = assembly.chromosomes.filter_by(accession=accNo).one() + except NoResultFound: L.addMessage(__file__, 4, "EARG", "EARG %s" % accNo) raise Fault("EARG", "The accNo argument (%s) was not a valid " \ "chromosome accession." % accNo) @@ -629,9 +629,9 @@ class MutalyzerService(ServiceBase): L.addMessage(__file__, -1, "INFO", "Received request getchromName(%s %s)" % (build, acc)) - assembly = Assembly.query.filter(or_(Assembly.name == build, - Assembly.alias == build)).first() - if not assembly: + try: + assembly = Assembly.by_name_or_alias(build) + except NoResultFound: L.addMessage(__file__, 4, "EARG", "EARG %s" % build) raise Fault("EARG", "The build argument (%s) was not a valid " \ @@ -672,9 +672,9 @@ class MutalyzerService(ServiceBase): stats.increment_counter('position-converter/webservice') - assembly = Assembly.query.filter(or_(Assembly.name == build, - Assembly.alias == build)).first() - if not assembly: + try: + assembly = Assembly.by_name_or_alias(build) + except NoResultFound: O.addMessage(__file__, 4, "EARG", "EARG %s" % build) raise Fault("EARG", "The build argument (%s) was not a valid " \ diff --git a/mutalyzer/website/views.py b/mutalyzer/website/views.py index 9d85548dba88c350145e943dede74eefb10892e5..0574995d92052ed95cd1fc280e1c36be82878384 100644 --- a/mutalyzer/website/views.py +++ b/mutalyzer/website/views.py @@ -16,7 +16,7 @@ from flask import (abort, current_app, jsonify, make_response, redirect, import jinja2 from lxml import etree from spyne.server.http import HttpBase -from sqlalchemy import and_, or_ +from sqlalchemy.orm.exc import NoResultFound import mutalyzer from mutalyzer import (describe, File, Retriever, Scheduler, stats, util, @@ -396,10 +396,9 @@ def position_converter(): chromosomal_description = None transcript_descriptions = None - assembly = Assembly.query.filter( - or_(Assembly.name == assembly_name_or_alias, - Assembly.alias == assembly_name_or_alias)).first() - if not assembly: + try: + assembly = Assembly.by_name_or_alias(assembly_name_or_alias) + except NoResultFound: output.addMessage(__file__, 3, 'ENOASSEMBLY', 'Not a valid assembly.') else: @@ -617,10 +616,9 @@ def reference_loader_submit(): assembly_name_or_alias = request.form.get('assembly_name_or_alias', settings.DEFAULT_ASSEMBLY) - assembly = Assembly.query.filter( - or_(Assembly.name == assembly_name_or_alias, - Assembly.alias == assembly_name_or_alias)).first() - if not assembly: + try: + assembly = Assembly.by_name_or_alias(assembly_name_or_alias) + except NoResultFound: raise InputException('Invalid assembly') if not chromosome_name.startswith('chr'): @@ -770,9 +768,9 @@ def batch_jobs_submit(): errors.append('Please select a local file for upload.') if job_type == 'position-converter': - if not Assembly.query.filter( - or_(Assembly.name == assembly_name_or_alias, - Assembly.alias == assembly_name_or_alias)).first(): + try: + Assembly.by_name_or_alias(assembly_name_or_alias) + except NoResultFound: errors.append('Not a valid assembly.') argument = assembly_name_or_alias else: @@ -996,9 +994,9 @@ def lovd_variant_info(): % (accession, description, lovd_version, build, request.remote_addr)) - assembly = Assembly.query.filter(or_(Assembly.name == build, - Assembly.alias == build)).first() - if not assembly: + try: + assembly = Assembly.by_name_or_alias(build) + except NoResultFound: response = make_response('invalid build') response.headers['Content-Type'] = 'text/plain' return response diff --git a/setup.py b/setup.py index 5d132cfade70ce50e404c12d2ad4579dc6ef4922..8e4ce82a1e4896cb6ad50255b247af617be74094 100644 --- a/setup.py +++ b/setup.py @@ -50,10 +50,8 @@ setup( include_package_data=True, entry_points = {'console_scripts': [ 'mutalyzer = mutalyzer.entrypoints.mutalyzer:main', + 'mutalyzer-admin = mutalyzer.entrypoints.admin:main', 'mutalyzer-batch-processor = mutalyzer.entrypoints.batch_processor:main', - 'mutalyzer-cache-sync = mutalyzer.entrypoints.cache_sync:main', - 'mutalyzer-mapping-import = mutalyzer.entrypoints.mapping_import:main', - 'mutalyzer-mapping-update = mutalyzer.entrypoints.mapping_update:main', 'mutalyzer-service-json = mutalyzer.entrypoints.service_json:main', 'mutalyzer-service-soap = mutalyzer.entrypoints.service_soap:main', 'mutalyzer-website = mutalyzer.entrypoints.website:main']},