diff --git a/mutalyzer/Retriever.py b/mutalyzer/Retriever.py index b0b9c59477fd5463f89b7820db4a6b59d67eda9c..37b5f7749957771a95fc7fce4d0e1bbba891d7f4 100644 --- a/mutalyzer/Retriever.py +++ b/mutalyzer/Retriever.py @@ -420,8 +420,10 @@ class GenBankRetriever(Retriever): as filename. :arg unicode accno: The accession number of the chromosome. - :arg int start: Start position of the slice. - :arg int stop: End position of the slice. + :arg int start: Start position of the slice (one-based, inclusive, in + reference orientation). + :arg int stop: End position of the slice (one-based, inclusive, in + reference orientation). :arg int orientation: Orientation of the slice: - 1 ; Forward. - 2 ; Reverse complement. @@ -429,11 +431,18 @@ class GenBankRetriever(Retriever): :returns unicode: An UD number. """ # Not a valid slice. - if start >= stop: + if start > stop: + self._output.addMessage(__file__, 4, 'ERETR', + 'Could not retrieve slice for start ' + 'position greater than stop position.') return None # The slice can not be too big. - if stop - start > settings.MAX_FILE_SIZE: + if stop - start + 1 > settings.MAX_FILE_SIZE: + self._output.addMessage(__file__, 4, 'ERETR', + 'Could not retrieve slice (request ' + 'exceeds maximum of %d bases)' % + settings.MAX_FILE_SIZE) return None slice_orientation = ['forward', 'reverse'][orientation - 1] @@ -452,6 +461,8 @@ class GenBankRetriever(Retriever): # It's not present, so download it. try: + # EFetch `seq_start` and `seq_stop` are one-based, inclusive, and + # in reference orientation. handle = Entrez.efetch( db='nuccore', rettype='gb', retmode='text', id=accno, seq_start=start, seq_stop=stop, strand=orientation) @@ -583,6 +594,7 @@ class GenBankRetriever(Retriever): gene)) return None + # Positions are zero-based, inclusive and in gene orientation. chr_acc_ver = unicode(document['GenomicInfo'][0]['ChrAccVer']) chr_start = int(document['GenomicInfo'][0]['ChrStart']) chr_stop = int(document['GenomicInfo'][0]['ChrStop']) @@ -606,6 +618,26 @@ class GenBankRetriever(Retriever): __file__, 4, 'ENOGENE', 'Gene {} not found.'.format(gene)) return None + # If we take `start` and `stop` to be the one-based, inclusive, in + # reference orientation coordinates of the gene, the code below yields + # the following call to retrieveslice: + # + # Forward: + # slice_start = start - upstream + # slice_stop = stop + downstream + 1 + # + # Reverse: + # slice_start = start - downstream - 1 + # slice_stop = stop + upstream + # + # Since retrieveslice expects one-based, inclusive, in reference + # orientation coordinates, this code effectively slices one downstream + # base extra. Yes, that's a bug. + # + # If we correct this, we should be careful not to invalidate the + # entire existing cache of sliced reference files by generating new + # slices with a one base difference. + # Figure out the orientation of the gene. orientation = 1 if chr_start > chr_stop: # Swap start and stop. diff --git a/mutalyzer/db/models.py b/mutalyzer/db/models.py index 309fbc554bb84b9f82610d3a795647ce3c4f9cea..87a87c88d31faa169a5a2d45df9e6bc8d52970a4 100644 --- a/mutalyzer/db/models.py +++ b/mutalyzer/db/models.py @@ -172,11 +172,11 @@ class Reference(db.Base): slice_accession = Column(String(20)) #: The start position on the accession number from which we took a slice, - #: if available. + #: if available (one-based, inclusive, in reference orientation). slice_start = Column(Integer) #: The stop position on the accession number from which we took a slice, - #: if available. + #: if available (one-based, inclusive, in reference orientation). slice_stop = Column(Integer) #: The orientation on the accession number from which we took a slice, if @@ -381,22 +381,28 @@ class TranscriptMapping(db.Base): orientation = Column(Enum('forward', 'reverse', name='orientation'), nullable=False) - #: The start position of the transcript on the chromosome. + #: The start position of the transcript on the chromosome (one-based, + #: inclusive, in chromosomal orientation). start = Column(Integer, nullable=False) - #: The stop position of the transcript on the chromosome. + #: The stop position of the transcript on the chromosome (one-based, + #: inclusive, in chromosomal orientation). stop = Column(Integer, nullable=False) - #: The CDS start position of the transcript on the chromosome. + #: The CDS start position of the transcript on the chromosome (one-based, + #: inclusive, in chromosomal orientation). cds_start = Column(Integer) - #: The CDS stop position of the transcript on the chromosome. + #: The CDS stop position of the transcript on the chromosome (one-based, + #: inclusive). cds_stop = Column(Integer) - #: The exon start positions of the transcript on the chromosome. + #: The exon start positions of the transcript on the chromosome + #: (one-based, inclusive, in chromosomal orientation). exon_starts = Column(Positions, nullable=False) - #: The exon start positions of the transcript on the chromosome. + #: The exon start positions of the transcript on the chromosome + #: (one-based, inclusive, in chromosomal orientation). exon_stops = Column(Positions, nullable=False) #: If `False`, variant descriptions can use just the accession number diff --git a/mutalyzer/mapping.py b/mutalyzer/mapping.py index 47fe4a310b9d5edf896e9ea080b051aa10f778e6..df2fb1dfc630bd02e6a132e94aeb29e6d7871531 100644 --- a/mutalyzer/mapping.py +++ b/mutalyzer/mapping.py @@ -816,6 +816,9 @@ def import_from_ucsc_by_gene(assembly, gene): result = cursor.fetchall() cursor.close() + # All ranges in the UCSC tables are zero-based and open-ended. We convert + # this to one-based, inclusive for our database. + for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds, geneName, chrom, strand, protAcc) in result: chromosome = assembly.chromosomes.filter_by(name=chrom).one() @@ -923,6 +926,9 @@ def import_from_mapview_file(assembly, mapview_file, group_label): For transcripts without any UTR and CDS entries (seems to happen for predicted genes), we generate one exon spanning the entire transcript. + + All positions are one-based, inclusive, and that is what we also use in + our database. """ columns = ['taxonomy', 'chromosome', 'start', 'stop', 'orientation', 'contig', 'ctg_start', 'ctg_stop', 'ctg_orientation', diff --git a/mutalyzer/services/rpc.py b/mutalyzer/services/rpc.py index fde784055806dbce4c3f26016e25f0306c46a4ef..5326f2ff499cfa9a0c2303ed86e7d04642086648 100644 --- a/mutalyzer/services/rpc.py +++ b/mutalyzer/services/rpc.py @@ -198,7 +198,7 @@ class MutalyzerService(ServiceBase): @type build: string @arg chrom: A chromosome encoded as "chr1", ..., "chrY". @type chrom: string - @arg pos: A position on the chromosome. + @arg pos: A position on the chromosome (one-based). @type pos: int @kwarg versions: If set to True, also include transcript versions. @type versions: bool @@ -276,6 +276,8 @@ class MutalyzerService(ServiceBase): """ Get all the transcripts that overlap with a range on a chromosome. + The range should be provided as one-based, inclusive positions. + @arg build: The genome build (hg19, hg18, mm10). @type build: string @arg chrom: A chromosome encoded as "chr1", ..., "chrY". @@ -298,6 +300,13 @@ class MutalyzerService(ServiceBase): "Received request getTranscriptsRange(%s %s %s %s %s)" % (build, chrom, pos1, pos2, method)) + if pos1 > pos2: + L.addMessage(__file__, 4, 'EARG', + 'Invalid range: %d-%d' % (pos1, pos2)) + raise Fault('EARG', + 'Invalid range (%d-%d) with start position greater ' + 'than stop position.' % (pos1, pos2)) + try: assembly = Assembly.by_name_or_alias(build) except NoResultFound: @@ -337,6 +346,8 @@ class MutalyzerService(ServiceBase): Get all the transcripts and their info that overlap with a range on a chromosome. + The range should be provided as one-based, inclusive positions. + @arg build: The genome build (hg19, hg18, mm10). @type build: string @arg chrom: A chromosome encoded as "chr1", ..., "chrY". @@ -359,16 +370,25 @@ class MutalyzerService(ServiceBase): - stop - cds_start - cds_stop + All returned ranges are one-based, inclusive, and in gene + orientation. """ output = Output(__file__) output.addMessage(__file__, -1, 'INFO', 'Received request ' \ 'getTranscriptsMapping(%s %s %s %s %s)' % (build, chrom, pos1, pos2, method)) + if pos1 > pos2: + output.addMessage(__file__, 4, 'EARG', + 'Invalid range: %d-%d' % (pos1, pos2)) + raise Fault('EARG', + 'Invalid range (%d-%d) with start position greater ' + 'than stop position.' % (pos1, pos2)) + try: assembly = Assembly.by_name_or_alias(build) except NoResultFound: - L.addMessage(__file__, 4, "EARG", "EARG %s" % build) + output.addMessage(__file__, 4, "EARG", "EARG %s" % build) raise Fault("EARG", "The build argument (%s) was not a valid " \ "build name." % build) @@ -376,7 +396,7 @@ class MutalyzerService(ServiceBase): try: chromosome = assembly.chromosomes.filter_by(name=chrom).one() except NoResultFound: - L.addMessage(__file__, 4, "EARG", "EARG %s" % chrom) + output.addMessage(__file__, 4, "EARG", "EARG %s" % chrom) raise Fault("EARG", "The chrom argument (%s) was not a valid " \ "chromosome name." % chrom) @@ -1123,8 +1143,23 @@ class MutalyzerService(ServiceBase): def sliceChromosomeByGene(geneSymbol, organism, upStream, downStream) : """ - Todo: documentation, error handling, argument checking, tests. + Retrieve part of the reference genome for a (HGNC) gene symbol. + + @arg geneSymbol: Gene symbol. + @type geneSymbol: string + @arg organism: Organism name without spaces. + @type organism: string + @arg upStream: Number of 5' flanking bases to include. + @type upStream: integer + @arg downStream: Number of 3' flanking bases to include. + @type upStream: integer + + This uses the NCBI Entrez search engine and is therefore based on the + current Entrez assembly for the given organism. + + @return: UD accession number for created slice. """ + # Todo: error handling, argument checking, tests. O = Output(__file__) retriever = Retriever.GenBankRetriever(O) @@ -1148,12 +1183,21 @@ class MutalyzerService(ServiceBase): Mandatory.Integer, _returns=Mandatory.Unicode) def sliceChromosome(chromAccNo, start, end, orientation) : """ - Todo: documentation, error handling, argument checking, tests. - + Retrieve a range of a chromosome by accession number. + + @arg chromAccNo: Chromosome or contig by accession number. + @type chromAccNo: string + @arg start: Start position (one-based, inclusive, in reference + orientation). + @type start: integer + @arg stop: Stop position (one-based, inclusive, in reference + orientation). + @type start: integer @arg orientation: Orientation of the slice. 1 for forward, 2 for - reverse complement. + reverse complement. @type orientation: integer """ + # Todo: error handling, argument checking, tests. O = Output(__file__) retriever = Retriever.GenBankRetriever(O) @@ -1351,10 +1395,12 @@ class MutalyzerService(ServiceBase): @return: Object with the following fields: - gene: Gene symbol. - - start: Gene start position. If multiple transcripts for the gene - are known, this contains the lowest start position. - - stop: Gene stop position. If multiple transcripts for the gene are - known, this contains the highest stop position. + - start: Gene start position (one-based, inclusive, in chromosomal + orientation). If multiple transcripts for the gene are known, + this contains the lowest start position. + - stop: Gene stop position (one-based, inclusive, in chromosomal + orientation). If multiple transcripts for the gene are known, + this contains the highest stop position. - orientation: Gene orientation, either 'forward' or 'reverse'. - chromosome_name: Gene chromosome by name (e.g., 'chrX'). - chromosome_accession: Gene chromosome by accession (e.g., diff --git a/mutalyzer/website/templates/reference-loader.html b/mutalyzer/website/templates/reference-loader.html index d4b4c755adb244875559fc9c38591811f79d4d74..411e034c0d4c81ace76900ba157dfcfaa9df2b2a 100644 --- a/mutalyzer/website/templates/reference-loader.html +++ b/mutalyzer/website/templates/reference-loader.html @@ -13,6 +13,10 @@ sequence when no appropriate RefSeq, GenBank or LRG file is available. Please select one of the options below to upload or retrieve your reference sequence (maximum size is {{ max_file_size }} megabytes). </p> +<p> +<strong>Note:</strong> All ranges are assumed to be one-based, inclusive, +and in reference orientation. +</p> <form name="invoer" enctype="multipart/form-data" action="{{ url_for('.reference_loader') }}" method="post" id="invoer"> <div class="row"> diff --git a/mutalyzer/website/views.py b/mutalyzer/website/views.py index db179140b656a754a65a36fc55ba574c4d8d209f..0b8ce1b14b7ad80fc97254eebcc8bf12eb659adc 100644 --- a/mutalyzer/website/views.py +++ b/mutalyzer/website/views.py @@ -558,8 +558,10 @@ def reference_loader_submit(): Retrieve a range of a chromosome by accession number. - `accession`: Chromosome Accession Number. - - `accession_start`: Start position. - - `accession_stop`: Stop position. + - `accession_start`: Start position (one-based, inclusive, in reference + orientation). + - `accession_stop`: Stop position (one-based, inclusive, in reference + orientation). - `accession_orientation`: Orientation. `method=slice_chromosome_method` @@ -567,8 +569,10 @@ def reference_loader_submit(): - `assembly_name_or_alias`: Genome assembly by name or by alias. - `chromosome`: Chromosome name. - - `chromosome_start`: Start position. - - `chromosome_stop`: Stop position. + - `chromosome_start`: Start position (one-based, inclusive, in reference + orientation). + - `chromosome_stop`: Stop position (one-based, inclusive, in reference + orientation). - `chromosome_orientation`: Orientation. """ method = request.form.get('method')