Skip to content
Snippets Groups Projects
Commit 91306c15 authored by Vermaat's avatar Vermaat
Browse files

Document scheme used for all positions and ranges

parent 37457325
No related branches found
No related tags found
No related merge requests found
......@@ -420,8 +420,10 @@ class GenBankRetriever(Retriever):
as filename.
:arg unicode accno: The accession number of the chromosome.
:arg int start: Start position of the slice.
:arg int stop: End position of the slice.
:arg int start: Start position of the slice (one-based, inclusive, in
reference orientation).
:arg int stop: End position of the slice (one-based, inclusive, in
reference orientation).
:arg int orientation: Orientation of the slice:
- 1 ; Forward.
- 2 ; Reverse complement.
......@@ -429,11 +431,18 @@ class GenBankRetriever(Retriever):
:returns unicode: An UD number.
"""
# Not a valid slice.
if start >= stop:
if start > stop:
self._output.addMessage(__file__, 4, 'ERETR',
'Could not retrieve slice for start '
'position greater than stop position.')
return None
# The slice can not be too big.
if stop - start > settings.MAX_FILE_SIZE:
if stop - start + 1 > settings.MAX_FILE_SIZE:
self._output.addMessage(__file__, 4, 'ERETR',
'Could not retrieve slice (request '
'exceeds maximum of %d bases)' %
settings.MAX_FILE_SIZE)
return None
slice_orientation = ['forward', 'reverse'][orientation - 1]
......@@ -452,6 +461,8 @@ class GenBankRetriever(Retriever):
# It's not present, so download it.
try:
# EFetch `seq_start` and `seq_stop` are one-based, inclusive, and
# in reference orientation.
handle = Entrez.efetch(
db='nuccore', rettype='gb', retmode='text', id=accno,
seq_start=start, seq_stop=stop, strand=orientation)
......@@ -583,6 +594,7 @@ class GenBankRetriever(Retriever):
gene))
return None
# Positions are zero-based, inclusive and in gene orientation.
chr_acc_ver = unicode(document['GenomicInfo'][0]['ChrAccVer'])
chr_start = int(document['GenomicInfo'][0]['ChrStart'])
chr_stop = int(document['GenomicInfo'][0]['ChrStop'])
......@@ -606,6 +618,26 @@ class GenBankRetriever(Retriever):
__file__, 4, 'ENOGENE', 'Gene {} not found.'.format(gene))
return None
# If we take `start` and `stop` to be the one-based, inclusive, in
# reference orientation coordinates of the gene, the code below yields
# the following call to retrieveslice:
#
# Forward:
# slice_start = start - upstream
# slice_stop = stop + downstream + 1
#
# Reverse:
# slice_start = start - downstream - 1
# slice_stop = stop + upstream
#
# Since retrieveslice expects one-based, inclusive, in reference
# orientation coordinates, this code effectively slices one downstream
# base extra. Yes, that's a bug.
#
# If we correct this, we should be careful not to invalidate the
# entire existing cache of sliced reference files by generating new
# slices with a one base difference.
# Figure out the orientation of the gene.
orientation = 1
if chr_start > chr_stop: # Swap start and stop.
......
......@@ -172,11 +172,11 @@ class Reference(db.Base):
slice_accession = Column(String(20))
#: The start position on the accession number from which we took a slice,
#: if available.
#: if available (one-based, inclusive, in reference orientation).
slice_start = Column(Integer)
#: The stop position on the accession number from which we took a slice,
#: if available.
#: if available (one-based, inclusive, in reference orientation).
slice_stop = Column(Integer)
#: The orientation on the accession number from which we took a slice, if
......@@ -381,22 +381,28 @@ class TranscriptMapping(db.Base):
orientation = Column(Enum('forward', 'reverse', name='orientation'),
nullable=False)
#: The start position of the transcript on the chromosome.
#: The start position of the transcript on the chromosome (one-based,
#: inclusive, in chromosomal orientation).
start = Column(Integer, nullable=False)
#: The stop position of the transcript on the chromosome.
#: The stop position of the transcript on the chromosome (one-based,
#: inclusive, in chromosomal orientation).
stop = Column(Integer, nullable=False)
#: The CDS start position of the transcript on the chromosome.
#: The CDS start position of the transcript on the chromosome (one-based,
#: inclusive, in chromosomal orientation).
cds_start = Column(Integer)
#: The CDS stop position of the transcript on the chromosome.
#: The CDS stop position of the transcript on the chromosome (one-based,
#: inclusive).
cds_stop = Column(Integer)
#: The exon start positions of the transcript on the chromosome.
#: The exon start positions of the transcript on the chromosome
#: (one-based, inclusive, in chromosomal orientation).
exon_starts = Column(Positions, nullable=False)
#: The exon start positions of the transcript on the chromosome.
#: The exon start positions of the transcript on the chromosome
#: (one-based, inclusive, in chromosomal orientation).
exon_stops = Column(Positions, nullable=False)
#: If `False`, variant descriptions can use just the accession number
......
......@@ -816,6 +816,9 @@ def import_from_ucsc_by_gene(assembly, gene):
result = cursor.fetchall()
cursor.close()
# All ranges in the UCSC tables are zero-based and open-ended. We convert
# this to one-based, inclusive for our database.
for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds,
geneName, chrom, strand, protAcc) in result:
chromosome = assembly.chromosomes.filter_by(name=chrom).one()
......@@ -923,6 +926,9 @@ def import_from_mapview_file(assembly, mapview_file, group_label):
For transcripts without any UTR and CDS entries (seems to happen for
predicted genes), we generate one exon spanning the entire transcript.
All positions are one-based, inclusive, and that is what we also use in
our database.
"""
columns = ['taxonomy', 'chromosome', 'start', 'stop', 'orientation',
'contig', 'ctg_start', 'ctg_stop', 'ctg_orientation',
......
......@@ -198,7 +198,7 @@ class MutalyzerService(ServiceBase):
@type build: string
@arg chrom: A chromosome encoded as "chr1", ..., "chrY".
@type chrom: string
@arg pos: A position on the chromosome.
@arg pos: A position on the chromosome (one-based).
@type pos: int
@kwarg versions: If set to True, also include transcript versions.
@type versions: bool
......@@ -276,6 +276,8 @@ class MutalyzerService(ServiceBase):
"""
Get all the transcripts that overlap with a range on a chromosome.
The range should be provided as one-based, inclusive positions.
@arg build: The genome build (hg19, hg18, mm10).
@type build: string
@arg chrom: A chromosome encoded as "chr1", ..., "chrY".
......@@ -298,6 +300,13 @@ class MutalyzerService(ServiceBase):
"Received request getTranscriptsRange(%s %s %s %s %s)" % (build,
chrom, pos1, pos2, method))
if pos1 > pos2:
L.addMessage(__file__, 4, 'EARG',
'Invalid range: %d-%d' % (pos1, pos2))
raise Fault('EARG',
'Invalid range (%d-%d) with start position greater '
'than stop position.' % (pos1, pos2))
try:
assembly = Assembly.by_name_or_alias(build)
except NoResultFound:
......@@ -337,6 +346,8 @@ class MutalyzerService(ServiceBase):
Get all the transcripts and their info that overlap with a range on a
chromosome.
The range should be provided as one-based, inclusive positions.
@arg build: The genome build (hg19, hg18, mm10).
@type build: string
@arg chrom: A chromosome encoded as "chr1", ..., "chrY".
......@@ -359,16 +370,25 @@ class MutalyzerService(ServiceBase):
- stop
- cds_start
- cds_stop
All returned ranges are one-based, inclusive, and in gene
orientation.
"""
output = Output(__file__)
output.addMessage(__file__, -1, 'INFO', 'Received request ' \
'getTranscriptsMapping(%s %s %s %s %s)' % (build, chrom, pos1, pos2,
method))
if pos1 > pos2:
output.addMessage(__file__, 4, 'EARG',
'Invalid range: %d-%d' % (pos1, pos2))
raise Fault('EARG',
'Invalid range (%d-%d) with start position greater '
'than stop position.' % (pos1, pos2))
try:
assembly = Assembly.by_name_or_alias(build)
except NoResultFound:
L.addMessage(__file__, 4, "EARG", "EARG %s" % build)
output.addMessage(__file__, 4, "EARG", "EARG %s" % build)
raise Fault("EARG",
"The build argument (%s) was not a valid " \
"build name." % build)
......@@ -376,7 +396,7 @@ class MutalyzerService(ServiceBase):
try:
chromosome = assembly.chromosomes.filter_by(name=chrom).one()
except NoResultFound:
L.addMessage(__file__, 4, "EARG", "EARG %s" % chrom)
output.addMessage(__file__, 4, "EARG", "EARG %s" % chrom)
raise Fault("EARG", "The chrom argument (%s) was not a valid " \
"chromosome name." % chrom)
......@@ -1123,8 +1143,23 @@ class MutalyzerService(ServiceBase):
def sliceChromosomeByGene(geneSymbol, organism, upStream,
downStream) :
"""
Todo: documentation, error handling, argument checking, tests.
Retrieve part of the reference genome for a (HGNC) gene symbol.
@arg geneSymbol: Gene symbol.
@type geneSymbol: string
@arg organism: Organism name without spaces.
@type organism: string
@arg upStream: Number of 5' flanking bases to include.
@type upStream: integer
@arg downStream: Number of 3' flanking bases to include.
@type upStream: integer
This uses the NCBI Entrez search engine and is therefore based on the
current Entrez assembly for the given organism.
@return: UD accession number for created slice.
"""
# Todo: error handling, argument checking, tests.
O = Output(__file__)
retriever = Retriever.GenBankRetriever(O)
......@@ -1148,12 +1183,21 @@ class MutalyzerService(ServiceBase):
Mandatory.Integer, _returns=Mandatory.Unicode)
def sliceChromosome(chromAccNo, start, end, orientation) :
"""
Todo: documentation, error handling, argument checking, tests.
Retrieve a range of a chromosome by accession number.
@arg chromAccNo: Chromosome or contig by accession number.
@type chromAccNo: string
@arg start: Start position (one-based, inclusive, in reference
orientation).
@type start: integer
@arg stop: Stop position (one-based, inclusive, in reference
orientation).
@type start: integer
@arg orientation: Orientation of the slice. 1 for forward, 2 for
reverse complement.
reverse complement.
@type orientation: integer
"""
# Todo: error handling, argument checking, tests.
O = Output(__file__)
retriever = Retriever.GenBankRetriever(O)
......@@ -1351,10 +1395,12 @@ class MutalyzerService(ServiceBase):
@return: Object with the following fields:
- gene: Gene symbol.
- start: Gene start position. If multiple transcripts for the gene
are known, this contains the lowest start position.
- stop: Gene stop position. If multiple transcripts for the gene are
known, this contains the highest stop position.
- start: Gene start position (one-based, inclusive, in chromosomal
orientation). If multiple transcripts for the gene are known,
this contains the lowest start position.
- stop: Gene stop position (one-based, inclusive, in chromosomal
orientation). If multiple transcripts for the gene are known,
this contains the highest stop position.
- orientation: Gene orientation, either 'forward' or 'reverse'.
- chromosome_name: Gene chromosome by name (e.g., 'chrX').
- chromosome_accession: Gene chromosome by accession (e.g.,
......
......@@ -13,6 +13,10 @@ sequence when no appropriate RefSeq, GenBank or LRG file is available.
Please select one of the options below to upload or retrieve your reference
sequence (maximum size is {{ max_file_size }} megabytes).
</p>
<p>
<strong>Note:</strong> All ranges are assumed to be one-based, inclusive,
and in reference orientation.
</p>
<form name="invoer" enctype="multipart/form-data" action="{{ url_for('.reference_loader') }}" method="post" id="invoer">
<div class="row">
......
......@@ -558,8 +558,10 @@ def reference_loader_submit():
Retrieve a range of a chromosome by accession number.
- `accession`: Chromosome Accession Number.
- `accession_start`: Start position.
- `accession_stop`: Stop position.
- `accession_start`: Start position (one-based, inclusive, in reference
orientation).
- `accession_stop`: Stop position (one-based, inclusive, in reference
orientation).
- `accession_orientation`: Orientation.
`method=slice_chromosome_method`
......@@ -567,8 +569,10 @@ def reference_loader_submit():
- `assembly_name_or_alias`: Genome assembly by name or by alias.
- `chromosome`: Chromosome name.
- `chromosome_start`: Start position.
- `chromosome_stop`: Stop position.
- `chromosome_start`: Start position (one-based, inclusive, in reference
orientation).
- `chromosome_stop`: Stop position (one-based, inclusive, in reference
orientation).
- `chromosome_orientation`: Orientation.
"""
method = request.form.get('method')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment