Skip to content
Snippets Groups Projects
Commit 91306c15 authored by Vermaat's avatar Vermaat
Browse files

Document scheme used for all positions and ranges

parent 37457325
No related branches found
No related tags found
No related merge requests found
...@@ -420,8 +420,10 @@ class GenBankRetriever(Retriever): ...@@ -420,8 +420,10 @@ class GenBankRetriever(Retriever):
as filename. as filename.
:arg unicode accno: The accession number of the chromosome. :arg unicode accno: The accession number of the chromosome.
:arg int start: Start position of the slice. :arg int start: Start position of the slice (one-based, inclusive, in
:arg int stop: End position of the slice. reference orientation).
:arg int stop: End position of the slice (one-based, inclusive, in
reference orientation).
:arg int orientation: Orientation of the slice: :arg int orientation: Orientation of the slice:
- 1 ; Forward. - 1 ; Forward.
- 2 ; Reverse complement. - 2 ; Reverse complement.
...@@ -429,11 +431,18 @@ class GenBankRetriever(Retriever): ...@@ -429,11 +431,18 @@ class GenBankRetriever(Retriever):
:returns unicode: An UD number. :returns unicode: An UD number.
""" """
# Not a valid slice. # Not a valid slice.
if start >= stop: if start > stop:
self._output.addMessage(__file__, 4, 'ERETR',
'Could not retrieve slice for start '
'position greater than stop position.')
return None return None
# The slice can not be too big. # The slice can not be too big.
if stop - start > settings.MAX_FILE_SIZE: if stop - start + 1 > settings.MAX_FILE_SIZE:
self._output.addMessage(__file__, 4, 'ERETR',
'Could not retrieve slice (request '
'exceeds maximum of %d bases)' %
settings.MAX_FILE_SIZE)
return None return None
slice_orientation = ['forward', 'reverse'][orientation - 1] slice_orientation = ['forward', 'reverse'][orientation - 1]
...@@ -452,6 +461,8 @@ class GenBankRetriever(Retriever): ...@@ -452,6 +461,8 @@ class GenBankRetriever(Retriever):
# It's not present, so download it. # It's not present, so download it.
try: try:
# EFetch `seq_start` and `seq_stop` are one-based, inclusive, and
# in reference orientation.
handle = Entrez.efetch( handle = Entrez.efetch(
db='nuccore', rettype='gb', retmode='text', id=accno, db='nuccore', rettype='gb', retmode='text', id=accno,
seq_start=start, seq_stop=stop, strand=orientation) seq_start=start, seq_stop=stop, strand=orientation)
...@@ -583,6 +594,7 @@ class GenBankRetriever(Retriever): ...@@ -583,6 +594,7 @@ class GenBankRetriever(Retriever):
gene)) gene))
return None return None
# Positions are zero-based, inclusive and in gene orientation.
chr_acc_ver = unicode(document['GenomicInfo'][0]['ChrAccVer']) chr_acc_ver = unicode(document['GenomicInfo'][0]['ChrAccVer'])
chr_start = int(document['GenomicInfo'][0]['ChrStart']) chr_start = int(document['GenomicInfo'][0]['ChrStart'])
chr_stop = int(document['GenomicInfo'][0]['ChrStop']) chr_stop = int(document['GenomicInfo'][0]['ChrStop'])
...@@ -606,6 +618,26 @@ class GenBankRetriever(Retriever): ...@@ -606,6 +618,26 @@ class GenBankRetriever(Retriever):
__file__, 4, 'ENOGENE', 'Gene {} not found.'.format(gene)) __file__, 4, 'ENOGENE', 'Gene {} not found.'.format(gene))
return None return None
# If we take `start` and `stop` to be the one-based, inclusive, in
# reference orientation coordinates of the gene, the code below yields
# the following call to retrieveslice:
#
# Forward:
# slice_start = start - upstream
# slice_stop = stop + downstream + 1
#
# Reverse:
# slice_start = start - downstream - 1
# slice_stop = stop + upstream
#
# Since retrieveslice expects one-based, inclusive, in reference
# orientation coordinates, this code effectively slices one downstream
# base extra. Yes, that's a bug.
#
# If we correct this, we should be careful not to invalidate the
# entire existing cache of sliced reference files by generating new
# slices with a one base difference.
# Figure out the orientation of the gene. # Figure out the orientation of the gene.
orientation = 1 orientation = 1
if chr_start > chr_stop: # Swap start and stop. if chr_start > chr_stop: # Swap start and stop.
......
...@@ -172,11 +172,11 @@ class Reference(db.Base): ...@@ -172,11 +172,11 @@ class Reference(db.Base):
slice_accession = Column(String(20)) slice_accession = Column(String(20))
#: The start position on the accession number from which we took a slice, #: The start position on the accession number from which we took a slice,
#: if available. #: if available (one-based, inclusive, in reference orientation).
slice_start = Column(Integer) slice_start = Column(Integer)
#: The stop position on the accession number from which we took a slice, #: The stop position on the accession number from which we took a slice,
#: if available. #: if available (one-based, inclusive, in reference orientation).
slice_stop = Column(Integer) slice_stop = Column(Integer)
#: The orientation on the accession number from which we took a slice, if #: The orientation on the accession number from which we took a slice, if
...@@ -381,22 +381,28 @@ class TranscriptMapping(db.Base): ...@@ -381,22 +381,28 @@ class TranscriptMapping(db.Base):
orientation = Column(Enum('forward', 'reverse', name='orientation'), orientation = Column(Enum('forward', 'reverse', name='orientation'),
nullable=False) nullable=False)
#: The start position of the transcript on the chromosome. #: The start position of the transcript on the chromosome (one-based,
#: inclusive, in chromosomal orientation).
start = Column(Integer, nullable=False) start = Column(Integer, nullable=False)
#: The stop position of the transcript on the chromosome. #: The stop position of the transcript on the chromosome (one-based,
#: inclusive, in chromosomal orientation).
stop = Column(Integer, nullable=False) stop = Column(Integer, nullable=False)
#: The CDS start position of the transcript on the chromosome. #: The CDS start position of the transcript on the chromosome (one-based,
#: inclusive, in chromosomal orientation).
cds_start = Column(Integer) cds_start = Column(Integer)
#: The CDS stop position of the transcript on the chromosome. #: The CDS stop position of the transcript on the chromosome (one-based,
#: inclusive).
cds_stop = Column(Integer) cds_stop = Column(Integer)
#: The exon start positions of the transcript on the chromosome. #: The exon start positions of the transcript on the chromosome
#: (one-based, inclusive, in chromosomal orientation).
exon_starts = Column(Positions, nullable=False) exon_starts = Column(Positions, nullable=False)
#: The exon start positions of the transcript on the chromosome. #: The exon start positions of the transcript on the chromosome
#: (one-based, inclusive, in chromosomal orientation).
exon_stops = Column(Positions, nullable=False) exon_stops = Column(Positions, nullable=False)
#: If `False`, variant descriptions can use just the accession number #: If `False`, variant descriptions can use just the accession number
......
...@@ -816,6 +816,9 @@ def import_from_ucsc_by_gene(assembly, gene): ...@@ -816,6 +816,9 @@ def import_from_ucsc_by_gene(assembly, gene):
result = cursor.fetchall() result = cursor.fetchall()
cursor.close() cursor.close()
# All ranges in the UCSC tables are zero-based and open-ended. We convert
# this to one-based, inclusive for our database.
for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds, for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds,
geneName, chrom, strand, protAcc) in result: geneName, chrom, strand, protAcc) in result:
chromosome = assembly.chromosomes.filter_by(name=chrom).one() chromosome = assembly.chromosomes.filter_by(name=chrom).one()
...@@ -923,6 +926,9 @@ def import_from_mapview_file(assembly, mapview_file, group_label): ...@@ -923,6 +926,9 @@ def import_from_mapview_file(assembly, mapview_file, group_label):
For transcripts without any UTR and CDS entries (seems to happen for For transcripts without any UTR and CDS entries (seems to happen for
predicted genes), we generate one exon spanning the entire transcript. predicted genes), we generate one exon spanning the entire transcript.
All positions are one-based, inclusive, and that is what we also use in
our database.
""" """
columns = ['taxonomy', 'chromosome', 'start', 'stop', 'orientation', columns = ['taxonomy', 'chromosome', 'start', 'stop', 'orientation',
'contig', 'ctg_start', 'ctg_stop', 'ctg_orientation', 'contig', 'ctg_start', 'ctg_stop', 'ctg_orientation',
......
...@@ -198,7 +198,7 @@ class MutalyzerService(ServiceBase): ...@@ -198,7 +198,7 @@ class MutalyzerService(ServiceBase):
@type build: string @type build: string
@arg chrom: A chromosome encoded as "chr1", ..., "chrY". @arg chrom: A chromosome encoded as "chr1", ..., "chrY".
@type chrom: string @type chrom: string
@arg pos: A position on the chromosome. @arg pos: A position on the chromosome (one-based).
@type pos: int @type pos: int
@kwarg versions: If set to True, also include transcript versions. @kwarg versions: If set to True, also include transcript versions.
@type versions: bool @type versions: bool
...@@ -276,6 +276,8 @@ class MutalyzerService(ServiceBase): ...@@ -276,6 +276,8 @@ class MutalyzerService(ServiceBase):
""" """
Get all the transcripts that overlap with a range on a chromosome. Get all the transcripts that overlap with a range on a chromosome.
The range should be provided as one-based, inclusive positions.
@arg build: The genome build (hg19, hg18, mm10). @arg build: The genome build (hg19, hg18, mm10).
@type build: string @type build: string
@arg chrom: A chromosome encoded as "chr1", ..., "chrY". @arg chrom: A chromosome encoded as "chr1", ..., "chrY".
...@@ -298,6 +300,13 @@ class MutalyzerService(ServiceBase): ...@@ -298,6 +300,13 @@ class MutalyzerService(ServiceBase):
"Received request getTranscriptsRange(%s %s %s %s %s)" % (build, "Received request getTranscriptsRange(%s %s %s %s %s)" % (build,
chrom, pos1, pos2, method)) chrom, pos1, pos2, method))
if pos1 > pos2:
L.addMessage(__file__, 4, 'EARG',
'Invalid range: %d-%d' % (pos1, pos2))
raise Fault('EARG',
'Invalid range (%d-%d) with start position greater '
'than stop position.' % (pos1, pos2))
try: try:
assembly = Assembly.by_name_or_alias(build) assembly = Assembly.by_name_or_alias(build)
except NoResultFound: except NoResultFound:
...@@ -337,6 +346,8 @@ class MutalyzerService(ServiceBase): ...@@ -337,6 +346,8 @@ class MutalyzerService(ServiceBase):
Get all the transcripts and their info that overlap with a range on a Get all the transcripts and their info that overlap with a range on a
chromosome. chromosome.
The range should be provided as one-based, inclusive positions.
@arg build: The genome build (hg19, hg18, mm10). @arg build: The genome build (hg19, hg18, mm10).
@type build: string @type build: string
@arg chrom: A chromosome encoded as "chr1", ..., "chrY". @arg chrom: A chromosome encoded as "chr1", ..., "chrY".
...@@ -359,16 +370,25 @@ class MutalyzerService(ServiceBase): ...@@ -359,16 +370,25 @@ class MutalyzerService(ServiceBase):
- stop - stop
- cds_start - cds_start
- cds_stop - cds_stop
All returned ranges are one-based, inclusive, and in gene
orientation.
""" """
output = Output(__file__) output = Output(__file__)
output.addMessage(__file__, -1, 'INFO', 'Received request ' \ output.addMessage(__file__, -1, 'INFO', 'Received request ' \
'getTranscriptsMapping(%s %s %s %s %s)' % (build, chrom, pos1, pos2, 'getTranscriptsMapping(%s %s %s %s %s)' % (build, chrom, pos1, pos2,
method)) method))
if pos1 > pos2:
output.addMessage(__file__, 4, 'EARG',
'Invalid range: %d-%d' % (pos1, pos2))
raise Fault('EARG',
'Invalid range (%d-%d) with start position greater '
'than stop position.' % (pos1, pos2))
try: try:
assembly = Assembly.by_name_or_alias(build) assembly = Assembly.by_name_or_alias(build)
except NoResultFound: except NoResultFound:
L.addMessage(__file__, 4, "EARG", "EARG %s" % build) output.addMessage(__file__, 4, "EARG", "EARG %s" % build)
raise Fault("EARG", raise Fault("EARG",
"The build argument (%s) was not a valid " \ "The build argument (%s) was not a valid " \
"build name." % build) "build name." % build)
...@@ -376,7 +396,7 @@ class MutalyzerService(ServiceBase): ...@@ -376,7 +396,7 @@ class MutalyzerService(ServiceBase):
try: try:
chromosome = assembly.chromosomes.filter_by(name=chrom).one() chromosome = assembly.chromosomes.filter_by(name=chrom).one()
except NoResultFound: except NoResultFound:
L.addMessage(__file__, 4, "EARG", "EARG %s" % chrom) output.addMessage(__file__, 4, "EARG", "EARG %s" % chrom)
raise Fault("EARG", "The chrom argument (%s) was not a valid " \ raise Fault("EARG", "The chrom argument (%s) was not a valid " \
"chromosome name." % chrom) "chromosome name." % chrom)
...@@ -1123,8 +1143,23 @@ class MutalyzerService(ServiceBase): ...@@ -1123,8 +1143,23 @@ class MutalyzerService(ServiceBase):
def sliceChromosomeByGene(geneSymbol, organism, upStream, def sliceChromosomeByGene(geneSymbol, organism, upStream,
downStream) : downStream) :
""" """
Todo: documentation, error handling, argument checking, tests. Retrieve part of the reference genome for a (HGNC) gene symbol.
@arg geneSymbol: Gene symbol.
@type geneSymbol: string
@arg organism: Organism name without spaces.
@type organism: string
@arg upStream: Number of 5' flanking bases to include.
@type upStream: integer
@arg downStream: Number of 3' flanking bases to include.
@type upStream: integer
This uses the NCBI Entrez search engine and is therefore based on the
current Entrez assembly for the given organism.
@return: UD accession number for created slice.
""" """
# Todo: error handling, argument checking, tests.
O = Output(__file__) O = Output(__file__)
retriever = Retriever.GenBankRetriever(O) retriever = Retriever.GenBankRetriever(O)
...@@ -1148,12 +1183,21 @@ class MutalyzerService(ServiceBase): ...@@ -1148,12 +1183,21 @@ class MutalyzerService(ServiceBase):
Mandatory.Integer, _returns=Mandatory.Unicode) Mandatory.Integer, _returns=Mandatory.Unicode)
def sliceChromosome(chromAccNo, start, end, orientation) : def sliceChromosome(chromAccNo, start, end, orientation) :
""" """
Todo: documentation, error handling, argument checking, tests. Retrieve a range of a chromosome by accession number.
@arg chromAccNo: Chromosome or contig by accession number.
@type chromAccNo: string
@arg start: Start position (one-based, inclusive, in reference
orientation).
@type start: integer
@arg stop: Stop position (one-based, inclusive, in reference
orientation).
@type start: integer
@arg orientation: Orientation of the slice. 1 for forward, 2 for @arg orientation: Orientation of the slice. 1 for forward, 2 for
reverse complement. reverse complement.
@type orientation: integer @type orientation: integer
""" """
# Todo: error handling, argument checking, tests.
O = Output(__file__) O = Output(__file__)
retriever = Retriever.GenBankRetriever(O) retriever = Retriever.GenBankRetriever(O)
...@@ -1351,10 +1395,12 @@ class MutalyzerService(ServiceBase): ...@@ -1351,10 +1395,12 @@ class MutalyzerService(ServiceBase):
@return: Object with the following fields: @return: Object with the following fields:
- gene: Gene symbol. - gene: Gene symbol.
- start: Gene start position. If multiple transcripts for the gene - start: Gene start position (one-based, inclusive, in chromosomal
are known, this contains the lowest start position. orientation). If multiple transcripts for the gene are known,
- stop: Gene stop position. If multiple transcripts for the gene are this contains the lowest start position.
known, this contains the highest stop position. - stop: Gene stop position (one-based, inclusive, in chromosomal
orientation). If multiple transcripts for the gene are known,
this contains the highest stop position.
- orientation: Gene orientation, either 'forward' or 'reverse'. - orientation: Gene orientation, either 'forward' or 'reverse'.
- chromosome_name: Gene chromosome by name (e.g., 'chrX'). - chromosome_name: Gene chromosome by name (e.g., 'chrX').
- chromosome_accession: Gene chromosome by accession (e.g., - chromosome_accession: Gene chromosome by accession (e.g.,
......
...@@ -13,6 +13,10 @@ sequence when no appropriate RefSeq, GenBank or LRG file is available. ...@@ -13,6 +13,10 @@ sequence when no appropriate RefSeq, GenBank or LRG file is available.
Please select one of the options below to upload or retrieve your reference Please select one of the options below to upload or retrieve your reference
sequence (maximum size is {{ max_file_size }} megabytes). sequence (maximum size is {{ max_file_size }} megabytes).
</p> </p>
<p>
<strong>Note:</strong> All ranges are assumed to be one-based, inclusive,
and in reference orientation.
</p>
<form name="invoer" enctype="multipart/form-data" action="{{ url_for('.reference_loader') }}" method="post" id="invoer"> <form name="invoer" enctype="multipart/form-data" action="{{ url_for('.reference_loader') }}" method="post" id="invoer">
<div class="row"> <div class="row">
......
...@@ -558,8 +558,10 @@ def reference_loader_submit(): ...@@ -558,8 +558,10 @@ def reference_loader_submit():
Retrieve a range of a chromosome by accession number. Retrieve a range of a chromosome by accession number.
- `accession`: Chromosome Accession Number. - `accession`: Chromosome Accession Number.
- `accession_start`: Start position. - `accession_start`: Start position (one-based, inclusive, in reference
- `accession_stop`: Stop position. orientation).
- `accession_stop`: Stop position (one-based, inclusive, in reference
orientation).
- `accession_orientation`: Orientation. - `accession_orientation`: Orientation.
`method=slice_chromosome_method` `method=slice_chromosome_method`
...@@ -567,8 +569,10 @@ def reference_loader_submit(): ...@@ -567,8 +569,10 @@ def reference_loader_submit():
- `assembly_name_or_alias`: Genome assembly by name or by alias. - `assembly_name_or_alias`: Genome assembly by name or by alias.
- `chromosome`: Chromosome name. - `chromosome`: Chromosome name.
- `chromosome_start`: Start position. - `chromosome_start`: Start position (one-based, inclusive, in reference
- `chromosome_stop`: Stop position. orientation).
- `chromosome_stop`: Stop position (one-based, inclusive, in reference
orientation).
- `chromosome_orientation`: Orientation. - `chromosome_orientation`: Orientation.
""" """
method = request.form.get('method') method = request.form.get('method')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment