From e1c4bb414e7cd4d8c4261ef7e07e43b355ce3c92 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat <martijn@vermaat.name> Date: Thu, 24 Sep 2015 11:59:53 +0200 Subject: [PATCH] Fix off-by-one in slicing chromosome by gene name The original code would always add one extra downstream base to the slice (for genes on the forward and reverse strand). Correcting this implies that cached slices will not be used by new queries requesting the exact same gene and number of upstream and downstream bases, effectively invalidating them. However, most existing cache entries affected by this bug will be on GRCh37/hg19, whereas current slices by gene name will yield GRCh38/hg38 slices. The only cache issue I see is trying to use existing cache entries created by slicing by gene name, by re-slicing by explicit coordinates instead of gene name. To get the old cache entry one would have to add an extra downstream base. --- mutalyzer/Retriever.py | 50 +++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/mutalyzer/Retriever.py b/mutalyzer/Retriever.py index 37b5f774..85780b59 100644 --- a/mutalyzer/Retriever.py +++ b/mutalyzer/Retriever.py @@ -505,7 +505,7 @@ class GenBankRetriever(Retriever): if self.write(raw_data, reference.accession, 0): return reference.accession - def retrievegene(self, gene, organism, upstream, downstream): + def retrievegene(self, gene, organism, upstream=0, downstream=0): """ Query the NCBI for the chromosomal location of a gene and make a slice if the gene can be found. @@ -598,6 +598,23 @@ class GenBankRetriever(Retriever): chr_acc_ver = unicode(document['GenomicInfo'][0]['ChrAccVer']) chr_start = int(document['GenomicInfo'][0]['ChrStart']) chr_stop = int(document['GenomicInfo'][0]['ChrStop']) + + # Convert to one-based, inclusive, reference orientation. We + # also add the flanking regions. + chr_start += 1 + chr_stop += 1 + if chr_start <= chr_stop: + # Gene on forward strand. + orientation = 1 + chr_start -= upstream + chr_stop += downstream + else: + # Gene on reverse strand. + orientation = 2 + chr_start, chr_stop = chr_stop, chr_start + chr_start -= downstream + chr_stop += upstream + break # Collect official symbols that has this gene as alias in case we @@ -618,37 +635,6 @@ class GenBankRetriever(Retriever): __file__, 4, 'ENOGENE', 'Gene {} not found.'.format(gene)) return None - # If we take `start` and `stop` to be the one-based, inclusive, in - # reference orientation coordinates of the gene, the code below yields - # the following call to retrieveslice: - # - # Forward: - # slice_start = start - upstream - # slice_stop = stop + downstream + 1 - # - # Reverse: - # slice_start = start - downstream - 1 - # slice_stop = stop + upstream - # - # Since retrieveslice expects one-based, inclusive, in reference - # orientation coordinates, this code effectively slices one downstream - # base extra. Yes, that's a bug. - # - # If we correct this, we should be careful not to invalidate the - # entire existing cache of sliced reference files by generating new - # slices with a one base difference. - - # Figure out the orientation of the gene. - orientation = 1 - if chr_start > chr_stop: # Swap start and stop. - orientation = 2 - temp = chr_start - chr_start = chr_stop - downstream # Also take care of the flanking - chr_stop = temp + upstream + 1 # sequences. - else: - chr_start -= upstream - 1 - chr_stop += downstream + 2 - # And retrieve the slice. return self.retrieveslice( chr_acc_ver, chr_start, chr_stop, orientation) -- GitLab