Skip to content
Snippets Groups Projects
Commit 1376c524 authored by Vermaat's avatar Vermaat
Browse files

Fix CDS start and stop in mapping database

During import of the NCBI transcript mappings, CDS start or stop positions
were not picked up for some transcripts (where these are on exon boundaries).

Bug reported by S Venkata Suresh Kumar.


git-svn-id: https://humgenprojects.lumc.nl/svn/mutalyzer/trunk@490 eb6bd6ab-9ccd-42b9-aceb-e2899b4a52f1
parent 160b3a06
No related branches found
No related tags found
No related merge requests found
...@@ -2,4 +2,4 @@ ...@@ -2,4 +2,4 @@
#25 3 * * 7 www-data wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/BUILD.36.3/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md; <MUTALYZER_BIN_MAPPING_UPDATE> hg18 /tmp/seq_gene.md reference #25 3 * * 7 www-data wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/BUILD.36.3/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md; <MUTALYZER_BIN_MAPPING_UPDATE> hg18 /tmp/seq_gene.md reference
#25 4 * * 7 www-data wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/BUILD.37.2/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md; <MUTALYZER_BIN_MAPPING_UPDATE> hg19 /tmp/seq_gene.md 'GRCh37.p2-Primary Assembly' #25 4 * * 7 www-data wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/BUILD.37.2/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md; <MUTALYZER_BIN_MAPPING_UPDATE> hg19 /tmp/seq_gene.md 'GRCh37.p2-Primary Assembly'
##25 4 * * 7 www-data wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md; <MUTALYZER_BIN_MAPPING_UPDATE> hg19 /tmp/seq_gene.md 'GRCh37.p2-Primary Assembly' ##25 4 * * 7 www-data wget "ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz" -O - | zcat > /tmp/seq_gene.md; <MUTALYZER_BIN_MAPPING_UPDATE> hg19 /tmp/seq_gene.md 'GRCh37.p5-Primary Assembly'
...@@ -644,7 +644,7 @@ class Mapping(Db) : ...@@ -644,7 +644,7 @@ class Mapping(Db) :
T.orientation as orientation, T.orientation as orientation,
MIN(T.start) as start, MIN(T.start) as start,
MAX(T.stop) as stop, MAX(T.stop) as stop,
MAX(E.cds_start) as cds_start, REPLACE(MIN(COALESCE(E.cds_start, 1000000000)), 1000000000, NULL) as cds_start,
MAX(E.cds_stop) as cds_stop, MAX(E.cds_stop) as cds_stop,
GROUP_CONCAT(DISTINCT E.start ORDER BY E.start ASC) as exon_starts, GROUP_CONCAT(DISTINCT E.start ORDER BY E.start ASC) as exon_starts,
GROUP_CONCAT(DISTINCT E.stop ORDER BY E.stop ASC) as exon_stops, GROUP_CONCAT(DISTINCT E.stop ORDER BY E.stop ASC) as exon_stops,
......
...@@ -833,6 +833,8 @@ class NCBIUpdater(Updater): ...@@ -833,6 +833,8 @@ class NCBIUpdater(Updater):
entry['start'] = int(entry['start']) entry['start'] = int(entry['start'])
entry['stop'] = int(entry['stop']) entry['stop'] = int(entry['stop'])
entry['protein'] = entry['feature_name'] if cds else None entry['protein'] = entry['feature_name'] if cds else None
entry['cds_start'] = entry['start'] if cds else None
entry['cds_stop'] = entry['stop'] if cds else None
entry['cds'] = cds entry['cds'] = cds
self._import_exon_backlog(entry['start'] - 1) self._import_exon_backlog(entry['start'] - 1)
...@@ -841,12 +843,9 @@ class NCBIUpdater(Updater): ...@@ -841,12 +843,9 @@ class NCBIUpdater(Updater):
previous = self.exon_backlog[entry['transcript']] previous = self.exon_backlog[entry['transcript']]
if previous['cds'] != entry['cds'] \ if previous['cds'] != entry['cds'] \
and previous['stop'] == entry['start'] - 1: and previous['stop'] == entry['start'] - 1:
if entry['cds']: if previous['cds']:
entry['cds_start'] = entry['start'] entry['cds_start'] = previous['cds_start']
else: entry['cds_stop'] = previous['cds_stop']
entry['cds_stop'] = previous['stop']
if 'cds_start' in previous:
entry['cds_start'] = previous['cds_start']
entry['protein'] = previous['protein'] entry['protein'] = previous['protein']
entry['start'] = previous['start'] entry['start'] = previous['start']
except KeyError: except KeyError:
...@@ -870,9 +869,7 @@ class NCBIUpdater(Updater): ...@@ -870,9 +869,7 @@ class NCBIUpdater(Updater):
del exon['cds'] del exon['cds']
self.db.ncbi_import_exon( self.db.ncbi_import_exon(
exon['transcript'], exon['chromosome'], exon['start'], exon['stop'], exon['transcript'], exon['chromosome'], exon['start'], exon['stop'],
exon['cds_start'] if 'cds_start' in exon else None, exon['cds_start'], exon['cds_stop'], exon['protein'] or None)
exon['cds_stop'] if 'cds_stop' in exon else None,
exon['protein'] or None)
#_import_exon_backlog #_import_exon_backlog
def _aggregate_mapping(self): def _aggregate_mapping(self):
......
...@@ -60,3 +60,19 @@ class TestConverter(): ...@@ -60,3 +60,19 @@ class TestConverter():
coding = converter.chrom2c('NC_000022.10:g.51016285_51017117del123456789', 'list') coding = converter.chrom2c('NC_000022.10:g.51016285_51017117del123456789', 'list')
assert 'NM_001145134.1:c.-138-u21_60del123456789' in coding assert 'NM_001145134.1:c.-138-u21_60del123456789' in coding
assert 'NR_021492.1:c.1-u5170_1-u4338del123456789' in coding assert 'NR_021492.1:c.1-u5170_1-u4338del123456789' in coding
def test_S_Venkata_Suresh_Kumar(self):
"""
Test for correct mapping information on genes where CDS start or stop
is exactly on the border of an exon.
Bug reported February 24 by S Venkata Suresh Kumartest.
"""
converter = self._converter('hg19')
coding = converter.chrom2c('NC_000001.10:g.115259837_115259837delT', 'list')
assert 'NM_001007553.1:c.3863delA' not in coding
assert 'NM_001007553.2:c.3863delA' not in coding
assert 'NM_001007553.1:c.*953delA' in coding
assert 'NM_001130523.1:c.*953delA' in coding
assert 'NM_001007553.2:c.*953delA' in coding
assert 'NM_001130523.2:c.*953delA' in coding
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment