Skip to content
Snippets Groups Projects
Commit 4e47d2c3 authored by Vermaat's avatar Vermaat
Browse files

Merge pull request #124 from mutalyzer/mapping-import

Speedup NCBI mapview file import
parents aa6b1b71 0149af27
No related branches found
No related tags found
No related merge requests found
...@@ -422,37 +422,30 @@ class TranscriptMapping(db.Base): ...@@ -422,37 +422,30 @@ class TranscriptMapping(db.Base):
source, transcript=1, cds=None, source, transcript=1, cds=None,
select_transcript=False, version=None): select_transcript=False, version=None):
""" """
Returns an existing :class:`TranscriptMapping` instance with the given Returns an :class:`TranscriptMapping` instance with the given values.
values for `chromosome`, `accession`, `version`, `gene`, and If a row with a duplicate key already exists, it is deleted first.
`transcript` if it exists, or a new instance otherwise.
.. note:: Unfortunately, SQLAlchemy does not have `ON DUPLICATE KEY .. note:: Unfortunately, SQLAlchemy does not have `ON DUPLICATE KEY
UPDATE` functionality, which would performance-wise be the best UPDATE` functionality, which would performance-wise be the best
way to update transcript mappings. This class method implements an way to update transcript mappings. Even if it had, PostgreSQL does
alternative albeit at the cost of querying the table for an not. This class method implements an alternative albeit at the
existing entry on each update. cost of querying the table for an existing entry on each update.
From PostgreSQL 9.5 we do have ``INSERT ... ON CONFLICT DO UPDATE``
which we might want to use in the future.
http://stackoverflow.com/questions/17267417/how-do-i-do-an-upsert-merge-insert-on-duplicate-update-in-postgresql
""" """
instance = cls.query.filter_by( # Actually we should do a `lock transcript_mappings in exclusive mode;`
# to prevent concurrent reads to miss this entry.
cls.query.filter_by(
chromosome=chromosome, accession=accession, version=version, chromosome=chromosome, accession=accession, version=version,
gene=gene, transcript=transcript).first() gene=gene, transcript=transcript
if instance is None: ).delete()
instance = cls(chromosome, reference_type, accession, gene, return cls(chromosome, reference_type, accession, gene, orientation,
orientation, start, stop, exon_starts, exon_stops, start, stop, exon_starts, exon_stops, source,
source, transcript=transcript, cds=cds, transcript=transcript, cds=cds,
select_transcript=select_transcript, select_transcript=select_transcript, version=version)
version=version)
else:
instance.reference_type = reference_type
instance.orientation = orientation
instance.start = start
instance.stop = stop
instance.bin = binning.assign_bin(start - 1, stop)
instance.exon_starts = exon_starts
instance.exon_stops = exon_stops
instance.source = source
instance.cds = cds
instance.select_transcript = select_transcript
return instance
@property @property
def coding(self): def coding(self):
......
This diff is collapsed.
...@@ -5,9 +5,13 @@ Tests for the mutalyzer.mapping module. ...@@ -5,9 +5,13 @@ Tests for the mutalyzer.mapping module.
from __future__ import unicode_literals from __future__ import unicode_literals
import codecs
import os
import pytest import pytest
from mutalyzer.mapping import Converter from mutalyzer.db.models import TranscriptMapping
from mutalyzer import mapping
pytestmark = pytest.mark.usefixtures('hg19_transcript_mappings') pytestmark = pytest.mark.usefixtures('hg19_transcript_mappings')
...@@ -15,7 +19,7 @@ pytestmark = pytest.mark.usefixtures('hg19_transcript_mappings') ...@@ -15,7 +19,7 @@ pytestmark = pytest.mark.usefixtures('hg19_transcript_mappings')
@pytest.fixture @pytest.fixture
def converter(output, hg19): def converter(output, hg19):
return Converter(hg19, output) return mapping.Converter(hg19, output)
def test_converter(converter): def test_converter(converter):
...@@ -322,3 +326,54 @@ def test_ins_seq_seq_c2chrom_reverse(converter): ...@@ -322,3 +326,54 @@ def test_ins_seq_seq_c2chrom_reverse(converter):
""" """
genomic = converter.c2chrom('NM_012459.2:c.10_11ins[TTT;ATC]') genomic = converter.c2chrom('NM_012459.2:c.10_11ins[TTT;ATC]')
assert genomic == 'NC_000011.9:g.111957482_111957483ins[GAT;AAA]' assert genomic == 'NC_000011.9:g.111957482_111957483ins[GAT;AAA]'
def test_import_mapview(hg19):
original_count = TranscriptMapping.query.count()
group_label = 'GRCh37.p13-Primary Assembly'
path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data',
'hg19.chr11.111771755-112247252.seq_gene.sorted.md')
mapview = codecs.open(path, encoding='utf-8')
mapview_count = sum(1 for line in mapview
if line.split('\t')[12] == group_label
and line.split('\t')[11] == 'RNA')
mapview.seek(0)
mapping.import_from_mapview_file(hg19, mapview, group_label)
# Two transcripts were already in, the rest is new:
# - NR_028383.1
# - NM_012459.2
assert TranscriptMapping.query.count() == original_count + mapview_count - 2
# No changes here.
unchanged = TranscriptMapping.query.filter_by(accession='NM_012459').one()
assert unchanged.start == 111955524
assert unchanged.stop == 111957522
assert unchanged.exon_starts == [111955524, 111957364]
assert unchanged.exon_stops == [111956186, 111957522]
assert unchanged.cds == (111956019, 111957492)
# We made some artificial changes to the mapview file here.
updated = TranscriptMapping.query.filter_by(accession='NR_028383').one()
assert updated.start == 111955524
assert updated.stop == 111957525
assert updated.exon_starts == [111955524, 111956700, 111957364]
assert updated.exon_stops == [111956180, 111957034, 111957525]
# This is a new entry.
new = TranscriptMapping.query.filter_by(accession='NM_000317').one()
assert new.version == 2
assert new.start == 112097088
assert new.stop == 112104696
assert new.exon_starts == [112097088, 112099317, 112100931, 112101349,
112103886, 112104155]
assert new.exon_stops == [112097249, 112099396, 112100953, 112101405,
112103956, 112104696]
assert new.cds == (112097167, 112104278)
assert new.gene == 'PTS'
assert new.orientation == 'forward'
assert new.reference_type == 'refseq'
assert new.source == 'ncbi'
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment