From 5e0d444a7331971ea89ec6daa2e7030b22e6a115 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat <martijn@vermaat.name> Date: Mon, 20 Jul 2015 16:55:22 +0200 Subject: [PATCH] Fix transcript mappings containing no exons For transcripts without any UTR and CDS entries in the NCBI Mapview file (seems to happen for predicted genes), we generate one exon spanning the entire transcript. --- ...086dd_fix_zero_exon_transcript_mappings.py | 36 +++++++++++++++++++ mutalyzer/mapping.py | 9 +++++ 2 files changed, 45 insertions(+) create mode 100644 migrations/versions/4bafcc5086dd_fix_zero_exon_transcript_mappings.py diff --git a/migrations/versions/4bafcc5086dd_fix_zero_exon_transcript_mappings.py b/migrations/versions/4bafcc5086dd_fix_zero_exon_transcript_mappings.py new file mode 100644 index 00000000..03cc4813 --- /dev/null +++ b/migrations/versions/4bafcc5086dd_fix_zero_exon_transcript_mappings.py @@ -0,0 +1,36 @@ +"""Fix zero-exon transcript mappings + +Revision ID: 4bafcc5086dd +Revises: 2e062969eb54 +Create Date: 2015-07-20 16:16:01.602964 + +""" + +from __future__ import unicode_literals + +# revision identifiers, used by Alembic. +revision = '4bafcc5086dd' +down_revision = u'2e062969eb54' + +from alembic import op +from sqlalchemy import sql +import sqlalchemy as sa + + +def upgrade(): + transcript_mappings = sql.table('transcript_mappings', + sql.column('start', sa.Integer()), + sql.column('stop', sa.Integer()), + sql.column('exon_starts', sa.Text()), + sql.column('exon_stops', sa.Text())) + # https://alembic.readthedocs.org/en/latest/ops.html#alembic.operations.Operations.execute + op.execute(transcript_mappings + .update() + .where(transcript_mappings.c.exon_starts == op.inline_literal('')) + .values({'exon_starts': transcript_mappings.c.start, + 'exon_stops': transcript_mappings.c.stop})) + + +def downgrade(): + # We cannot reliably downgrade this migration. + pass diff --git a/mutalyzer/mapping.py b/mutalyzer/mapping.py index ba4d1110..47fe4a31 100644 --- a/mutalyzer/mapping.py +++ b/mutalyzer/mapping.py @@ -920,6 +920,9 @@ def import_from_mapview_file(assembly, mapview_file, group_label): Our strategy is too sort by gene and chromosome and process the file grouped by these two fields. + + For transcripts without any UTR and CDS entries (seems to happen for + predicted genes), we generate one exon spanning the entire transcript. """ columns = ['taxonomy', 'chromosome', 'start', 'stop', 'orientation', 'contig', 'ctg_start', 'ctg_stop', 'ctg_orientation', @@ -999,6 +1002,12 @@ def import_from_mapview_file(assembly, mapview_file, group_label): else: cds = None + # If no exons are annotated, we create one spanning the entire + # transcript. + if not exon_starts: + exon_starts = [start] + exon_stops = [stop] + yield TranscriptMapping.create_or_update( chromosome, 'refseq', accession, gene, orientation, start, stop, exon_starts, exon_stops, 'ncbi', cds=cds, -- GitLab