diff --git a/migrations/versions/4bafcc5086dd_fix_zero_exon_transcript_mappings.py b/migrations/versions/4bafcc5086dd_fix_zero_exon_transcript_mappings.py new file mode 100644 index 0000000000000000000000000000000000000000..03cc481324099d02a38c3f8cb26250d7543de203 --- /dev/null +++ b/migrations/versions/4bafcc5086dd_fix_zero_exon_transcript_mappings.py @@ -0,0 +1,36 @@ +"""Fix zero-exon transcript mappings + +Revision ID: 4bafcc5086dd +Revises: 2e062969eb54 +Create Date: 2015-07-20 16:16:01.602964 + +""" + +from __future__ import unicode_literals + +# revision identifiers, used by Alembic. +revision = '4bafcc5086dd' +down_revision = u'2e062969eb54' + +from alembic import op +from sqlalchemy import sql +import sqlalchemy as sa + + +def upgrade(): + transcript_mappings = sql.table('transcript_mappings', + sql.column('start', sa.Integer()), + sql.column('stop', sa.Integer()), + sql.column('exon_starts', sa.Text()), + sql.column('exon_stops', sa.Text())) + # https://alembic.readthedocs.org/en/latest/ops.html#alembic.operations.Operations.execute + op.execute(transcript_mappings + .update() + .where(transcript_mappings.c.exon_starts == op.inline_literal('')) + .values({'exon_starts': transcript_mappings.c.start, + 'exon_stops': transcript_mappings.c.stop})) + + +def downgrade(): + # We cannot reliably downgrade this migration. + pass diff --git a/mutalyzer/mapping.py b/mutalyzer/mapping.py index ba4d11102919c0b384511ca24e61494991fa0c43..47fe4a310b9d5edf896e9ea080b051aa10f778e6 100644 --- a/mutalyzer/mapping.py +++ b/mutalyzer/mapping.py @@ -920,6 +920,9 @@ def import_from_mapview_file(assembly, mapview_file, group_label): Our strategy is too sort by gene and chromosome and process the file grouped by these two fields. + + For transcripts without any UTR and CDS entries (seems to happen for + predicted genes), we generate one exon spanning the entire transcript. """ columns = ['taxonomy', 'chromosome', 'start', 'stop', 'orientation', 'contig', 'ctg_start', 'ctg_stop', 'ctg_orientation', @@ -999,6 +1002,12 @@ def import_from_mapview_file(assembly, mapview_file, group_label): else: cds = None + # If no exons are annotated, we create one spanning the entire + # transcript. + if not exon_starts: + exon_starts = [start] + exon_stops = [stop] + yield TranscriptMapping.create_or_update( chromosome, 'refseq', accession, gene, orientation, start, stop, exon_starts, exon_stops, 'ncbi', cds=cds,