Merge pull request #63 from mutalyzer/mapping-zero-exons

Fix transcript mappings containing no exons

Merge pull request #63 from mutalyzer/mapping-zero-exons
e921f13a · Vermaat · 3dfb33f9 · 5e0d444a · e921f13a · e921f13a
Commit e921f13a authored 9 years ago by Vermaat
--- a/migrations/versions/4bafcc5086dd_fix_zero_exon_transcript_mappings.py
+++ b/migrations/versions/4bafcc5086dd_fix_zero_exon_transcript_mappings.py
+"""Fix zero-exon transcript mappings
+Revision ID: 4bafcc5086dd
+Revises: 2e062969eb54
+Create Date: 2015-07-20 16:16:01.602964
+"""
+from __future__ import unicode_literals
+# revision identifiers, used by Alembic.
+revision = '4bafcc5086dd'
+down_revision = u'2e062969eb54'
+from alembic import op
+from sqlalchemy import sql
+import sqlalchemy as sa
+def upgrade():
+    transcript_mappings = sql.table('transcript_mappings',
+                                    sql.column('start', sa.Integer()),
+                                    sql.column('stop', sa.Integer()),
+                                    sql.column('exon_starts', sa.Text()),
+                                    sql.column('exon_stops', sa.Text()))
+    # https://alembic.readthedocs.org/en/latest/ops.html#alembic.operations.Operations.execute
+    op.execute(transcript_mappings
+               .update()
+               .where(transcript_mappings.c.exon_starts == op.inline_literal(''))
+               .values({'exon_starts': transcript_mappings.c.start,
+                        'exon_stops': transcript_mappings.c.stop}))
+def downgrade():
+    # We cannot reliably downgrade this migration.
+    pass
--- a/mutalyzer/mapping.py
+++ b/mutalyzer/mapping.py
@@ -920,6 +920,9 @@ def import_from_mapview_file(assembly, mapview_file, group_label):
    Our strategy is too sort by gene and chromosome and process the file
    grouped by these two fields.
+    For transcripts without any UTR and CDS entries (seems to happen for
+    predicted genes), we generate one exon spanning the entire transcript.
    """
    columns = ['taxonomy', 'chromosome', 'start', 'stop', 'orientation',
               'contig', 'ctg_start', 'ctg_stop', 'ctg_orientation',
@@ -999,6 +1002,12 @@ def import_from_mapview_file(assembly, mapview_file, group_label):
            else:
                cds = None
+            # If no exons are annotated, we create one spanning the entire
+            # transcript.
+            if not exon_starts:
+                exon_starts = [start]
+                exon_stops = [stop]
            yield TranscriptMapping.create_or_update(
                chromosome, 'refseq', accession, gene, orientation, start,
                stop, exon_starts, exon_stops, 'ncbi', cds=cds,