From 5e0d444a7331971ea89ec6daa2e7030b22e6a115 Mon Sep 17 00:00:00 2001
From: Martijn Vermaat <martijn@vermaat.name>
Date: Mon, 20 Jul 2015 16:55:22 +0200
Subject: [PATCH] Fix transcript mappings containing no exons

For transcripts without any UTR and CDS entries in the NCBI Mapview
file (seems to happen for  predicted genes), we generate one exon
spanning the entire transcript.
---
 ...086dd_fix_zero_exon_transcript_mappings.py | 36 +++++++++++++++++++
 mutalyzer/mapping.py                          |  9 +++++
 2 files changed, 45 insertions(+)
 create mode 100644 migrations/versions/4bafcc5086dd_fix_zero_exon_transcript_mappings.py

diff --git a/migrations/versions/4bafcc5086dd_fix_zero_exon_transcript_mappings.py b/migrations/versions/4bafcc5086dd_fix_zero_exon_transcript_mappings.py
new file mode 100644
index 00000000..03cc4813
--- /dev/null
+++ b/migrations/versions/4bafcc5086dd_fix_zero_exon_transcript_mappings.py
@@ -0,0 +1,36 @@
+"""Fix zero-exon transcript mappings
+
+Revision ID: 4bafcc5086dd
+Revises: 2e062969eb54
+Create Date: 2015-07-20 16:16:01.602964
+
+"""
+
+from __future__ import unicode_literals
+
+# revision identifiers, used by Alembic.
+revision = '4bafcc5086dd'
+down_revision = u'2e062969eb54'
+
+from alembic import op
+from sqlalchemy import sql
+import sqlalchemy as sa
+
+
+def upgrade():
+    transcript_mappings = sql.table('transcript_mappings',
+                                    sql.column('start', sa.Integer()),
+                                    sql.column('stop', sa.Integer()),
+                                    sql.column('exon_starts', sa.Text()),
+                                    sql.column('exon_stops', sa.Text()))
+    # https://alembic.readthedocs.org/en/latest/ops.html#alembic.operations.Operations.execute
+    op.execute(transcript_mappings
+               .update()
+               .where(transcript_mappings.c.exon_starts == op.inline_literal(''))
+               .values({'exon_starts': transcript_mappings.c.start,
+                        'exon_stops': transcript_mappings.c.stop}))
+
+
+def downgrade():
+    # We cannot reliably downgrade this migration.
+    pass
diff --git a/mutalyzer/mapping.py b/mutalyzer/mapping.py
index ba4d1110..47fe4a31 100644
--- a/mutalyzer/mapping.py
+++ b/mutalyzer/mapping.py
@@ -920,6 +920,9 @@ def import_from_mapview_file(assembly, mapview_file, group_label):
 
     Our strategy is too sort by gene and chromosome and process the file
     grouped by these two fields.
+
+    For transcripts without any UTR and CDS entries (seems to happen for
+    predicted genes), we generate one exon spanning the entire transcript.
     """
     columns = ['taxonomy', 'chromosome', 'start', 'stop', 'orientation',
                'contig', 'ctg_start', 'ctg_stop', 'ctg_orientation',
@@ -999,6 +1002,12 @@ def import_from_mapview_file(assembly, mapview_file, group_label):
             else:
                 cds = None
 
+            # If no exons are annotated, we create one spanning the entire
+            # transcript.
+            if not exon_starts:
+                exon_starts = [start]
+                exon_stops = [stop]
+
             yield TranscriptMapping.create_or_update(
                 chromosome, 'refseq', accession, gene, orientation, start,
                 stop, exon_starts, exon_stops, 'ncbi', cds=cds,
-- 
GitLab