From d18b5395cabb35fcf6e318264840f239c7112ddc Mon Sep 17 00:00:00 2001
From: Martijn Vermaat <martijn@vermaat.name>
Date: Sat, 25 Jul 2015 19:17:11 +0200
Subject: [PATCH] Parse genbank file without VERSION field

Partial fix for https://humgenprojects.lumc.nl/trac/mutalyzer/ticket/188
---
 mutalyzer/parsers/genbank.py      |   8 ++++++--
 tests/data/UD_143772172095.gb.bz2 | Bin 0 -> 1637 bytes
 tests/data/references.yml         |   4 ++++
 tests/test_parsers_genbank.py     |  14 ++++++++++++++
 4 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100644 tests/data/UD_143772172095.gb.bz2

diff --git a/mutalyzer/parsers/genbank.py b/mutalyzer/parsers/genbank.py
index b85dd531..266def8f 100644
--- a/mutalyzer/parsers/genbank.py
+++ b/mutalyzer/parsers/genbank.py
@@ -441,8 +441,12 @@ class GBparser():
         #     the genbank file) are from the original NC reference. We try to
         #     set the .id field to the working value in the caller.
         record.source_id = biorecord.id
-        record.source_accession, record.source_version = biorecord.id.split('.')[:2]
-        record.source_gi = biorecord.annotations['gi']
+        try:
+            record.source_accession, record.source_version = biorecord.id.split('.')[:2]
+        except ValueError:
+            record.source_accession = biorecord.id
+            record.source_version = '1'
+        record.source_gi = biorecord.annotations.get('gi')
         record.organism = biorecord.annotations['organism']
 
         # Todo: This will change once we support protein references
diff --git a/tests/data/UD_143772172095.gb.bz2 b/tests/data/UD_143772172095.gb.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..30f97e92f1f2b746cb17d66fb9b9c69f83dc1c0d
GIT binary patch
literal 1637
zcmV-r2AcUoT4*^jL0KkKSp*1~j{pJ#|A2jPU;t2O|N1}g|DeC`|FB>OUssIjD3%6*
z6ajVI3^g$m5YS~Z0R}0cWYL8EQ_z|WO&dv|ig^t*H8Po^n-xDr02%>-13)y>KnEEz
zXv8rzz(By7V2l$JO*8;u2B+kbDA7~enrQMshK!5=6Gjlgm?4K5GHApxG{8W>nqZ6*
z6HPP#VFad-rlF^((=;PYfEXYEGyntC1yj4NERLBu24UWqFa@YeQh<@6Okl{9j>(FB
zVT2tsvgVv&?%T)L(F-^R@ll?X#1x>V4ZpSp3i%F_Q#EB#cpx0%afwrybiz32jeC<O
zzJ_!m8KNGeHR$WePgBxM;1w}SG@Ulgn^uq4#g{iSOooO!Y)Tz6Z+(MSaY}I*R+bn{
z3b2_oEXbrVw|sftKgv$ev-9li?T=~=MJTnAEo6hxjf_B4hQ5*YzzJJJ9ZEXA++JI@
zw;-NAe%Wd3gLhV)3n1ddUm=y9NwaLGBQ^GQpd==f4TMZSlr|U`wVTtQm>40=p?bs&
zL7vA)+;kaqO^*;8fQV?qY#4;oQHZVmQDU68I)*rhPa3^MM4_aVNu3RALLj|*Pn)!e
zfoji&EIcsq?&j`V{@tDPbUBajW0SSh&L}5N#{O)96+As&b`BAjR01Lqvag%(QCjrj
zHof~{k=Wa$mdcbjw+tZ&Eu+&XryO5+D4pecx|By5D(t{)$*spH)G;YC6Qn}HS#^_D
zMkyWAoY`CMg}AxrU6MyxR3#RgwU&YrTd75bsO5eS8}V*<IiYemFDr&SM>+N5Kw7eq
zfh6cUY>}KJlN2i0k(+9<GKJHYZAaGFp^dz2*POP(w{Dr`yRpToR_F#67MSA5&ssTx
zU?Vqyu6!Bdgf@tZI!!D}X>;Ki3!XGasa}Pg>bR)ODC4l!lu2ncQr^vi*V|pOlDta7
zsja^qYoRME?aRHL^v;Ov-b&SE6kIL7ii<TE*`%(-m8)FZ_IIwK5vIY;X&R>36Jlcl
zGn@35WqU!dwyY-bEf&+?`t~IIe%i9nsgZ~Uv6c^%|2Z7r+3!3o5%)-($;r;Fv9btJ
zd=!Pl2IQd*vo2Wb#ye?YQYdVW4mQAc(<V%oi5rA%ms)fB<;Z9^Nx7Mp5)#74I&XN*
zu#A9+JREJqThwx}Aqr!Vm_sxKChf2Zm6jQnpuyho&@32FU*m^2gBpJz3by8-&(WSW
zQ8B6Z)+KFi4J6q0thZ&h?p*Be>$whjoRcM1Ib^UtPA@!1f^d1)U46e_6ysBqj>~il
zGcIGIRnh}!Sgx9Mt`Wvdt;}sOh}~mHEOIrDo0Q38aE*?f%*hf(u4JPzw=aFW#qzg}
zvq1}>L>qToAQYBd(ylZBZW@&hU_m1sm5>TBGC5sTRl;G8HLSAGfreMi@o~N$0?2UX
zld8?+$PuM?3^OszB7-@>nYEir>De+|GTJ$!;bA~i0CNjwWgM1)1d+K#%963mUq7;R
z@%595|D%_UXox3<7M!NN%Gd`H)K*(y4;htQ)`DjNi|WW^_<sKVb^D%2yHmfx{lGGm
zWt63(3vZ!ihD24%3Ly{_N6?`QLeiKp$0ESAsUmA;7jlEP;5rCHSGw7EV=dOrVPsKA
z!Z5W0$5>3smRyDvfpU$NrIf+TU^uu-hFoDTIH?Mx!{Rj>8Yd&FBnBEDEaNU~GUUT6
z5E!ch<gO_e6Du`sOtR7;VA~eN&~^SE9lY;|FpnybW$IkZb4|(=aVllWb=}%;)Xq8b
zcAed6JlvjVi$EN#bKmDU%{4h@6)L7L(KC%1ZEbAU!5m{`n$1M)BNz#&+Et>_MEIO-
zEVm+!*~yHhB~{0wK?$Tc^I@X($IX!d<p6KNS`U2ND6B5gcBjDxT$NrMcoh}&0B0tG
zBZ0t9R!J-^!Ud=y1VoN@5*G`s&;h}?j7+!OvUl;@;W-mf=>_o$Mq6dqFpft~LJC3R
z1TBG=DZ7|<>6}&(6g_Tu&C60HM^?sGZp^}6I69+kp~qKtPdCqBNt2$UbcHo2l8rP7
zYKWAU9V3My5OVfWm|eCdB~4pW%P5<Ji>#-u)nxJXldHSMZu;ZUl$Tv9u90oF`Api_
jW=5awJp7T|W1jU=m6R2x4-;65|BJaIoG3^F1Wd<(O)UY~

literal 0
HcmV?d00001

diff --git a/tests/data/references.yml b/tests/data/references.yml
index 17afd29a..6ab753a8 100644
--- a/tests/data/references.yml
+++ b/tests/data/references.yml
@@ -144,6 +144,10 @@ MARK1:
     - null
   - - XM_005273136
     - null
+ADAC:
+  accession: UD_143772172095
+  checksum: 0b7f7991c1fb50bdfd04d3b0e405ecf3
+  filename: UD_143772172095.gb.bz2
 NG_008939.1:
   checksum: 114a03e16ad2f63531d796c2fb0d7039
   filename: NG_008939.1.gb.bz2
diff --git a/tests/test_parsers_genbank.py b/tests/test_parsers_genbank.py
index 2248318e..f997e89c 100644
--- a/tests/test_parsers_genbank.py
+++ b/tests/test_parsers_genbank.py
@@ -48,3 +48,17 @@ def test_only_complete_genes_included(settings, references, parser):
     filename = os.path.join(settings.CACHE_DIR, '%s.gb.bz2' % accession)
     record = parser.create_record(filename)
     assert [g.name for g in record.geneList] == ['A1BG']
+
+@with_references('ADAC')
+def test_no_version(settings, references, parser):
+    """
+    Genbank file without 'version' field, so BioPython record.id is the
+    accession number without version. Our parser used to crash on that.
+
+    This genbank file was contributed by Gerard Schaafsma (original
+    source unknown).
+    """
+    accession = references[0].accession
+    genbank_filename = os.path.join(settings.CACHE_DIR,
+                                    '%s.gb.bz2' % accession)
+    parser.create_record(genbank_filename)
-- 
GitLab