From d18b5395cabb35fcf6e318264840f239c7112ddc Mon Sep 17 00:00:00 2001 From: Martijn Vermaat <martijn@vermaat.name> Date: Sat, 25 Jul 2015 19:17:11 +0200 Subject: [PATCH] Parse genbank file without VERSION field Partial fix for https://humgenprojects.lumc.nl/trac/mutalyzer/ticket/188 --- mutalyzer/parsers/genbank.py | 8 ++++++-- tests/data/UD_143772172095.gb.bz2 | Bin 0 -> 1637 bytes tests/data/references.yml | 4 ++++ tests/test_parsers_genbank.py | 14 ++++++++++++++ 4 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 tests/data/UD_143772172095.gb.bz2 diff --git a/mutalyzer/parsers/genbank.py b/mutalyzer/parsers/genbank.py index b85dd531..266def8f 100644 --- a/mutalyzer/parsers/genbank.py +++ b/mutalyzer/parsers/genbank.py @@ -441,8 +441,12 @@ class GBparser(): # the genbank file) are from the original NC reference. We try to # set the .id field to the working value in the caller. record.source_id = biorecord.id - record.source_accession, record.source_version = biorecord.id.split('.')[:2] - record.source_gi = biorecord.annotations['gi'] + try: + record.source_accession, record.source_version = biorecord.id.split('.')[:2] + except ValueError: + record.source_accession = biorecord.id + record.source_version = '1' + record.source_gi = biorecord.annotations.get('gi') record.organism = biorecord.annotations['organism'] # Todo: This will change once we support protein references diff --git a/tests/data/UD_143772172095.gb.bz2 b/tests/data/UD_143772172095.gb.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..30f97e92f1f2b746cb17d66fb9b9c69f83dc1c0d GIT binary patch literal 1637 zcmV-r2AcUoT4*^jL0KkKSp*1~j{pJ#|A2jPU;t2O|N1}g|DeC`|FB>OUssIjD3%6* z6ajVI3^g$m5YS~Z0R}0cWYL8EQ_z|WO&dv|ig^t*H8Po^n-xDr02%>-13)y>KnEEz zXv8rzz(By7V2l$JO*8;u2B+kbDA7~enrQMshK!5=6Gjlgm?4K5GHApxG{8W>nqZ6* z6HPP#VFad-rlF^((=;PYfEXYEGyntC1yj4NERLBu24UWqFa@YeQh<@6Okl{9j>(FB zVT2tsvgVv&?%T)L(F-^R@ll?X#1x>V4ZpSp3i%F_Q#EB#cpx0%afwrybiz32jeC<O zzJ_!m8KNGeHR$WePgBxM;1w}SG@Ulgn^uq4#g{iSOooO!Y)Tz6Z+(MSaY}I*R+bn{ z3b2_oEXbrVw|sftKgv$ev-9li?T=~=MJTnAEo6hxjf_B4hQ5*YzzJJJ9ZEXA++JI@ zw;-NAe%Wd3gLhV)3n1ddUm=y9NwaLGBQ^GQpd==f4TMZSlr|U`wVTtQm>40=p?bs& zL7vA)+;kaqO^*;8fQV?qY#4;oQHZVmQDU68I)*rhPa3^MM4_aVNu3RALLj|*Pn)!e zfoji&EIcsq?&j`V{@tDPbUBajW0SSh&L}5N#{O)96+As&b`BAjR01Lqvag%(QCjrj zHof~{k=Wa$mdcbjw+tZ&Eu+&XryO5+D4pecx|By5D(t{)$*spH)G;YC6Qn}HS#^_D zMkyWAoY`CMg}AxrU6MyxR3#RgwU&YrTd75bsO5eS8}V*<IiYemFDr&SM>+N5Kw7eq zfh6cUY>}KJlN2i0k(+9<GKJHYZAaGFp^dz2*POP(w{Dr`yRpToR_F#67MSA5&ssTx zU?Vqyu6!Bdgf@tZI!!D}X>;Ki3!XGasa}Pg>bR)ODC4l!lu2ncQr^vi*V|pOlDta7 zsja^qYoRME?aRHL^v;Ov-b&SE6kIL7ii<TE*`%(-m8)FZ_IIwK5vIY;X&R>36Jlcl zGn@35WqU!dwyY-bEf&+?`t~IIe%i9nsgZ~Uv6c^%|2Z7r+3!3o5%)-($;r;Fv9btJ zd=!Pl2IQd*vo2Wb#ye?YQYdVW4mQAc(<V%oi5rA%ms)fB<;Z9^Nx7Mp5)#74I&XN* zu#A9+JREJqThwx}Aqr!Vm_sxKChf2Zm6jQnpuyho&@32FU*m^2gBpJz3by8-&(WSW zQ8B6Z)+KFi4J6q0thZ&h?p*Be>$whjoRcM1Ib^UtPA@!1f^d1)U46e_6ysBqj>~il zGcIGIRnh}!Sgx9Mt`Wvdt;}sOh}~mHEOIrDo0Q38aE*?f%*hf(u4JPzw=aFW#qzg} zvq1}>L>qToAQYBd(ylZBZW@&hU_m1sm5>TBGC5sTRl;G8HLSAGfreMi@o~N$0?2UX zld8?+$PuM?3^OszB7-@>nYEir>De+|GTJ$!;bA~i0CNjwWgM1)1d+K#%963mUq7;R z@%595|D%_UXox3<7M!NN%Gd`H)K*(y4;htQ)`DjNi|WW^_<sKVb^D%2yHmfx{lGGm zWt63(3vZ!ihD24%3Ly{_N6?`QLeiKp$0ESAsUmA;7jlEP;5rCHSGw7EV=dOrVPsKA z!Z5W0$5>3smRyDvfpU$NrIf+TU^uu-hFoDTIH?Mx!{Rj>8Yd&FBnBEDEaNU~GUUT6 z5E!ch<gO_e6Du`sOtR7;VA~eN&~^SE9lY;|FpnybW$IkZb4|(=aVllWb=}%;)Xq8b zcAed6JlvjVi$EN#bKmDU%{4h@6)L7L(KC%1ZEbAU!5m{`n$1M)BNz#&+Et>_MEIO- zEVm+!*~yHhB~{0wK?$Tc^I@X($IX!d<p6KNS`U2ND6B5gcBjDxT$NrMcoh}&0B0tG zBZ0t9R!J-^!Ud=y1VoN@5*G`s&;h}?j7+!OvUl;@;W-mf=>_o$Mq6dqFpft~LJC3R z1TBG=DZ7|<>6}&(6g_Tu&C60HM^?sGZp^}6I69+kp~qKtPdCqBNt2$UbcHo2l8rP7 zYKWAU9V3My5OVfWm|eCdB~4pW%P5<Ji>#-u)nxJXldHSMZu;ZUl$Tv9u90oF`Api_ jW=5awJp7T|W1jU=m6R2x4-;65|BJaIoG3^F1Wd<(O)UY~ literal 0 HcmV?d00001 diff --git a/tests/data/references.yml b/tests/data/references.yml index 17afd29a..6ab753a8 100644 --- a/tests/data/references.yml +++ b/tests/data/references.yml @@ -144,6 +144,10 @@ MARK1: - null - - XM_005273136 - null +ADAC: + accession: UD_143772172095 + checksum: 0b7f7991c1fb50bdfd04d3b0e405ecf3 + filename: UD_143772172095.gb.bz2 NG_008939.1: checksum: 114a03e16ad2f63531d796c2fb0d7039 filename: NG_008939.1.gb.bz2 diff --git a/tests/test_parsers_genbank.py b/tests/test_parsers_genbank.py index 2248318e..f997e89c 100644 --- a/tests/test_parsers_genbank.py +++ b/tests/test_parsers_genbank.py @@ -48,3 +48,17 @@ def test_only_complete_genes_included(settings, references, parser): filename = os.path.join(settings.CACHE_DIR, '%s.gb.bz2' % accession) record = parser.create_record(filename) assert [g.name for g in record.geneList] == ['A1BG'] + +@with_references('ADAC') +def test_no_version(settings, references, parser): + """ + Genbank file without 'version' field, so BioPython record.id is the + accession number without version. Our parser used to crash on that. + + This genbank file was contributed by Gerard Schaafsma (original + source unknown). + """ + accession = references[0].accession + genbank_filename = os.path.join(settings.CACHE_DIR, + '%s.gb.bz2' % accession) + parser.create_record(genbank_filename) -- GitLab