From 86bd745bee0da0fb9a50b0b24b041a0d27e6c7e2 Mon Sep 17 00:00:00 2001 From: Redmar van den Berg <RedmarvandenBerg@lumc.nl> Date: Thu, 24 Oct 2019 10:25:33 +0200 Subject: [PATCH] Add check for decomposed calls Before, we would iterate over decomposed called variants until we found the variant where the REF and ALT matched the one in the positive vcf. However, this meant that sites where the called vcf had different ALTs could not be processed. To resolve this, we had to drop support for decomposed variants. --- tests/cases/dummy_decomposed.vcf.gz | Bin 0 -> 1380 bytes tests/cases/dummy_decomposed.vcf.gz.tbi | Bin 0 -> 170 bytes tests/test_evaluate.py | 26 +++++++++++++ vtools/evaluate.py | 47 ++++++++++++++---------- 4 files changed, 54 insertions(+), 19 deletions(-) create mode 100644 tests/cases/dummy_decomposed.vcf.gz create mode 100644 tests/cases/dummy_decomposed.vcf.gz.tbi diff --git a/tests/cases/dummy_decomposed.vcf.gz b/tests/cases/dummy_decomposed.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..09642f104fb1e3a673add5c28804af5297c7468d GIT binary patch literal 1380 zcmb2|=3rp}f&Xj_PR>jW?yPUa?&se&6Q~RO&$X!X2A77G=ih0|dHq6EkL4WSe~qbj zvFa@`t?pGS>R<o;l2x_bICagrfH#Q;<ajRGefs_G-EQk&kCLV?sZs6HUHzy3`op~o zCz^|kKfIfBCH}{L9;W^BD<0o1JINg%EdG1W@8&g+vg1}T>MvG0x!NV=-|f0tpG-Of z&V0@aR$_3u+P1i9y};?DBbT`B3R<RXoXnc6krcPUL{nRC|KG1ujAzJLocwO_d5^$- zp${=a8+RzW+n=uOloPtqzs%0m_d(V^y@wNQ;*`P)SSE(=dGK+!-ruuZoaL7_g)V<$ zZX+yu<3h>tOG|FqAC4$7cosS_<@I#OUDMSMAAUOf{So_VtCpUeZxePgg}E_inxJ<j zkLO>eq@0!xY1M1Gye+X#lfpdN+M^^LibI~+c!hZ$QOSFIIzi^1>8p5oo0s#0=P&kn zv&oCeUTBr6efmYVukGF2SgtUapWN1N#c;gO>tUzen+y9FxOmHn{dwJ&e(_(s<?;2J z`&w5-bZz>jAhi1NtG*|8bzX-w|A=b;EIGDG|MT5%H8X43d|TICb;`N8T&4Gj`NV}x z>lPPncMUi-HRZi?eXWkENpoE^6XUO_@YGAH{OkCp{BSsFWh$xv{4u9gVdbhjDwXUH z*sV6Vdu`0E(^$y+J>&2@%c!Dxw%d~0bLy_$_AlU8I}@k2aH{!qqa%+Lv%h`JmkpRV z?fGXpF`Y@)oz|;m_lZRwoy1cmakhBcRp|qdK82{Ce)z}cpy%Od)4%K7E!?85nl)pJ zbljg1rGLD0GECO7JlXK*!`3ximVwhbUvIc~uIy&^5y?$Sx{9d|C#%%URCkF+PwXz) z-x|dxBKgHVOoeCH&bR9v+w2aouvSkHv8l4t$cvnJ?Wx`LuT^$+3Hqs@9S_@OO?TLN zrFd3ohIYyoex|84u^&u()J~jv=5UF#c7iVVxnzqX-5*({^ZveM;?dnP(S@@;AyLU8 z?bWW{+0!Q1$x8i9S@&$ujvOu?+nfbW${Wu<3~XGU#~Z3t+dg~ynvU<&=T}T>w9dXD z%$EE0&=lFdj}Ff4eU@?LQ_QcLKhI<uA6(~O$0zjb4EIHmD)BG!{i{CwS+OhMJ=9iu z+gj~<<4wyBR=#rH=oGkil0;V5etVW}cbumm`e4aAb)uik&-|W;lW!C;y6PT3u`qUh zetNlbYLR(3Tcgo$0j&=%nwRZ=IjhR>h|8ySl>}z~<I)H}{O?D3>K*kf8|S8#dhQO| zxhtgn*45lF=KY;!tV_EbeYy4rE!9}L`1X;u$>v`13)gv?Tn{l>TL1QPoc))LZajOh z%l|&|Le}0d;X>|a@tChoo&G*KUz;lZ{rn9Q?r=Z<+Ercj?#G5XJPh*#VvT*1zP{R@ zu<KdSjYJD`t3%smHk<R-1Q!>+4-qn$r5Bbb7H9vVsk7YptC}<G{^RqO9y+ev*T&1g z`beOrVuFI)McJtdw->8@i@C16&NXDss~ZK<o1d=wHd*v==7w3j;uZ%TX1#N*Ba}P& z>Xy=5w-|n$aLc`_q?ehaX1CMb=W0pJch%XMyH{o2JocaIZ#&DT4n8KqMYH=DFKS)q zQWR5^x+dMY#!aDlmp-S94sX=gK*dKg+Dqn#o>(v|b350;dmH9yx|sTG2sv@}L5%xC z??pGQD)lCYIGNfcRf}EKIlQ^V!=!ZWF_tDlE+dI)XShN%UaS<Fy+zZw!)IoqiKekn zxGB%>y<AmR&mO87%7i~^YSJ}0@IvIw7B;i&uMekOp0SZN?ej&R&gp4)GrOA`W=17k g%Pwa2^SN91S^Q<AUi)D_1_tzURGNVqRH}k80IN@lCIA2c literal 0 HcmV?d00001 diff --git a/tests/cases/dummy_decomposed.vcf.gz.tbi b/tests/cases/dummy_decomposed.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..b61781774ec3e28d970e66d942d351f4637d2019 GIT binary patch literal 170 zcmb2|=3rp}f&Xj_PR>jWy$o+p@8mt?Ai#PdQb*a5H)ThYZX<`xZe}4S_bHFA-I;KG zx`2tpr+p@$mssw)t-Q3uV#=+##dmF{RqeZHsXzPQrnuhaxw7A@-o4#W_wU^5AAZIE h?o3eeoP-<wJ8df*XY%>}tUv|^bk|5TFoWC%#sJJUKY;)M literal 0 HcmV?d00001 diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index 712f755..85bd91a 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -132,3 +132,29 @@ def test_phased_call_file(): match='Phased variants are not supported'): site_concordancy(call, positive, call_samples=['BLANK'], positive_samples=['NA12878']) + + +def test_decomposed_positive_file(): + """ Test error message when the positive vcf contains decomposed variants + """ + filename = 'tests/cases/gatk.vcf.gz' + decomposed = 'tests/cases/dummy_decomposed.vcf.gz' + call = VCF(filename, gts012=True) + positive = VCF(decomposed, gts012=True) + with pytest.raises(NotImplementedError, + match='Decomposed variants are not supported'): + site_concordancy(call, positive, call_samples=['BLANK'], + positive_samples=['NA12878']) + + +def test_decomposed_call_file(): + """ Test error message when the positive vcf contains decomposed variants + """ + filename = 'tests/cases/gatk.vcf.gz' + decomposed = 'tests/cases/dummy_decomposed.vcf.gz' + call = VCF(decomposed, gts012=True) + positive = VCF(filename, gts012=True) + with pytest.raises(NotImplementedError, + match='Decomposed variants are not supported'): + site_concordancy(call, positive, call_samples=['BLANK'], + positive_samples=['NA12878']) diff --git a/vtools/evaluate.py b/vtools/evaluate.py index ea276ca..bb751ec 100644 --- a/vtools/evaluate.py +++ b/vtools/evaluate.py @@ -7,6 +7,7 @@ vtools.evaluate :license: MIT """ from typing import List, Dict +from types import SimpleNamespace from cyvcf2 import VCF @@ -91,40 +92,48 @@ def site_concordancy(call_vcf: VCF, "alleles_low_qual": 0, "alleles_low_depth": 0 } + + # Keep track of the discordant sites discordant_count = 0 discordant_records = list() + # Keep track of the previous record, to make sure the positive vcf file + # does not contain decomposed records + prev_record = SimpleNamespace() + prev_record.POS = -1 + prev_record.CHROM = -1 + for pos_record in positive_vcf: + # Check to make sure the POS and CHROM are different from the previous + # variant + if (prev_record.POS == pos_record.POS and + prev_record.CHROM == pos_record.CHROM): + raise NotImplementedError('Decomposed variants are not supported') + else: + prev_record.POS = pos_record.POS + prev_record.CHROM = pos_record.CHROM + d['total_sites'] += 1 query_str = "{0}:{1}-{2}".format( pos_record.CHROM, - pos_record.start, + pos_record.start+1, pos_record.end ) - it = call_vcf(query_str) - same = [] - # If the vcf file has been decomposed, there can be multiple sites with - # the same CHROM and POS, which is why we have to iterate over all the - # sites that are returned - for it_record in it: - # We only want to consider sites that have the same CHROM, POS, REF - # and ALT as the positive vcf, since otherwise we might consider - # A/T and A/G as identical since they are both heterozygous - if (it_record.CHROM == pos_record.CHROM - and it_record.POS == pos_record.POS - and it_record.REF == pos_record.REF - and it_record.ALT == pos_record.ALT): - same.append(it_record) + call_records = list(call_vcf(query_str)) # If the variant is not present in the call vcf - if len(same) == 0: + if len(call_records) == 0: d['alleles_no_call'] += 2 - - if len(same) != 1: continue - call_record = same[0] + # If there are multiple variants with the same CHROM and POS, the + # variant in the call vcf file must have been decomposted + if len(call_records) > 1: + raise NotImplementedError('Decomposed variants are not supported') + + # Now we know there is only one call record + call_record = call_records[0] d['sites_considered'] += 1 d['alleles_considered'] += (2 * len(positive_samples)) -- GitLab