From f9aadab304417312ba56adad501fd10a9aad0423 Mon Sep 17 00:00:00 2001 From: Redmar van den Berg <RedmarvandenBerg@lumc.nl> Date: Thu, 24 Oct 2019 16:57:04 +0200 Subject: [PATCH] Add support for parital calls --- tests/cases/gatk_partial_call.vcf.gz | Bin 0 -> 1957 bytes tests/cases/gatk_partial_call.vcf.gz.tbi | Bin 0 -> 171 bytes tests/test_evaluate.py | 76 +++++++++++++++++++++++ vtools/evaluate.py | 18 ++++-- 4 files changed, 89 insertions(+), 5 deletions(-) create mode 100644 tests/cases/gatk_partial_call.vcf.gz create mode 100644 tests/cases/gatk_partial_call.vcf.gz.tbi diff --git a/tests/cases/gatk_partial_call.vcf.gz b/tests/cases/gatk_partial_call.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..3ba8f8a666e0037892e28ade7172780a28adf74a GIT binary patch literal 1957 zcmV;W2U_?aiwFb&00000{{{d;LjnMZ2enpPbJ|E2ekQ*{HF?<GDp}L_R*Pk~hzYi9 zZO1m=q-x$|XdBgl#7H8E_t)>&YRO<=f~ThDA(71KKIioLF0J?PAG3vhEXrkC4?drc zzlw;yd;fm?>F#QHKltaxXfPaw*X!kjEeBC}|Gct;i@df^whTwMnwHtB&We2S({Qn{ zi)<Rq?5dtu0TKrF+y<q64AOiSq>CE=u*6nDmN%iIoY^v1WL0fvKmF@<&CyL)lm4jY zYE>5BGj#3>r88eV_`7?rf^Qk#uWkoSO4RayK46hsUIl4s1Dw3JrJem_xBS`dPWkbo zNbA?d&um`Qhy}OnbdlB1dpYiRa!l$n%byM>+1-3|*=;`i!TGNpYK!cDb`}OVa~$NP znx+do!%%+(81tf-c_g0NGB|dHAw6a}7PT+{^PpPUY4(^+Q<t~t<>Gq$$)kVTbuv7L zz73|uI<H;gmx5ta-=@=fFuc5PDG+wGt904%0F7-?VN-s*Q~m%FZSiqwe_Pvp`uwMB zUcZHW_eGs9g1lK66p!2Np1SwhW(cp-e@H8Pdza?FPS(rgH2VdZ7LKcBu?)Usi)rz_ z$b%BwgKE7DK;_z1A3|TBt0=gG-X2gO_^n}Owp)Dkp({zclIad7XAhI>o8Zd2Rn<IO z?IidU%IpVKvGhMwkl}MMPrusW!P@*qiaW(7N4x!kU2vFZH-r}6m4jhce9Qg&?B6>{ z?V>t9_i>p{-J8pM*ED=u<aL_m&~PwWrFrv$T-L4HneupYyud5Lu1>zDi?t2bmCHKL zs<|!yRt5Gun(!Fc1!h=V{8`ndbJT|{-Klzhx6a=J&ShFYxs>ie+0(jMSHXF)ELO$h zc?m8fi#zn^w9HbdxmDLi9(;s@B0kDWJKZ;m^K`Xn=st0LJ$`;vublWG*ca?F(53O| zv<{YqJN36BsO(c|pI}Xx%)BeIbM%Y+p|o~}C8vdbd^}!#vjL`s9oaOqI4m3ilWBN6 z^OAFYbwYmSt1DZcc!CAr=3wx&SgvrX;HrB9OI+{Rd7hTD=aVls%hApg@PyIU2VC~* z4HwSc)g-t|S1Z@^_MNkpguLCjb}|LO*s}0M-%a|5t!jMSs=l^X)p*0QZRK|?>vnX! zbVpW@>M@&vQ}*ZXU=PnwDa_|}Qg_muqWUXr_|@4~1xW2|q?7CE`Q4|h-p!{;?_$)u z`#A35it63|cX-(wUtHdQyz99Zd;U|P_u+DQ{h!|Tkg{0E{%L6YWbSDgkK3p5PeI;I z=Otr3!6d`4tXEbn=9+4(JnId6%t#s;*^7F^(*f%b#{&{k!TZDO0Tuo2%LxIlWq<Uc zKe`!ENmPF{={xsCfRO%p(mYAu`OW~3R|YVkj7L&Z)o(t*cw`vf2-aO~<yW!n-;M?{ zGNk_+5(K3oy|epdI7&u0$=PjkbJ-(5M~Mnaq7oWML?k4namXk|fF_j3AtgFtk|f3? z)F>K~kP~5iSea07q6DiEm7F6=5+)4twm}jqLPDuIYhy^f6=OP7)KDLj<kdbZLm3-$ zC?>TDrP5*>AVoO<qLj;6;IzHd9;J%Xh;R>(@&E~D4go|}1SY&7MB{CQ_)&xei==K5 z!n;s=2(gGz;t(<s*6kp4LL<%TA&4jsMM8C=SfoUvI0ns(@+bf>$^e5<ZYp3X@C>P@ z?Z^amBNM`pOh^+7q1&y%GnU;>Dq`dE0>L&h(r|__YZA)_eXNRom?(thOBjQ9VIH^S zpa?UCFmJ*>Ih9;*qot+O%_(r=Dl)N$h<b>KriUOpp(2Wv@&GZt4G=vBkdqDS3I`B1 zI^rH6wgu1$B~k1lf{f)pL^{GScQK0Gq?O~ePT&S4cI>1eDDfbWI1~--JqnDgP(VIS z5FP^-JH7ZIr6L49nI9u4u3S(iTB3LS??k5JR%BS)K&0Benqe=OZB^iHn#2k{$wp1+ zLv(eaEYj@dpruBbvk(TgjSwfq>q!}^i4yR;vz~^$BH21#?9q~k(NYm%s?(F;{6~&Q z_w+;{h}d|zV)Q8X^mGEn0e7G!BC{u`IAUfGE9&$F3=+x`7$%;M^-?216N3?Qc%dgP zk7dX-c<r%}JT$Tcic9I$B7IOe<X+5)axPpL>S(1Vywr^MN==%m?wpi_X0sP)H>pWi zSA{Wk-MFY`O=7LKZBO7f>4c<Z9H}>f!$idK+Z!&UeP}du*9q;c?!aXP(}mg{G!COZ zA;AD+c?cUGa$HBa!Qw=yP<6w1UKMvE(xka-$f20xjYLR%V6;lc&D`6u$54dnVnVwZ zsND)rg<wJ(lf1!8xA!8@)20Jc6VVtS-@-#0>An**{4-X7<`lY-v37Ivz~~kj4EPW* z?(}N&p&<X*hk_8`y*7<(v{ULHHat>_c-5$Q7a9%UWBU#V1_BxBu;Ck6U>THdJ_5MW z7=|VX?|H{-#%RZRu=-`d6qhYF8ZI_+QJYVTMlLW${WAY}BNsx9kGAM27RH;7sA=FK rv3&P`a!eLTn-c&4ABzYC000000RIL6LPG)o8vp|U0000000000#aYE1 literal 0 HcmV?d00001 diff --git a/tests/cases/gatk_partial_call.vcf.gz.tbi b/tests/cases/gatk_partial_call.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..3205497d3fb444c435910397efb54e0fc53295d6 GIT binary patch literal 171 zcmb2|=3rp}f&Xj_PR>jWeGG3;?-guv5MaAt%Otb1(MNlS&>59w%)-n{yA8IQN>BQn zAd%bfq}F(Mep>yp)22&1GM+{0TIy~qzWc6!&i>ge*N4A<ys7W`!Y>a)?f*Z|{qyWj iuDOcmBs{2k=QHMyyu7Ps1u`(mqd7;Kff?*N5CH&ac|J4% literal 0 HcmV?d00001 diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index 895a5b7..15b8e7b 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -184,3 +184,79 @@ def test_haploid_call_file(): match='Non-diploid variants are not supported'): site_concordancy(call, positive, call_samples=['BLANK'], positive_samples=['NA12878']) + + +@pytest.fixture(scope='module') +def partial_call_file(): + """ Test statistics when the call vcf contains partial variants """ + filename = 'tests/cases/gatk.vcf.gz' + partial = 'tests/cases/gatk_partial_call.vcf.gz' + call = VCF(partial, gts012=True) + positive = VCF(filename, gts012=True) + d, disc = site_concordancy(call, positive, call_samples=['BLANK'], + positive_samples=['BLANK'], min_dp=0, min_gq=0) + return d + + +def test_partial_call_total_sites(partial_call_file): + assert partial_call_file['total_sites'] == 37 + + +def test_partial_call_alleles_hom_ref_concordant(partial_call_file): + assert partial_call_file['alleles_hom_ref_concordant'] == 0 + + +def test_partial_call_alleles_het_concordant(partial_call_file): + assert partial_call_file['alleles_het_concordant'] == 0 + + +def test_partial_call_alleles_hom_alt_concordant(partial_call_file): + assert partial_call_file['alleles_hom_alt_concordant'] == 0 + + +def test_partial_call_alleles_concordant(partial_call_file): + assert partial_call_file['alleles_concordant'] == 6 + + +def test_partial_call_alleles_discordant(partial_call_file): + assert partial_call_file['alleles_discordant'] == 0 + + +def test_partial_call_alleles_no_call(partial_call_file): + assert partial_call_file['alleles_no_call'] == 68 + + +@pytest.fixture(scope='module') +def partial_positive_file(): + """ Test statistics when the call vcf contains partial variants """ + filename = 'tests/cases/gatk.vcf.gz' + partial = 'tests/cases/gatk_partial_call.vcf.gz' + call = VCF(filename, gts012=True) + positive = VCF(partial, gts012=True) + d, disc = site_concordancy(call, positive, call_samples=['BLANK'], + positive_samples=['BLANK'], min_dp=0, min_gq=0) + return d + + +def test_partial_positive_total_sites(partial_positive_file): + assert partial_positive_file['total_sites'] == 6 + + +def test_partial_positive_hom_ref_concordant(partial_positive_file): + assert partial_positive_file['alleles_hom_ref_concordant'] == 0 + + +def test_partial_positive_het_concordant(partial_positive_file): + assert partial_positive_file['alleles_het_concordant'] == 0 + + +def test_partial_positive_hom_alt_concordant(partial_positive_file): + assert partial_positive_file['alleles_hom_alt_concordant'] == 0 + + +def test_partial_positive_concordant(partial_positive_file): + assert partial_positive_file['alleles_concordant'] == 6 + + +def test_partial_positive_no_call(partial_positive_file): + assert partial_positive_file['alleles_no_call'] == 6 diff --git a/vtools/evaluate.py b/vtools/evaluate.py index 880853b..0369ff0 100644 --- a/vtools/evaluate.py +++ b/vtools/evaluate.py @@ -34,7 +34,10 @@ def parse_variants(ref: str, call: List[str], pos: List[str], # Here we count all alleles independently, also for the concordant calls for i, j in zip(call_variant, pos_variant): - if i == j: + # If one of the variants is partial no call + if i == '.' or j == '.': + results['alleles_no_call'] += 1 + elif i == j: results['alleles_concordant'] += 1 else: results['alleles_discordant'] += 1 @@ -143,7 +146,8 @@ def site_concordancy(call_vcf: VCF, p_gt = pos_record.gt_types[p_s] c_gt = call_record.gt_types[c_s] - # Is there missing data in one of the vcf files? + # Is one of the sites no call. This is only for fully no call + # sites. Partial no call site are handled in parse_variants if p_gt == 3 or c_gt == 3: d['alleles_no_call'] += 2 continue @@ -188,9 +192,13 @@ def site_concordancy(call_vcf: VCF, pos_alleles = [pos_record.REF] + pos_record.ALT cal_alleles = [call_record.REF] + call_record.ALT - # The third item in pos is a boolean indicating phasing - pos_gt = [pos_alleles[x] for x in pos[:2]] - cal_gt = [cal_alleles[x] for x in cal[:2]] + # The third item in pos is a boolean indicating phasing, so we + # don't use that + # No call is indicated by -1 in pyvcf2, so here we convert negative + # values to '.', which is what is used in the vcf to indicate no + # call + pos_gt = [pos_alleles[x] if x >= 0 else '.' for x in pos[:2]] + cal_gt = [cal_alleles[x] if x >= 0 else '.' for x in cal[:2]] # Parse the genotypes and add the results into d # We also need to know the reference call to determine if variants -- GitLab