From f9aadab304417312ba56adad501fd10a9aad0423 Mon Sep 17 00:00:00 2001
From: Redmar van den Berg <RedmarvandenBerg@lumc.nl>
Date: Thu, 24 Oct 2019 16:57:04 +0200
Subject: [PATCH] Add support for parital calls

---
 tests/cases/gatk_partial_call.vcf.gz     | Bin 0 -> 1957 bytes
 tests/cases/gatk_partial_call.vcf.gz.tbi | Bin 0 -> 171 bytes
 tests/test_evaluate.py                   |  76 +++++++++++++++++++++++
 vtools/evaluate.py                       |  18 ++++--
 4 files changed, 89 insertions(+), 5 deletions(-)
 create mode 100644 tests/cases/gatk_partial_call.vcf.gz
 create mode 100644 tests/cases/gatk_partial_call.vcf.gz.tbi

diff --git a/tests/cases/gatk_partial_call.vcf.gz b/tests/cases/gatk_partial_call.vcf.gz
new file mode 100644
index 0000000000000000000000000000000000000000..3ba8f8a666e0037892e28ade7172780a28adf74a
GIT binary patch
literal 1957
zcmV;W2U_?aiwFb&00000{{{d;LjnMZ2enpPbJ|E2ekQ*{HF?<GDp}L_R*Pk~hzYi9
zZO1m=q-x$|XdBgl#7H8E_t)>&YRO<=f~ThDA(71KKIioLF0J?PAG3vhEXrkC4?drc
zzlw;yd;fm?>F#QHKltaxXfPaw*X!kjEeBC}|Gct;i@df^whTwMnwHtB&We2S({Qn{
zi)<Rq?5dtu0TKrF+y<q64AOiSq>CE=u*6nDmN%iIoY^v1WL0fvKmF@<&CyL)lm4jY
zYE>5BGj#3>r88eV_`7?rf^Qk#uWkoSO4RayK46hsUIl4s1Dw3JrJem_xBS`dPWkbo
zNbA?d&um`Qhy}OnbdlB1dpYiRa!l$n%byM>+1-3|*=;`i!TGNpYK!cDb`}OVa~$NP
znx+do!%%+(81tf-c_g0NGB|dHAw6a}7PT+{^PpPUY4(^+Q<t~t<>Gq$$)kVTbuv7L
zz73|uI<H;gmx5ta-=@=fFuc5PDG+wGt904%0F7-?VN-s*Q~m%FZSiqwe_Pvp`uwMB
zUcZHW_eGs9g1lK66p!2Np1SwhW(cp-e@H8Pdza?FPS(rgH2VdZ7LKcBu?)Usi)rz_
z$b%BwgKE7DK;_z1A3|TBt0=gG-X2gO_^n}Owp)Dkp({zclIad7XAhI>o8Zd2Rn<IO
z?IidU%IpVKvGhMwkl}MMPrusW!P@*qiaW(7N4x!kU2vFZH-r}6m4jhce9Qg&?B6>{
z?V>t9_i>p{-J8pM*ED=u<aL_m&~PwWrFrv$T-L4HneupYyud5Lu1>zDi?t2bmCHKL
zs<|!yRt5Gun(!Fc1!h=V{8`ndbJT|{-Klzhx6a=J&ShFYxs>ie+0(jMSHXF)ELO$h
zc?m8fi#zn^w9HbdxmDLi9(;s@B0kDWJKZ;m^K`Xn=st0LJ$`;vublWG*ca?F(53O|
zv<{YqJN36BsO(c|pI}Xx%)BeIbM%Y+p|o~}C8vdbd^}!#vjL`s9oaOqI4m3ilWBN6
z^OAFYbwYmSt1DZcc!CAr=3wx&SgvrX;HrB9OI+{Rd7hTD=aVls%hApg@PyIU2VC~*
z4HwSc)g-t|S1Z@^_MNkpguLCjb}|LO*s}0M-%a|5t!jMSs=l^X)p*0QZRK|?>vnX!
zbVpW@>M@&vQ}*ZXU=PnwDa_|}Qg_muqWUXr_|@4~1xW2|q?7CE`Q4|h-p!{;?_$)u
z`#A35it63|cX-(wUtHdQyz99Zd;U|P_u+DQ{h!|Tkg{0E{%L6YWbSDgkK3p5PeI;I
z=Otr3!6d`4tXEbn=9+4(JnId6%t#s;*^7F^(*f%b#{&{k!TZDO0Tuo2%LxIlWq<Uc
zKe`!ENmPF{={xsCfRO%p(mYAu`OW~3R|YVkj7L&Z)o(t*cw`vf2-aO~<yW!n-;M?{
zGNk_+5(K3oy|epdI7&u0$=PjkbJ-(5M~Mnaq7oWML?k4namXk|fF_j3AtgFtk|f3?
z)F>K~kP~5iSea07q6DiEm7F6=5+)4twm}jqLPDuIYhy^f6=OP7)KDLj<kdbZLm3-$
zC?>TDrP5*>AVoO<qLj;6;IzHd9;J%Xh;R>(@&E~D4go|}1SY&7MB{CQ_)&xei==K5
z!n;s=2(gGz;t(<s*6kp4LL<%TA&4jsMM8C=SfoUvI0ns(@+bf>$^e5<ZYp3X@C>P@
z?Z^amBNM`pOh^+7q1&y%GnU;>Dq`dE0>L&h(r|__YZA)_eXNRom?(thOBjQ9VIH^S
zpa?UCFmJ*>Ih9;*qot+O%_(r=Dl)N$h<b>KriUOpp(2Wv@&GZt4G=vBkdqDS3I`B1
zI^rH6wgu1$B~k1lf{f)pL^{GScQK0Gq?O~ePT&S4cI>1eDDfbWI1~--JqnDgP(VIS
z5FP^-JH7ZIr6L49nI9u4u3S(iTB3LS??k5JR%BS)K&0Benqe=OZB^iHn#2k{$wp1+
zLv(eaEYj@dpruBbvk(TgjSwfq>q!}^i4yR;vz~^$BH21#?9q~k(NYm%s?(F;{6~&Q
z_w+;{h}d|zV)Q8X^mGEn0e7G!BC{u`IAUfGE9&$F3=+x`7$%;M^-?216N3?Qc%dgP
zk7dX-c<r%}JT$Tcic9I$B7IOe<X+5)axPpL>S(1Vywr^MN==%m?wpi_X0sP)H>pWi
zSA{Wk-MFY`O=7LKZBO7f>4c<Z9H}>f!$idK+Z!&UeP}du*9q;c?!aXP(}mg{G!COZ
zA;AD+c?cUGa$HBa!Qw=yP<6w1UKMvE(xka-$f20xjYLR%V6;lc&D`6u$54dnVnVwZ
zsND)rg<wJ(lf1!8xA!8@)20Jc6VVtS-@-#0>An**{4-X7<`lY-v37Ivz~~kj4EPW*
z?(}N&p&<X*hk_8`y*7<(v{ULHHat>_c-5$Q7a9%UWBU#V1_BxBu;Ck6U>THdJ_5MW
z7=|VX?|H{-#%RZRu=-`d6qhYF8ZI_+QJYVTMlLW${WAY}BNsx9kGAM27RH;7sA=FK
rv3&P`a!eLTn-c&4ABzYC000000RIL6LPG)o8vp|U0000000000#aYE1

literal 0
HcmV?d00001

diff --git a/tests/cases/gatk_partial_call.vcf.gz.tbi b/tests/cases/gatk_partial_call.vcf.gz.tbi
new file mode 100644
index 0000000000000000000000000000000000000000..3205497d3fb444c435910397efb54e0fc53295d6
GIT binary patch
literal 171
zcmb2|=3rp}f&Xj_PR>jWeGG3;?-guv5MaAt%Otb1(MNlS&>59w%)-n{yA8IQN>BQn
zAd%bfq}F(Mep>yp)22&1GM+{0TIy~qzWc6!&i>ge*N4A<ys7W`!Y>a)?f*Z|{qyWj
iuDOcmBs{2k=QHMyyu7Ps1u`(mqd7;Kff?*N5CH&ac|J4%

literal 0
HcmV?d00001

diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
index 895a5b7..15b8e7b 100644
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
@@ -184,3 +184,79 @@ def test_haploid_call_file():
                        match='Non-diploid variants are not supported'):
         site_concordancy(call, positive, call_samples=['BLANK'],
                          positive_samples=['NA12878'])
+
+
+@pytest.fixture(scope='module')
+def partial_call_file():
+    """ Test statistics when the call vcf contains partial variants """
+    filename = 'tests/cases/gatk.vcf.gz'
+    partial = 'tests/cases/gatk_partial_call.vcf.gz'
+    call = VCF(partial, gts012=True)
+    positive = VCF(filename, gts012=True)
+    d, disc = site_concordancy(call, positive, call_samples=['BLANK'],
+                               positive_samples=['BLANK'], min_dp=0, min_gq=0)
+    return d
+
+
+def test_partial_call_total_sites(partial_call_file):
+    assert partial_call_file['total_sites'] == 37
+
+
+def test_partial_call_alleles_hom_ref_concordant(partial_call_file):
+    assert partial_call_file['alleles_hom_ref_concordant'] == 0
+
+
+def test_partial_call_alleles_het_concordant(partial_call_file):
+    assert partial_call_file['alleles_het_concordant'] == 0
+
+
+def test_partial_call_alleles_hom_alt_concordant(partial_call_file):
+    assert partial_call_file['alleles_hom_alt_concordant'] == 0
+
+
+def test_partial_call_alleles_concordant(partial_call_file):
+    assert partial_call_file['alleles_concordant'] == 6
+
+
+def test_partial_call_alleles_discordant(partial_call_file):
+    assert partial_call_file['alleles_discordant'] == 0
+
+
+def test_partial_call_alleles_no_call(partial_call_file):
+    assert partial_call_file['alleles_no_call'] == 68
+
+
+@pytest.fixture(scope='module')
+def partial_positive_file():
+    """ Test statistics when the call vcf contains partial variants """
+    filename = 'tests/cases/gatk.vcf.gz'
+    partial = 'tests/cases/gatk_partial_call.vcf.gz'
+    call = VCF(filename, gts012=True)
+    positive = VCF(partial, gts012=True)
+    d, disc = site_concordancy(call, positive, call_samples=['BLANK'],
+                               positive_samples=['BLANK'], min_dp=0, min_gq=0)
+    return d
+
+
+def test_partial_positive_total_sites(partial_positive_file):
+    assert partial_positive_file['total_sites'] == 6
+
+
+def test_partial_positive_hom_ref_concordant(partial_positive_file):
+    assert partial_positive_file['alleles_hom_ref_concordant'] == 0
+
+
+def test_partial_positive_het_concordant(partial_positive_file):
+    assert partial_positive_file['alleles_het_concordant'] == 0
+
+
+def test_partial_positive_hom_alt_concordant(partial_positive_file):
+    assert partial_positive_file['alleles_hom_alt_concordant'] == 0
+
+
+def test_partial_positive_concordant(partial_positive_file):
+    assert partial_positive_file['alleles_concordant'] == 6
+
+
+def test_partial_positive_no_call(partial_positive_file):
+    assert partial_positive_file['alleles_no_call'] == 6
diff --git a/vtools/evaluate.py b/vtools/evaluate.py
index 880853b..0369ff0 100644
--- a/vtools/evaluate.py
+++ b/vtools/evaluate.py
@@ -34,7 +34,10 @@ def parse_variants(ref: str, call: List[str], pos: List[str],
 
     # Here we count all alleles independently, also for the concordant calls
     for i, j in zip(call_variant, pos_variant):
-        if i == j:
+        # If one of the variants is partial no call
+        if i == '.' or j == '.':
+            results['alleles_no_call'] += 1
+        elif i == j:
             results['alleles_concordant'] += 1
         else:
             results['alleles_discordant'] += 1
@@ -143,7 +146,8 @@ def site_concordancy(call_vcf: VCF,
             p_gt = pos_record.gt_types[p_s]
             c_gt = call_record.gt_types[c_s]
 
-            # Is there missing data in one of the vcf files?
+            # Is one of the sites no call. This is only for fully no call
+            # sites. Partial no call site are handled in parse_variants
             if p_gt == 3 or c_gt == 3:
                 d['alleles_no_call'] += 2
                 continue
@@ -188,9 +192,13 @@ def site_concordancy(call_vcf: VCF,
             pos_alleles = [pos_record.REF] + pos_record.ALT
             cal_alleles = [call_record.REF] + call_record.ALT
 
-            # The third item in pos is a boolean indicating phasing
-            pos_gt = [pos_alleles[x] for x in pos[:2]]
-            cal_gt = [cal_alleles[x] for x in cal[:2]]
+            # The third item in pos is a boolean indicating phasing, so we
+            # don't use that
+            # No call is indicated by -1 in pyvcf2, so here we convert negative
+            # values to '.', which is what is used in the vcf to indicate no
+            # call
+            pos_gt = [pos_alleles[x] if x >= 0 else '.' for x in pos[:2]]
+            cal_gt = [cal_alleles[x] if x >= 0 else '.' for x in cal[:2]]
 
             # Parse the genotypes and add the results into d
             # We also need to know the reference call to determine if variants
-- 
GitLab