From d37115cc742d6688b5dd72993a8a9d5e678a1ffa Mon Sep 17 00:00:00 2001
From: Redmar van den Berg <RedmarvandenBerg@lumc.nl>
Date: Tue, 12 Nov 2019 16:12:08 +0100
Subject: [PATCH] Add support for RQC annotations to be used as GQ

When GATK emits all sites, homref sites get the RQC annotation instead
of the QC annotations. For vtools, these two annotations are used
interchangeably, since they both contain genotype likelihoods.

See the detailed explantation at
https://gatkforums.broadinstitute.org/gatk/discussion/5115/haplotypecaller-genotypegvcfs-and-vqsr-for-alternatives-in-dbsnp
---
 tests/cases/gatk_RGQ.vcf.gz     | Bin 0 -> 3544 bytes
 tests/cases/gatk_RGQ.vcf.gz.tbi | Bin 0 -> 172 bytes
 tests/test_evaluate.py          |  31 +++++++++++++++++++++++++++++++
 vtools/evaluate.py              |  32 +++++++++++++++++++++++++++++++-
 4 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 tests/cases/gatk_RGQ.vcf.gz
 create mode 100644 tests/cases/gatk_RGQ.vcf.gz.tbi

diff --git a/tests/cases/gatk_RGQ.vcf.gz b/tests/cases/gatk_RGQ.vcf.gz
new file mode 100644
index 0000000000000000000000000000000000000000..46a7b8df09b91362d2bc28bf63ca99c40dbcdd75
GIT binary patch
literal 3544
zcmb2|=3rp}f&Xj_PR>jWyLq=p#s)uj6RRz&Yt^c}@n%}Uuiwnm?kWkSEz6pb*q{<x
zc{zb`o#`~?*)uonuR9;=z3qwE4Zke;G=m#q-(Ob0+xh(7zjwQLpWL!Td-d^}%0H)H
z{uH13F@N9hcYhvVJ$mx&NBjG8W#`XXDnC!oUi@m|w0kz^yC0WtPA!ZNc^SEJ$6B?|
zPnXP*t*<tkD|Y#0u94oe)9tQJ>}sdml~#7`R+-;Bxn|c9`<f*YDw8AshDG*vYt8f%
zJDv3B?fzfc^J2GUf4Y8c&iU1!`Z>$|xSu}WQ1QujZ=6cJdPVKWmk}AgTMw<D|55SR
z52cf<l^8#z28ur3pRf1-+2v!Oo?kYK|0Z4^eCCwuJ#RbPr<YHDvaC9G_L%Xy=~q+A
z?Eg$Ss~^8FfB&CnqJQKyvM!&j?=GAbW5;mf;3Kp7V#zYE>IHh*H_5bn+%Z4R@Qtax
z#C+yMXI}|{&nbrbdFthpZ_k;xd*}4kFHG3XKYxC+ZBDQH7jL_zJ07Q`Z$Fy#mL>nb
z@3Al2O%DElDp7r)>{E(gw0|A{@fk7&-}*Bv&qo^lulpNYpgvP!S*Yc1{)biHAO1;x
z@H}((-pRRtmiXG7oyJmIs*+jj<XZQ8!kj((Gd*T)TfX^(eDt!U^|G7RO}D=vQP-e-
z_R;gukH3W%*#CcOYX3%P;_RYMk2A0M)t<(h|9@dq_-wWF3;Tb^YXWO_ol5_|v##pS
zJdXX1rK+jHp>}Hf{g!QiD*sex<%h2+AHU2{xH)ahFHVsM(_N><I11V&H>y3DeD>)1
zx}51LZzuh9yVheLU%$ggkL%Z!M~O!FD-Oh8=v$l;XxC=1qabqrr1QU}9(LkKb5zBo
zyJPe3zFq%A%;dvNzJ;9c=NvN7+kfuR>yP`*<)VDVZ=e6W=T!bnQQPmU?tRF#jD9v*
zGDL0Xyyu&LvCa#<u-we==bx(IOtzjg<!eqZ_jmoYWx?`!{$l*DU$?sk{b}@Vk<j;^
z^KjFF_1&qiv;J33%3t&A+pElN-UaubU$wQJ#NhjG{;B1zpSIQLJb(J-r(n#nV2&^P
zm8K^rp7kx;J6qM>@$tuhH<xJTg$Hemvz_msy(?Zge*5F&ueiPJRNiQn313*cPUWXj
zr4HxU>qe0x-$M)eKOQ-0v!y%7q2By=jObAbc}v~y-MineUcaB=;vsXj?UR`j7IIk3
zoVES&grY6$rv{Wwt&a;~Z%jCOo!#KawwgK5Rkr@Ljc}?yZhrsS)gM0}Tu(AOT=q`T
ze@auTocz6`e5xN`J+j();(76{;_X^xa)-*q*Y&Ea?iUS}?AZ77!rrB?YIcXzW-eKE
zt?hW4w!X);<(uDnZ%?s}4Lv9yl`wz)#H9lB|2cEMOx^jT{zL))?L9G9T&t(9Tb{q(
zF0Mz<KCb`X+}?F{R`Ls7dhPz!e5xz?_wMgm|8V77k2EA&F18En%_+Tq$1cRFXUYX9
z4}G_*P8OLvr{;Z0D=`v^`+VkgpqZ7Wt(d#|^W!>?z2xTBSUh{RB0BFw+m>H1-W*@N
z`E>HSnU+?&?UZbfNcYIjjlXVxQ&97yVcnOYUo*I8W}ZD8nEN~B8Ec$j&AUaPYaV~O
zxO{W++{&^m_9=T-+Dh-;X=!6=ZL`~^<+J6U8v7*s56*hAr)JuU%#y9DKe6|rrnhSF
zKJL2(oG<2hzZIEhY$aQmXK{W8&jsIxbuIEu=X}-X#3w#q*BzU8be+qx+-BMJQ#&_w
z_Y}I_sBJr+n_&N7=F}SAea&;3cQcfwa#%CvG|Y{@cR0`V=y#D7OHc0MZm489qa)8$
zb0G9t4`)H^+e1%pKM1PWxK3%uo!dTaI~FmE`xzB88oXGRe85cDi??OAEZ4K3R{2wc
zNz=G8gO2)&Z%EN$xm}SrGjPGr*%Mvg7~d7R%le^8V$aDL2DuxiPFXKF*Zp8x$F$Fk
zd!1uKMcy!9U_8$8LF;qimRPktw|LClH`p?1^Y5Ijc*Sq|p;J@7Hbi?fFov(@nQ~M0
zk_DUcY=_T#i=?tGRGEVoM=jaYA5!S=!FZGZLllR-Oe*V^T(8X`+Eb<8mx!-UsMAf$
zDRI_ZyVY#w=iMxlGFS7~rq$_p6*K<h5;KZ&aLtiPITCiYh;5y>-0PVW>oiWW^(<@I
z))JGcsgSx(KJ}obV!BV}9;F8!VskS$c6TM4-?}N@kUzap&1iLRR%5mpbNexc+N`qO
z8qYj9a@m*iRQ8C(p4?Hw6{NPqH1@?RJ}Y+7IYJA5o;_iqGUvYQ5y8z;d6TaTtk^pt
zyXB5_v~s$|y&VlJeIf<9W^|gIO&56X_#kRoobco9JGX8}9je`4<bC{0*BdA0N9M|T
zDqBL|&3JsN?9}zuOQUm?whJT|=(Wh)TEV_*y0K{ib8GKRKOvhrPx>a=Fr;gGm1OPQ
z9D673s#o5gO<UJEzU4jtLyvQv(cvRfpX)A8(_?-jkh8~ZV~6w2rs#(q+wQRF8n0%%
z9K9@Lwe%$GXRcX2Z<lUox_qlB*Yw)Gm1&%ps<(EVc56>lElk_9Rh{`7kDI6W?H3hK
z&hjNh&5?Uzwr57MZj84Ro8{|;p_68um2EycalzM<9*@t@%m_H$5XErYVB^ks_Q)C6
zLu}akN>z5wVYBG2m0aR_;=$>fC8<K^%MPEprjz>mQ|qOvR(ZS5gt27>=qe<4X-bCN
z(U_5PxID0MX5qX4YdW%TRm?r6VBQ=p>6IaPqBlIyZE_p`?Z6Q29qYt)Oj)(<(Y)1q
zMW>!-Db96g-M)2-;+KRAdA1(vE2mzsJ{FxCW$MM&otdQSEOzjiKtaY&wY)2pa;l}h
z(nq$bJY1!B%V$!P^mW&@g=R;JmpQVCN(r!XPxz^J&+5UnTYTL!!?R@bx38a3D6uwR
zr_PyN-rHXut!<WeUv!&sz0%EsmqJo++vAST)?O7W^lxkTom-{XJ<mIKEmjuP-oI#;
zNO{_Y^$od)_FgE<GtD~JyuWm%+uK8vYeWj?E}XT0%EJqatK&^p<izea;oLcSSG)Au
zYh2s;6u-M1oI4}$g!KCv4+Sq@d~Uc@>PBBz(VZOY*y%Z29t9o<JjN=0t$6CL&^>!s
zUylC1%lE_V^@*#aZY(K$k#$CIgXo*o<yo8mcJRG^8QE~QGm!a^TIAjS16DgFwizTz
zZ1b5L8ELZqwG~_J>xW{pcQ$=nwn2Z**JZa8X76X1U8S!c-}<(dYeOtc^x=~{(VybB
zuufi*ed$Hqj=js?Y@1-7o)F`=vccM$o9)iFjk-7fPRPADvpmhWc<+4maIUkP)=uU1
zeIRf?Fo2i)Y)9vW+dN_qo;q!tr+M<$lxQ`p6THFuK4$X$)s(t>tZQ?_jeQA`9z8z=
z?!0Xlb!}Ey>3mo>z=)kqJHxnU!y~;@*)^_wk4j(c?a64@(g-SLeQhnaIBU<73kL$G
z9FMi)Z+t9n8lm`2T7qeH0`JO+u3GOloaEj;{g;nXP))$jozCBsI;HcMt?_=C>&w3~
zI9E&hJ>w6NdiUQi%3m30Rz3Yv6{O`VzT<{h)`}U$b_&y*CkM^ceA-meU9rk3U9Ww4
zYr@tmZwn5#B`!?A>vuWBRd$<MPw~2D+s9rJo2AtE-fL&r?b^QKP~La0U5oq7&YWYh
zS=e)1?+W{I=be2EZ~GiQvGTQ0#_l6pb4_*1SE;<-8tXQD!F{$awdOkQ9apC+*2kWx
znfqS*#(JqOD`KjrTW+ac7}FfSvF9hxZ(c$6W~Cd)Tr+()h|W4Y)l;xdt?g3YqIFNB
zl@A^Id25NsYlg}1O~m=%t!!QUZFRJO#DPsZB6?dc7WP$hbZ7iJ$6?Mpv3=&c&g*rm
z_oFywsaoE2_si`topf1e>$as_Q%kmYa+OQIj^TL2DYGslO7xc>U%M3lMV?<#!oLJo
zKdQ(U_U+)@=5nA?<HoErXBIAEEL>InsO^aA9j>Z%b2*FIz9^<`&t~M<U#<CI_N8S?
z^Sqaoygo1EY7%B9{YP?ncJqn(o`w<o7^1m${k3qh=HAwP@auww{JZC=7#_E=dM~};
zS<1zY3-7LDvEV)VKW2yGJh#^l#T;E-vjrAhE#1#sEU<O1)#k(EOjGWy(!Dh+nm6#X
z^vkz5xJoZKNid(<di8hd6*1BN^?~mzts9Jk&uyQaB)HhET(nJlkDOD^fn$y<9SxtW
zBxk8UJC|MYkvGz~G~)F==QnP-eRppiTlwa>lWC>-+34j)2B8Nwo90Kz`kpT^xwhv)
zNRHNr`pd~$TcX(}zInJM#ZzNl+uK=(n)05{YK-JxdH+^eknc0UElq15`8DhD^=9XN
zb5V(9ml0BEyUY1w^~Ds$$t$<BObq_fH!nqceU@4CwI{bdH%?tJdrEAS&Wqm2a;c>A
z)%;T8mN&mdeBByr<B?&NrS+_E!c@5_ZwlUSOfsJ1em7-d>l=pcr_AoPxMe+D$NhB9
zvb1IQOD>3U?!LiewP^P{{hM_&eY9R~;fWPHli4%ZW;s`}v312uV*~kh!IrAUm(n(c
z?%DR(_C?ew23ae1cmL@O7RHkdE`LwWldxR$@G19+?S4G6S0B!L%JA%V+v@u*CgpYo
zS`%9wudR{(viN|nx%}7Ab&sZ97A=$C{&jT~YrLCTOq0`wZ?)^BtvhC^?OE+@mv%L7
zg|1eg@xeYLf0oYECXcli^qn>ln&iHs^L5yg7rRzmR0&RdCBnXNm+!LDuQprqj5cnW
zy!WAoUlxb1Pv?Z*MH~NmZJx;I!c{K6qGzV`j^7TlL5}Amj^v!$ad4u6O={NbTW3yQ
hmMPkGcFMci{~24$OS!J<GBBX`-J}_qK|MJz1^^a3)BXSe

literal 0
HcmV?d00001

diff --git a/tests/cases/gatk_RGQ.vcf.gz.tbi b/tests/cases/gatk_RGQ.vcf.gz.tbi
new file mode 100644
index 0000000000000000000000000000000000000000..792beb715455fdd4b892f3b7ae538f352a37c207
GIT binary patch
literal 172
zcmb2|=3rp}f&Xj_PR>jW{S0qU@8&(^Ai#RSmYq-MVx#ebM(swHhcoy*I67juzV5L2
zZ4u2ORbb?QOE%N~`)S>!9U0SZ=~%AY=50GK`P}@otJjBL&x<`?qETCM{rvx*mUYFi
h_L-}APQrt#cc!wvs!L6t70AGV?jC6dW{~T^7y!MPJIMe5

literal 0
HcmV?d00001

diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
index 26e142d..c531141 100644
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
@@ -362,3 +362,34 @@ def test_parse_variants_concordant():
 
     parse_variants('A', call, pos, results)
     assert results['alleles_concordant'] == 1
+
+
+def test_known_concordant_RGQ():
+    """ 0/0 calls from GATK have RGQ annotation instead of GQ
+        These should be treated the same, so when RGQ is present it should be
+        used instead of the QC when filtering variants.
+    """
+    filename = 'tests/cases/gatk_RGQ.vcf.gz'
+    call = VCF(filename, gts012=True)
+    positive = VCF(filename, gts012=True)
+    d, disc = site_concordancy(call, positive, call_samples=['NA12878'],
+                               positive_samples=['NA12878'],
+                               min_gq=0, min_dp=0)
+    assert d['alleles_hom_ref_concordant'] == 14
+
+
+def test_known_concordant_RGQ_min_qc_100():
+    """ 0/0 calls from GATK have RGQ annotation instead of GQ
+        These should be treated the same, so when RGQ is present it should be
+        used instead of the QC when filtering variants.
+
+        The max value of both GQ and RGQ is 99, so filtering for GQ or RGQ >=
+        100 should give us 0 hom ref concordant variants.
+    """
+    filename = 'tests/cases/gatk_RGQ.vcf.gz'
+    call = VCF(filename, gts012=True)
+    positive = VCF(filename, gts012=True)
+    d, disc = site_concordancy(call, positive, call_samples=['NA12878'],
+                               positive_samples=['NA12878'],
+                               min_gq=100, min_dp=0)
+    assert d['alleles_hom_ref_concordant'] == 0
diff --git a/vtools/evaluate.py b/vtools/evaluate.py
index c5a657b..91e18ee 100644
--- a/vtools/evaluate.py
+++ b/vtools/evaluate.py
@@ -45,6 +45,24 @@ def parse_variants(ref: str, call: List[str], pos: List[str],
             results['alleles_discordant'] += 1
 
 
+def RGQ_header_defined(vcf):
+    """ Determine whether the RGQ annotation is defined in the FORMAT header of
+    the vcf.
+
+    Since the header_iter does not return python dictionaries, we cannot easily
+    test if a certain key is set, hence the ugly code.
+    """
+    for header in vcf.header_iter():
+        try:
+            header['ID']
+        except KeyError:
+            continue
+        else:
+            if header['ID'] == 'RGQ':
+                return True
+    return False
+
+
 def site_concordancy(call_vcf: VCF,
                      positive_vcf: VCF,
                      call_samples: List[str],
@@ -99,6 +117,11 @@ def site_concordancy(call_vcf: VCF,
         "alleles_low_depth": 0
     }
 
+    # Determine if the 'RQC' annotation is present in the FORMAT header
+    # This is needed for avoid getting an KeyError when we want to see if the
+    # RQC annotation is set for a given variant
+    RGQ_present = RGQ_header_defined(call_vcf)
+
     # Keep track of the discordant sites
     discordant_count = 0
     discordant_records = list()
@@ -155,8 +178,15 @@ def site_concordancy(call_vcf: VCF,
                 continue
 
             # Get the quality requirements for the call site
-            c_gq = call_record.gt_quals[c_s]
             c_dp = call_record.gt_depths[c_s]
+            # If the 'RGQ' is defined, which is the case for homref calls from
+            # GATK, use this value instead of 'GQ'
+            # See
+            # https://gatkforums.broadinstitute.org/gatk/discussion/9907/genotypegvcfs-no-records-in-vcf  # noqa
+            if RGQ_present and call_record.format('RGQ') is not None:
+                c_gq = call_record.format('RGQ')[0][0]
+            else:
+                c_gq = call_record.gt_quals[c_s]
 
             # Did we fail any of the quality control checks?
             # We use this variable since we want to count all quality checks
-- 
GitLab