From d37115cc742d6688b5dd72993a8a9d5e678a1ffa Mon Sep 17 00:00:00 2001 From: Redmar van den Berg <RedmarvandenBerg@lumc.nl> Date: Tue, 12 Nov 2019 16:12:08 +0100 Subject: [PATCH] Add support for RQC annotations to be used as GQ When GATK emits all sites, homref sites get the RQC annotation instead of the QC annotations. For vtools, these two annotations are used interchangeably, since they both contain genotype likelihoods. See the detailed explantation at https://gatkforums.broadinstitute.org/gatk/discussion/5115/haplotypecaller-genotypegvcfs-and-vqsr-for-alternatives-in-dbsnp --- tests/cases/gatk_RGQ.vcf.gz | Bin 0 -> 3544 bytes tests/cases/gatk_RGQ.vcf.gz.tbi | Bin 0 -> 172 bytes tests/test_evaluate.py | 31 +++++++++++++++++++++++++++++++ vtools/evaluate.py | 32 +++++++++++++++++++++++++++++++- 4 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 tests/cases/gatk_RGQ.vcf.gz create mode 100644 tests/cases/gatk_RGQ.vcf.gz.tbi diff --git a/tests/cases/gatk_RGQ.vcf.gz b/tests/cases/gatk_RGQ.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..46a7b8df09b91362d2bc28bf63ca99c40dbcdd75 GIT binary patch literal 3544 zcmb2|=3rp}f&Xj_PR>jWyLq=p#s)uj6RRz&Yt^c}@n%}Uuiwnm?kWkSEz6pb*q{<x zc{zb`o#`~?*)uonuR9;=z3qwE4Zke;G=m#q-(Ob0+xh(7zjwQLpWL!Td-d^}%0H)H z{uH13F@N9hcYhvVJ$mx&NBjG8W#`XXDnC!oUi@m|w0kz^yC0WtPA!ZNc^SEJ$6B?| zPnXP*t*<tkD|Y#0u94oe)9tQJ>}sdml~#7`R+-;Bxn|c9`<f*YDw8AshDG*vYt8f% zJDv3B?fzfc^J2GUf4Y8c&iU1!`Z>$|xSu}WQ1QujZ=6cJdPVKWmk}AgTMw<D|55SR z52cf<l^8#z28ur3pRf1-+2v!Oo?kYK|0Z4^eCCwuJ#RbPr<YHDvaC9G_L%Xy=~q+A z?Eg$Ss~^8FfB&CnqJQKyvM!&j?=GAbW5;mf;3Kp7V#zYE>IHh*H_5bn+%Z4R@Qtax z#C+yMXI}|{&nbrbdFthpZ_k;xd*}4kFHG3XKYxC+ZBDQH7jL_zJ07Q`Z$Fy#mL>nb z@3Al2O%DElDp7r)>{E(gw0|A{@fk7&-}*Bv&qo^lulpNYpgvP!S*Yc1{)biHAO1;x z@H}((-pRRtmiXG7oyJmIs*+jj<XZQ8!kj((Gd*T)TfX^(eDt!U^|G7RO}D=vQP-e- z_R;gukH3W%*#CcOYX3%P;_RYMk2A0M)t<(h|9@dq_-wWF3;Tb^YXWO_ol5_|v##pS zJdXX1rK+jHp>}Hf{g!QiD*sex<%h2+AHU2{xH)ahFHVsM(_N><I11V&H>y3DeD>)1 zx}51LZzuh9yVheLU%$ggkL%Z!M~O!FD-Oh8=v$l;XxC=1qabqrr1QU}9(LkKb5zBo zyJPe3zFq%A%;dvNzJ;9c=NvN7+kfuR>yP`*<)VDVZ=e6W=T!bnQQPmU?tRF#jD9v* zGDL0Xyyu&LvCa#<u-we==bx(IOtzjg<!eqZ_jmoYWx?`!{$l*DU$?sk{b}@Vk<j;^ z^KjFF_1&qiv;J33%3t&A+pElN-UaubU$wQJ#NhjG{;B1zpSIQLJb(J-r(n#nV2&^P zm8K^rp7kx;J6qM>@$tuhH<xJTg$Hemvz_msy(?Zge*5F&ueiPJRNiQn313*cPUWXj zr4HxU>qe0x-$M)eKOQ-0v!y%7q2By=jObAbc}v~y-MineUcaB=;vsXj?UR`j7IIk3 zoVES&grY6$rv{Wwt&a;~Z%jCOo!#KawwgK5Rkr@Ljc}?yZhrsS)gM0}Tu(AOT=q`T ze@auTocz6`e5xN`J+j();(76{;_X^xa)-*q*Y&Ea?iUS}?AZ77!rrB?YIcXzW-eKE zt?hW4w!X);<(uDnZ%?s}4Lv9yl`wz)#H9lB|2cEMOx^jT{zL))?L9G9T&t(9Tb{q( zF0Mz<KCb`X+}?F{R`Ls7dhPz!e5xz?_wMgm|8V77k2EA&F18En%_+Tq$1cRFXUYX9 z4}G_*P8OLvr{;Z0D=`v^`+VkgpqZ7Wt(d#|^W!>?z2xTBSUh{RB0BFw+m>H1-W*@N z`E>HSnU+?&?UZbfNcYIjjlXVxQ&97yVcnOYUo*I8W}ZD8nEN~B8Ec$j&AUaPYaV~O zxO{W++{&^m_9=T-+Dh-;X=!6=ZL`~^<+J6U8v7*s56*hAr)JuU%#y9DKe6|rrnhSF zKJL2(oG<2hzZIEhY$aQmXK{W8&jsIxbuIEu=X}-X#3w#q*BzU8be+qx+-BMJQ#&_w z_Y}I_sBJr+n_&N7=F}SAea&;3cQcfwa#%CvG|Y{@cR0`V=y#D7OHc0MZm489qa)8$ zb0G9t4`)H^+e1%pKM1PWxK3%uo!dTaI~FmE`xzB88oXGRe85cDi??OAEZ4K3R{2wc zNz=G8gO2)&Z%EN$xm}SrGjPGr*%Mvg7~d7R%le^8V$aDL2DuxiPFXKF*Zp8x$F$Fk zd!1uKMcy!9U_8$8LF;qimRPktw|LClH`p?1^Y5Ijc*Sq|p;J@7Hbi?fFov(@nQ~M0 zk_DUcY=_T#i=?tGRGEVoM=jaYA5!S=!FZGZLllR-Oe*V^T(8X`+Eb<8mx!-UsMAf$ zDRI_ZyVY#w=iMxlGFS7~rq$_p6*K<h5;KZ&aLtiPITCiYh;5y>-0PVW>oiWW^(<@I z))JGcsgSx(KJ}obV!BV}9;F8!VskS$c6TM4-?}N@kUzap&1iLRR%5mpbNexc+N`qO z8qYj9a@m*iRQ8C(p4?Hw6{NPqH1@?RJ}Y+7IYJA5o;_iqGUvYQ5y8z;d6TaTtk^pt zyXB5_v~s$|y&VlJeIf<9W^|gIO&56X_#kRoobco9JGX8}9je`4<bC{0*BdA0N9M|T zDqBL|&3JsN?9}zuOQUm?whJT|=(Wh)TEV_*y0K{ib8GKRKOvhrPx>a=Fr;gGm1OPQ z9D673s#o5gO<UJEzU4jtLyvQv(cvRfpX)A8(_?-jkh8~ZV~6w2rs#(q+wQRF8n0%% z9K9@Lwe%$GXRcX2Z<lUox_qlB*Yw)Gm1&%ps<(EVc56>lElk_9Rh{`7kDI6W?H3hK z&hjNh&5?Uzwr57MZj84Ro8{|;p_68um2EycalzM<9*@t@%m_H$5XErYVB^ks_Q)C6 zLu}akN>z5wVYBG2m0aR_;=$>fC8<K^%MPEprjz>mQ|qOvR(ZS5gt27>=qe<4X-bCN z(U_5PxID0MX5qX4YdW%TRm?r6VBQ=p>6IaPqBlIyZE_p`?Z6Q29qYt)Oj)(<(Y)1q zMW>!-Db96g-M)2-;+KRAdA1(vE2mzsJ{FxCW$MM&otdQSEOzjiKtaY&wY)2pa;l}h z(nq$bJY1!B%V$!P^mW&@g=R;JmpQVCN(r!XPxz^J&+5UnTYTL!!?R@bx38a3D6uwR zr_PyN-rHXut!<WeUv!&sz0%EsmqJo++vAST)?O7W^lxkTom-{XJ<mIKEmjuP-oI#; zNO{_Y^$od)_FgE<GtD~JyuWm%+uK8vYeWj?E}XT0%EJqatK&^p<izea;oLcSSG)Au zYh2s;6u-M1oI4}$g!KCv4+Sq@d~Uc@>PBBz(VZOY*y%Z29t9o<JjN=0t$6CL&^>!s zUylC1%lE_V^@*#aZY(K$k#$CIgXo*o<yo8mcJRG^8QE~QGm!a^TIAjS16DgFwizTz zZ1b5L8ELZqwG~_J>xW{pcQ$=nwn2Z**JZa8X76X1U8S!c-}<(dYeOtc^x=~{(VybB zuufi*ed$Hqj=js?Y@1-7o)F`=vccM$o9)iFjk-7fPRPADvpmhWc<+4maIUkP)=uU1 zeIRf?Fo2i)Y)9vW+dN_qo;q!tr+M<$lxQ`p6THFuK4$X$)s(t>tZQ?_jeQA`9z8z= z?!0Xlb!}Ey>3mo>z=)kqJHxnU!y~;@*)^_wk4j(c?a64@(g-SLeQhnaIBU<73kL$G z9FMi)Z+t9n8lm`2T7qeH0`JO+u3GOloaEj;{g;nXP))$jozCBsI;HcMt?_=C>&w3~ zI9E&hJ>w6NdiUQi%3m30Rz3Yv6{O`VzT<{h)`}U$b_&y*CkM^ceA-meU9rk3U9Ww4 zYr@tmZwn5#B`!?A>vuWBRd$<MPw~2D+s9rJo2AtE-fL&r?b^QKP~La0U5oq7&YWYh zS=e)1?+W{I=be2EZ~GiQvGTQ0#_l6pb4_*1SE;<-8tXQD!F{$awdOkQ9apC+*2kWx znfqS*#(JqOD`KjrTW+ac7}FfSvF9hxZ(c$6W~Cd)Tr+()h|W4Y)l;xdt?g3YqIFNB zl@A^Id25NsYlg}1O~m=%t!!QUZFRJO#DPsZB6?dc7WP$hbZ7iJ$6?Mpv3=&c&g*rm z_oFywsaoE2_si`topf1e>$as_Q%kmYa+OQIj^TL2DYGslO7xc>U%M3lMV?<#!oLJo zKdQ(U_U+)@=5nA?<HoErXBIAEEL>InsO^aA9j>Z%b2*FIz9^<`&t~M<U#<CI_N8S? z^Sqaoygo1EY7%B9{YP?ncJqn(o`w<o7^1m${k3qh=HAwP@auww{JZC=7#_E=dM~}; zS<1zY3-7LDvEV)VKW2yGJh#^l#T;E-vjrAhE#1#sEU<O1)#k(EOjGWy(!Dh+nm6#X z^vkz5xJoZKNid(<di8hd6*1BN^?~mzts9Jk&uyQaB)HhET(nJlkDOD^fn$y<9SxtW zBxk8UJC|MYkvGz~G~)F==QnP-eRppiTlwa>lWC>-+34j)2B8Nwo90Kz`kpT^xwhv) zNRHNr`pd~$TcX(}zInJM#ZzNl+uK=(n)05{YK-JxdH+^eknc0UElq15`8DhD^=9XN zb5V(9ml0BEyUY1w^~Ds$$t$<BObq_fH!nqceU@4CwI{bdH%?tJdrEAS&Wqm2a;c>A z)%;T8mN&mdeBByr<B?&NrS+_E!c@5_ZwlUSOfsJ1em7-d>l=pcr_AoPxMe+D$NhB9 zvb1IQOD>3U?!LiewP^P{{hM_&eY9R~;fWPHli4%ZW;s`}v312uV*~kh!IrAUm(n(c z?%DR(_C?ew23ae1cmL@O7RHkdE`LwWldxR$@G19+?S4G6S0B!L%JA%V+v@u*CgpYo zS`%9wudR{(viN|nx%}7Ab&sZ97A=$C{&jT~YrLCTOq0`wZ?)^BtvhC^?OE+@mv%L7 zg|1eg@xeYLf0oYECXcli^qn>ln&iHs^L5yg7rRzmR0&RdCBnXNm+!LDuQprqj5cnW zy!WAoUlxb1Pv?Z*MH~NmZJx;I!c{K6qGzV`j^7TlL5}Amj^v!$ad4u6O={NbTW3yQ hmMPkGcFMci{~24$OS!J<GBBX`-J}_qK|MJz1^^a3)BXSe literal 0 HcmV?d00001 diff --git a/tests/cases/gatk_RGQ.vcf.gz.tbi b/tests/cases/gatk_RGQ.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..792beb715455fdd4b892f3b7ae538f352a37c207 GIT binary patch literal 172 zcmb2|=3rp}f&Xj_PR>jW{S0qU@8&(^Ai#RSmYq-MVx#ebM(swHhcoy*I67juzV5L2 zZ4u2ORbb?QOE%N~`)S>!9U0SZ=~%AY=50GK`P}@otJjBL&x<`?qETCM{rvx*mUYFi h_L-}APQrt#cc!wvs!L6t70AGV?jC6dW{~T^7y!MPJIMe5 literal 0 HcmV?d00001 diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index 26e142d..c531141 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -362,3 +362,34 @@ def test_parse_variants_concordant(): parse_variants('A', call, pos, results) assert results['alleles_concordant'] == 1 + + +def test_known_concordant_RGQ(): + """ 0/0 calls from GATK have RGQ annotation instead of GQ + These should be treated the same, so when RGQ is present it should be + used instead of the QC when filtering variants. + """ + filename = 'tests/cases/gatk_RGQ.vcf.gz' + call = VCF(filename, gts012=True) + positive = VCF(filename, gts012=True) + d, disc = site_concordancy(call, positive, call_samples=['NA12878'], + positive_samples=['NA12878'], + min_gq=0, min_dp=0) + assert d['alleles_hom_ref_concordant'] == 14 + + +def test_known_concordant_RGQ_min_qc_100(): + """ 0/0 calls from GATK have RGQ annotation instead of GQ + These should be treated the same, so when RGQ is present it should be + used instead of the QC when filtering variants. + + The max value of both GQ and RGQ is 99, so filtering for GQ or RGQ >= + 100 should give us 0 hom ref concordant variants. + """ + filename = 'tests/cases/gatk_RGQ.vcf.gz' + call = VCF(filename, gts012=True) + positive = VCF(filename, gts012=True) + d, disc = site_concordancy(call, positive, call_samples=['NA12878'], + positive_samples=['NA12878'], + min_gq=100, min_dp=0) + assert d['alleles_hom_ref_concordant'] == 0 diff --git a/vtools/evaluate.py b/vtools/evaluate.py index c5a657b..91e18ee 100644 --- a/vtools/evaluate.py +++ b/vtools/evaluate.py @@ -45,6 +45,24 @@ def parse_variants(ref: str, call: List[str], pos: List[str], results['alleles_discordant'] += 1 +def RGQ_header_defined(vcf): + """ Determine whether the RGQ annotation is defined in the FORMAT header of + the vcf. + + Since the header_iter does not return python dictionaries, we cannot easily + test if a certain key is set, hence the ugly code. + """ + for header in vcf.header_iter(): + try: + header['ID'] + except KeyError: + continue + else: + if header['ID'] == 'RGQ': + return True + return False + + def site_concordancy(call_vcf: VCF, positive_vcf: VCF, call_samples: List[str], @@ -99,6 +117,11 @@ def site_concordancy(call_vcf: VCF, "alleles_low_depth": 0 } + # Determine if the 'RQC' annotation is present in the FORMAT header + # This is needed for avoid getting an KeyError when we want to see if the + # RQC annotation is set for a given variant + RGQ_present = RGQ_header_defined(call_vcf) + # Keep track of the discordant sites discordant_count = 0 discordant_records = list() @@ -155,8 +178,15 @@ def site_concordancy(call_vcf: VCF, continue # Get the quality requirements for the call site - c_gq = call_record.gt_quals[c_s] c_dp = call_record.gt_depths[c_s] + # If the 'RGQ' is defined, which is the case for homref calls from + # GATK, use this value instead of 'GQ' + # See + # https://gatkforums.broadinstitute.org/gatk/discussion/9907/genotypegvcfs-no-records-in-vcf # noqa + if RGQ_present and call_record.format('RGQ') is not None: + c_gq = call_record.format('RGQ')[0][0] + else: + c_gq = call_record.gt_quals[c_s] # Did we fail any of the quality control checks? # We use this variable since we want to count all quality checks -- GitLab