From 86bd745bee0da0fb9a50b0b24b041a0d27e6c7e2 Mon Sep 17 00:00:00 2001
From: Redmar van den Berg <RedmarvandenBerg@lumc.nl>
Date: Thu, 24 Oct 2019 10:25:33 +0200
Subject: [PATCH] Add check for decomposed calls

Before, we would iterate over decomposed called variants until we found
the variant where the REF and ALT matched the one in the positive vcf.
However, this meant that sites where the called vcf had different ALTs
could not be processed.

To resolve this, we had to drop support for decomposed variants.
---
 tests/cases/dummy_decomposed.vcf.gz     | Bin 0 -> 1380 bytes
 tests/cases/dummy_decomposed.vcf.gz.tbi | Bin 0 -> 170 bytes
 tests/test_evaluate.py                  |  26 +++++++++++++
 vtools/evaluate.py                      |  47 ++++++++++++++----------
 4 files changed, 54 insertions(+), 19 deletions(-)
 create mode 100644 tests/cases/dummy_decomposed.vcf.gz
 create mode 100644 tests/cases/dummy_decomposed.vcf.gz.tbi

diff --git a/tests/cases/dummy_decomposed.vcf.gz b/tests/cases/dummy_decomposed.vcf.gz
new file mode 100644
index 0000000000000000000000000000000000000000..09642f104fb1e3a673add5c28804af5297c7468d
GIT binary patch
literal 1380
zcmb2|=3rp}f&Xj_PR>jW?yPUa?&se&6Q~RO&$X!X2A77G=ih0|dHq6EkL4WSe~qbj
zvFa@`t?pGS>R<o;l2x_bICagrfH#Q;<ajRGefs_G-EQk&kCLV?sZs6HUHzy3`op~o
zCz^|kKfIfBCH}{L9;W^BD<0o1JINg%EdG1W@8&g+vg1}T>MvG0x!NV=-|f0tpG-Of
z&V0@aR$_3u+P1i9y};?DBbT`B3R<RXoXnc6krcPUL{nRC|KG1ujAzJLocwO_d5^$-
zp${=a8+RzW+n=uOloPtqzs%0m_d(V^y@wNQ;*`P)SSE(=dGK+!-ruuZoaL7_g)V<$
zZX+yu<3h>tOG|FqAC4$7cosS_<@I#OUDMSMAAUOf{So_VtCpUeZxePgg}E_inxJ<j
zkLO>eq@0!xY1M1Gye+X#lfpdN+M^^LibI~+c!hZ$QOSFIIzi^1>8p5oo0s#0=P&kn
zv&oCeUTBr6efmYVukGF2SgtUapWN1N#c;gO>tUzen+y9FxOmHn{dwJ&e(_(s<?;2J
z`&w5-bZz>jAhi1NtG*|8bzX-w|A=b;EIGDG|MT5%H8X43d|TICb;`N8T&4Gj`NV}x
z>lPPncMUi-HRZi?eXWkENpoE^6XUO_@YGAH{OkCp{BSsFWh$xv{4u9gVdbhjDwXUH
z*sV6Vdu`0E(^$y+J>&2@%c!Dxw%d~0bLy_$_AlU8I}@k2aH{!qqa%+Lv%h`JmkpRV
z?fGXpF`Y@)oz|;m_lZRwoy1cmakhBcRp|qdK82{Ce)z}cpy%Od)4%K7E!?85nl)pJ
zbljg1rGLD0GECO7JlXK*!`3ximVwhbUvIc~uIy&^5y?$Sx{9d|C#%%URCkF+PwXz)
z-x|dxBKgHVOoeCH&bR9v+w2aouvSkHv8l4t$cvnJ?Wx`LuT^$+3Hqs@9S_@OO?TLN
zrFd3ohIYyoex|84u^&u()J~jv=5UF#c7iVVxnzqX-5*({^ZveM;?dnP(S@@;AyLU8
z?bWW{+0!Q1$x8i9S@&$ujvOu?+nfbW${Wu<3~XGU#~Z3t+dg~ynvU<&=T}T>w9dXD
z%$EE0&=lFdj}Ff4eU@?LQ_QcLKhI<uA6(~O$0zjb4EIHmD)BG!{i{CwS+OhMJ=9iu
z+gj~<<4wyBR=#rH=oGkil0;V5etVW}cbumm`e4aAb)uik&-|W;lW!C;y6PT3u`qUh
zetNlbYLR(3Tcgo$0j&=%nwRZ=IjhR>h|8ySl>}z~<I)H}{O?D3>K*kf8|S8#dhQO|
zxhtgn*45lF=KY;!tV_EbeYy4rE!9}L`1X;u$>v`13)gv?Tn{l>TL1QPoc))LZajOh
z%l|&|Le}0d;X>|a@tChoo&G*KUz;lZ{rn9Q?r=Z<+Ercj?#G5XJPh*#VvT*1zP{R@
zu<KdSjYJD`t3%smHk<R-1Q!>+4-qn$r5Bbb7H9vVsk7YptC}<G{^RqO9y+ev*T&1g
z`beOrVuFI)McJtdw->8@i@C16&NXDss~ZK<o1d=wHd*v==7w3j;uZ%TX1#N*Ba}P&
z>Xy=5w-|n$aLc`_q?ehaX1CMb=W0pJch%XMyH{o2JocaIZ#&DT4n8KqMYH=DFKS)q
zQWR5^x+dMY#!aDlmp-S94sX=gK*dKg+Dqn#o>(v|b350;dmH9yx|sTG2sv@}L5%xC
z??pGQD)lCYIGNfcRf}EKIlQ^V!=!ZWF_tDlE+dI)XShN%UaS<Fy+zZw!)IoqiKekn
zxGB%>y<AmR&mO87%7i~^YSJ}0@IvIw7B;i&uMekOp0SZN?ej&R&gp4)GrOA`W=17k
g%Pwa2^SN91S^Q<AUi)D_1_tzURGNVqRH}k80IN@lCIA2c

literal 0
HcmV?d00001

diff --git a/tests/cases/dummy_decomposed.vcf.gz.tbi b/tests/cases/dummy_decomposed.vcf.gz.tbi
new file mode 100644
index 0000000000000000000000000000000000000000..b61781774ec3e28d970e66d942d351f4637d2019
GIT binary patch
literal 170
zcmb2|=3rp}f&Xj_PR>jWy$o+p@8mt?Ai#PdQb*a5H)ThYZX<`xZe}4S_bHFA-I;KG
zx`2tpr+p@$mssw)t-Q3uV#=+##dmF{RqeZHsXzPQrnuhaxw7A@-o4#W_wU^5AAZIE
h?o3eeoP-<wJ8df*XY%>}tUv|^bk|5TFoWC%#sJJUKY;)M

literal 0
HcmV?d00001

diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
index 712f755..85bd91a 100644
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
@@ -132,3 +132,29 @@ def test_phased_call_file():
                        match='Phased variants are not supported'):
         site_concordancy(call, positive, call_samples=['BLANK'],
                          positive_samples=['NA12878'])
+
+
+def test_decomposed_positive_file():
+    """ Test error message when the positive vcf contains decomposed variants
+    """
+    filename = 'tests/cases/gatk.vcf.gz'
+    decomposed = 'tests/cases/dummy_decomposed.vcf.gz'
+    call = VCF(filename, gts012=True)
+    positive = VCF(decomposed, gts012=True)
+    with pytest.raises(NotImplementedError,
+                       match='Decomposed variants are not supported'):
+        site_concordancy(call, positive, call_samples=['BLANK'],
+                         positive_samples=['NA12878'])
+
+
+def test_decomposed_call_file():
+    """ Test error message when the positive vcf contains decomposed variants
+    """
+    filename = 'tests/cases/gatk.vcf.gz'
+    decomposed = 'tests/cases/dummy_decomposed.vcf.gz'
+    call = VCF(decomposed, gts012=True)
+    positive = VCF(filename, gts012=True)
+    with pytest.raises(NotImplementedError,
+                       match='Decomposed variants are not supported'):
+        site_concordancy(call, positive, call_samples=['BLANK'],
+                         positive_samples=['NA12878'])
diff --git a/vtools/evaluate.py b/vtools/evaluate.py
index ea276ca..bb751ec 100644
--- a/vtools/evaluate.py
+++ b/vtools/evaluate.py
@@ -7,6 +7,7 @@ vtools.evaluate
 :license: MIT
 """
 from typing import List, Dict
+from types import SimpleNamespace
 
 from cyvcf2 import VCF
 
@@ -91,40 +92,48 @@ def site_concordancy(call_vcf: VCF,
         "alleles_low_qual": 0,
         "alleles_low_depth": 0
     }
+
+    # Keep track of the discordant sites
     discordant_count = 0
     discordant_records = list()
 
+    # Keep track of the previous record, to make sure the positive vcf file
+    # does not contain decomposed records
+    prev_record = SimpleNamespace()
+    prev_record.POS = -1
+    prev_record.CHROM = -1
+
     for pos_record in positive_vcf:
+        # Check to make sure the POS and CHROM are different from the previous
+        # variant
+        if (prev_record.POS == pos_record.POS and
+                prev_record.CHROM == pos_record.CHROM):
+            raise NotImplementedError('Decomposed variants are not supported')
+        else:
+            prev_record.POS = pos_record.POS
+            prev_record.CHROM = pos_record.CHROM
+
         d['total_sites'] += 1
         query_str = "{0}:{1}-{2}".format(
             pos_record.CHROM,
-            pos_record.start,
+            pos_record.start+1,
             pos_record.end
         )
 
-        it = call_vcf(query_str)
-        same = []
-        # If the vcf file has been decomposed, there can be multiple sites with
-        # the same CHROM and POS, which is why we have to iterate over all the
-        # sites that are returned
-        for it_record in it:
-            # We only want to consider sites that have the same CHROM, POS, REF
-            # and ALT as the positive vcf, since otherwise we might consider
-            # A/T and A/G as identical since they are both heterozygous
-            if (it_record.CHROM == pos_record.CHROM
-                    and it_record.POS == pos_record.POS
-                    and it_record.REF == pos_record.REF
-                    and it_record.ALT == pos_record.ALT):
-                same.append(it_record)
+        call_records = list(call_vcf(query_str))
 
         # If the variant is not present in the call vcf
-        if len(same) == 0:
+        if len(call_records) == 0:
             d['alleles_no_call'] += 2
-
-        if len(same) != 1:
             continue
 
-        call_record = same[0]
+        # If there are multiple variants with the same CHROM and POS, the
+        # variant in the call vcf file must have been decomposted
+        if len(call_records) > 1:
+            raise NotImplementedError('Decomposed variants are not supported')
+
+        # Now we know there is only one call record
+        call_record = call_records[0]
 
         d['sites_considered'] += 1
         d['alleles_considered'] += (2 * len(positive_samples))
-- 
GitLab