From 895f75418059cb824e715ffa3588c36f3f1bcee2 Mon Sep 17 00:00:00 2001
From: Redmar van den Berg <RedmarvandenBerg@lumc.nl>
Date: Tue, 22 Oct 2019 08:55:15 +0200
Subject: [PATCH] Properly count 'no call' sites

cyvcf2 assigns 'no call' sites a DP and QC of -1, which caused the
statistics to be off since we checked for 'no call' sites (i.e. gt=3)
after all other checks.

The check for 'no call' sites was moved up to make sure only true sites
get to the QC/DP checks. Furthermore, the QC/DP checks are counted
independenly, since a site can fail multiple checks.

The test cases were updated to reflect these changes, and now all pass.
---
 tests/test_evaluate.py |  4 ++--
 vtools/evaluate.py     | 37 ++++++++++++++++++++++++++-----------
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
index a7f92b3..c4e56c0 100644
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
@@ -72,11 +72,11 @@ def BLANK_NA12878():
 
 
 def test_low_qual_30(BLANK_NA12878):
-    assert BLANK_NA12878['alleles_low_qual'] == 42
+    assert BLANK_NA12878['alleles_low_qual'] == 34
 
 
 def test_low_depth_20(BLANK_NA12878):
-    assert BLANK_NA12878['alleles_low_depth'] == 44
+    assert BLANK_NA12878['alleles_low_depth'] == 36
 
 
 def test_no_call(BLANK_NA12878):
diff --git a/vtools/evaluate.py b/vtools/evaluate.py
index bd3d3a0..edb3def 100644
--- a/vtools/evaluate.py
+++ b/vtools/evaluate.py
@@ -81,7 +81,6 @@ def site_concordancy(call_vcf: VCF,
                     and it_record.POS == pos_record.POS
                     and it_record.REF == pos_record.REF
                     and it_record.ALT == pos_record.ALT):
-
                 same.append(it_record)
 
         if len(same) != 1:
@@ -89,22 +88,38 @@ def site_concordancy(call_vcf: VCF,
 
         call_record = same[0]
         d['sites_considered'] += 1
-        d['alleles_considered'] += (2*len(positive_samples))
+        d['alleles_considered'] += (2 * len(positive_samples))
 
         for p_s, c_s in zip(pos_sample_idx, call_sample_idx):
             p_gt = pos_record.gt_types[p_s]
             c_gt = call_record.gt_types[c_s]
 
-            # If the site does not pass the quality requirements
+            # Is there missing data in one of the vcf files?
+            if p_gt == 3 or c_gt == 3:
+                d['alleles_no_call'] += 2
+                continue
+
+            # Get the quality requirements for the call site
             c_gq = call_record.gt_quals[c_s]
             c_dp = call_record.gt_depths[c_s]
-            print(f'{call_record.POS}\tc_gt: {c_gt} p_gt: {p_gt}')
 
-            if c_gq < min_gq or c_dp < min_dp:
-                if c_gq < min_gq:
-                    d['alleles_low_qual'] += 2
-                if c_dp < min_dp:
-                    d['alleles_low_depth'] += 2
+            # Did we fail any of the quality control checks?
+            # We use this variable since we want to count all quality checks
+            # independenly, and a single sample can fail multiple checks
+            failed_quality_check = False
+
+            # Is the QG of the call site below the threshold?
+            if c_gq < min_gq:
+                d['alleles_low_qual'] += 2
+                failed_quality_check = True
+
+            # Is the DP of the call site below the threshold?
+            if c_dp < min_dp:
+                d['alleles_low_depth'] += 2
+                failed_quality_check = True
+
+            # Go to the next variant if any of the quality checks failed
+            if failed_quality_check:
                 continue
 
             # If the site passed the quality requirements
@@ -143,8 +158,8 @@ def site_concordancy(call_vcf: VCF,
                 d['alleles_concordant'] += 2
                 d['alleles_hom_alt_concordant'] += 2
             else:
-                # the same has no call in one or more of the VCF files
-                d['alleles_no_call'] += 2
+                # This should not be reached
+                raise RuntimeError('Unhandled genotype call')
 
             # The current variant is discordant
             if d['alleles_discordant'] > discordant_count:
-- 
GitLab