From d5dd7f1ae551acf5e6157dd594077abdac82d74d Mon Sep 17 00:00:00 2001
From: Redmar van den Berg <RedmarvandenBerg@lumc.nl>
Date: Mon, 26 Aug 2019 10:03:18 +0200
Subject: [PATCH] Add option to filter records based on read depth

---
 README.md          |  8 +++++++-
 vtools/cli.py      | 10 +++++++---
 vtools/evaluate.py | 25 +++++++++++++++++++------
 3 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index a998cd6..b613410 100644
--- a/README.md
+++ b/README.md
@@ -135,11 +135,15 @@ technologies. E.g, when comparing a WES VCF file vs a SNP array, this
 tool can be quite useful.
 
 Output is a simple JSON file listing counts of concordant and discordant
-alleles. 
+alleles and some other metrics. It is also possible to output the discordant
+VCF records.
 
 Multisample VCF files are allowed; the samples to be evaluated have to be set 
 through a CLI argument.
 
+Variants from the `--call-vcf` are filtered to have a Genotype Quality (GQ) of
+at least 30 by default. This can be overruled by specifying `--min-qual 0`.
+The optional flag `--min-depth` can be used to set the minimum read coverage.
 
 #### Usage
 
@@ -156,6 +160,8 @@ Options:
                                 called multiple times  [required]
   -s, --stats PATH              Path to output stats json file
   -dc, --discordant PATH        Path to output discordant VCF file
+  -mq, --min-qual FLOAT         Minimum quality of variants to consider
+  -md, --min-depth FLOAT        Minimum depth of variants to consider
   --help                        Show this message and exit.
 ```
 
diff --git a/vtools/cli.py b/vtools/cli.py
index 77d54a7..7f95ef9 100644
--- a/vtools/cli.py
+++ b/vtools/cli.py
@@ -36,12 +36,16 @@ from .gcoverage import RefRecord, region_coverages
 @click.option("-dc", "--discordant", type=click.Path(writable=True),
               help="Path to output discordant VCF file",
               required=False)
-def evaluate_cli(call_vcf, positive_vcf, call_samples, positive_samples, stats,
-                 discordant):
+@click.option("-mq", "--min-qual", type=float,
+              help="Minimum quality of variants to consider", default=30)
+@click.option("-md", "--min-depth", type=float,
+              help="Minimum depth of variants to consider", default=0)
+def evaluate_cli(call_vcf, positive_vcf, call_samples, positive_samples,
+                 min_qual, min_depth, stats, discordant):
     c_vcf = VCF(call_vcf, gts012=True)
     p_vcf = VCF(positive_vcf, gts012=True)
     st, disc = site_concordancy(c_vcf, p_vcf, call_samples,
-                                positive_samples)
+                                positive_samples, min_qual, min_depth)
     # Write the stats json file
     with click.open_file(stats, 'w') as fout:
         print(json.dumps(st), file=fout)
diff --git a/vtools/evaluate.py b/vtools/evaluate.py
index 089e6a9..5593bf7 100644
--- a/vtools/evaluate.py
+++ b/vtools/evaluate.py
@@ -15,7 +15,8 @@ def site_concordancy(call_vcf: VCF,
                      positive_vcf: VCF,
                      call_samples: List[str],
                      positive_samples: List[str],
-                     min_gq: float = 30) -> Dict[str, float]:
+                     min_gq: float = 30,
+                     min_dp: float = 0) -> Dict[str, float]:
     """
     Calculate concordance between sites of two call sets,
     of which one contains known true positives.
@@ -32,9 +33,11 @@ def site_concordancy(call_vcf: VCF,
     :param call_vcf: The VCF to evaluate. Must have gts012=True
     :param positive_vcf: The VCF file containing true positive sites.
         Must have gts012=True
-    :param call_samples: List of samples in `call_vcf` to consider
+    :param call_samples: List of samples in `call_vcf` to consider.
     :param positive_samples: List of samples in `positive_vcf` to consider.
     Must have same length than `call_samples`
+    :param min_qg: Minimum quality of variants to consider.
+    :param min_dp: Minimum depth of variants to consider.
 
     :raises: ValueError if sample lists are not of same size.
 
@@ -58,7 +61,8 @@ def site_concordancy(call_vcf: VCF,
         "alleles_concordant": 0,
         "alleles_discordant": 0,
         "alleles_no_call": 0,
-        "alleles_low_qual": 0
+        "alleles_low_qual": 0,
+        "alleles_low_depth": 0
     }
     discordant_count = 0
     discordant_records = list()
@@ -91,10 +95,19 @@ def site_concordancy(call_vcf: VCF,
             p_gt = pos_record.gt_types[p_s]
             c_gt = call_record.gt_types[c_s]
 
+            # If the site does not pass the quality requirements
             c_gq = call_record.gt_quals[c_s]
-            if c_gq < min_gq:
-                d['alleles_low_qual'] += 2
-            elif p_gt == 0 and c_gt == 0:
+            c_dp = call_record.gt_depths[c_s]
+
+            if c_gq < min_gq or c_dp < min_dp:
+                if c_gq < min_gq:
+                    d['alleles_low_qual'] += 2
+                if c_dp < min_dp:
+                    d['alleles_low_depth'] += 2
+                continue
+
+            # If the site passed the quality requirements
+            if p_gt == 0 and c_gt == 0:
                 # both homozygous reference
                 d['alleles_concordant'] += 2
                 d['alleles_hom_ref_concordant'] += 2
-- 
GitLab