Add option to filter records based on read depth

d5dd7f1a · van den Berg · ec9bba93 · d5dd7f1a · d5dd7f1a · d5dd7f1a
Commit d5dd7f1a authored 5 years ago by van den Berg
--- a/README.md
+++ b/README.md
@@ -135,11 +135,15 @@ technologies. E.g, when comparing a WES VCF file vs a SNP array, this
 tool can be quite useful.

 Output is a simple JSON file listing counts of concordant and discordant
-alleles. 
+alleles and some other metrics. It is also possible to output the discordant
+VCF records.

 Multisample VCF files are allowed; the samples to be evaluated have to be set 
 through a CLI argument.

+Variants from the `--call-vcf` are filtered to have a Genotype Quality (GQ) of
+at least 30 by default. This can be overruled by specifying `--min-qual 0`.
+The optional flag `--min-depth` can be used to set the minimum read coverage.

 #### Usage

@@ -156,6 +160,8 @@ Options:
                                called multiple times  [required]
  -s, --stats PATH              Path to output stats json file
  -dc, --discordant PATH        Path to output discordant VCF file
+  -mq, --min-qual FLOAT         Minimum quality of variants to consider
+  -md, --min-depth FLOAT        Minimum depth of variants to consider
  --help                        Show this message and exit.
 ```


--- a/vtools/cli.py
+++ b/vtools/cli.py
@@ -36,12 +36,16 @@ from .gcoverage import RefRecord, region_coverages
 @click.option("-dc", "--discordant", type=click.Path(writable=True),
              help="Path to output discordant VCF file",
              required=False)
-def evaluate_cli(call_vcf, positive_vcf, call_samples, positive_samples, stats,
-                 discordant):
+@click.option("-mq", "--min-qual", type=float,
+              help="Minimum quality of variants to consider", default=30)
+@click.option("-md", "--min-depth", type=float,
+              help="Minimum depth of variants to consider", default=0)
+def evaluate_cli(call_vcf, positive_vcf, call_samples, positive_samples,
+                 min_qual, min_depth, stats, discordant):
    c_vcf = VCF(call_vcf, gts012=True)
    p_vcf = VCF(positive_vcf, gts012=True)
    st, disc = site_concordancy(c_vcf, p_vcf, call_samples,
-                                positive_samples)
+                                positive_samples, min_qual, min_depth)
    # Write the stats json file
    with click.open_file(stats, 'w') as fout:
        print(json.dumps(st), file=fout)

--- a/vtools/evaluate.py
+++ b/vtools/evaluate.py
@@ -15,7 +15,8 @@ def site_concordancy(call_vcf: VCF,
                     positive_vcf: VCF,
                     call_samples: List[str],
                     positive_samples: List[str],
-                     min_gq: float = 30) -> Dict[str, float]:
+                     min_gq: float = 30,
+                     min_dp: float = 0) -> Dict[str, float]:
    """
    Calculate concordance between sites of two call sets,
    of which one contains known true positives.
@@ -32,9 +33,11 @@ def site_concordancy(call_vcf: VCF,
    :param call_vcf: The VCF to evaluate. Must have gts012=True
    :param positive_vcf: The VCF file containing true positive sites.
        Must have gts012=True
-    :param call_samples: List of samples in `call_vcf` to consider
+    :param call_samples: List of samples in `call_vcf` to consider.
    :param positive_samples: List of samples in `positive_vcf` to consider.
    Must have same length than `call_samples`
+    :param min_qg: Minimum quality of variants to consider.
+    :param min_dp: Minimum depth of variants to consider.

    :raises: ValueError if sample lists are not of same size.

@@ -58,7 +61,8 @@ def site_concordancy(call_vcf: VCF,
        "alleles_concordant": 0,
        "alleles_discordant": 0,
        "alleles_no_call": 0,
-        "alleles_low_qual": 0
+        "alleles_low_qual": 0,
+        "alleles_low_depth": 0
    }
    discordant_count = 0
    discordant_records = list()
@@ -91,10 +95,19 @@ def site_concordancy(call_vcf: VCF,
            p_gt = pos_record.gt_types[p_s]
            c_gt = call_record.gt_types[c_s]

+            # If the site does not pass the quality requirements
            c_gq = call_record.gt_quals[c_s]
-            if c_gq < min_gq:
-                d['alleles_low_qual'] += 2
-            elif p_gt == 0 and c_gt == 0:
+            c_dp = call_record.gt_depths[c_s]
+
+            if c_gq < min_gq or c_dp < min_dp:
+                if c_gq < min_gq:
+                    d['alleles_low_qual'] += 2
+                if c_dp < min_dp:
+                    d['alleles_low_depth'] += 2
+                continue
+
+            # If the site passed the quality requirements
+            if p_gt == 0 and c_gt == 0:
                # both homozygous reference
                d['alleles_concordant'] += 2
                d['alleles_hom_ref_concordant'] += 2