From d5dd7f1ae551acf5e6157dd594077abdac82d74d Mon Sep 17 00:00:00 2001 From: Redmar van den Berg <RedmarvandenBerg@lumc.nl> Date: Mon, 26 Aug 2019 10:03:18 +0200 Subject: [PATCH] Add option to filter records based on read depth --- README.md | 8 +++++++- vtools/cli.py | 10 +++++++--- vtools/evaluate.py | 25 +++++++++++++++++++------ 3 files changed, 33 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index a998cd6..b613410 100644 --- a/README.md +++ b/README.md @@ -135,11 +135,15 @@ technologies. E.g, when comparing a WES VCF file vs a SNP array, this tool can be quite useful. Output is a simple JSON file listing counts of concordant and discordant -alleles. +alleles and some other metrics. It is also possible to output the discordant +VCF records. Multisample VCF files are allowed; the samples to be evaluated have to be set through a CLI argument. +Variants from the `--call-vcf` are filtered to have a Genotype Quality (GQ) of +at least 30 by default. This can be overruled by specifying `--min-qual 0`. +The optional flag `--min-depth` can be used to set the minimum read coverage. #### Usage @@ -156,6 +160,8 @@ Options: called multiple times [required] -s, --stats PATH Path to output stats json file -dc, --discordant PATH Path to output discordant VCF file + -mq, --min-qual FLOAT Minimum quality of variants to consider + -md, --min-depth FLOAT Minimum depth of variants to consider --help Show this message and exit. ``` diff --git a/vtools/cli.py b/vtools/cli.py index 77d54a7..7f95ef9 100644 --- a/vtools/cli.py +++ b/vtools/cli.py @@ -36,12 +36,16 @@ from .gcoverage import RefRecord, region_coverages @click.option("-dc", "--discordant", type=click.Path(writable=True), help="Path to output discordant VCF file", required=False) -def evaluate_cli(call_vcf, positive_vcf, call_samples, positive_samples, stats, - discordant): +@click.option("-mq", "--min-qual", type=float, + help="Minimum quality of variants to consider", default=30) +@click.option("-md", "--min-depth", type=float, + help="Minimum depth of variants to consider", default=0) +def evaluate_cli(call_vcf, positive_vcf, call_samples, positive_samples, + min_qual, min_depth, stats, discordant): c_vcf = VCF(call_vcf, gts012=True) p_vcf = VCF(positive_vcf, gts012=True) st, disc = site_concordancy(c_vcf, p_vcf, call_samples, - positive_samples) + positive_samples, min_qual, min_depth) # Write the stats json file with click.open_file(stats, 'w') as fout: print(json.dumps(st), file=fout) diff --git a/vtools/evaluate.py b/vtools/evaluate.py index 089e6a9..5593bf7 100644 --- a/vtools/evaluate.py +++ b/vtools/evaluate.py @@ -15,7 +15,8 @@ def site_concordancy(call_vcf: VCF, positive_vcf: VCF, call_samples: List[str], positive_samples: List[str], - min_gq: float = 30) -> Dict[str, float]: + min_gq: float = 30, + min_dp: float = 0) -> Dict[str, float]: """ Calculate concordance between sites of two call sets, of which one contains known true positives. @@ -32,9 +33,11 @@ def site_concordancy(call_vcf: VCF, :param call_vcf: The VCF to evaluate. Must have gts012=True :param positive_vcf: The VCF file containing true positive sites. Must have gts012=True - :param call_samples: List of samples in `call_vcf` to consider + :param call_samples: List of samples in `call_vcf` to consider. :param positive_samples: List of samples in `positive_vcf` to consider. Must have same length than `call_samples` + :param min_qg: Minimum quality of variants to consider. + :param min_dp: Minimum depth of variants to consider. :raises: ValueError if sample lists are not of same size. @@ -58,7 +61,8 @@ def site_concordancy(call_vcf: VCF, "alleles_concordant": 0, "alleles_discordant": 0, "alleles_no_call": 0, - "alleles_low_qual": 0 + "alleles_low_qual": 0, + "alleles_low_depth": 0 } discordant_count = 0 discordant_records = list() @@ -91,10 +95,19 @@ def site_concordancy(call_vcf: VCF, p_gt = pos_record.gt_types[p_s] c_gt = call_record.gt_types[c_s] + # If the site does not pass the quality requirements c_gq = call_record.gt_quals[c_s] - if c_gq < min_gq: - d['alleles_low_qual'] += 2 - elif p_gt == 0 and c_gt == 0: + c_dp = call_record.gt_depths[c_s] + + if c_gq < min_gq or c_dp < min_dp: + if c_gq < min_gq: + d['alleles_low_qual'] += 2 + if c_dp < min_dp: + d['alleles_low_depth'] += 2 + continue + + # If the site passed the quality requirements + if p_gt == 0 and c_gt == 0: # both homozygous reference d['alleles_concordant'] += 2 d['alleles_hom_ref_concordant'] += 2 -- GitLab