Commit d5dd7f1a authored by rrvandenberg's avatar rrvandenberg

Add option to filter records based on read depth

parent ec9bba93
Pipeline #2742 failed with stage
in 24 seconds
......@@ -135,11 +135,15 @@ technologies. E.g, when comparing a WES VCF file vs a SNP array, this
tool can be quite useful.
Output is a simple JSON file listing counts of concordant and discordant
alleles.
alleles and some other metrics. It is also possible to output the discordant
VCF records.
Multisample VCF files are allowed; the samples to be evaluated have to be set
through a CLI argument.
Variants from the `--call-vcf` are filtered to have a Genotype Quality (GQ) of
at least 30 by default. This can be overruled by specifying `--min-qual 0`.
The optional flag `--min-depth` can be used to set the minimum read coverage.
#### Usage
......@@ -156,6 +160,8 @@ Options:
called multiple times [required]
-s, --stats PATH Path to output stats json file
-dc, --discordant PATH Path to output discordant VCF file
-mq, --min-qual FLOAT Minimum quality of variants to consider
-md, --min-depth FLOAT Minimum depth of variants to consider
--help Show this message and exit.
```
......
......@@ -36,12 +36,16 @@ from .gcoverage import RefRecord, region_coverages
@click.option("-dc", "--discordant", type=click.Path(writable=True),
help="Path to output discordant VCF file",
required=False)
def evaluate_cli(call_vcf, positive_vcf, call_samples, positive_samples, stats,
discordant):
@click.option("-mq", "--min-qual", type=float,
help="Minimum quality of variants to consider", default=30)
@click.option("-md", "--min-depth", type=float,
help="Minimum depth of variants to consider", default=0)
def evaluate_cli(call_vcf, positive_vcf, call_samples, positive_samples,
min_qual, min_depth, stats, discordant):
c_vcf = VCF(call_vcf, gts012=True)
p_vcf = VCF(positive_vcf, gts012=True)
st, disc = site_concordancy(c_vcf, p_vcf, call_samples,
positive_samples)
positive_samples, min_qual, min_depth)
# Write the stats json file
with click.open_file(stats, 'w') as fout:
print(json.dumps(st), file=fout)
......
......@@ -15,7 +15,8 @@ def site_concordancy(call_vcf: VCF,
positive_vcf: VCF,
call_samples: List[str],
positive_samples: List[str],
min_gq: float = 30) -> Dict[str, float]:
min_gq: float = 30,
min_dp: float = 0) -> Dict[str, float]:
"""
Calculate concordance between sites of two call sets,
of which one contains known true positives.
......@@ -32,9 +33,11 @@ def site_concordancy(call_vcf: VCF,
:param call_vcf: The VCF to evaluate. Must have gts012=True
:param positive_vcf: The VCF file containing true positive sites.
Must have gts012=True
:param call_samples: List of samples in `call_vcf` to consider
:param call_samples: List of samples in `call_vcf` to consider.
:param positive_samples: List of samples in `positive_vcf` to consider.
Must have same length than `call_samples`
:param min_qg: Minimum quality of variants to consider.
:param min_dp: Minimum depth of variants to consider.
:raises: ValueError if sample lists are not of same size.
......@@ -58,7 +61,8 @@ def site_concordancy(call_vcf: VCF,
"alleles_concordant": 0,
"alleles_discordant": 0,
"alleles_no_call": 0,
"alleles_low_qual": 0
"alleles_low_qual": 0,
"alleles_low_depth": 0
}
discordant_count = 0
discordant_records = list()
......@@ -91,10 +95,19 @@ def site_concordancy(call_vcf: VCF,
p_gt = pos_record.gt_types[p_s]
c_gt = call_record.gt_types[c_s]
# If the site does not pass the quality requirements
c_gq = call_record.gt_quals[c_s]
if c_gq < min_gq:
d['alleles_low_qual'] += 2
elif p_gt == 0 and c_gt == 0:
c_dp = call_record.gt_depths[c_s]
if c_gq < min_gq or c_dp < min_dp:
if c_gq < min_gq:
d['alleles_low_qual'] += 2
if c_dp < min_dp:
d['alleles_low_depth'] += 2
continue
# If the site passed the quality requirements
if p_gt == 0 and c_gt == 0:
# both homozygous reference
d['alleles_concordant'] += 2
d['alleles_hom_ref_concordant'] += 2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment