Commit 103371d5 authored by Sander Bollen's avatar Sander Bollen

add site concordancy function

parent 5bfe55f2
Pipeline #2127 canceled with stage
"""
vtools.evaluate
~~~~~~~~~~~~~~~
:copyright: (c) 2018 Sander Bollen
:copyright: (c) 2018 Leiden University Medical Center
:license: MIT
"""
from typing import List, Dict
from cyvcf2 import VCF
def site_concordancy(call_vcf: VCF,
positive_vcf: VCF,
call_samples: List[str],
positive_samples: List[str]) -> Dict[str, float]:
"""
Calculate concordance between sites of two call sets,
of which one contains known true positives.
This only takes those sites in consideration which are shared between
both call sets. This makes this evaluation useful in cases where
the true positive set is known to contain regions not present at all
in the base VCF. Such a situation occurs, for example, when comparing
WES to a SNP array.
Site identity is determined by the CHROM, POS, REF and ALT fields.
This only works for bi-allelic variants.
:param call_vcf: The VCF to evaluate. Must have gts012=True
:param positive_vcf: The VCF file containing true positive sites.
Must have gts012=True
:param call_samples: List of samples in `call_vcf` to consider
:param positive_samples: List of samples in `positive_vcf` to consider.
Must have same length than `call_samples`
:raises: ValueError if sample lists are not of same size.
:returns: Dictionary of shape {"hom_alt_concordant": 0,
"het_concordant": 0, "sites_considered": 0,
"total_sites": 0}
"""
if len(positive_samples) != len(call_samples):
raise ValueError("Lists of samples must have same size")
pos_sample_idx = [positive_vcf.samples.index(x) for x in positive_samples]
call_sample_idx = [call_vcf.samples.index(x) for x in call_samples]
d = {
"total_sites": 0,
"sites_considered": 0,
"alleles_considered": 0,
"alleles_het_concordant": 0,
"alleles_hom_alt_concordant": 0,
"alleles_hom_ref_concordant": 0,
"alleles_concordant": 0,
"alleles_discordant": 0,
"alleles_no_call": 0
}
for pos_record in positive_vcf:
d['total_sites'] += 1
query_str = "{0}:{1}-{2}".format(
pos_record.CHROM,
pos_record.start,
pos_record.end
)
it = call_vcf(query_str)
same = []
for it_record in it:
if (it_record.CHROM == pos_record.CHROM
and it_record.POS == pos_record.POS
and it_record.REF == pos_record.REF
and it_record.ALT == pos_record.ALT):
same.append(it_record)
if len(same) != 1:
continue
call_record = same[0]
d['sites_considered'] += 1
d['alleles_considered'] += (2*len(positive_samples))
for p_s, c_s in zip(pos_sample_idx, call_sample_idx):
p_gt = pos_record.gt_types[p_s]
c_gt = call_record.gt_types[c_s]
if p_gt == 0 and c_gt == 0:
# both homozygous reference
d['alleles_concordant'] += 2
d['alleles_hom_ref_concordant'] += 2
elif p_gt == 0 and c_gt == 1:
# heterozygous when homozygous reference
d['alleles_concordant'] += 1
d['alleles_discordant'] += 1
elif p_gt == 0 and c_gt == 2:
# homozygous alt when homozygous ref
d['alleles_discordant'] += 2
elif p_gt == 1 and c_gt == 0:
# homozygous ref when heterozygous
d['alleles_concordant'] += 1
d['alleles_discordant'] += 1
elif p_gt == 1 and c_gt == 1:
# both heterozygous
d['alleles_concordant'] += 2
d['alleles_het_concordant'] += 2
elif p_gt == 1 and c_gt == 2:
# homozygous alt when heterozygous
d['alleles_concordant'] += 1
d['alleles_discordant'] += 1
elif p_gt == 2 and c_gt == 0:
# homozygous ref when homozygous alt
d['alleles_discordant'] += 2
elif p_gt == 2 and c_gt == 1:
# heterozygous when homozygous alt
d['alleles_concordant'] += 1
d['alleles_discordant'] += 1
elif p_gt == 2 and c_gt == 2:
# both homozygous alt
d['alleles_concordant'] += 2
d['alleles_hom_alt_concordant'] += 2
else:
# the same has no call in one or more of the VCF files
d['alleles_no_call'] += 2
return d
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment