evaluate.py 4.79 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
"""
vtools.evaluate
~~~~~~~~~~~~~~~

:copyright: (c) 2018 Sander Bollen
:copyright: (c) 2018 Leiden University Medical Center
:license: MIT
"""
from typing import List, Dict

from cyvcf2 import VCF


def site_concordancy(call_vcf: VCF,
                     positive_vcf: VCF,
                     call_samples: List[str],
Sander Bollen's avatar
Sander Bollen committed
17 18
                     positive_samples: List[str],
                     min_gq: float = 30) -> Dict[str, float]:
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
    """
    Calculate concordance between sites of two call sets,
    of which one contains known true positives.

    This only takes those sites in consideration which are shared between
    both call sets. This makes this evaluation useful in cases where
    the true positive set is known to contain regions not present at all
    in the base VCF. Such a situation occurs, for example, when comparing
    WES to a SNP array.

    Site identity is determined by the CHROM, POS, REF and ALT fields.
    This only works for bi-allelic variants.

    :param call_vcf: The VCF to evaluate. Must have gts012=True
    :param positive_vcf: The VCF file containing true positive sites.
        Must have gts012=True
    :param call_samples: List of samples in `call_vcf` to consider
    :param positive_samples: List of samples in `positive_vcf` to consider.
    Must have same length than `call_samples`

    :raises: ValueError if sample lists are not of same size.

    :returns: Dictionary of shape {"hom_alt_concordant": 0,
                                   "het_concordant": 0, "sites_considered": 0,
                                   "total_sites": 0}
    """
    if len(positive_samples) != len(call_samples):
        raise ValueError("Lists of samples must have same size")

    pos_sample_idx = [positive_vcf.samples.index(x) for x in positive_samples]
    call_sample_idx = [call_vcf.samples.index(x) for x in call_samples]

    d = {
        "total_sites": 0,
        "sites_considered": 0,
        "alleles_considered": 0,
        "alleles_het_concordant": 0,
        "alleles_hom_alt_concordant": 0,
        "alleles_hom_ref_concordant": 0,
        "alleles_concordant": 0,
        "alleles_discordant": 0,
Sander Bollen's avatar
Sander Bollen committed
60 61
        "alleles_no_call": 0,
        "alleles_low_qual": 0
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
    }
    for pos_record in positive_vcf:
        d['total_sites'] += 1
        query_str = "{0}:{1}-{2}".format(
            pos_record.CHROM,
            pos_record.start,
            pos_record.end
        )

        it = call_vcf(query_str)
        same = []
        for it_record in it:
            if (it_record.CHROM == pos_record.CHROM
                    and it_record.POS == pos_record.POS
                    and it_record.REF == pos_record.REF
                    and it_record.ALT == pos_record.ALT):

                same.append(it_record)

        if len(same) != 1:
            continue

        call_record = same[0]
        d['sites_considered'] += 1
        d['alleles_considered'] += (2*len(positive_samples))

        for p_s, c_s in zip(pos_sample_idx, call_sample_idx):
            p_gt = pos_record.gt_types[p_s]
            c_gt = call_record.gt_types[c_s]
Sander Bollen's avatar
Sander Bollen committed
91 92 93 94 95

            c_gq = call_record.gt_quals[c_s]
            if c_gq < min_gq:
                d['alleles_low_qual'] += 2
            elif p_gt == 0 and c_gt == 0:
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
                # both homozygous reference
                d['alleles_concordant'] += 2
                d['alleles_hom_ref_concordant'] += 2
            elif p_gt == 0 and c_gt == 1:
                # heterozygous when homozygous reference
                d['alleles_concordant'] += 1
                d['alleles_discordant'] += 1
            elif p_gt == 0 and c_gt == 2:
                # homozygous alt when homozygous ref
                d['alleles_discordant'] += 2
            elif p_gt == 1 and c_gt == 0:
                # homozygous ref when heterozygous
                d['alleles_concordant'] += 1
                d['alleles_discordant'] += 1
            elif p_gt == 1 and c_gt == 1:
                # both heterozygous
                d['alleles_concordant'] += 2
                d['alleles_het_concordant'] += 2
            elif p_gt == 1 and c_gt == 2:
                # homozygous alt when heterozygous
                d['alleles_concordant'] += 1
                d['alleles_discordant'] += 1
            elif p_gt == 2 and c_gt == 0:
                # homozygous ref when homozygous alt
                d['alleles_discordant'] += 2
            elif p_gt == 2 and c_gt == 1:
                # heterozygous when homozygous alt
                d['alleles_concordant'] += 1
                d['alleles_discordant'] += 1
            elif p_gt == 2 and c_gt == 2:
                # both homozygous alt
                d['alleles_concordant'] += 2
                d['alleles_hom_alt_concordant'] += 2
            else:
                # the same has no call in one or more of the VCF files
                d['alleles_no_call'] += 2

    return d