diff --git a/tests/cases/gatk_partial_call.vcf.gz b/tests/cases/gatk_partial_call.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..3ba8f8a666e0037892e28ade7172780a28adf74a Binary files /dev/null and b/tests/cases/gatk_partial_call.vcf.gz differ diff --git a/tests/cases/gatk_partial_call.vcf.gz.tbi b/tests/cases/gatk_partial_call.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..3205497d3fb444c435910397efb54e0fc53295d6 Binary files /dev/null and b/tests/cases/gatk_partial_call.vcf.gz.tbi differ diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index 895a5b717dd6416b09dcace21610566bb6e5b183..15b8e7b49004a4a80a1982e7afcb75ee50e6c4c2 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -184,3 +184,79 @@ def test_haploid_call_file(): match='Non-diploid variants are not supported'): site_concordancy(call, positive, call_samples=['BLANK'], positive_samples=['NA12878']) + + +@pytest.fixture(scope='module') +def partial_call_file(): + """ Test statistics when the call vcf contains partial variants """ + filename = 'tests/cases/gatk.vcf.gz' + partial = 'tests/cases/gatk_partial_call.vcf.gz' + call = VCF(partial, gts012=True) + positive = VCF(filename, gts012=True) + d, disc = site_concordancy(call, positive, call_samples=['BLANK'], + positive_samples=['BLANK'], min_dp=0, min_gq=0) + return d + + +def test_partial_call_total_sites(partial_call_file): + assert partial_call_file['total_sites'] == 37 + + +def test_partial_call_alleles_hom_ref_concordant(partial_call_file): + assert partial_call_file['alleles_hom_ref_concordant'] == 0 + + +def test_partial_call_alleles_het_concordant(partial_call_file): + assert partial_call_file['alleles_het_concordant'] == 0 + + +def test_partial_call_alleles_hom_alt_concordant(partial_call_file): + assert partial_call_file['alleles_hom_alt_concordant'] == 0 + + +def test_partial_call_alleles_concordant(partial_call_file): + assert partial_call_file['alleles_concordant'] == 6 + + +def test_partial_call_alleles_discordant(partial_call_file): + assert partial_call_file['alleles_discordant'] == 0 + + +def test_partial_call_alleles_no_call(partial_call_file): + assert partial_call_file['alleles_no_call'] == 68 + + +@pytest.fixture(scope='module') +def partial_positive_file(): + """ Test statistics when the call vcf contains partial variants """ + filename = 'tests/cases/gatk.vcf.gz' + partial = 'tests/cases/gatk_partial_call.vcf.gz' + call = VCF(filename, gts012=True) + positive = VCF(partial, gts012=True) + d, disc = site_concordancy(call, positive, call_samples=['BLANK'], + positive_samples=['BLANK'], min_dp=0, min_gq=0) + return d + + +def test_partial_positive_total_sites(partial_positive_file): + assert partial_positive_file['total_sites'] == 6 + + +def test_partial_positive_hom_ref_concordant(partial_positive_file): + assert partial_positive_file['alleles_hom_ref_concordant'] == 0 + + +def test_partial_positive_het_concordant(partial_positive_file): + assert partial_positive_file['alleles_het_concordant'] == 0 + + +def test_partial_positive_hom_alt_concordant(partial_positive_file): + assert partial_positive_file['alleles_hom_alt_concordant'] == 0 + + +def test_partial_positive_concordant(partial_positive_file): + assert partial_positive_file['alleles_concordant'] == 6 + + +def test_partial_positive_no_call(partial_positive_file): + assert partial_positive_file['alleles_no_call'] == 6 diff --git a/vtools/evaluate.py b/vtools/evaluate.py index 880853bf2e662da969bb5fb9bc4d39666c93bd46..0369ff02c459182652e2b5a372468e687cc81289 100644 --- a/vtools/evaluate.py +++ b/vtools/evaluate.py @@ -34,7 +34,10 @@ def parse_variants(ref: str, call: List[str], pos: List[str], # Here we count all alleles independently, also for the concordant calls for i, j in zip(call_variant, pos_variant): - if i == j: + # If one of the variants is partial no call + if i == '.' or j == '.': + results['alleles_no_call'] += 1 + elif i == j: results['alleles_concordant'] += 1 else: results['alleles_discordant'] += 1 @@ -143,7 +146,8 @@ def site_concordancy(call_vcf: VCF, p_gt = pos_record.gt_types[p_s] c_gt = call_record.gt_types[c_s] - # Is there missing data in one of the vcf files? + # Is one of the sites no call. This is only for fully no call + # sites. Partial no call site are handled in parse_variants if p_gt == 3 or c_gt == 3: d['alleles_no_call'] += 2 continue @@ -188,9 +192,13 @@ def site_concordancy(call_vcf: VCF, pos_alleles = [pos_record.REF] + pos_record.ALT cal_alleles = [call_record.REF] + call_record.ALT - # The third item in pos is a boolean indicating phasing - pos_gt = [pos_alleles[x] for x in pos[:2]] - cal_gt = [cal_alleles[x] for x in cal[:2]] + # The third item in pos is a boolean indicating phasing, so we + # don't use that + # No call is indicated by -1 in pyvcf2, so here we convert negative + # values to '.', which is what is used in the vcf to indicate no + # call + pos_gt = [pos_alleles[x] if x >= 0 else '.' for x in pos[:2]] + cal_gt = [cal_alleles[x] if x >= 0 else '.' for x in cal[:2]] # Parse the genotypes and add the results into d # We also need to know the reference call to determine if variants