diff --git a/tests/cases/dummy_decomposed.vcf.gz b/tests/cases/dummy_decomposed.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..09642f104fb1e3a673add5c28804af5297c7468d Binary files /dev/null and b/tests/cases/dummy_decomposed.vcf.gz differ diff --git a/tests/cases/dummy_decomposed.vcf.gz.tbi b/tests/cases/dummy_decomposed.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..b61781774ec3e28d970e66d942d351f4637d2019 Binary files /dev/null and b/tests/cases/dummy_decomposed.vcf.gz.tbi differ diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index 712f75558a1c88fb20e06f9ea36cdfa2d1f40696..85bd91a03a2ad4af2e1a51fb5b976443b58ad079 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -132,3 +132,29 @@ def test_phased_call_file(): match='Phased variants are not supported'): site_concordancy(call, positive, call_samples=['BLANK'], positive_samples=['NA12878']) + + +def test_decomposed_positive_file(): + """ Test error message when the positive vcf contains decomposed variants + """ + filename = 'tests/cases/gatk.vcf.gz' + decomposed = 'tests/cases/dummy_decomposed.vcf.gz' + call = VCF(filename, gts012=True) + positive = VCF(decomposed, gts012=True) + with pytest.raises(NotImplementedError, + match='Decomposed variants are not supported'): + site_concordancy(call, positive, call_samples=['BLANK'], + positive_samples=['NA12878']) + + +def test_decomposed_call_file(): + """ Test error message when the positive vcf contains decomposed variants + """ + filename = 'tests/cases/gatk.vcf.gz' + decomposed = 'tests/cases/dummy_decomposed.vcf.gz' + call = VCF(decomposed, gts012=True) + positive = VCF(filename, gts012=True) + with pytest.raises(NotImplementedError, + match='Decomposed variants are not supported'): + site_concordancy(call, positive, call_samples=['BLANK'], + positive_samples=['NA12878']) diff --git a/vtools/evaluate.py b/vtools/evaluate.py index ea276ca08a3a54b533b4b46e56290e8b5b53b145..bb751ec7c2b4ad0dac0d4108711b07b8fa18174f 100644 --- a/vtools/evaluate.py +++ b/vtools/evaluate.py @@ -7,6 +7,7 @@ vtools.evaluate :license: MIT """ from typing import List, Dict +from types import SimpleNamespace from cyvcf2 import VCF @@ -91,40 +92,48 @@ def site_concordancy(call_vcf: VCF, "alleles_low_qual": 0, "alleles_low_depth": 0 } + + # Keep track of the discordant sites discordant_count = 0 discordant_records = list() + # Keep track of the previous record, to make sure the positive vcf file + # does not contain decomposed records + prev_record = SimpleNamespace() + prev_record.POS = -1 + prev_record.CHROM = -1 + for pos_record in positive_vcf: + # Check to make sure the POS and CHROM are different from the previous + # variant + if (prev_record.POS == pos_record.POS and + prev_record.CHROM == pos_record.CHROM): + raise NotImplementedError('Decomposed variants are not supported') + else: + prev_record.POS = pos_record.POS + prev_record.CHROM = pos_record.CHROM + d['total_sites'] += 1 query_str = "{0}:{1}-{2}".format( pos_record.CHROM, - pos_record.start, + pos_record.start+1, pos_record.end ) - it = call_vcf(query_str) - same = [] - # If the vcf file has been decomposed, there can be multiple sites with - # the same CHROM and POS, which is why we have to iterate over all the - # sites that are returned - for it_record in it: - # We only want to consider sites that have the same CHROM, POS, REF - # and ALT as the positive vcf, since otherwise we might consider - # A/T and A/G as identical since they are both heterozygous - if (it_record.CHROM == pos_record.CHROM - and it_record.POS == pos_record.POS - and it_record.REF == pos_record.REF - and it_record.ALT == pos_record.ALT): - same.append(it_record) + call_records = list(call_vcf(query_str)) # If the variant is not present in the call vcf - if len(same) == 0: + if len(call_records) == 0: d['alleles_no_call'] += 2 - - if len(same) != 1: continue - call_record = same[0] + # If there are multiple variants with the same CHROM and POS, the + # variant in the call vcf file must have been decomposted + if len(call_records) > 1: + raise NotImplementedError('Decomposed variants are not supported') + + # Now we know there is only one call record + call_record = call_records[0] d['sites_considered'] += 1 d['alleles_considered'] += (2 * len(positive_samples))