diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ea2f40c45968ed52baff0153146e607fa7781a05..ed7d19d3aad6ca8af55cdb6aec807a796acc55e1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,4 +4,5 @@ build: - docker script: - pip install --upgrade pip setuptools wheel - - pip install '.' + - pip install cython numpy + - tox diff --git a/README.md b/README.md index 947346e65f5acdc2cb89815c30797b824832e4c8..0d707d83911d518694455e15fbd1212aff609beb 100644 --- a/README.md +++ b/README.md @@ -135,11 +135,15 @@ technologies. E.g, when comparing a WES VCF file vs a SNP array, this tool can be quite useful. Output is a simple JSON file listing counts of concordant and discordant -alleles. +alleles and some other metrics. It is also possible to output the discordant +VCF records. Multisample VCF files are allowed; the samples to be evaluated have to be set through a CLI argument. +Variants from the `--call-vcf` are filtered to have a Genotype Quality (GQ) of +at least 30 by default. This can be overruled by specifying `--min-qual 0`. +The optional flag `--min-depth` can be used to set the minimum read coverage. #### Usage @@ -154,10 +158,12 @@ Options: called multiple times [required] -ps, --positive-samples TEXT Sample(s) in positive-vcf to consider. May be called multiple times [required] - -s, --stats PATH Path to output stats json file [required] - -dc, --discordant PATH Path to output discordant VCF file - --help Show this message and exit - ``` + -s, --stats PATH Path to output stats json file + -dc, --discordant PATH Path to output gzipped discordant vcf file + -mq, --min-qual FLOAT Minimum quality of variants to consider + -md, --min-depth INTEGER Minimum depth of variants to consider + --help Show this message and exit. +``` ## License diff --git a/tests/cases/dummy_decomposed.vcf.gz b/tests/cases/dummy_decomposed.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..09642f104fb1e3a673add5c28804af5297c7468d Binary files /dev/null and b/tests/cases/dummy_decomposed.vcf.gz differ diff --git a/tests/cases/dummy_decomposed.vcf.gz.tbi b/tests/cases/dummy_decomposed.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..b61781774ec3e28d970e66d942d351f4637d2019 Binary files /dev/null and b/tests/cases/dummy_decomposed.vcf.gz.tbi differ diff --git a/tests/cases/dummy_haploid.vcf.gz b/tests/cases/dummy_haploid.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..e0f25567c4e32ae177347b1243e8ed439852a78e Binary files /dev/null and b/tests/cases/dummy_haploid.vcf.gz differ diff --git a/tests/cases/dummy_haploid.vcf.gz.tbi b/tests/cases/dummy_haploid.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..a45d08c4d0fe3703b41aa5274203d985e7e934d9 Binary files /dev/null and b/tests/cases/dummy_haploid.vcf.gz.tbi differ diff --git a/tests/cases/dummy_phased_blank.vcf.gz b/tests/cases/dummy_phased_blank.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..cee7e4b957acc3391221c4fb4b616a677692aca7 Binary files /dev/null and b/tests/cases/dummy_phased_blank.vcf.gz differ diff --git a/tests/cases/dummy_phased_blank.vcf.gz.tbi b/tests/cases/dummy_phased_blank.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..271fa29b09e858f39223dd295d18a885b34b04cf Binary files /dev/null and b/tests/cases/dummy_phased_blank.vcf.gz.tbi differ diff --git a/tests/cases/gatk.vcf.gz b/tests/cases/gatk.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..df6722dd81a3e3c9425a9fadf40e965ff288fd60 Binary files /dev/null and b/tests/cases/gatk.vcf.gz differ diff --git a/tests/cases/gatk.vcf.gz.tbi b/tests/cases/gatk.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..26f3289b70db11b8b83bdb80424688968852fba5 Binary files /dev/null and b/tests/cases/gatk.vcf.gz.tbi differ diff --git a/tests/cases/gatk_RGQ.vcf.gz b/tests/cases/gatk_RGQ.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..46a7b8df09b91362d2bc28bf63ca99c40dbcdd75 Binary files /dev/null and b/tests/cases/gatk_RGQ.vcf.gz differ diff --git a/tests/cases/gatk_RGQ.vcf.gz.tbi b/tests/cases/gatk_RGQ.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..792beb715455fdd4b892f3b7ae538f352a37c207 Binary files /dev/null and b/tests/cases/gatk_RGQ.vcf.gz.tbi differ diff --git a/tests/cases/gatk_no_alt.vcf.gz b/tests/cases/gatk_no_alt.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..2eadfc08e074500ef0aaa88fa1f923f157847836 Binary files /dev/null and b/tests/cases/gatk_no_alt.vcf.gz differ diff --git a/tests/cases/gatk_no_alt.vcf.gz.tbi b/tests/cases/gatk_no_alt.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..4a964e0351e4325914edef48315c6a4fdfc84165 Binary files /dev/null and b/tests/cases/gatk_no_alt.vcf.gz.tbi differ diff --git a/tests/cases/gatk_partial_call.vcf.gz b/tests/cases/gatk_partial_call.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..3ba8f8a666e0037892e28ade7172780a28adf74a Binary files /dev/null and b/tests/cases/gatk_partial_call.vcf.gz differ diff --git a/tests/cases/gatk_partial_call.vcf.gz.tbi b/tests/cases/gatk_partial_call.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..3205497d3fb444c435910397efb54e0fc53295d6 Binary files /dev/null and b/tests/cases/gatk_partial_call.vcf.gz.tbi differ diff --git a/tests/cases/gatk_ref_alt_changed.vcf.gz b/tests/cases/gatk_ref_alt_changed.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..d6e7abc4e9c71e290cd03a48acabdc66a3c8e3f0 Binary files /dev/null and b/tests/cases/gatk_ref_alt_changed.vcf.gz differ diff --git a/tests/cases/gatk_ref_alt_changed.vcf.gz.tbi b/tests/cases/gatk_ref_alt_changed.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..16b661a64744cf268cc76e6ef270345a55668af2 Binary files /dev/null and b/tests/cases/gatk_ref_alt_changed.vcf.gz.tbi differ diff --git a/tests/cases/gatk_truncated.vcf.gz b/tests/cases/gatk_truncated.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..f07ffaed0445f1c1e63206f517df0dda21f53bca Binary files /dev/null and b/tests/cases/gatk_truncated.vcf.gz differ diff --git a/tests/cases/gatk_truncated.vcf.gz.tbi b/tests/cases/gatk_truncated.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..38bcbf2d56e47126019c1119e3af7644b4e7a82e Binary files /dev/null and b/tests/cases/gatk_truncated.vcf.gz.tbi differ diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..dc167975361de7a390e43bfcc5eb5e62e29ed076 --- /dev/null +++ b/tests/test_evaluate.py @@ -0,0 +1,443 @@ +import pytest +from collections import defaultdict + +from vtools.evaluate import site_concordancy +from vtools.evaluate import parse_variants + +from cyvcf2 import VCF + + +@pytest.fixture(scope='module') +def known_concordant(): + filename = 'tests/cases/gatk.vcf.gz' + call = VCF(filename, gts012=True) + positive = VCF(filename, gts012=True) + d, disc = site_concordancy(call, positive, call_samples=['NA12878'], + positive_samples=['NA12878'], + min_gq=0, min_dp=0) + return d + + +def test_total_sites(known_concordant): + assert known_concordant['total_sites'] == 37 + + +def test_sites_considered(known_concordant): + assert known_concordant['sites_considered'] == 37 + + +def test_alleles_considered(known_concordant): + assert known_concordant['alleles_considered'] == 74 + + +def test_alleles_het_concordant(known_concordant): + assert known_concordant['alleles_het_concordant'] == 42 + + +def test_alleles_hom_alt_concordant(known_concordant): + assert known_concordant['alleles_hom_alt_concordant'] == 18 + + +def test_alleles_hom_ref_concordant(known_concordant): + assert known_concordant['alleles_hom_ref_concordant'] == 14 + + +def test_alleles_concordant(known_concordant): + assert known_concordant['alleles_concordant'] == 74 + + +def test_alleles_discordant(known_concordant): + assert known_concordant['alleles_discordant'] == 0 + + +def test_alleles_no_call(known_concordant): + assert known_concordant['alleles_no_call'] == 0 + + +def test_alleles_low_qual(known_concordant): + assert known_concordant['alleles_low_qual'] == 0 + + +def test_alleles_low_depth(known_concordant): + assert known_concordant['alleles_low_depth'] == 0 + + +@pytest.fixture(scope='module') +def BLANK_NA12878(): + filename = 'tests/cases/gatk.vcf.gz' + call = VCF(filename, gts012=True) + positive = VCF(filename, gts012=True) + d, disc = site_concordancy(call, positive, call_samples=['BLANK'], + positive_samples=['NA12878'], + min_gq=30, min_dp=20) + return d + + +def test_low_qual_30(BLANK_NA12878): + assert BLANK_NA12878['alleles_low_qual'] == 34 + + +def test_low_depth_20(BLANK_NA12878): + assert BLANK_NA12878['alleles_low_depth'] == 36 + + +def test_no_call(BLANK_NA12878): + assert BLANK_NA12878['alleles_no_call'] == 8 + + +def test_truncated_call_file(): + """ When the call set is truncated, i.e. is missing variants which are + present in the positive vcf file """ + + filename = 'tests/cases/gatk.vcf.gz' + truncated = 'tests/cases/gatk_truncated.vcf.gz' + call = VCF(truncated, gts012=True) + positive = VCF(filename, gts012=True) + d, disc = site_concordancy(call, positive, call_samples=['NA12878'], + positive_samples=['BLANK'], + min_gq=30, min_dp=20) + assert d['alleles_no_call'] == 12 + + +def test_truncated_positive_file(): + """ When the positive set is truncated, i.e. the called vcf file contains + variants which are absent from the positive vcf file """ + filename = 'tests/cases/gatk.vcf.gz' + truncated = 'tests/cases/gatk_truncated.vcf.gz' + call = VCF(filename, gts012=True) + positive = VCF(truncated, gts012=True) + d, disc = site_concordancy(call, positive, call_samples=['NA12878'], + positive_samples=['BLANK'], + min_gq=30, min_dp=20) + assert d['alleles_no_call'] == 0 + + +def test_phased_positive_file(): + """ Test error message when the positive vcf contains phased variants """ + filename = 'tests/cases/gatk.vcf.gz' + phased = 'tests/cases/dummy_phased_blank.vcf.gz' + call = VCF(filename, gts012=True) + phased = VCF(phased, gts012=True) + with pytest.raises(NotImplementedError, + match='Phased variants are not supported'): + site_concordancy(call, phased, call_samples=['NA12878'], + positive_samples=['BLANK']) + + +def test_phased_call_file(): + """ Test error message when the call vcf contains phased variants """ + filename = 'tests/cases/gatk.vcf.gz' + phased = 'tests/cases/dummy_phased_blank.vcf.gz' + call = VCF(phased, gts012=True) + positive = VCF(filename, gts012=True) + with pytest.raises(NotImplementedError, + match='Phased variants are not supported'): + site_concordancy(call, positive, call_samples=['BLANK'], + positive_samples=['NA12878']) + + +def test_decomposed_positive_file(): + """ Test error message when the positive vcf contains decomposed variants + """ + filename = 'tests/cases/gatk.vcf.gz' + decomposed = 'tests/cases/dummy_decomposed.vcf.gz' + call = VCF(filename, gts012=True) + positive = VCF(decomposed, gts012=True) + with pytest.raises(NotImplementedError, + match='Decomposed variants are not supported'): + site_concordancy(call, positive, call_samples=['BLANK'], + positive_samples=['NA12878']) + + +def test_decomposed_call_file(): + """ Test error message when the positive vcf contains decomposed variants + """ + filename = 'tests/cases/gatk.vcf.gz' + decomposed = 'tests/cases/dummy_decomposed.vcf.gz' + call = VCF(decomposed, gts012=True) + positive = VCF(filename, gts012=True) + with pytest.raises(NotImplementedError, + match='Decomposed variants are not supported'): + site_concordancy(call, positive, call_samples=['BLANK'], + positive_samples=['NA12878']) + + +def test_haploid_positive_file(): + """ Test error message when the positive vcf contains non-diploid variants + """ + filename = 'tests/cases/gatk.vcf.gz' + haploid = 'tests/cases/dummy_haploid.vcf.gz' + call = VCF(filename, gts012=True) + positive = VCF(haploid, gts012=True) + with pytest.raises(NotImplementedError, + match='Non-diploid variants are not supported'): + site_concordancy(call, positive, call_samples=['NA12878'], + positive_samples=['BLANK']) + + +def test_haploid_call_file(): + """ Test error message when the call vcf contains non-diploid variants + """ + filename = 'tests/cases/gatk.vcf.gz' + haploid = 'tests/cases/dummy_haploid.vcf.gz' + call = VCF(haploid, gts012=True) + positive = VCF(filename, gts012=True) + with pytest.raises(NotImplementedError, + match='Non-diploid variants are not supported'): + site_concordancy(call, positive, call_samples=['BLANK'], + positive_samples=['NA12878']) + + +@pytest.fixture(scope='module') +def partial_call_file(): + """ Test statistics when the call vcf contains partial variants """ + filename = 'tests/cases/gatk.vcf.gz' + partial = 'tests/cases/gatk_partial_call.vcf.gz' + call = VCF(partial, gts012=True) + positive = VCF(filename, gts012=True) + d, disc = site_concordancy(call, positive, call_samples=['BLANK'], + positive_samples=['BLANK'], min_dp=0, min_gq=0) + return d + + +def test_partial_call_total_sites(partial_call_file): + assert partial_call_file['total_sites'] == 37 + + +def test_partial_call_alleles_hom_ref_concordant(partial_call_file): + assert partial_call_file['alleles_hom_ref_concordant'] == 0 + + +def test_partial_call_alleles_het_concordant(partial_call_file): + assert partial_call_file['alleles_het_concordant'] == 0 + + +def test_partial_call_alleles_hom_alt_concordant(partial_call_file): + assert partial_call_file['alleles_hom_alt_concordant'] == 0 + + +def test_partial_call_alleles_concordant(partial_call_file): + assert partial_call_file['alleles_concordant'] == 6 + + +def test_partial_call_alleles_discordant(partial_call_file): + assert partial_call_file['alleles_discordant'] == 0 + + +def test_partial_call_alleles_no_call(partial_call_file): + assert partial_call_file['alleles_no_call'] == 68 + + +@pytest.fixture(scope='module') +def partial_positive_file(): + """ Test statistics when the call vcf contains partial variants """ + filename = 'tests/cases/gatk.vcf.gz' + partial = 'tests/cases/gatk_partial_call.vcf.gz' + call = VCF(filename, gts012=True) + positive = VCF(partial, gts012=True) + d, disc = site_concordancy(call, positive, call_samples=['BLANK'], + positive_samples=['BLANK'], min_dp=0, min_gq=0) + return d + + +def test_partial_positive_total_sites(partial_positive_file): + assert partial_positive_file['total_sites'] == 6 + + +def test_partial_positive_hom_ref_concordant(partial_positive_file): + assert partial_positive_file['alleles_hom_ref_concordant'] == 0 + + +def test_partial_positive_het_concordant(partial_positive_file): + assert partial_positive_file['alleles_het_concordant'] == 0 + + +def test_partial_positive_hom_alt_concordant(partial_positive_file): + assert partial_positive_file['alleles_hom_alt_concordant'] == 0 + + +def test_partial_positive_concordant(partial_positive_file): + assert partial_positive_file['alleles_concordant'] == 6 + + +def test_partial_positive_no_call(partial_positive_file): + assert partial_positive_file['alleles_no_call'] == 0 + + +@pytest.fixture(scope='module') +def ref_alt_changed_positive(): + """ Test statistics when the ref and alt have been changed. + + Only the REF and ALT have been changed, and the gt calls for BLANK have + been updated to keep the actual genotype the same + """ + filename = 'tests/cases/gatk.vcf.gz' + mixed = 'tests/cases/gatk_ref_alt_changed.vcf.gz' + positive = VCF(mixed, gts012=True) + call = VCF(filename, gts012=True) + d, disc = site_concordancy(call, positive, call_samples=['BLANK'], + positive_samples=['BLANK'], min_dp=0, min_gq=0) + return d + + +def test_ref_alt_changed_positive_total(ref_alt_changed_positive): + assert ref_alt_changed_positive['total_sites'] == 10 + + +def test_ref_alt_changed_positive_hom_ref_concordant(ref_alt_changed_positive): + assert ref_alt_changed_positive['alleles_hom_ref_concordant'] == 2 + + +def test_ref_alt_changed_positive_het_concordant(ref_alt_changed_positive): + assert ref_alt_changed_positive['alleles_het_concordant'] == 12 + + +def test_ref_alt_changed_positive_hom_alt_concordant(ref_alt_changed_positive): + assert ref_alt_changed_positive['alleles_hom_alt_concordant'] == 6 + + +def test_ref_alt_changed_positive_concordant(ref_alt_changed_positive): + assert ref_alt_changed_positive['alleles_concordant'] == 20 + + +def test_ref_alt_changed_positive_no_call(ref_alt_changed_positive): + assert ref_alt_changed_positive['alleles_no_call'] == 0 + + +@pytest.fixture(scope='module') +def ref_alt_changed_call(): + """ Test statistics when the ref and alt have been changed. + + Only the REF and ALT have been changed, and the gt calls for BLANK have + been updated to keep the actual genotype the same + """ + filename = 'tests/cases/gatk.vcf.gz' + mixed = 'tests/cases/gatk_ref_alt_changed.vcf.gz' + positive = VCF(filename, gts012=True) + call = VCF(mixed, gts012=True) + d, disc = site_concordancy(call, positive, call_samples=['BLANK'], + positive_samples=['BLANK'], min_dp=0, min_gq=0) + return d + + +def test_ref_alt_changed_call_total(ref_alt_changed_call): + assert ref_alt_changed_call['total_sites'] == 37 + + +def test_ref_alt_changed_call_hom_ref_concordant(ref_alt_changed_call): + assert ref_alt_changed_call['alleles_hom_ref_concordant'] == 8 + + +def test_ref_alt_changed_call_het_concordant(ref_alt_changed_call): + assert ref_alt_changed_call['alleles_het_concordant'] == 12 + + +def test_ref_alt_changed_call_hom_alt_concordant(ref_alt_changed_call): + assert ref_alt_changed_call['alleles_hom_alt_concordant'] == 0 + + +def test_ref_alt_changed_call_concordant(ref_alt_changed_call): + assert ref_alt_changed_call['alleles_concordant'] == 20 + + +def test_ref_alt_changed_call_no_call(ref_alt_changed_call): + assert ref_alt_changed_call['alleles_no_call'] == 54 + + +def test_parse_variants_no_call(): + """ This should be counted as a single no call """ + results = defaultdict(int) + call = ['.', 'A'] + pos = ['A', 'G'] + + parse_variants('A', call, pos, results) + assert results['alleles_no_call'] == 1 + + +def test_parse_variants_concordant(): + """ This should be counted as a single concordant allele """ + results = defaultdict(int) + call = ['.', 'A'] + pos = ['A', 'G'] + + parse_variants('A', call, pos, results) + assert results['alleles_concordant'] == 1 + + +def test_known_concordant_RGQ(): + """ 0/0 calls from GATK have RGQ annotation instead of GQ + These should be treated the same, so when RGQ is present it should be + used instead of the QC when filtering variants. + """ + filename = 'tests/cases/gatk_RGQ.vcf.gz' + call = VCF(filename, gts012=True) + positive = VCF(filename, gts012=True) + d, disc = site_concordancy(call, positive, call_samples=['NA12878'], + positive_samples=['NA12878'], + min_gq=0, min_dp=0) + assert d['alleles_hom_ref_concordant'] == 14 + + +def test_known_concordant_RGQ_min_qc_100(): + """ 0/0 calls from GATK have RGQ annotation instead of GQ + These should be treated the same, so when RGQ is present it should be + used instead of the QC when filtering variants. + + The max value of both GQ and RGQ is 99, so filtering for GQ or RGQ >= + 100 should give us 0 hom ref concordant variants. + """ + filename = 'tests/cases/gatk_RGQ.vcf.gz' + call = VCF(filename, gts012=True) + positive = VCF(filename, gts012=True) + d, disc = site_concordancy(call, positive, call_samples=['NA12878'], + positive_samples=['NA12878'], + min_gq=100, min_dp=0) + assert d['alleles_hom_ref_concordant'] == 0 + + +@pytest.fixture(scope='module') +def gatk_no_alt_in_call(): + """ Test statistics when the ALT allele is missing from the called vcf + + The ALT allele has been set to '.' for each variant, and the corresponding + GT has been set to 0/0 to generate valid variants. + """ + filename = 'tests/cases/gatk.vcf.gz' + no_alt = 'tests/cases/gatk_no_alt.vcf.gz' + positive = VCF(filename, gts012=True) + call = VCF(no_alt, gts012=True) + d, disc = site_concordancy(call, positive, call_samples=['BLANK'], + positive_samples=['BLANK'], min_dp=0, min_gq=0) + return d + + +def test_no_alt_call_total_sites(gatk_no_alt_in_call): + assert gatk_no_alt_in_call['total_sites'] == 37 + + +def test_no_alt_call_sites_considered(gatk_no_alt_in_call): + assert gatk_no_alt_in_call['sites_considered'] == 37 + + +def test_no_alt_call_het_concordant(gatk_no_alt_in_call): + assert gatk_no_alt_in_call['alleles_het_concordant'] == 0 + + +def test_no_alt_call_hom_alt_concordant(gatk_no_alt_in_call): + assert gatk_no_alt_in_call['alleles_hom_alt_concordant'] == 0 + + +def test_no_alt_call_hom_ref_concordant(gatk_no_alt_in_call): + assert gatk_no_alt_in_call['alleles_hom_ref_concordant'] == 32 + + +def test_no_alt_call_alleles_concordant(gatk_no_alt_in_call): + assert gatk_no_alt_in_call['alleles_concordant'] == 46 + + +def test_no_alt_call_alleles_discordant(gatk_no_alt_in_call): + assert gatk_no_alt_in_call['alleles_discordant'] == 20 + + +def test_no_alt_call_alleles_no_call(gatk_no_alt_in_call): + assert gatk_no_alt_in_call['alleles_no_call'] == 8 diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000000000000000000000000000000000000..8b42584536c20bded1f0c00a9910c2c156f3c89c --- /dev/null +++ b/tox.ini @@ -0,0 +1,8 @@ +[tox] +envlist = py36 + +[testenv] +deps = + pytest + +commands = pytest diff --git a/vtools/cli.py b/vtools/cli.py index 30cd698f6ea2a5811d578601798de07122a93d19..97ee6df2ed2c7a840d0248450ebfef4a84197751 100644 --- a/vtools/cli.py +++ b/vtools/cli.py @@ -9,6 +9,7 @@ vtools.cli import json import click from cyvcf2 import VCF, Writer +import gzip from .evaluate import site_concordancy from .filter import FilterParams, FilterClass, Filterer @@ -32,25 +33,40 @@ from .gcoverage import RefRecord, region_coverages "May be called multiple times", required=True) @click.option("-s", "--stats", type=click.Path(writable=True), - help="Path to output stats json file", required=True) -@click.option("-dc", "--discordant", type=click.Path(writable=True), - help="Path to output discordant VCF file", + help="Path to output stats json file") +@click.option("-dvcf", "--discordant-vcf", type=click.Path(writable=True), + help="Path to output gzipped discordant vcf file", required=False) -def evaluate_cli(call_vcf, positive_vcf, call_samples, positive_samples, stats, - discordant): +@click.option("-mq", "--min-qual", type=float, + help="Minimum quality of variants to consider", default=30) +@click.option("-md", "--min-depth", type=int, + help="Minimum depth of variants to consider", default=0) +def evaluate_cli(call_vcf, positive_vcf, call_samples, positive_samples, + min_qual, min_depth, stats, discordant_vcf): c_vcf = VCF(call_vcf, gts012=True) p_vcf = VCF(positive_vcf, gts012=True) st, disc = site_concordancy(c_vcf, p_vcf, call_samples, - positive_samples) + positive_samples, min_qual, min_depth) # Write the stats json file - with open(stats, 'w') as fout: - print(json.dumps(st), file=fout) - - # If specified, write the discordant variants - if discordant: - with open(discordant, 'w') as fout: + if stats is None: + print(json.dumps(st)) + else: + with click.open_file(stats, 'w') as fout: + fout.write(json.dumps(st)) + + # If there were discordand records, and a discordant-vcf should be written + if len(disc) > 0 and discordant_vcf: + with click.open_file(discordant_vcf, 'w') as fout: + # First, we write the vcf header + with gzip.open(call_vcf, 'rt') as fin: + for line in fin: + if line.startswith('#'): + fout.write(line) + else: + break + # Then we write the vcf records that were discordant for record in disc: - print(record, file=fout, end='') + fout.write(str(record)) @click.command() diff --git a/vtools/evaluate.py b/vtools/evaluate.py index 089e6a9de569979640c5ae165194e71a7c87280c..396d68940eee87037a3bff3de5b43e39effd4382 100644 --- a/vtools/evaluate.py +++ b/vtools/evaluate.py @@ -7,15 +7,62 @@ vtools.evaluate :license: MIT """ from typing import List, Dict +from types import SimpleNamespace from cyvcf2 import VCF +def parse_variants(ref: str, call: List[str], pos: List[str], + results: Dict[str, int]): + """ Parse the variants and add to results """ + + call_variant = set(call) + pos_variant = set(pos) + + # The types of concordant calls are counted separately + if call_variant == pos_variant: + # These variants are homozygous reference + if len(call_variant) == 1 and next(iter(call_variant)) == ref: + results['alleles_hom_ref_concordant'] += 2 + # These variants are heterozygous, since they have different calls + elif len(call_variant) > 1: + results['alleles_het_concordant'] += 2 + # If they are not homozygous reference, and not heterozygous, they must + # be homozygous alternative, for whichever alt allele they have + else: + results['alleles_hom_alt_concordant'] += 2 + + # Here we count all alleles independently, also for the concordant calls + for allele in call: + if allele == '.': + results['alleles_no_call'] += 1 + elif allele in pos: + results['alleles_concordant'] += 1 + # We cannot match an A/A call with A/G, so we need to remove the + # A call from the positive set once we have 'used' it + pos.remove(allele) + else: + results['alleles_discordant'] += 1 + + +def RGQ_header_defined(vcf): + """ Determine whether the RGQ annotation is defined in the FORMAT header of + the vcf. + """ + try: + vcf.get_header_type('RGQ') + except KeyError: + return False + else: + return True + + def site_concordancy(call_vcf: VCF, positive_vcf: VCF, call_samples: List[str], positive_samples: List[str], - min_gq: float = 30) -> Dict[str, float]: + min_gq: float = 30, + min_dp: float = 0) -> Dict[str, float]: """ Calculate concordance between sites of two call sets, of which one contains known true positives. @@ -32,9 +79,11 @@ def site_concordancy(call_vcf: VCF, :param call_vcf: The VCF to evaluate. Must have gts012=True :param positive_vcf: The VCF file containing true positive sites. Must have gts012=True - :param call_samples: List of samples in `call_vcf` to consider + :param call_samples: List of samples in `call_vcf` to consider. :param positive_samples: List of samples in `positive_vcf` to consider. Must have same length than `call_samples` + :param min_qg: Minimum quality of variants to consider. + :param min_dp: Minimum depth of variants to consider. :raises: ValueError if sample lists are not of same size. @@ -58,79 +107,130 @@ def site_concordancy(call_vcf: VCF, "alleles_concordant": 0, "alleles_discordant": 0, "alleles_no_call": 0, - "alleles_low_qual": 0 + "alleles_low_qual": 0, + "alleles_low_depth": 0 } + + # Determine if the 'RQC' annotation is present in the FORMAT header + # This is needed for avoid getting an KeyError when we want to see if the + # RQC annotation is set for a given variant + RGQ_present = RGQ_header_defined(call_vcf) + + # Keep track of the discordant sites discordant_count = 0 discordant_records = list() + + # Keep track of the previous record, to make sure the positive vcf file + # does not contain decomposed records + prev_record = SimpleNamespace() + prev_record.POS = -1 + prev_record.CHROM = -1 + for pos_record in positive_vcf: + # Check to make sure the POS and CHROM are different from the previous + # variant + if (prev_record.POS == pos_record.POS and + prev_record.CHROM == pos_record.CHROM): + raise NotImplementedError('Decomposed variants are not supported') + else: + prev_record.POS = pos_record.POS + prev_record.CHROM = pos_record.CHROM + d['total_sites'] += 1 query_str = "{0}:{1}-{2}".format( pos_record.CHROM, - pos_record.start, + pos_record.start+1, pos_record.end ) - it = call_vcf(query_str) - same = [] - for it_record in it: - if (it_record.CHROM == pos_record.CHROM - and it_record.POS == pos_record.POS - and it_record.REF == pos_record.REF - and it_record.ALT == pos_record.ALT): - - same.append(it_record) + call_records = list(call_vcf(query_str)) - if len(same) != 1: + # If the variant is not present in the call vcf + if len(call_records) == 0: + d['alleles_no_call'] += 2 continue - call_record = same[0] + # If there are multiple variants with the same CHROM and POS, the + # variant in the call vcf file must have been decomposted + if len(call_records) > 1: + raise NotImplementedError('Decomposed variants are not supported') + + # Now we know there is only one call record + call_record = call_records[0] + d['sites_considered'] += 1 - d['alleles_considered'] += (2*len(positive_samples)) + d['alleles_considered'] += (2 * len(positive_samples)) for p_s, c_s in zip(pos_sample_idx, call_sample_idx): p_gt = pos_record.gt_types[p_s] c_gt = call_record.gt_types[c_s] - c_gq = call_record.gt_quals[c_s] + # Is one of the sites no call. This is only for fully no call + # sites. Partial no call site are handled in parse_variants + if p_gt == 3 or c_gt == 3: + d['alleles_no_call'] += 2 + continue + + # Get the quality requirements for the call site + c_dp = call_record.gt_depths[c_s] + # If the 'RGQ' is defined, which is the case for homref calls from + # GATK, use this value instead of 'GQ' + # See + # https://gatkforums.broadinstitute.org/gatk/discussion/9907/genotypegvcfs-no-records-in-vcf # noqa + if RGQ_present and call_record.format('RGQ') is not None: + c_gq = call_record.format('RGQ')[0][0] + else: + c_gq = call_record.gt_quals[c_s] + + # Did we fail any of the quality control checks? + # We use this variable since we want to count all quality checks + # independenly, and a single sample can fail multiple checks + failed_quality_check = False + + # Is the QG of the call site below the threshold? if c_gq < min_gq: d['alleles_low_qual'] += 2 - elif p_gt == 0 and c_gt == 0: - # both homozygous reference - d['alleles_concordant'] += 2 - d['alleles_hom_ref_concordant'] += 2 - elif p_gt == 0 and c_gt == 1: - # heterozygous when homozygous reference - d['alleles_concordant'] += 1 - d['alleles_discordant'] += 1 - elif p_gt == 0 and c_gt == 2: - # homozygous alt when homozygous ref - d['alleles_discordant'] += 2 - elif p_gt == 1 and c_gt == 0: - # homozygous ref when heterozygous - d['alleles_concordant'] += 1 - d['alleles_discordant'] += 1 - elif p_gt == 1 and c_gt == 1: - # both heterozygous - d['alleles_concordant'] += 2 - d['alleles_het_concordant'] += 2 - elif p_gt == 1 and c_gt == 2: - # homozygous alt when heterozygous - d['alleles_concordant'] += 1 - d['alleles_discordant'] += 1 - elif p_gt == 2 and c_gt == 0: - # homozygous ref when homozygous alt - d['alleles_discordant'] += 2 - elif p_gt == 2 and c_gt == 1: - # heterozygous when homozygous alt - d['alleles_concordant'] += 1 - d['alleles_discordant'] += 1 - elif p_gt == 2 and c_gt == 2: - # both homozygous alt - d['alleles_concordant'] += 2 - d['alleles_hom_alt_concordant'] += 2 - else: - # the same has no call in one or more of the VCF files - d['alleles_no_call'] += 2 + failed_quality_check = True + + # Is the DP of the call site below the threshold? + if c_dp < min_dp: + d['alleles_low_depth'] += 2 + failed_quality_check = True + + # Go to the next variant if any of the quality checks failed + if failed_quality_check: + continue + + # Get the genotypes + pos = pos_record.genotypes[p_s] + cal = call_record.genotypes[c_s] + + # If the genotypes are not diploid + if len(pos) != 3 or len(cal) != 3: + raise NotImplementedError('Non-diploid variants are not ' + 'supported') + # If the genotypes are phased + if pos[2] or cal[2]: + raise NotImplementedError('Phased variants are not supported') + + # Get the genotyped bases. There is a deprecationWarning when using + # gt_bases directly so we go via the alleles and the gt call + pos_alleles = [pos_record.REF] + pos_record.ALT + cal_alleles = [call_record.REF] + call_record.ALT + + # The third item in pos is a boolean indicating phasing, so we + # don't use that + # No call is indicated by -1 in pyvcf2, so here we convert negative + # values to '.', which is what is used in the vcf to indicate no + # call + pos_gt = [pos_alleles[x] if x >= 0 else '.' for x in pos[:2]] + cal_gt = [cal_alleles[x] if x >= 0 else '.' for x in cal[:2]] + + # Parse the genotypes and add the results into d + # We also need to know the reference call to determine if variants + # are hom ref. For this, we take the positive vcf REF call to be + # the truth + parse_variants(pos_record.REF, cal_gt, pos_gt, d) # The current variant is discordant if d['alleles_discordant'] > discordant_count: