Commit d37115cc authored by van den Berg's avatar van den Berg
Browse files

Add support for RQC annotations to be used as GQ

When GATK emits all sites, homref sites get the RQC annotation instead
of the QC annotations. For vtools, these two annotations are used
interchangeably, since they both contain genotype likelihoods.

See the detailed explantation at
https://gatkforums.broadinstitute.org/gatk/discussion/5115/haplotypecaller-genotypegvcfs-and-vqsr-for-alternatives-in-dbsnp
parent 0ab4c701
......@@ -362,3 +362,34 @@ def test_parse_variants_concordant():
parse_variants('A', call, pos, results)
assert results['alleles_concordant'] == 1
def test_known_concordant_RGQ():
""" 0/0 calls from GATK have RGQ annotation instead of GQ
These should be treated the same, so when RGQ is present it should be
used instead of the QC when filtering variants.
"""
filename = 'tests/cases/gatk_RGQ.vcf.gz'
call = VCF(filename, gts012=True)
positive = VCF(filename, gts012=True)
d, disc = site_concordancy(call, positive, call_samples=['NA12878'],
positive_samples=['NA12878'],
min_gq=0, min_dp=0)
assert d['alleles_hom_ref_concordant'] == 14
def test_known_concordant_RGQ_min_qc_100():
""" 0/0 calls from GATK have RGQ annotation instead of GQ
These should be treated the same, so when RGQ is present it should be
used instead of the QC when filtering variants.
The max value of both GQ and RGQ is 99, so filtering for GQ or RGQ >=
100 should give us 0 hom ref concordant variants.
"""
filename = 'tests/cases/gatk_RGQ.vcf.gz'
call = VCF(filename, gts012=True)
positive = VCF(filename, gts012=True)
d, disc = site_concordancy(call, positive, call_samples=['NA12878'],
positive_samples=['NA12878'],
min_gq=100, min_dp=0)
assert d['alleles_hom_ref_concordant'] == 0
......@@ -45,6 +45,24 @@ def parse_variants(ref: str, call: List[str], pos: List[str],
results['alleles_discordant'] += 1
def RGQ_header_defined(vcf):
""" Determine whether the RGQ annotation is defined in the FORMAT header of
the vcf.
Since the header_iter does not return python dictionaries, we cannot easily
test if a certain key is set, hence the ugly code.
"""
for header in vcf.header_iter():
try:
header['ID']
except KeyError:
continue
else:
if header['ID'] == 'RGQ':
return True
return False
def site_concordancy(call_vcf: VCF,
positive_vcf: VCF,
call_samples: List[str],
......@@ -99,6 +117,11 @@ def site_concordancy(call_vcf: VCF,
"alleles_low_depth": 0
}
# Determine if the 'RQC' annotation is present in the FORMAT header
# This is needed for avoid getting an KeyError when we want to see if the
# RQC annotation is set for a given variant
RGQ_present = RGQ_header_defined(call_vcf)
# Keep track of the discordant sites
discordant_count = 0
discordant_records = list()
......@@ -155,8 +178,15 @@ def site_concordancy(call_vcf: VCF,
continue
# Get the quality requirements for the call site
c_gq = call_record.gt_quals[c_s]
c_dp = call_record.gt_depths[c_s]
# If the 'RGQ' is defined, which is the case for homref calls from
# GATK, use this value instead of 'GQ'
# See
# https://gatkforums.broadinstitute.org/gatk/discussion/9907/genotypegvcfs-no-records-in-vcf # noqa
if RGQ_present and call_record.format('RGQ') is not None:
c_gq = call_record.format('RGQ')[0][0]
else:
c_gq = call_record.gt_quals[c_s]
# Did we fail any of the quality control checks?
# We use this variable since we want to count all quality checks
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment