From c0410aa8953d46fde1b9c584e0384699633aa1a9 Mon Sep 17 00:00:00 2001 From: Redmar van den Berg <RedmarvandenBerg@lumc.nl> Date: Fri, 15 May 2020 10:03:06 +0200 Subject: [PATCH] Add tests for removal of ALTs that are not called The first step of the pipeline is to use bcftools to filter alt alleles that are not present in any of the called genotypes, and to remove the alt alleles of genotypes that are uncalled. --- test/config/config-noalt-uncalled.json | 9 +++++ test/data/16699289_chrM_noalt_uncalled.vcf | 40 ++++++++++++++++++++++ test/test-integration.yml | 21 ++++++++++++ 3 files changed, 70 insertions(+) create mode 100644 test/config/config-noalt-uncalled.json create mode 100644 test/data/16699289_chrM_noalt_uncalled.vcf diff --git a/test/config/config-noalt-uncalled.json b/test/config/config-noalt-uncalled.json new file mode 100644 index 0000000..c4dc9da --- /dev/null +++ b/test/config/config-noalt-uncalled.json @@ -0,0 +1,9 @@ +{ + "samples": { + "16699289": { + "disease_code": "TEST", + "gvcf": "test/data/16699289_chrM.g.vcf", + "vcf": "test/data/16699289_chrM_noalt_uncalled.vcf" + } + } +} diff --git a/test/data/16699289_chrM_noalt_uncalled.vcf b/test/data/16699289_chrM_noalt_uncalled.vcf new file mode 100644 index 0000000..fc48c78 --- /dev/null +++ b/test/data/16699289_chrM_noalt_uncalled.vcf @@ -0,0 +1,40 @@ +##fileformat=VCFv4.2 +##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location"> +##FILTER=<ID=LowQual,Description="Low quality"> +##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> +##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)"> +##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block"> +##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another"> +##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group"> +##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"> +##FORMAT=<ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)"> +##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias."> +##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> +##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> +##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes"> +##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities"> +##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases"> +##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership"> +##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered"> +##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?"> +##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval"> +##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity"> +##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias"> +##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes"> +##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation"> +##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed"> +##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed"> +##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality"> +##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities"> +##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth"> +##INFO=<ID=RAW_MQ,Number=1,Type=Float,Description="Raw data for RMS Mapping Quality"> +##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias"> +##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias"> +##contig=<ID=chrM,length=16571,assembly=hg19> +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 16699289 +chrM 73 . G A 4062.77 . AC=2;AF=1.00;AN=2;DP=131;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=58.26;QD=32.24;SOR=0.791 GT:AD:DP:GQ:PL 1/1:0,126:126:99:4091,378,0 +chrM 150 . T C 5149.77 . AC=2;AF=1.00;AN=2;DP=130;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=60.00;QD=34.24;SOR=0.710 GT:AD:DP:GQ:PGT:PID:PL 0/0:0,117:117:99:1|1:150_T_C:5178,352,0 +chrM 152 . T C 5149.77 . AC=2;AF=1.00;AN=2;DP=126;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=60.00;QD=30.63;SOR=0.711 GT:AD:DP:GQ:PGT:PID:PL ./.:0,113:113:99:1|1:150_T_C:5178,352,0 +chrM 195 . C T 2959.77 . AC=2;AF=1.00;AN=2;DP=98;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=60.00;QD=29.09;SOR=0.961 GT:AD:DP:GQ:PL 1/1:0,82:82:99:2988,246,0 diff --git a/test/test-integration.yml b/test/test-integration.yml index b4af988..29151c0 100644 --- a/test/test-integration.yml +++ b/test/test-integration.yml @@ -12,3 +12,24 @@ - path: 16699289_coverage.varda - path: 16699289_variants.varda +- name: test-trim-alt-exclude-uncalled + tags: + - integration + command: > + snakemake + --configfile test/config/config-noalt-uncalled.json + --use-singularity + --singularity-args ' --containall --bind /tmp' + --notemp + --cores 1 + 16699289_trimmed.vcf + files: + - path: 16699289_trimmed.vcf + must_not_contain: + - "chrM\t150\t.\tT\tC" + - "chrM\t152\t.\tT\tC" + contains: + - "chrM\t73\t.\tG\tA" + - "chrM\t150\t.\tT\t." + - "chrM\t152\t.\tT\t." + - "chrM\t195\t.\tC\tT" -- GitLab