From f5deabd29c6ef686d54025d8110b902a73b80818 Mon Sep 17 00:00:00 2001
From: Redmar van den Berg <RedmarvandenBerg@lumc.nl>
Date: Fri, 15 May 2020 10:55:33 +0200
Subject: [PATCH] Add tests removing multiallelic ALTs

---
 test/config/config-multiallelic.json |  9 ++++++
 test/data/fake_chrM_multiallelic.vcf | 43 ++++++++++++++++++++++++++++
 test/test-integration.yml            | 23 +++++++++++++++
 3 files changed, 75 insertions(+)
 create mode 100644 test/config/config-multiallelic.json
 create mode 100644 test/data/fake_chrM_multiallelic.vcf

diff --git a/test/config/config-multiallelic.json b/test/config/config-multiallelic.json
new file mode 100644
index 0000000..0f30516
--- /dev/null
+++ b/test/config/config-multiallelic.json
@@ -0,0 +1,9 @@
+{
+  "samples": {
+    "16699289": {
+      "disease_code": "TEST",
+      "gvcf": "test/data/16699289_chrM.g.vcf",
+      "vcf": "test/data/fake_chrM_multiallelic.vcf"
+    }
+  }
+}
diff --git a/test/data/fake_chrM_multiallelic.vcf b/test/data/fake_chrM_multiallelic.vcf
new file mode 100644
index 0000000..83d6dd3
--- /dev/null
+++ b/test/data/fake_chrM_multiallelic.vcf
@@ -0,0 +1,43 @@
+##fileformat=VCFv4.2
+##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
+##FILTER=<ID=LowQual,Description="Low quality">
+##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
+##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another">
+##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
+##FORMAT=<ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
+##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
+##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
+##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
+##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
+##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
+##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
+##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
+##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
+##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
+##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
+##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
+##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
+##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
+##INFO=<ID=RAW_MQ,Number=1,Type=Float,Description="Raw data for RMS Mapping Quality">
+##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
+##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias">
+##contig=<ID=chrM,length=16571,assembly=hg19>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	16699289
+chrM	73	.	G	A	4062.77	.	AC=2;AF=1.00;AN=2;DP=131;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=58.26;QD=32.24;SOR=0.791	GT:AD:DP:GQ:PL	1/1:0,126:126:99:4091,378,0
+chrM	150	.	T	C	5149.77	.	AC=2;AF=1.00;AN=2;DP=130;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=60.00;QD=34.24;SOR=0.710	GT:AD:DP:GQ:PGT:PID:PL	1/1:0,117:117:99:1|1:150_T_C:5178,352,0
+chrM	152	.	T	C	5149.77	.	AC=2;AF=1.00;AN=2;DP=126;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=60.00;QD=30.63;SOR=0.711	GT:AD:DP:GQ:PGT:PID:PL	1/1:0,113:113:99:1|1:150_T_C:5178,352,0
+chrM	195	.	C	T,*	690.21	.	AC=0,1;AF=0.00,0.500;AN=2;BaseQRankSum=-1.150e+00;ClippingRankSum=0.00;DP=17;ExcessHet=3.9794;FS=8.872;MQ=39.83;MQRankSum=-9.210e-01;QD=21.57;ReadPosRankSum=-4.390e-01;SOR=2.038	GT:AD:DP:GQ:PL	0/2:9,0,8:17:99:303,330,1227,0,897,873
+chrM	410	.	A	C,G	4849.13	.	AC=1,0;AF=0.500,0.00;AN=2;BaseQRankSum=0.910;ClippingRankSum=0.00;DP=76;ExcessHet=3.0103;FS=6.027;MQ=52.15;MQRankSum=-4.344e+00;QD=22.98;ReadPosRankSum=0.687;SOR=1.420	GT:AD:DP:GQ:PL	0/1:33,35,8:76:99:836,0,820,685,673,1584
+chrM	2261	.	C	T,CATTTT	4096.90	.	AC=2,0;AF=1.00,0.00;AN=2;DP=37;ExcessHet=3.0103;FS=0.000;MQ=60.00;QD=29.45;SOR=4.768	GT:AD:DP:GQ:PL	1/1:0,37,0:37:99:1248,110,0,1248,110,1248
+chrM	2354	.	C	T	3755.77	.	AC=2;AF=1.00;AN=2;DP=117;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=60.00;QD=32.95;SOR=1.054	GT:AD:DP:GQ:PL	1/1:0,114:114:99:3784,341,0
diff --git a/test/test-integration.yml b/test/test-integration.yml
index bc2b9d0..f1473f4 100644
--- a/test/test-integration.yml
+++ b/test/test-integration.yml
@@ -34,3 +34,26 @@
           - "chrM\t150\t.\tT\t."
           - "chrM\t152\t.\tT\t."
           - "chrM\t195\t.\tC\tT"
+
+- name: test-multiallelic
+  tags:
+    - integration
+    - new
+  command: >
+    snakemake
+    --configfile test/config/config-multiallelic.json
+    --use-singularity
+    --singularity-args ' --containall --bind /tmp'
+    --notemp
+    --cores 1
+    16699289_trimmed.vcf
+  files:
+    - path: 16699289_trimmed.vcf
+      must_not_contain:
+          - "chrM\t195\t.\tC\tT,*"
+          - "chrM\t410\t.\tA\tC,G"
+          - "chrM\t2261\t.\tC\tT,CATTTT"
+      contains:
+          - "chrM\t195\t.\tC\t*"
+          - "chrM\t410\t.\tA\tC"
+          - "chrM\t2261\t.\tC\tT"
-- 
GitLab