Commit b39f9387 authored by Leon Mei's avatar Leon Mei

scripts used for bedtools based comparison of HG514 callers and trueset

parent 097e0b9f
#!/bin/bash
# part 1 convert truecall
python convert_vcf_withENDtag_bed.py -i /exports/sascstudent/cedrick/project-MinE/data/03-hgsv-truecalls/HG00514.merged_nonredundant.vcf -o HG00514_DEL_truecall.bed
# part 2 Check delly calls
python convert_MinE_vcf_withENDtag_bed.py -i /exports/sascstudent/cedrick/project-MinE/data/05-VCF-merging/phase1_callset/delly/HG00514_CHS_HGSV/HG00514_CHS_HGSV.all.vcf -d contig_map.tsv -o HG00514_DEL_PRECISE_delly.bed
sort -k1,1 -k2,2n HG00514_DEL_PRECISE_delly.bed > HG00514_DEL_PRECISE_delly.sorted.bed
bedtools merge -i HG00514_DEL_PRECISE_delly.sorted.bed > HG00514_DEL_PRECISE_delly.sorted.merged.bed
bedtools intersect -a HG00514_DEL_truecall.bed -b HG00514_DEL_PRECISE_delly.sorted.merged.bed -wa > intersect_delly_HG00514truecall.bed
#sometimes, one large true call can overlap with many reported regions by a SV caller.
uniq intersect_delly_HG00514truecall.bed > intersect_delly_HG00514truecall.uniq.bed
# part 3 Check clever calls
python convert_MinE_vcf_withSVLENtag_bed.py -i /exports/sascstudent/cedrick/project-MinE/data/05-VCF-merging/phase1_callset/mateclever/HG00514_CHS_HGSV/HG00514_CHS_HGSV.all.vcf -d contig_map.tsv -o HG00514_DEL_PRECISE_clever.bed
sort -k1,1 -k2,2n HG00514_DEL_PRECISE_clever.bed > HG00514_DEL_PRECISE_clever.sorted.bed
bedtools merge -i HG00514_DEL_PRECISE_clever.sorted.bed >HG00514_DEL_PRECISE_clever.sorted.merged.bed
bedtools intersect -a HG00514_DEL_truecall.bed -b HG00514_DEL_PRECISE_clever.sorted.merged.bed -wa > intersect_clever_HG00514truecall.bed
uniq intersect_clever_HG00514truecall.bed > intersect_clever_HG00514truecall.uniq.bed
# part 4 Check pindel calls
python convert_MinE_vcf_withENDtag_bed.py -i /exports/sascstudent/cedrick/project-MinE/data/05-VCF-merging/phase1_callset/pindel/HG00514_CHS_HGSV/HG00514_CHS_HGSV.all.D.vcf -d contig_map.tsv -o HG00514_DEL_PRECISE_pindel.bed
sort -k1,1 -k2,2n HG00514_DEL_PRECISE_pindel.bed > HG00514_DEL_PRECISE_pindel.sorted.bed
bedtools merge -i HG00514_DEL_PRECISE_pindel.sorted.bed > HG00514_DEL_PRECISE_pindel.sorted.merged.bed
bedtools intersect -a HG00514_DEL_truecall.bed -b HG00514_DEL_PRECISE_pindel.sorted.merged.bed -wa > intersect_pindel_HG00514truecall.bed
uniq intersect_pindel_HG00514truecall.bed > intersect_pindel_HG00514truecall.uniq.bed
chr1 NC_000001.11
chr2 NC_000002.12
chr3 NC_000003.12
chr4 NC_000004.12
chr5 NC_000005.10
chr6 NC_000006.12
chr7 NC_000007.14
chr8 NC_000008.11
chr9 NC_000009.12
chr10 NC_000010.11
chr11 NC_000011.10
chr12 NC_000012.12
chr13 NC_000013.11
chr14 NC_000014.9
chr15 NC_000015.10
chr16 NC_000016.10
chr17 NC_000017.11
chr18 NC_000018.10
chr19 NC_000019.10
chr20 NC_000020.11
chr21 NC_000021.9
chr22 NC_000022.11
chrX NC_000023.11
chrY NC_000024.10
chrM NC_012920.1
import vcf
import argparse
print("enter...")
parser = argparse.ArgumentParser(description='Convert MinE SV VCF to bed and harmonize the contig names.')
parser.add_argument("-i", dest="inputVCF", required=True,
help="input MinE VCF file", metavar="FILE")
parser.add_argument("-d", dest="contigDict", required=True,
help="a two column-ed tsv file, 1st column is the contig name in MinE reference, 2nd column is the to be mapped contig name", metavar="FILE")
parser.add_argument("-o", dest="outputBED", required=True,
help="output harmonized BED file")
args = parser.parse_args()
#Load chr name mapping
print("load contig map")
convert_contig = {}
with open(args.contigDict) as f:
for line in f:
(value, key)=line.split()
convert_contig[key]=value
print(convert_contig)
print("start processing VCF")
vcf_reader = vcf.Reader(open(args.inputVCF, 'r'))
output_bed = open(args.outputBED, 'w')
count = 0
for record in vcf_reader:
# print(record.INFO)
# if count == 10:
# break
count+=1
if count % 1000 == 0:
print(count)
if record.INFO["SVTYPE"] != "DEL":
continue
if "IMPRECISE" in record.INFO:
continue
output_bed.write(convert_contig[record.CHROM]+ "\t" + str(record.POS) + "\t" + str(record.INFO["END"]) + "\n")
output_bed.close()
import vcf
import argparse
#pyvcf and python2.7 are needed to run this script. Use conda.
parser = argparse.ArgumentParser(description='Convert MinE SV VCF to bed and harmonize the contig names.')
parser.add_argument("-i", dest="inputVCF", required=True,
help="input MinE VCF file", metavar="FILE")
parser.add_argument("-d", dest="contigDict", required=True,
help="a two column-ed tsv file, 1st column is the contig name in MinE reference, 2nd column is the to be mapped contig name", metavar="FILE")
parser.add_argument("-o", dest="outputBED", required=True,
help="output harmonized BED file")
args = parser.parse_args()
#Load chr name mapping
print("load contig map")
convert_contig = {}
with open(args.contigDict) as f:
for line in f:
(value, key)=line.split()
convert_contig[key]=value
print(convert_contig)
print("start processing VCF")
vcf_reader = vcf.Reader(open(args.inputVCF, 'r'))
output_bed = open(args.outputBED, 'w')
count = 0
for record in vcf_reader:
# print(record.INFO)
# if count == 10:
# break
count+=1
if count % 1000 == 0:
print(count)
if record.INFO["SVTYPE"] != "DEL":
continue
if "IMPRECISE" in record.INFO:
continue
output_bed.write(convert_contig[record.CHROM]+ "\t" + str(record.POS) + "\t" + str(record.POS+abs(record.INFO["SVLEN"])) + "\n")
output_bed.close()
import vcf
import argparse
#pyvcf and python2.7 are needed to run this script. Use conda.
parser = argparse.ArgumentParser(description='Convert a normal DEL VCF to bed')
parser.add_argument("-i", dest="inputVCF", required=True,
help="input DEL VCF file", metavar="FILE")
parser.add_argument("-o", dest="outputBED", required=True,
help="output BED file")
args = parser.parse_args()
print("start processing VCF")
vcf_reader = vcf.Reader(open(args.inputVCF, 'r'))
output_bed = open(args.outputBED, 'w')
count = 0
for record in vcf_reader:
# print(record.INFO)
# if count == 10:
# break
count+=1
if count % 1000 == 0:
print(count)
if record.INFO["SVTYPE"] != "DEL":
continue
if "IMPRECISE" in record.INFO:
continue
output_bed.write(str(record.CHROM)+ "\t" + str(record.POS) + "\t" + str(record.INFO["END"]) + "\n")
output_bed.close()
#!/bin/bash
vt normalize -n -o vt_normalized.vcf -r /exports/sascstudent/cedrick/project-MinE/data/01-pilot/reference/Homo_sapiens_assembly38_phix_stableid.fasta /exports/sascstudent/cedrick/project-MinE/data/05-VCF-merging/phase1_callset/delly/HG00514_CHS_HGSV/HG00514_CHS_HGSV.all.vcf 2>&1 | tee vt_log.txt
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment