Commit 9256dcbf authored by Sander Bollen's avatar Sander Bollen
Browse files

vcfstats via vtools

parent b4a6ad0e
......@@ -25,7 +25,6 @@ def fsrc_dir(*args):
covpy = fsrc_dir("src", "covstats.py")
colpy = fsrc_dir("src", "collect_stats.py")
vs_py = fsrc_dir("src", "vcfstats.py")
mpy = fsrc_dir("src", "merge_stats.py")
if FASTQ_COUNT is None:
......@@ -458,7 +457,7 @@ rule vcfstats:
output:
stats=out_path("multisample/vcfstats.json")
conda: "envs/vcfstats.yml"
shell: "python {input.vs_py} -i {input.vcf} > {output.stats}"
shell: "vtools-stats -i {input.vcf} > {output.stats}"
## collection
......
......@@ -5,69 +5,65 @@ channels:
- defaults
- r
dependencies:
- cyvcf2=0.8.0=py27_0
- python-dateutil=2.3=py27_0
- backports=1.0=py27_1
- backports.functools_lru_cache=1.4=py27_1
- backports_abc=0.5=py27_0
- ca-certificates=2017.11.5=0
- certifi=2017.11.5=py27_0
- click=6.7=py27_0
- coloredlogs=7.1=py27_0
- cycler=0.10.0=py27_0
- cyvcf2=0.8.4=py36_1
- tqdm=4.7.2=py36_0
- backports=1.0=py36_1
- backports.functools_lru_cache=1.5=py36_0
- blas=1.1=openblas
- ca-certificates=2018.1.18=0
- certifi=2018.1.18=py36_0
- click=6.7=py_1
- coloredlogs=7.1=py36_0
- cycler=0.10.0=py36_0
- cython=0.27.3=py36_0
- dbus=1.10.22=0
- expat=2.2.5=0
- fontconfig=2.12.1=6
- freetype=2.7=1
- functools32=3.2.3.2=py27_1
- gettext=0.19.7=1
- glib=2.53.5=0
- fontconfig=2.12.6=0
- freetype=2.8.1=0
- gettext=0.19.8.1=0
- glib=2.55.0=0
- gst-plugins-base=1.8.0=0
- gstreamer=1.8.0=1
- humanfriendly=4.4=py27_0
- humanfriendly=4.4=py36_0
- icu=58.2=0
- jpeg=9b=2
- libffi=3.2.1=3
- libiconv=1.15=0
- libpng=1.6.28=1
- libpng=1.6.34=0
- libxcb=1.12=1
- libxml2=2.9.5=0
- matplotlib=2.1.0=py27_1
- libxml2=2.9.7=0
- matplotlib=2.1.2=py36_0
- ncurses=5.9=10
- openssl=1.0.2m=0
- pandas=0.21.0=py27_0
- patsy=0.4.1=py27_0
- numpy=1.14.1=py36_blas_openblas_200
- openblas=0.2.20=7
- openssl=1.0.2n=0
- pandas=0.22.0=py36_0
- patsy=0.5.0=py36_0
- pcre=8.39=0
- pip=9.0.1=py27_0
- pyparsing=2.2.0=py27_0
- pyqt=5.6.0=py27_4
- python=2.7.14=0
- pytz=2017.3=py_2
- qt=5.6.2=3
- readline=6.2=0
- seaborn=0.8.1=py27_0
- setuptools=37.0.0=py27_0
- singledispatch=3.4.0.3=py27_0
- sip=4.18=py27_1
- six=1.11.0=py27_1
- sqlite=3.13.0=1
- ssl_match_hostname=3.5.0.1=py27_1
- statsmodels=0.8.0=py27_0
- subprocess32=3.2.7=py27_0
- tk=8.5.19=2
- tornado=4.5.2=py27_0
- tqdm=4.19.4=py_0
- wheel=0.30.0
- pip=9.0.1=py36_1
- pyparsing=2.2.0=py36_0
- pyqt=5.6.0=py36_4
- python=3.6.4=0
- python-dateutil=2.6.1=py36_0
- pytz=2018.3=py_0
- qt=5.6.2=7
- readline=7.0=0
- scipy=1.0.0=py36_blas_openblas_201
- seaborn=0.8.1=py36_0
- setuptools=38.5.1=py36_0
- sip=4.18=py36_1
- six=1.11.0=py36_1
- sqlite=3.20.1=2
- statsmodels=0.8.0=py36_0
- tk=8.6.7=0
- tornado=4.5.3=py36_0
- wheel=0.30.0=py36_2
- xorg-libxau=1.0.8=3
- xorg-libxdmcp=1.1.2=3
- xz=5.2.3=0
- zlib=1.2.8=3
- zlib=1.2.11=0
- libgcc=5.2.0=0
- libgfortran=3.0.0=1
- mkl=2017.0.3=0
- numpy=1.13.1=py27_0
- scipy=0.19.1=np113py27_0
- pip:
- backports-abc==0.5
- backports.functools-lru-cache==1.4
- backports.ssl-match-hostname==3.5.0.1
- backports.functools-lru-cache==1.5
- "--editable git+git@git.lumc.nl:klinische-genetica/capture-lumc/vtools.git#egg=vtools"
\ No newline at end of file
import click
import cyvcf2
import numpy
import json
from tqdm import tqdm
from collections import Counter
def gen_chrom_counter(vcf_reader):
"""Generate chromosome counter from VCF reader"""
return Counter({n: 0 for n in vcf_reader.seqnames})
class Sample(object):
def __init__(self, name, idx):
self.name = name
self.idx = idx
self.transversions = 0
self.transitions = 0
self.hom_ref = 0
self.het = 0
self.hom_alt = 0
self.deletions = 0
self.insertions = 0
self.snps = 0
self.__gq_counter = Counter({i: 0 for i in range(100)})
@property
def ti_tv(self):
if self.transversions > 0:
return float(self.transitions)/self.transversions
return numpy.nan
def add_variant(self, var):
"""assuming gts012=True"""
typ = var.gt_types[self.idx]
if typ == 3:
return None
gq = var.gt_quals[self.idx]
self.__gq_counter[gq] += 1
if typ == 0:
self.hom_ref += 1
return None
if typ == 1:
self.het += 1
if typ == 2:
self.hom_alt += 1
if var.is_snp and var.is_transition: # this only works in python 2 for now. See: https://github.com/brentp/cyvcf2/pull/70
self.transitions += 1
self.snps += 1
elif var.is_snp:
self.transversions += 1
self.snps += 1
elif var.is_indel and var.is_deletion:
self.deletions += 1
elif var.is_indel:
self.insertions += 1
@property
def gq_distr(self):
return [self.__gq_counter[x] for x in range(100)]
@property
def total_variants(self):
return self.hom_alt + self.het
@property
def as_dict(self):
return {
"name": self.name,
"total_variants": self.total_variants,
"variant_types": {
"snps": self.snps,
"deletions": self.deletions,
"insertions": self.insertions,
},
"genotypes": {
"hom_ref": self.hom_ref,
"het": self.het,
"hom_alt": self.hom_alt
},
"transitions": self.transitions,
"transversions": self.transversions,
"ti_tv_ratio": self.ti_tv,
"gq_distribution": self.gq_distr
}
class Stats(object):
def __init__(self, vcf_path):
self.path = vcf_path
self.vcf = cyvcf2.VCF(vcf_path, gts012=True)
self.samples = [Sample(x, i) for i, x in enumerate(self.vcf.samples)]
self.chrom_counter = gen_chrom_counter(self.vcf)
self.__calculated = False
def calculate(self):
for record in tqdm(self.vcf, unit="variants", unit_scale=True):
for s in self.samples:
s.add_variant(record)
self.chrom_counter[record.CHROM] += 1
self.__calculated = True
@property
def total_variants(self):
return sum(self.chrom_counter.values())
@property
def as_dict(self):
if not self.__calculated:
self.calculate()
return {
"vcf_path": self.path,
"total_variants": self.total_variants,
"samples": [s.as_dict for s in self.samples],
"per_chromosome_variants": {k: v for k, v in self.chrom_counter.items()}
}
@property
def as_json(self):
return json.dumps(self.as_dict, sort_keys=True, indent=4)
@click.command()
@click.option("-i",
"--input",
type=click.Path(exists=True, dir_okay=False, readable=True),
required=True,
help="Input VCF file")
def main(input):
stats = Stats(input)
print(stats.as_json)
if __name__ == "__main__":
main()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment