From 9256dcbf4c75c153d187fcc20becc82f77dc52a5 Mon Sep 17 00:00:00 2001 From: Sander Bollen <a.h.b.bollen@lumc.nl> Date: Mon, 26 Feb 2018 17:13:54 +0100 Subject: [PATCH] vcfstats via vtools --- Snakefile | 3 +- envs/vcfstats.yml | 94 ++++++++++++++---------------- src/vcfstats.py | 145 ---------------------------------------------- 3 files changed, 46 insertions(+), 196 deletions(-) delete mode 100644 src/vcfstats.py diff --git a/Snakefile b/Snakefile index e91db83..f12cba5 100644 --- a/Snakefile +++ b/Snakefile @@ -25,7 +25,6 @@ def fsrc_dir(*args): covpy = fsrc_dir("src", "covstats.py") colpy = fsrc_dir("src", "collect_stats.py") -vs_py = fsrc_dir("src", "vcfstats.py") mpy = fsrc_dir("src", "merge_stats.py") if FASTQ_COUNT is None: @@ -458,7 +457,7 @@ rule vcfstats: output: stats=out_path("multisample/vcfstats.json") conda: "envs/vcfstats.yml" - shell: "python {input.vs_py} -i {input.vcf} > {output.stats}" + shell: "vtools-stats -i {input.vcf} > {output.stats}" ## collection diff --git a/envs/vcfstats.yml b/envs/vcfstats.yml index 7df8161..fcf7567 100644 --- a/envs/vcfstats.yml +++ b/envs/vcfstats.yml @@ -5,69 +5,65 @@ channels: - defaults - r dependencies: -- cyvcf2=0.8.0=py27_0 -- python-dateutil=2.3=py27_0 -- backports=1.0=py27_1 -- backports.functools_lru_cache=1.4=py27_1 -- backports_abc=0.5=py27_0 -- ca-certificates=2017.11.5=0 -- certifi=2017.11.5=py27_0 -- click=6.7=py27_0 -- coloredlogs=7.1=py27_0 -- cycler=0.10.0=py27_0 +- cyvcf2=0.8.4=py36_1 +- tqdm=4.7.2=py36_0 +- backports=1.0=py36_1 +- backports.functools_lru_cache=1.5=py36_0 +- blas=1.1=openblas +- ca-certificates=2018.1.18=0 +- certifi=2018.1.18=py36_0 +- click=6.7=py_1 +- coloredlogs=7.1=py36_0 +- cycler=0.10.0=py36_0 +- cython=0.27.3=py36_0 - dbus=1.10.22=0 - expat=2.2.5=0 -- fontconfig=2.12.1=6 -- freetype=2.7=1 -- functools32=3.2.3.2=py27_1 -- gettext=0.19.7=1 -- glib=2.53.5=0 +- fontconfig=2.12.6=0 +- freetype=2.8.1=0 +- gettext=0.19.8.1=0 +- glib=2.55.0=0 - gst-plugins-base=1.8.0=0 - gstreamer=1.8.0=1 -- humanfriendly=4.4=py27_0 +- humanfriendly=4.4=py36_0 - icu=58.2=0 - jpeg=9b=2 - libffi=3.2.1=3 - libiconv=1.15=0 -- libpng=1.6.28=1 +- libpng=1.6.34=0 - libxcb=1.12=1 -- libxml2=2.9.5=0 -- matplotlib=2.1.0=py27_1 +- libxml2=2.9.7=0 +- matplotlib=2.1.2=py36_0 - ncurses=5.9=10 -- openssl=1.0.2m=0 -- pandas=0.21.0=py27_0 -- patsy=0.4.1=py27_0 +- numpy=1.14.1=py36_blas_openblas_200 +- openblas=0.2.20=7 +- openssl=1.0.2n=0 +- pandas=0.22.0=py36_0 +- patsy=0.5.0=py36_0 - pcre=8.39=0 -- pip=9.0.1=py27_0 -- pyparsing=2.2.0=py27_0 -- pyqt=5.6.0=py27_4 -- python=2.7.14=0 -- pytz=2017.3=py_2 -- qt=5.6.2=3 -- readline=6.2=0 -- seaborn=0.8.1=py27_0 -- setuptools=37.0.0=py27_0 -- singledispatch=3.4.0.3=py27_0 -- sip=4.18=py27_1 -- six=1.11.0=py27_1 -- sqlite=3.13.0=1 -- ssl_match_hostname=3.5.0.1=py27_1 -- statsmodels=0.8.0=py27_0 -- subprocess32=3.2.7=py27_0 -- tk=8.5.19=2 -- tornado=4.5.2=py27_0 -- tqdm=4.19.4=py_0 -- wheel=0.30.0 +- pip=9.0.1=py36_1 +- pyparsing=2.2.0=py36_0 +- pyqt=5.6.0=py36_4 +- python=3.6.4=0 +- python-dateutil=2.6.1=py36_0 +- pytz=2018.3=py_0 +- qt=5.6.2=7 +- readline=7.0=0 +- scipy=1.0.0=py36_blas_openblas_201 +- seaborn=0.8.1=py36_0 +- setuptools=38.5.1=py36_0 +- sip=4.18=py36_1 +- six=1.11.0=py36_1 +- sqlite=3.20.1=2 +- statsmodels=0.8.0=py36_0 +- tk=8.6.7=0 +- tornado=4.5.3=py36_0 +- wheel=0.30.0=py36_2 - xorg-libxau=1.0.8=3 - xorg-libxdmcp=1.1.2=3 - xz=5.2.3=0 -- zlib=1.2.8=3 +- zlib=1.2.11=0 - libgcc=5.2.0=0 - libgfortran=3.0.0=1 -- mkl=2017.0.3=0 -- numpy=1.13.1=py27_0 -- scipy=0.19.1=np113py27_0 - pip: - - backports-abc==0.5 - - backports.functools-lru-cache==1.4 - - backports.ssl-match-hostname==3.5.0.1 + - backports.functools-lru-cache==1.5 + - "--editable git+git@git.lumc.nl:klinische-genetica/capture-lumc/vtools.git#egg=vtools" \ No newline at end of file diff --git a/src/vcfstats.py b/src/vcfstats.py deleted file mode 100644 index 8457498..0000000 --- a/src/vcfstats.py +++ /dev/null @@ -1,145 +0,0 @@ -import click -import cyvcf2 -import numpy -import json - -from tqdm import tqdm - -from collections import Counter - - -def gen_chrom_counter(vcf_reader): - """Generate chromosome counter from VCF reader""" - return Counter({n: 0 for n in vcf_reader.seqnames}) - - -class Sample(object): - def __init__(self, name, idx): - self.name = name - self.idx = idx - self.transversions = 0 - self.transitions = 0 - self.hom_ref = 0 - self.het = 0 - self.hom_alt = 0 - self.deletions = 0 - self.insertions = 0 - self.snps = 0 - - self.__gq_counter = Counter({i: 0 for i in range(100)}) - - @property - def ti_tv(self): - if self.transversions > 0: - return float(self.transitions)/self.transversions - return numpy.nan - - def add_variant(self, var): - """assuming gts012=True""" - typ = var.gt_types[self.idx] - if typ == 3: - return None - - gq = var.gt_quals[self.idx] - self.__gq_counter[gq] += 1 - - if typ == 0: - self.hom_ref += 1 - return None - - if typ == 1: - self.het += 1 - if typ == 2: - self.hom_alt += 1 - - if var.is_snp and var.is_transition: # this only works in python 2 for now. See: https://github.com/brentp/cyvcf2/pull/70 - self.transitions += 1 - self.snps += 1 - elif var.is_snp: - self.transversions += 1 - self.snps += 1 - elif var.is_indel and var.is_deletion: - self.deletions += 1 - elif var.is_indel: - self.insertions += 1 - - @property - def gq_distr(self): - return [self.__gq_counter[x] for x in range(100)] - - @property - def total_variants(self): - return self.hom_alt + self.het - - @property - def as_dict(self): - return { - "name": self.name, - "total_variants": self.total_variants, - "variant_types": { - "snps": self.snps, - "deletions": self.deletions, - "insertions": self.insertions, - }, - "genotypes": { - "hom_ref": self.hom_ref, - "het": self.het, - "hom_alt": self.hom_alt - }, - "transitions": self.transitions, - "transversions": self.transversions, - "ti_tv_ratio": self.ti_tv, - "gq_distribution": self.gq_distr - } - - -class Stats(object): - def __init__(self, vcf_path): - self.path = vcf_path - self.vcf = cyvcf2.VCF(vcf_path, gts012=True) - - self.samples = [Sample(x, i) for i, x in enumerate(self.vcf.samples)] - self.chrom_counter = gen_chrom_counter(self.vcf) - - self.__calculated = False - - def calculate(self): - for record in tqdm(self.vcf, unit="variants", unit_scale=True): - for s in self.samples: - s.add_variant(record) - self.chrom_counter[record.CHROM] += 1 - self.__calculated = True - - @property - def total_variants(self): - return sum(self.chrom_counter.values()) - - @property - def as_dict(self): - if not self.__calculated: - self.calculate() - return { - "vcf_path": self.path, - "total_variants": self.total_variants, - "samples": [s.as_dict for s in self.samples], - "per_chromosome_variants": {k: v for k, v in self.chrom_counter.items()} - } - - @property - def as_json(self): - return json.dumps(self.as_dict, sort_keys=True, indent=4) - - -@click.command() -@click.option("-i", - "--input", - type=click.Path(exists=True, dir_okay=False, readable=True), - required=True, - help="Input VCF file") -def main(input): - stats = Stats(input) - print(stats.as_json) - - -if __name__ == "__main__": - main() -- GitLab