From 9256dcbf4c75c153d187fcc20becc82f77dc52a5 Mon Sep 17 00:00:00 2001
From: Sander Bollen <a.h.b.bollen@lumc.nl>
Date: Mon, 26 Feb 2018 17:13:54 +0100
Subject: [PATCH] vcfstats via vtools

---
 Snakefile         |   3 +-
 envs/vcfstats.yml |  94 ++++++++++++++----------------
 src/vcfstats.py   | 145 ----------------------------------------------
 3 files changed, 46 insertions(+), 196 deletions(-)
 delete mode 100644 src/vcfstats.py

diff --git a/Snakefile b/Snakefile
index e91db83..f12cba5 100644
--- a/Snakefile
+++ b/Snakefile
@@ -25,7 +25,6 @@ def fsrc_dir(*args):
 
 covpy = fsrc_dir("src", "covstats.py")
 colpy = fsrc_dir("src", "collect_stats.py")
-vs_py = fsrc_dir("src", "vcfstats.py")
 mpy = fsrc_dir("src", "merge_stats.py")
 
 if FASTQ_COUNT is None:
@@ -458,7 +457,7 @@ rule vcfstats:
     output:
         stats=out_path("multisample/vcfstats.json")
     conda: "envs/vcfstats.yml"
-    shell: "python {input.vs_py} -i {input.vcf} > {output.stats}"
+    shell: "vtools-stats -i {input.vcf} > {output.stats}"
 
 
 ## collection
diff --git a/envs/vcfstats.yml b/envs/vcfstats.yml
index 7df8161..fcf7567 100644
--- a/envs/vcfstats.yml
+++ b/envs/vcfstats.yml
@@ -5,69 +5,65 @@ channels:
 - defaults
 - r
 dependencies:
-- cyvcf2=0.8.0=py27_0
-- python-dateutil=2.3=py27_0
-- backports=1.0=py27_1
-- backports.functools_lru_cache=1.4=py27_1
-- backports_abc=0.5=py27_0
-- ca-certificates=2017.11.5=0
-- certifi=2017.11.5=py27_0
-- click=6.7=py27_0
-- coloredlogs=7.1=py27_0
-- cycler=0.10.0=py27_0
+- cyvcf2=0.8.4=py36_1
+- tqdm=4.7.2=py36_0
+- backports=1.0=py36_1
+- backports.functools_lru_cache=1.5=py36_0
+- blas=1.1=openblas
+- ca-certificates=2018.1.18=0
+- certifi=2018.1.18=py36_0
+- click=6.7=py_1
+- coloredlogs=7.1=py36_0
+- cycler=0.10.0=py36_0
+- cython=0.27.3=py36_0
 - dbus=1.10.22=0
 - expat=2.2.5=0
-- fontconfig=2.12.1=6
-- freetype=2.7=1
-- functools32=3.2.3.2=py27_1
-- gettext=0.19.7=1
-- glib=2.53.5=0
+- fontconfig=2.12.6=0
+- freetype=2.8.1=0
+- gettext=0.19.8.1=0
+- glib=2.55.0=0
 - gst-plugins-base=1.8.0=0
 - gstreamer=1.8.0=1
-- humanfriendly=4.4=py27_0
+- humanfriendly=4.4=py36_0
 - icu=58.2=0
 - jpeg=9b=2
 - libffi=3.2.1=3
 - libiconv=1.15=0
-- libpng=1.6.28=1
+- libpng=1.6.34=0
 - libxcb=1.12=1
-- libxml2=2.9.5=0
-- matplotlib=2.1.0=py27_1
+- libxml2=2.9.7=0
+- matplotlib=2.1.2=py36_0
 - ncurses=5.9=10
-- openssl=1.0.2m=0
-- pandas=0.21.0=py27_0
-- patsy=0.4.1=py27_0
+- numpy=1.14.1=py36_blas_openblas_200
+- openblas=0.2.20=7
+- openssl=1.0.2n=0
+- pandas=0.22.0=py36_0
+- patsy=0.5.0=py36_0
 - pcre=8.39=0
-- pip=9.0.1=py27_0
-- pyparsing=2.2.0=py27_0
-- pyqt=5.6.0=py27_4
-- python=2.7.14=0
-- pytz=2017.3=py_2
-- qt=5.6.2=3
-- readline=6.2=0
-- seaborn=0.8.1=py27_0
-- setuptools=37.0.0=py27_0
-- singledispatch=3.4.0.3=py27_0
-- sip=4.18=py27_1
-- six=1.11.0=py27_1
-- sqlite=3.13.0=1
-- ssl_match_hostname=3.5.0.1=py27_1
-- statsmodels=0.8.0=py27_0
-- subprocess32=3.2.7=py27_0
-- tk=8.5.19=2
-- tornado=4.5.2=py27_0
-- tqdm=4.19.4=py_0
-- wheel=0.30.0
+- pip=9.0.1=py36_1
+- pyparsing=2.2.0=py36_0
+- pyqt=5.6.0=py36_4
+- python=3.6.4=0
+- python-dateutil=2.6.1=py36_0
+- pytz=2018.3=py_0
+- qt=5.6.2=7
+- readline=7.0=0
+- scipy=1.0.0=py36_blas_openblas_201
+- seaborn=0.8.1=py36_0
+- setuptools=38.5.1=py36_0
+- sip=4.18=py36_1
+- six=1.11.0=py36_1
+- sqlite=3.20.1=2
+- statsmodels=0.8.0=py36_0
+- tk=8.6.7=0
+- tornado=4.5.3=py36_0
+- wheel=0.30.0=py36_2
 - xorg-libxau=1.0.8=3
 - xorg-libxdmcp=1.1.2=3
 - xz=5.2.3=0
-- zlib=1.2.8=3
+- zlib=1.2.11=0
 - libgcc=5.2.0=0
 - libgfortran=3.0.0=1
-- mkl=2017.0.3=0
-- numpy=1.13.1=py27_0
-- scipy=0.19.1=np113py27_0
 - pip:
-  - backports-abc==0.5
-  - backports.functools-lru-cache==1.4
-  - backports.ssl-match-hostname==3.5.0.1
+  - backports.functools-lru-cache==1.5
+  - "--editable git+git@git.lumc.nl:klinische-genetica/capture-lumc/vtools.git#egg=vtools"
\ No newline at end of file
diff --git a/src/vcfstats.py b/src/vcfstats.py
deleted file mode 100644
index 8457498..0000000
--- a/src/vcfstats.py
+++ /dev/null
@@ -1,145 +0,0 @@
-import click
-import cyvcf2
-import numpy
-import json
-
-from tqdm import tqdm
-
-from collections import Counter
-
-
-def gen_chrom_counter(vcf_reader):
-    """Generate chromosome counter from VCF reader"""
-    return Counter({n: 0 for n in vcf_reader.seqnames})
-
-
-class Sample(object):
-    def __init__(self, name, idx):
-        self.name = name
-        self.idx = idx
-        self.transversions = 0
-        self.transitions = 0
-        self.hom_ref = 0
-        self.het = 0
-        self.hom_alt = 0
-        self.deletions = 0
-        self.insertions = 0
-        self.snps = 0
-
-        self.__gq_counter = Counter({i: 0 for i in range(100)})
-
-    @property
-    def ti_tv(self):
-        if self.transversions > 0:
-            return float(self.transitions)/self.transversions
-        return numpy.nan
-
-    def add_variant(self, var):
-        """assuming gts012=True"""
-        typ = var.gt_types[self.idx]
-        if typ == 3:
-            return None
-
-        gq = var.gt_quals[self.idx]
-        self.__gq_counter[gq] += 1
-
-        if typ == 0:
-            self.hom_ref += 1
-            return None
-
-        if typ == 1:
-            self.het += 1
-        if typ == 2:
-            self.hom_alt += 1
-
-        if var.is_snp and var.is_transition:  # this only works in python 2 for now. See: https://github.com/brentp/cyvcf2/pull/70
-            self.transitions += 1
-            self.snps += 1
-        elif var.is_snp:
-            self.transversions += 1
-            self.snps += 1
-        elif var.is_indel and var.is_deletion:
-            self.deletions += 1
-        elif var.is_indel:
-            self.insertions += 1
-
-    @property
-    def gq_distr(self):
-        return [self.__gq_counter[x] for x in range(100)]
-
-    @property
-    def total_variants(self):
-        return self.hom_alt + self.het
-
-    @property
-    def as_dict(self):
-        return {
-            "name": self.name,
-            "total_variants": self.total_variants,
-            "variant_types": {
-                "snps": self.snps,
-                "deletions": self.deletions,
-                "insertions": self.insertions,
-            },
-            "genotypes": {
-                "hom_ref": self.hom_ref,
-                "het": self.het,
-                "hom_alt": self.hom_alt
-            },
-            "transitions": self.transitions,
-            "transversions": self.transversions,
-            "ti_tv_ratio": self.ti_tv,
-            "gq_distribution": self.gq_distr
-        }
-
-
-class Stats(object):
-    def __init__(self, vcf_path):
-        self.path = vcf_path
-        self.vcf = cyvcf2.VCF(vcf_path, gts012=True)
-
-        self.samples = [Sample(x, i) for i, x in enumerate(self.vcf.samples)]
-        self.chrom_counter = gen_chrom_counter(self.vcf)
-
-        self.__calculated = False
-
-    def calculate(self):
-        for record in tqdm(self.vcf, unit="variants", unit_scale=True):
-            for s in self.samples:
-                s.add_variant(record)
-            self.chrom_counter[record.CHROM] += 1
-        self.__calculated = True
-
-    @property
-    def total_variants(self):
-        return sum(self.chrom_counter.values())
-
-    @property
-    def as_dict(self):
-        if not self.__calculated:
-            self.calculate()
-        return {
-            "vcf_path": self.path,
-            "total_variants": self.total_variants,
-            "samples": [s.as_dict for s in self.samples],
-            "per_chromosome_variants": {k: v for k, v in self.chrom_counter.items()}
-        }
-
-    @property
-    def as_json(self):
-        return json.dumps(self.as_dict, sort_keys=True, indent=4)
-
-
-@click.command()
-@click.option("-i",
-              "--input",
-              type=click.Path(exists=True, dir_okay=False, readable=True),
-              required=True,
-              help="Input VCF file")
-def main(input):
-    stats = Stats(input)
-    print(stats.as_json)
-
-
-if __name__ == "__main__":
-    main()
-- 
GitLab