diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 75fef90b4531dae80fe7679230d11cf9744652ec..ea2f40c45968ed52baff0153146e607fa7781a05 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -3,7 +3,5 @@ build: tags: - docker script: - - pip install --upgrade Cython - - pip install --upgrade numpy - pip install --upgrade pip setuptools wheel - pip install '.' diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..3b7a5d1f706eb94c1f4d36915c7514d2f47bf292 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include vtools/*.pyx \ No newline at end of file diff --git a/README.md b/README.md index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..4a76129e544bd3c898c465a651d6e17a469d5471 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,149 @@ +vtools +====== + +Little toolset operating over VCF files. Uses cyvcf2 and cython under +the hood for speed. + + +Tools +----- + +### vtools-filter + +Filter VCF files based on a few criteria. Will output both a filtered VCF +file, and a VCF file containing all the filtered-out variants. + +#### Filter criteria + +| name | meaning | optional | +| ---- | ------- | -------- | +| NON_CANONICAL | Non-canonical chromosome | Yes | +| INDEX_UNCALLED | Index uncalled or homozygous reference | Yes | +| TOO_HIGH_GONL_AF | Too high GonL allele frequency | Yes | +| TOO_HIGH_GNOMAD_AF | Too high GnomAD allele frequency | Yes | +| LOW_GQ | Too low GQ on index sample | Yes | +| DELETED_ALLELE | The only ALT allele is a deleted allele | No | + +#### Configuration + +Configuration of filters goes by a little JSON file. +See [here](cfg/example-filter.json) for an example. + + +#### Usage + +```bash +Usage: vtools-filter [OPTIONS] + +Options: + -i, --input PATH Path to input VCF file [required] + -o, --output PATH Path to output (filtered) VCF file + [required] + -t, --trash PATH Path to trash VCF file [required] + -p, --params-file PATH Path to filter params json [required] + --index-sample TEXT Name of index sample [required] + --immediate-return / --no-immediate-return + Immediately write filters to file upon + hitting one filter criterium. Default = True + --help Show this message and exit. + +``` + +### vtools-stats + +Collects some general statistics about a VCF file, and writes a json to +stdout. + +#### Usage + +```bash +Usage: vtools-stats [OPTIONS] + +Options: + -i, --input FILE Input VCF file [required] + --help Show this message and exit. +``` + +### vtools-gcoverage + +Collect coverage metrics over a gVCF file for every exon or every transcript +in a refFlat file. This assumes the input VCF file is at least similar to +GATK's gVCF files. gVCF files are only expected to have one sample; if +your input file contains multiple samples, we simply take the first only. + +Output is a simple TSV file with the following columns + +| column | meaning | +| ------ | ------- | +| exon | exon number | +| gene | gene name / symbol / id | +| mean_dp | mean DP value over the exon | +| mean_gq | mean GQ value over the exon* | +| median_dp | median DP value over the exon | +| median_gq | median GQ value over the exon | +| perc_at_least_{10, 20, 30, 50, 100}_dp | Percentage of exon with DP value over value | +| perc_at_least_{10, 29, 30, 50, 90}_gq | Percentage of exon with GQ value over exon | +| transcript | transcript name / symbol / id | + +*: mean GQ value is computed by first calculating the P-value of all GQ +values, then calculating the mean over these P-values, and lastly +converting this number back to a phred score. + +#### Usage + +```bash +Usage: vtools-gcoverage [OPTIONS] + +Options: + -I, --input-gvcf PATH Path to input VCF file [required] + -R, --refflat-file PATH Path to refFlat file [required] + --per-exon / --per-transcript Collect metrics per exon or per transcript + --help Show this message and exit. +``` + +### vtools-evaluate + +Evaluate a VCF file to a baseline VCF file containing true positives. +We only consider variants that are present in both VCF files. This makes +it useful when the two VCF files have been produced by wildly different +technologies. E.g, when comparing a WES VCF file vs a SNP array, this +tool can be quite useful. + +Output is a simple JSON file listing counts of concordant and discordant +alleles. + +Multisample VCF files are allowed; the samples to be evaluated have to be set +through a CLI argument. + + +#### Usage + +```bash +Usage: vtools-evaluate [OPTIONS] + +Options: + -c, --call-vcf PATH Path to VCF with calls to be evaluated + [required] + -p, --positive-vcf PATH Path to VCF with known calls [required] + -cs, --call-samples TEXT Sample(s) in call-vcf to consider. May be + called multiple times [required] + -ps, --positive-samples TEXT Sample(s) in positive-vcf to consider. May be + called multiple times [required] + --help Show this message and exit. +``` + +## Installation + +* Python 3.6 at minimum +* numpy and cython must be installed prior to installing vtools + * this will get fixed in the very near future + +After both requirements have been met, simply install vtools with + +```bash +python setup.py install +``` + +## License + +MIT diff --git a/cfg/example-filter.json b/cfg/example-filter.json new file mode 100644 index 0000000000000000000000000000000000000000..0130356457567483466c33cc31fbc8593f807fea --- /dev/null +++ b/cfg/example-filter.json @@ -0,0 +1,9 @@ +{ + "canonical_chromosomes": true, + "gq_pass": 7, + "index_called": true, + "low_gnomad_af": 0.05, + "low_gonl_af": null, + "gnomad_vcf": "/path/to/vcf", + "gonl_vcf": null +} \ No newline at end of file diff --git a/setup.py b/setup.py index 7e18b07d2161d9d82b1ae727d79059b39f25efc8..4892ea274afbf40c28a2c01674b12164ec24ecca 100644 --- a/setup.py +++ b/setup.py @@ -7,20 +7,41 @@ setup.py :license: MIT """ from os.path import abspath, dirname, join +import sys +import pkg_resources +import subprocess from setuptools import setup, find_packages -try: - from Cython.Build import cythonize -except ImportError: - raise NotImplementedError("Installing cython on the fly not yet supported") +# Temporarily install dependencies required by setup.py before trying to +# import them. From https://bitbucket.org/dholth/setup-requires +sys.path[0:0] = ['setup-requires'] +pkg_resources.working_set.add_entry('setup-requires') -try: - import numpy as np -except ImportError: - raise NotImplementedError("Installing numpy on the fly not yet supported") +def missing_requirements(specifiers): + for specifier in specifiers: + try: + pkg_resources.require(specifier) + except pkg_resources.DistributionNotFound: + yield specifier + + +def install_requirements(specifiers): + to_install = list(specifiers) + if to_install: + cmd = [sys.executable, "-m", "pip", "install", + "-t", "setup-requires"] + to_install + subprocess.call(cmd) + + +requires = ['cython', 'numpy'] +install_requirements(missing_requirements(requires)) + + +from Cython.Build import cythonize +import numpy as np readme_file = join(abspath(dirname(__file__)), "README.md") with open(readme_file) as desc_handle: @@ -32,13 +53,18 @@ for ext in cython_extensions: ext.include_dirs.append(np.get_include()) setup( - name="vtools", - version="0.0.1", + name="v-tools", + version="1.0.0", description="Various tools operating over VCF files", + long_description=long_desc, author="Sander Bollen", author_email="a.h.b.bollen@lumc.nl", + url="https://git.lumc.nl/klinische-genetica/capture-lumc/vtools", license="MIT", packages=find_packages(), + python_requires=">=3.6", + zip_safe=False, + include_package_data=True, install_requires=[ "click", "cyvcf2", @@ -55,6 +81,10 @@ setup( ] }, classifiers=[ + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", "Topic :: Scientific/Engineering :: Bio-Informatics" ], ext_modules=cython_extensions diff --git a/vtools/__init__.py b/vtools/__init__.py index 3943b8bfe3980137859697c2aaecac96c527cf6d..fcd826dfa0dfee50e5362517fbb9f8d2a01e4ab5 100644 --- a/vtools/__init__.py +++ b/vtools/__init__.py @@ -7,4 +7,4 @@ vtools :license: MIT """ -__version__ = '0.0.1' +