Commit 2fb3b9f0 authored by van den Berg's avatar van den Berg
Browse files

Remove unneeded scripts, rules and containers

parent cfd0cdac
Pipeline #3608 failed with stages
in 16 seconds
......@@ -59,8 +59,7 @@ set_default("collect_stats", "src/collect_stats.py")
set_default("merge_stats", "src/merge_stats.py")
set_default("stats_to_tsv", "src/stats_to_tsv.py")
set_default("py_wordcount", "src/pywc.py")
set_default("collect_metrics", "src/collect_metrics.py")
set_default("merge_metrics", "src/merge_metrics.py")
set_default("cutadapt_summary", "src/cutadapt_summary.py")
containers = {
"bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
......@@ -75,8 +74,6 @@ containers = {
"picard-2.14": "docker://quay.io/biocontainers/picard:2.14--py36_0",
"python3": "docker://python:3.6-slim",
"samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
"sickle": "docker://quay.io/biocontainers/sickle-trim:1.33--ha92aebf_4",
"tabix": "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0",
"vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
......@@ -120,9 +117,6 @@ rule all:
fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
fastqc_trimmed = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
cutadapt = (f"{sample}/pre_process/{sample}-{read_group}.txt" for read_group, sample in get_readgroup_per_sample()),
sample_metrics = expand("{sample}/metrics.json", sample=settings['samples']),
metrics_json = "metrics.json",
metrics_tsv = "metrics.tsv",
coverage_stats = coverage_stats,
......@@ -378,8 +372,7 @@ rule fastqc_postqc:
shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
## coverages
## coverage
rule covstats:
"""Calculate coverage statistics on bam file"""
input:
......@@ -411,7 +404,7 @@ rule vtools_coverage:
shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"
# collection
## collection
if "bedfile" in settings:
rule collectstats:
"""Collect all stats for a particular sample with beds"""
......@@ -421,7 +414,7 @@ if "bedfile" in settings:
unum="{sample}/bams/{sample}.unique.num",
ubnum="{sample}/bams/{sample}.usable.basenum",
cov="{sample}/coverage/covstats.json",
cutadapt = "{sample}/metrics.json",
cutadapt = "{sample}/cutadapt.json",
colpy=settings["collect_stats"]
params:
sample_name="{sample}",
......@@ -443,7 +436,7 @@ else:
mbnum = "{sample}/bams/{sample}.mapped.basenum",
unum = "{sample}/bams/{sample}.unique.num",
ubnum = "{sample}/bams/{sample}.usable.basenum",
cutadapt = "{sample}/metrics.json",
cutadapt = "{sample}/cutadapt.json",
colpy = settings["collect_stats"]
params:
sample_name = "{sample}",
......@@ -458,29 +451,18 @@ else:
"--cutadapt {input.cutadapt} "
"> {output}"
rule collect_metrics:
""" Colect metrics for each sample """
rule collect_cutadapt_summary:
""" Colect cutadapt summary from each readgroup per sample """
input:
cutadapt = lambda wildcards:
("{sample}/pre_process/{sample}-{read_group}.txt".format(sample=wildcards.sample,
read_group=read_group) for read_group in get_readgroup(wildcards)),
collect_metrics = settings["collect_metrics"]
output: "{sample}/metrics.json"
cutadapt_summary= settings["cutadapt_summary"]
output: "{sample}/cutadapt.json"
singularity: containers["python3"]
shell: "python {input.collect_metrics} --sample {wildcards.sample} "
shell: "python {input.cutadapt_summary} --sample {wildcards.sample} "
"--cutadapt-summary {input.cutadapt} > {output}"
rule merge_metrics:
""" Merge per sample metrics files """
input:
metrics = expand("{sample}/metrics.json", sample=settings["samples"]),
merge_metrics = settings["merge_metrics"]
output:
json = "metrics.json",
tsv = "metrics.tsv"
singularity: containers["python3"]
shell: "python {input.merge_metrics} --metrics {input.metrics} "
"--json {output.json} --tsv {output.tsv}"
rule merge_stats:
"""Merge all stats of all samples"""
......@@ -511,11 +493,11 @@ rule multiqc:
Depends on stats.tsv to forcefully run at end of pipeline
"""
input:
stats=expand("{sample}/metrics.json", sample=settings["samples"])
stats="stats.tsv"
params:
odir=".",
rdir="multiqc_report"
output:
"multiqc_report/multiqc_report.html"
singularity: containers["multiqc"]
shell: "multiqc -f -o {params.rdir} {params.odir} --ignore metrics.tsv || touch {output}"
shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output}"
# hutspot - a DNAseq variant calling pipeline
# Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
fastq-count.py
Pythonic equivalent to https://github.com/sndrtj/fastq-count
:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""
import click
import gzip
import json
from collections import namedtuple
class SimpleFastqRecord(namedtuple("SimpleFastqRecord",
["read_id", "sequence", "qualities"])):
def __str__(self):
return "{0}{1}+\n{2}".format(self.read_id.decode(),
self.sequence.decode(),
self.qualities.decode())
def __bytes__(self):
return self.read_id + self.sequence + b"+\n" + self.qualities
@property
def id(self):
try:
return self._id
except AttributeError:
self._id = self.read_id.strip().split()[0][1:]
return self._id
@property
def seq(self):
return self.sequence.strip().decode()
class SimpleFastqParser(object):
"""
An iterator returning SimpleFastqRecord objects
:arg handle: Any iterator returning lines of bytestrings,
preferable in open file handle in rb mode.
:return SimpleFastqRecord objects
"""
def __init__(self, handle):
self.__handle = handle
self.__bucket = [None, None, None]
def __iter__(self):
return self
def __next__(self):
i = 0
while i < 3:
line = next(self.__handle)
if line == b"+\n":
continue
self.__bucket[i] = line
i += 1
read = SimpleFastqRecord(self.__bucket[0], self.__bucket[1],
self.__bucket[2])
self.__bucket = [None, None, None]
return read
def next(self): # python 2 compatibility
return self.__next__()
def close(self):
self.__handle.close()
def count(handle_r1, handle_r2):
"""
Count reads and bases for handles in rb mode
:param handle:
:return:
"""
p = SimpleFastqParser(handle_r1)
reads = 0
bases = 0
for r in p:
reads += 1
bases += len(r.seq)
p2 = SimpleFastqParser(handle_r2)
for r2 in p2:
reads += 1
bases += len(r2.seq)
return {"reads": reads, "bases": bases}
@click.command()
@click.argument("r1",
type=click.Path(dir_okay=False, readable=True, exists=True),
required=True)
@click.argument("r2",
type=click.Path(dir_okay=False, readable=True, exists=True),
required=True)
def main(r1, r2):
if r1.endswith(".gz"):
r1_handle = gzip.open(r1, mode="rb")
else:
r1_handle = open(r1, mode="rb")
if r2.endswith(".gz"):
r2_handle = gzip.open(r2, mode="rb")
else:
r2_handle = open(r2, mode="rb")
d = count(r1_handle, r2_handle)
print(json.dumps(d))
if __name__ == "__main__":
main()
# hutspot - a DNAseq variant calling pipeline
# Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
fastqc_stats.py
~~~~~~~~~~~~~~~
:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""
import argparse
import json
from pathlib import Path
from typing import List, Union
from copy import deepcopy
from warnings import warn
import zipfile
DO_NOT_TWO_COLUMN = ["Sequence_Length_Distribution",
"Per_sequence_quality_scores"]
def try_numeric(val: str) -> Union[str, int, float]:
try:
value = int(val)
except ValueError:
try:
value = float(val)
except ValueError:
value = val
return value
class FastqcModule(object):
"""Helper class for Fastqc Modules"""
def __init__(self, name):
self.name = name
self.header = None
self.rows = []
def add_header(self, headerstring: str):
self.header = headerstring.strip().split("\t")
def add_row(self, rowstring: str):
self.rows.append(rowstring.strip().split("\t"))
def row_as_dict(self, row) -> dict:
return {k.replace(" ", "_"): try_numeric(row[i])
for i, k in enumerate(self.header)}
def to_dict_list(self) -> Union[List, List[dict]]:
# The semantics herein _ONLY_ works for fastqc version 0.11.5
# If the version in the pipeline changes, we MUST revisit this section
if self.header is None:
return []
if (all([len(x) == 2 for x in self.rows])
and len(self.header) == 2
and self.name not in DO_NOT_TWO_COLUMN):
# two-column data is returned as a single list
# second column is assumed to contain all the data
return [try_numeric(x[1]) for x in self.rows]
return [self.row_as_dict(x) for x in self.rows]
def extract_data_txt(zip_path: Path, encoding='utf-8') -> str:
"""
Extract text of data.txt from fastqc zip file
Returns emtpy string when file is completely empty
"""
if zip_path.stat().st_size == 0:
warn(f"File {str(zip_path)} is empty")
return ""
with zipfile.ZipFile(zip_path) as zp:
iflist = zp.infolist()
data_info = next(x for x in iflist
if x.filename.endswith("fastqc_data.txt"))
with zp.open(data_info.filename) as data_handle:
return data_handle.read().decode(encoding)
def make_data_modules(data: str) -> List[FastqcModule]:
"""Make fastqc modules from data"""
modules = []
in_module = False
cur_module = None
for line in data.split("\n"):
if not in_module and line.startswith(">>"):
name = line.split(">>")[1].split("\t")[0].replace(" ", "_")
cur_module = FastqcModule(name)
in_module = True
elif in_module and line.startswith(">>END_MODULE"):
modules.append(deepcopy(cur_module))
cur_module = None
in_module = False
elif in_module and line.startswith("#"):
cur_module.add_header(line.split("#")[1])
elif in_module:
cur_module.add_row(line)
return modules
def data_to_dict(data: str, exclusions: List[str]) -> dict:
"""Create dictionary from data"""
modules = filter(lambda x: x.name not in exclusions,
make_data_modules(data))
return {m.name: m.to_dict_list() for m in modules}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--preqc-r1",
type=Path,
required=True,
help="Fastqc zip file for forward strand of "
"merged fastq file before pre-processing")
parser.add_argument("--preqc-r2",
type=Path,
required=True,
help="Fastqc zip file for reverse strand of "
"merged fastq file before pre-processing")
parser.add_argument("--postqc-r1",
type=Path,
required=True,
help="Fastqc zip file for forward strand of "
"fastq file after pre-processing")
parser.add_argument("--postqc-r2",
type=Path,
required=True,
help="Fastqc zip for file reverse strand of "
"fastq file after pre-processing")
parser.add_argument("--exclude-modules",
action="append",
default=["Per_tile_sequence_quality",
"Basic_Statistics"],
help="Fastqc modules to exclude")
args = parser.parse_args()
excl = args.exclude_modules
data_prr1 = extract_data_txt(args.preqc_r1)
data_prr2 = extract_data_txt(args.preqc_r2)
data_por1 = extract_data_txt(args.postqc_r1)
data_por2 = extract_data_txt(args.postqc_r2)
d = {
"preqc_r1": data_to_dict(data_prr1, excl),
"preqc_r2": data_to_dict(data_prr2, excl),
"postqc_r1": data_to_dict(data_por1, excl),
"postqc_r2": data_to_dict(data_por2, excl)
}
print(json.dumps(d))
#!/usr/bin/env python3
import argparse
import json
def print_tsv(data, filename):
""" Flatten data and print in tsv format """
# Determine the header
for sample in data:
header = sorted(data[sample].keys())
# Print the header, and then for each sample the data
with open(filename, 'w') as fout:
print('sample_name', *header, sep='\t', file=fout)
for sample in data:
print(sample, *(data[sample][field] for field in header), sep='\t',
file=fout)
def main(args):
data = dict()
for filename in args.metrics:
with open(filename, 'rt') as fin:
metrics = json.load(fin)
name = metrics.pop('sample_name')
data[name] = metrics
if args.json:
with open(args.json, 'wt') as fout:
json.dump(data, fout, indent=2)
if args.tsv:
print_tsv(data, args.tsv)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--metrics',
required = True,
help = 'Metrics json files',
nargs = '+'
)
parser.add_argument('--json', required=False)
parser.add_argument('--tsv', required=False)
args = parser.parse_args()
main(args)
#!/usr/bin/env bash
# hutspot - a DNAseq variant calling pipeline
# Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
set -ex
set -o pipefail
count_json=${1}
input_fastq=${2}
output_fastq=${3}
max_bases=${4}
full_input=$(readlink -f ${input_fastq})
if [[ ${max_bases} -eq 'None' ]]; then
ln -s ${full_input} ${output_fastq}
exit 0
fi
if [[ -z $max_bases ]]; then
ln -s ${full_input} ${output_fastq}
exit 0
fi
bases=$(jq '.bases' $count_json)
frac=$(jq -n "$max_bases / $bases" | sed -e "s:e:E:g")
echo $frac
frac_higher_than_one=$(echo "${frac} > 1" | bc )
if [[ ${frac_higher_than_one} -eq 1 ]]; then
ln -s ${full_input} ${output_fastq}
else
seqtk sample -s100 ${input_fastq} ${frac} | gzip -c > ${output_fastq}
fi
......@@ -185,10 +185,10 @@
must_not_contain:
- "rror"
files:
- path: "metrics.tsv"
- path: "stats.tsv"
contains:
- "sample_name\tpostqc_bases\tpostqc_reads\tpreqc_bases\tpreqc_reads"
- "micro\t2274413\t15440\t2276743\t15440"
- "sample_name\tpreqc_reads\tpreqc_bases\tpostqc_reads\tpostqc_bases\tmapped_reads\tmapped_bases\tusable_reads\tusable_bases\tcovstats.json_median_coverage"
- "micro\t15440\t2276743\t15440\t2274413\t15558\t2280294\t15515\t2275470\t133.0"
- name: test-integration-two-known-sites
tags:
......@@ -241,14 +241,7 @@
- path: "micro/pre_process/raw-micro-lib_02/micro_rg2_R2_fastqc.zip"
- path: "micro/pre_process/micro-lib_01.txt"
- path: "micro/pre_process/micro-lib_02.txt"
- path: "metrics.tsv"
- path: "metrics.json"
- path: "micro/metrics.json"
contains:
- "sample_name\": \"micro"
- "preqc_reads\": 15440"
- "postqc_reads\": 15440"
- path: "stats.tsv"
- path: "micro/pre_process/micro-lib_01.txt"
contains:
- "status\tin_reads\tin_bp\ttoo_short\ttoo_long\ttoo_many_n\tout_reads\tw/adapters\tqualtrim_bp\tout_bp\tw/adapters2\tqualtrim2_bp\tout2_bp"
......@@ -287,8 +280,8 @@
- path: "micro2/vcf/micro2.g.vcf.gz"
- path: "micro1/bams/micro1.markdup.bam"
- path: "micro2/bams/micro2.markdup.bam"
- path: "metrics.tsv"
- path: "stats.tsv"
contains:
- "sample_name\tpostqc_bases\tpostqc_reads\tpreqc_bases\tpreqc_reads"
- "micro1"
- "micro2\t1137997\t7720\t1139177\t7720"
- "sample_name\tpreqc_reads\tpreqc_bases\tpostqc_reads\tpostqc_bases"
- "micro1\t7720\t1137566\t7720\t1136416"
- "micro2\t7720\t1139177\t7720\t1137997"
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment