Commit 583aed74 authored by van den Berg's avatar van den Berg
Browse files

Consolidate collectstats into a single rule

The implementation is a bit hacky, since snakemake does not allow for
optional input files. As a workaround, "." is passed when the bedfile is
not defined, and the collect_stats.py script has been made aware of the
special meaning of "."

Additionally, Click has been removed as a dependency for collect stats,
and the structure of the stats.json file has been updated to only allow
for a single entry of coverage stats instead of a list. This has been
done to match an earlier change in Hutspot where support for multiple
bed files has been dropped.
parent e64950fe
......@@ -416,48 +416,26 @@ rule collect_cutadapt_summary:
shell: "python {input.cutadapt_summary} --sample {wildcards.sample} "
"--cutadapt-summary {input.cutadapt} > {output}"
## collection
if "bedfile" in config:
rule collectstats:
"""Collect all stats for a particular sample with beds"""
input:
mnum = rules.mapped_reads_bases.output.reads,
mbnum = rules.mapped_reads_bases.output.bases,
unum = rules.unique_reads_bases.output.reads,
ubnum = rules.unique_reads_bases.output.bases,
cov = rules.covstats.output.covj,
cutadapt = rules.collect_cutadapt_summary.output,
colpy = config["collect_stats"]
params:
fthresh = config["female_threshold"]
output: "{sample}/{sample}.stats.json"
container: containers["vtools"]
shell: "python {input.colpy} --sample-name {wildcards.sample} "
"--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
"--unique-num {input.unum} --usable-basenum {input.ubnum} "
"--female-threshold {params.fthresh} "
"--cutadapt {input.cutadapt} "
"{input.cov} > {output}"
else:
rule collectstats:
"""Collect all stats for a particular sample without beds"""
input:
mnum = rules.mapped_reads_bases.output.reads,
mbnum = rules.mapped_reads_bases.output.bases,
unum = rules.unique_reads_bases.output.reads,
ubnum = rules.unique_reads_bases.output.bases,
cutadapt = rules.collect_cutadapt_summary.output,
colpy = config["collect_stats"]
params:
fthresh = config["female_threshold"]
output: "{sample}/{sample}.stats.json"
container: containers["vtools"]
shell: "python {input.colpy} --sample-name {wildcards.sample} "
"--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
"--unique-num {input.unum} --usable-basenum {input.ubnum} "
"--female-threshold {params.fthresh} "
"--cutadapt {input.cutadapt} "
"> {output}"
rule collectstats:
"""Collect all stats for a particular sample"""
input:
mnum = rules.mapped_reads_bases.output.reads,
mbnum = rules.mapped_reads_bases.output.bases,
unum = rules.unique_reads_bases.output.reads,
ubnum = rules.unique_reads_bases.output.bases,
cov = rules.covstats.output.covj if "bedfile" in config else ".",
cutadapt = rules.collect_cutadapt_summary.output,
colpy = config["collect_stats"]
params:
fthresh = config["female_threshold"]
output: "{sample}/{sample}.stats.json"
container: containers["python3"]
shell: "python {input.colpy} --sample-name {wildcards.sample} "
"--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
"--unique-num {input.unum} --usable-basenum {input.ubnum} "
"--female-threshold {params.fthresh} "
"--cutadapt {input.cutadapt} "
"{input.cov} > {output}"
rule merge_stats:
"""Merge all stats of all samples"""
......
......@@ -21,7 +21,7 @@ collect_stats.py
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""
import click
import argparse
import json
from os.path import basename
......@@ -61,59 +61,16 @@ def determine_gender(covstat, fthresh):
return "male"
@click.command()
@click.option("--sample-name",
type=click.STRING,
required=True,
help="Sample name")
@click.option("--mapped-num",
type=click.Path(dir_okay=False, exists=True, readable=True),
required=True,
help="Mapped num file")
@click.option("--mapped-basenum",
type=click.Path(dir_okay=False, exists=True, readable=True),
required=True,
help="Mapped basenum file")
@click.option("--unique-num",
type=click.Path(dir_okay=False, exists=True, readable=True),
required=True,
help="Unique num file")
@click.option("--usable-basenum",
type=click.Path(dir_okay=False, exists=True, readable=True),
required=True,
help="Usable basenum")
@click.option("--female-threshold",
type=click.FLOAT,
default=0.6,
help="Female threshold of X/all cov")
@click.option("--cutadapt",
type=click.Path(dir_okay=False, exists=True, readable=True),
help="Cutadapt summary output")
@click.argument("covstats",
type=click.Path(dir_okay=False, exists=True, readable=True),
nargs=-1)
def main(sample_name, mapped_num, mapped_basenum,
unique_num, usable_basenum, female_threshold, covstats, cutadapt):
mpnum = parse_num_file(mapped_num)
mpbnum = parse_num_file(mapped_basenum)
unum = parse_num_file(unique_num)
ubnum = parse_num_file(usable_basenum)
cutadapt = parse_json_file(cutadapt)
covl = []
for c in covstats:
cd = parse_json_file(c)
cdd = {
"name": basename(c),
"gender": determine_gender(cd, female_threshold),
"covstats": cd
}
covl.append(cdd)
def main(args):
mpnum = parse_num_file(args.mapped_num)
mpbnum = parse_num_file(args.mapped_basenum)
unum = parse_num_file(args.unique_num)
ubnum = parse_num_file(args.usable_basenum)
cutadapt = parse_json_file(args.cutadapt)
d = {
"sample_name": sample_name,
"sample_name": args.sample_name,
"preqc_reads": cutadapt["preqc_reads"],
"preqc_bases": cutadapt["preqc_bases"],
"postqc_reads": cutadapt["postqc_reads"],
......@@ -121,12 +78,46 @@ def main(sample_name, mapped_num, mapped_basenum,
"n_mapped_reads": mpnum,
"n_mapped_bases": mpbnum,
"n_usable_reads": unum,
"n_usable_bases": ubnum,
"covstats": covl
"n_usable_bases": ubnum
}
# "." is used to pass an 'empty' file from snakemake, since all snakemake
# inputs must be files or folders which exist
if args.covstats != ".":
# Read the json file
covstats = parse_json_file(args.covstats)
# Format the coverage data and determine the gender
cov_data = {
"name": basename(args.covstats),
"gender": determine_gender(covstats, args.female_threshold),
"covstats": covstats
}
# Add the coverage data
d["covstats"] = cov_data
print(json.dumps(d))
if __name__ == "__main__":
main()
parser = argparse.ArgumentParser()
parser.add_argument("--sample-name", required=True,
help="Sample name")
parser.add_argument("--mapped-num", required=True,
help="Mapped num file")
parser.add_argument("--mapped-basenum", required=True,
help="Mapped basenum file")
parser.add_argument("--unique-num", required=True,
help="Unique num file")
parser.add_argument("--usable-basenum", required=True,
help="Usable basenum")
parser.add_argument("--female-threshold", default=0.6,
help="Female threshold of X/all cov")
parser.add_argument("--cutadapt", required=True,
help="Cutadapt summary output")
parser.add_argument("covstats",
help="Coverage statistics")
args = parser.parse_args()
main(args)
......@@ -86,8 +86,7 @@ if __name__ == "__main__":
"usable_bases": sample['n_usable_bases'],
})
if "covstats" in sample:
for cov_d in sample['covstats']:
sample_dict.update(get_covstats(cov_d))
sample_dict.update(get_covstats(sample['covstats']))
sdicts.append(sample_dict)
lens = [len(list(x.keys())) for x in sdicts]
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment