Commit a191f581 authored by Sander Bollen's avatar Sander Bollen
Browse files

write seqtk as bash script

parent 9256dcbf
......@@ -26,6 +26,7 @@ def fsrc_dir(*args):
covpy = fsrc_dir("src", "covstats.py")
colpy = fsrc_dir("src", "collect_stats.py")
mpy = fsrc_dir("src", "merge_stats.py")
seq = fsrc_dir("src", "seqtk.sh")
if FASTQ_COUNT is None:
fqc = "python {0}".format(fsrc_dir("src", "fastq-count.py"))
......@@ -144,26 +145,28 @@ rule seqtk_r1:
"""Either subsample or link forward fastq file"""
input:
stats=out_path("{sample}/pre_process/{sample}.preqc_count.json"),
fastq=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz")
fastq=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
seqtk=seq
params:
max_bases=MAX_BASES
output:
fastq=temp(out_path("{sample}/pre_process/{sample}.sampled_R1.fastq.gz"))
conda: "envs/seqtk.yml"
script: "src/seqtk.py"
shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} {params.max_bases}"
rule seqtk_r2:
"""Either subsample or link reverse fastq file"""
input:
stats = out_path("{sample}/pre_process/{sample}.preqc_count.json"),
fastq = out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
fastq = out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz"),
seqtk=seq
params:
max_bases = MAX_BASES
output:
fastq = temp(out_path("{sample}/pre_process/{sample}.sampled_R2.fastq.gz"))
conda: "envs/seqtk.yml"
script: "src/seqtk.py"
shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} {params.max_bases}"
# contains original merged fastq files as input to prevent them from being prematurely deleted
......
......@@ -6,4 +6,6 @@ channels:
- r
dependencies:
- seqtk=1.2=0
- zlib=1.2.11=0
- bc=1.06=0
- sed=4.4=1
- zlib=1.2.11=0
\ No newline at end of file
"""
Little script from running seqtk with conda
Conda directives can't be used with a run directive,
so must be combined with script directive in stead.
This script assumes the following:
- a `snakemake` object exists,
- this object has the following attributes:
- input: a list of two items:
1. output of fastq-count as path to json file
2. a fastq file to be sub-sampled
- output: a list of one item containing path to output file
- params: a list of one item containing the max number of bases
- a `shell` function exists
This will _not_ work outside of a snakemake context.
"""
import json
from snakemake import shell
def subsample(json_path, fastq_path, opath, max_bases):
with open(json_path) as handle:
bases = json.load(handle)['bases']
if max_bases == "" or max_bases is None:
frac = 100
else:
frac = int(max_bases) / float(bases)
if frac >= 1:
cmd = "ln -s {0} {1}".format(fastq_path, opath)
else:
cmd = "seqtk sample -s100 {0} {1} | gzip -c > {2}".format(fastq_path,
frac,
opath)
print("executing")
print(cmd)
shell(cmd)
subsample(snakemake.input[0], snakemake.input[1],
snakemake.output[0], snakemake.params[0])
#!/usr/bin/env bash
count_json=${1}
input_fastq=${2}
output_fastq=${3}
max_bases=${4}
bases=$(jq '.bases' $count_json)
frac=$(jq -n "$max_bases / $bases" | sed -e "s:e:E:g")
echo $frac
if (( $(echo "$frac > 1" | bc -l) )); then
ln -s $input_fastq $output_fastq
else
seqtk sample -s100 $frac $input_fastq | gzip -c > $output_fastq
fi
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment