Skip to content
Snippets Groups Projects
Commit a191f581 authored by Sander Bollen's avatar Sander Bollen
Browse files

write seqtk as bash script

parent 9256dcbf
No related branches found
No related tags found
1 merge request!2Review comments
......@@ -26,6 +26,7 @@ def fsrc_dir(*args):
covpy = fsrc_dir("src", "covstats.py")
colpy = fsrc_dir("src", "collect_stats.py")
mpy = fsrc_dir("src", "merge_stats.py")
seq = fsrc_dir("src", "seqtk.sh")
if FASTQ_COUNT is None:
fqc = "python {0}".format(fsrc_dir("src", "fastq-count.py"))
......@@ -144,26 +145,28 @@ rule seqtk_r1:
"""Either subsample or link forward fastq file"""
input:
stats=out_path("{sample}/pre_process/{sample}.preqc_count.json"),
fastq=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz")
fastq=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
seqtk=seq
params:
max_bases=MAX_BASES
output:
fastq=temp(out_path("{sample}/pre_process/{sample}.sampled_R1.fastq.gz"))
conda: "envs/seqtk.yml"
script: "src/seqtk.py"
shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} {params.max_bases}"
rule seqtk_r2:
"""Either subsample or link reverse fastq file"""
input:
stats = out_path("{sample}/pre_process/{sample}.preqc_count.json"),
fastq = out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
fastq = out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz"),
seqtk=seq
params:
max_bases = MAX_BASES
output:
fastq = temp(out_path("{sample}/pre_process/{sample}.sampled_R2.fastq.gz"))
conda: "envs/seqtk.yml"
script: "src/seqtk.py"
shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} {params.max_bases}"
# contains original merged fastq files as input to prevent them from being prematurely deleted
......
......@@ -6,4 +6,6 @@ channels:
- r
dependencies:
- seqtk=1.2=0
- zlib=1.2.11=0
- bc=1.06=0
- sed=4.4=1
- zlib=1.2.11=0
\ No newline at end of file
"""
Little script from running seqtk with conda
Conda directives can't be used with a run directive,
so must be combined with script directive in stead.
This script assumes the following:
- a `snakemake` object exists,
- this object has the following attributes:
- input: a list of two items:
1. output of fastq-count as path to json file
2. a fastq file to be sub-sampled
- output: a list of one item containing path to output file
- params: a list of one item containing the max number of bases
- a `shell` function exists
This will _not_ work outside of a snakemake context.
"""
import json
from snakemake import shell
def subsample(json_path, fastq_path, opath, max_bases):
with open(json_path) as handle:
bases = json.load(handle)['bases']
if max_bases == "" or max_bases is None:
frac = 100
else:
frac = int(max_bases) / float(bases)
if frac >= 1:
cmd = "ln -s {0} {1}".format(fastq_path, opath)
else:
cmd = "seqtk sample -s100 {0} {1} | gzip -c > {2}".format(fastq_path,
frac,
opath)
print("executing")
print(cmd)
shell(cmd)
subsample(snakemake.input[0], snakemake.input[1],
snakemake.output[0], snakemake.params[0])
#!/usr/bin/env bash
count_json=${1}
input_fastq=${2}
output_fastq=${3}
max_bases=${4}
bases=$(jq '.bases' $count_json)
frac=$(jq -n "$max_bases / $bases" | sed -e "s:e:E:g")
echo $frac
if (( $(echo "$frac > 1" | bc -l) )); then
ln -s $input_fastq $output_fastq
else
seqtk sample -s100 $frac $input_fastq | gzip -c > $output_fastq
fi
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment