write seqtk as bash script

a191f581 · Sander Bollen · 9256dcbf · a191f581 · a191f581 · 9256dcbf
Commit a191f581 authored 7 years ago by Sander Bollen
--- a/Snakefile
+++ b/Snakefile
@@ -26,6 +26,7 @@ def fsrc_dir(*args):
 covpy = fsrc_dir("src", "covstats.py")
 colpy = fsrc_dir("src", "collect_stats.py")
 mpy = fsrc_dir("src", "merge_stats.py")
+seq = fsrc_dir("src", "seqtk.sh")

 if FASTQ_COUNT is None:
    fqc = "python {0}".format(fsrc_dir("src", "fastq-count.py"))
@@ -144,26 +145,28 @@ rule seqtk_r1:
    """Either subsample or link forward fastq file"""
    input:
        stats=out_path("{sample}/pre_process/{sample}.preqc_count.json"),
-        fastq=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz")
+        fastq=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
+        seqtk=seq
    params:
        max_bases=MAX_BASES
    output:
        fastq=temp(out_path("{sample}/pre_process/{sample}.sampled_R1.fastq.gz"))
    conda: "envs/seqtk.yml"
-    script: "src/seqtk.py"
+    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} {params.max_bases}"


 rule seqtk_r2:
    """Either subsample or link reverse fastq file"""
    input:
        stats = out_path("{sample}/pre_process/{sample}.preqc_count.json"),
-        fastq = out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
+        fastq = out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz"),
+        seqtk=seq
    params:
        max_bases = MAX_BASES
    output:
        fastq = temp(out_path("{sample}/pre_process/{sample}.sampled_R2.fastq.gz"))
    conda: "envs/seqtk.yml"
-    script: "src/seqtk.py"
+    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} {params.max_bases}"


 # contains original merged fastq files as input to prevent them from being prematurely deleted

--- a/envs/seqtk.yml
+++ b/envs/seqtk.yml
@@ -6,4 +6,6 @@ channels:
 - r
 dependencies:
 - seqtk=1.2=0
- zlib=1.2.11=0
+- bc=1.06=0
+- sed=4.4=1
+- zlib=1.2.11=0
\ No newline at end of file
--- a/src/seqtk.py
+++ b/src/seqtk.py
-"""
-Little script from running seqtk with conda
-
-Conda directives can't be used with a run directive,
-so must be combined with script directive in stead.
-
-This script assumes the following:
- - a `snakemake` object exists,
- - this object has the following attributes:
-    - input: a list of two items:
-        1. output of fastq-count as path to json file
-        2. a fastq file to be sub-sampled
-    - output: a list of one item containing path to output file
-    - params: a list of one item containing the max number of bases
- - a `shell` function exists
-
-This will _not_ work outside of a snakemake context.
-"""
-import json
-from snakemake import shell
-
-
-def subsample(json_path, fastq_path, opath, max_bases):
-    with open(json_path) as handle:
-        bases = json.load(handle)['bases']
-    if max_bases == "" or max_bases is None:
-        frac = 100
-    else:
-        frac = int(max_bases) / float(bases)
-
-    if frac >= 1:
-        cmd = "ln -s {0} {1}".format(fastq_path, opath)
-    else:
-        cmd = "seqtk sample -s100 {0} {1} | gzip -c > {2}".format(fastq_path,
-                                                                  frac,
-                                                                  opath)
-    print("executing")
-    print(cmd)
-    shell(cmd)
-
-
-subsample(snakemake.input[0], snakemake.input[1],
-          snakemake.output[0], snakemake.params[0])
-
-
-
--- a/src/seqtk.sh
+++ b/src/seqtk.sh
+#!/usr/bin/env bash
+
+count_json=${1}
+input_fastq=${2}
+output_fastq=${3}
+max_bases=${4}
+
+
+bases=$(jq '.bases' $count_json)
+frac=$(jq -n "$max_bases / $bases" | sed -e "s:e:E:g")
+echo $frac
+if (( $(echo "$frac > 1" | bc -l) )); then
+    ln -s $input_fastq $output_fastq
+else
+    seqtk sample -s100 $frac $input_fastq | gzip -c > $output_fastq
+fi