snakefile 4.59 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
################## Import libraries ##################

import pandas as pd
import os
import sys
from subprocess import call
import itertools
from snakemake.utils import R


################## Configuration file and PATHS##################

configfile: "config.yaml"

WORKING_DIR         = config["working_dir"]
RESULT_DIR          = config["results_dir"]
DATA_DIR            = config["data_dir"]
GENOME_FASTA_URL    = config["genome_fasta_url"]

#units = pd.read_table(config["units"], dtype=str).set_index(["bed"], drop=False)

#BED = units.index.get_level_values('bed').unique().tolist()
WJH58's avatar
WJH58 committed
23
units = pd.read_table(config["units"], dtype=str).set_index(["sample"], drop=False)
24
25
26
27
28
29
30

SAMPLES = units.index.get_level_values('sample').unique().tolist()

###############
# Helper Functions
###############
def get_fastq(wildcards):
WJH58's avatar
WJH58 committed
31
    return units.loc[(wildcards.samples), ["fq1", "fq2"]].dropna()
32
33


WJH58's avatar
WJH58 committed
34
35
36
37
38
39
40
41
##############
# Wildcards
##############
wildcard_constraints:
    sample = "[A-Za-z0-9]+"

wildcard_constraints:
    unit = "L[0-9]+"
42
43


WJH58's avatar
WJH58 committed
44
45
46
47
48
################## DESIRED OUTPUT ##################

FASTQC          = expand(RESULT_DIR + "fastqc/{samples}_{R}_2000reads_fastqc.html", samples = SAMPLES, R=['R1', 'R2']),
FORWARD_READS   = expand(WORKING_DIR + "trimmed/{samples}_forward.fastq.gz", samples = SAMPLES),
REVERSE_READS   = expand(WORKING_DIR + "trimmed/{samples}_reverse.fastq.gz", samples = SAMPLES),
49
50
51
52
53
54
55

rule all:
    input:
        FASTQC,
        WORKING_DIR + "reference",
        [WORKING_DIR + "genome." + str(i) + ".bt2" for i in range(1,4)],
        WORKING_DIR + "genome.rev.1.bt2",
WJH58's avatar
WJH58 committed
56
57
58
        WORKING_DIR + "genome.rev.2.bt2",
        FORWARD_READS,
        REVERSE_READS
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
    message : "Analysis is complete!"
    shell:""

################## INCLUDE RULES ##################
rule download_genome:
    output:
        WORKING_DIR + "reference",
    shell:
        "wget -O {output} {GENOME_FASTA_URL}"

rule index_genome:
    input:
        WORKING_DIR + "reference",
    output:
        [WORKING_DIR + "genome." + str(i) + ".bt2" for i in range(1,4)],
        WORKING_DIR + "genome.rev.1.bt2",
        WORKING_DIR + "genome.rev.2.bt2"
    message:"Indexing Reference genome"
    params:
        WORKING_DIR + "genome"
    conda:
        "envs/samtools_bowtie.yaml"
    shell:
        "bowtie2-build --threads 10 {input} {params}"

rule trimmomatic:
    input:
WJH58's avatar
WJH58 committed
86
87
88
89
90
91
        reads = get_fastq,
        adapters = config['trimmomatic']["adapters"]
    output:
        forward_reads   = WORKING_DIR + "trimmed/{samples}_forward.fastq.gz",
        reverse_reads   = WORKING_DIR + "trimmed/{samples}_reverse.fastq.gz",
        forwardUnpaired = temp(WORKING_DIR + "trimmed/{samples}_forward_unpaired.fastq.gz"),
WJH58's avatar
WJH58 committed
92
        reverseUnpaired = temp(WORKING_DIR + "trimmed/{samples}_reverse_unpaired.fastq.gz"),
WJH58's avatar
WJH58 committed
93
    log:
WJH58's avatar
WJH58 committed
94
        RESULT_DIR + "logs/trimmomatic/{samples}.log"
WJH58's avatar
WJH58 committed
95
96
97
98
99
100
101
102
103
104
105
106
    params:
        seedMisMatches =            str(config['trimmomatic']['seedMisMatches']),
        palindromeClipTreshold =    str(config['trimmomatic']['palindromeClipTreshold']),
        simpleClipThreshhold =      str(config['trimmomatic']['simpleClipThreshold']),
        LeadMinTrimQual =           str(config['trimmomatic']['LeadMinTrimQual']),
        TrailMinTrimQual =          str(config['trimmomatic']['TrailMinTrimQual']),
        windowSize =                str(config['trimmomatic']['windowSize']),
        avgMinQual =                str(config['trimmomatic']['avgMinQual']),
        minReadLen =                str(config['trimmomatic']['minReadLength']),
        phred = 		            str(config["trimmomatic"]["phred"])
    threads: 10
    conda:
WJH58's avatar
WJH58 committed
107
        "envs/trimmomatic.yaml"
WJH58's avatar
WJH58 committed
108
    shell:
WJH58's avatar
WJH58 committed
109
110
111
112
113
114
115
116
117
118
119
120
        """
        java -jar trimmomatic-0.32.jar PE \
        -threads 10 \
        -phred33 \
         {input.reads} \
         {output.forward_reads} {output.forwardUnpaired} {output.reverse_reads} {output.reverseUnpaired} \
         ILLUMINACLIP:{input.adapters}:{params.seedMisMatches}:{params.palindromeClipTreshold}:{params.simpleClipThreshhold} \
         LEADING:3 \
         TRAILING:3 \
         SLIDINGWINDOW:4:15 \
         MINLEN:40 &>{log}
        """
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135

rule fastqc:
    input:
        fwd = expand(DATA_DIR + "{samples}_R1_2000reads.fq.gz", samples = SAMPLES),
        rev = expand(DATA_DIR + "{samples}_R2_2000reads.fq.gz", samples = SAMPLES)
    output:
        expand(RESULT_DIR + "fastqc/{samples}_{R}_2000reads_fastqc.html", samples = SAMPLES, R=['R1', 'R2'])
    log:
        expand(RESULT_DIR + "logs/fastqc/{samples}.fastqc.log", samples = SAMPLES)
    params:
        RESULT_DIR + "fastqc/"
    conda:
        "envs/fastqc.yaml"
    shell:
        "fastqc --outdir={params} {input.fwd} {input.rev} &>{log}"