snakefile 9.04 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
################## Import libraries ##################

import pandas as pd
import os
import sys
from subprocess import call
import itertools
from snakemake.utils import R


################## Configuration file and PATHS##################

configfile: "config.yaml"

WORKING_DIR         = config["working_dir"]
RESULT_DIR          = config["results_dir"]
DATA_DIR            = config["data_dir"]
GENOME_FASTA_URL    = config["genome_fasta_url"]

WJH58's avatar
WJH58 committed
20
#units = pd.read_table(config["units"], dtype=str).set_index(["bed"], drop=False)
21

WJH58's avatar
WJH58 committed
22
#BED = units.index.get_level_values('bed').unique().tolist()
WJH58's avatar
WJH58 committed
23
units = pd.read_table(config["units"], dtype=str).set_index(["sample"], drop=False)
24
25
26
27
28
29
30

SAMPLES = units.index.get_level_values('sample').unique().tolist()

###############
# Helper Functions
###############
def get_fastq(wildcards):
WJH58's avatar
WJH58 committed
31
    return units.loc[(wildcards.samples), ["fq1", "fq2"]].dropna()
32
33


WJH58's avatar
WJH58 committed
34
35
36
37
38
39
40
41
##############
# Wildcards
##############
wildcard_constraints:
    sample = "[A-Za-z0-9]+"

wildcard_constraints:
    unit = "L[0-9]+"
42
43


WJH58's avatar
WJH58 committed
44
45
################## DESIRED OUTPUT ##################

46
FASTQC          = expand(RESULT_DIR + "fastqc/{samples}_{R}_fastqc.html", samples = SAMPLES, R=['R1', 'R2']),
WJH58's avatar
WJH58 committed
47
48
FORWARD_READS   = expand(WORKING_DIR + "trimmed/{samples}_forward.fastq.gz", samples = SAMPLES),
REVERSE_READS   = expand(WORKING_DIR + "trimmed/{samples}_reverse.fastq.gz", samples = SAMPLES),
49
TRIMMED_FASTQC  = expand(RESULT_DIR + "trimmed_fastqc/{samples}_{direction}_fastqc.html", samples = SAMPLES, direction=['forward', 'reverse']),
WJH58's avatar
WJH58 committed
50
51
MAPPED = expand(WORKING_DIR + "mapped/{samples}.bam", samples = SAMPLES),
UNMAPPED = expand([WORKING_DIR + "unmapped/{samples}.fq." + str(i) +".gz" for i in range(1,2)], samples = SAMPLES),
WJH58's avatar
WJH58 committed
52
MAP_SORTED = expand(WORKING_DIR + "sort/{samples}.sorted.bam", samples = SAMPLES),
WJH58's avatar
WJH58 committed
53
54
SORTED_INDEXED = expand(WORKING_DIR + "index/{samples}.sorted.bam.bai", samples = SAMPLES),
GENOME_INFO = expand(WORKING_DIR + "genome_info/{samples}.genome.info", samples  = SAMPLES),
55
56
GAPPED_PEAK = expand(RESULT_DIR + "peaks/{samples}_peaks.gappedPeak", samples = SAMPLES),
NAME_LOG = expand(RESULT_DIR + "peaks/{samples}.log", samples = SAMPLES)
57
58
59
60
61
62
63

rule all:
    input:
        FASTQC,
        WORKING_DIR + "reference",
        [WORKING_DIR + "genome." + str(i) + ".bt2" for i in range(1,4)],
        WORKING_DIR + "genome.rev.1.bt2",
WJH58's avatar
WJH58 committed
64
65
        WORKING_DIR + "genome.rev.2.bt2",
        FORWARD_READS,
WJH58's avatar
WJH58 committed
66
        REVERSE_READS,
WJH58's avatar
WJH58 committed
67
68
        TRIMMED_FASTQC,
        MAPPED,
WJH58's avatar
WJH58 committed
69
70
        UNMAPPED,
        MAP_SORTED,
WJH58's avatar
WJH58 committed
71
        SORTED_INDEXED,
72
73
74
75
        #GENOME_INFO,
        #GAPPED_PEAK,
        #NAME_LOG

76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
    message : "Analysis is complete!"
    shell:""

################## INCLUDE RULES ##################
rule download_genome:
    output:
        WORKING_DIR + "reference",
    shell:
        "wget -O {output} {GENOME_FASTA_URL}"

rule index_genome:
    input:
        WORKING_DIR + "reference",
    output:
        [WORKING_DIR + "genome." + str(i) + ".bt2" for i in range(1,4)],
        WORKING_DIR + "genome.rev.1.bt2",
        WORKING_DIR + "genome.rev.2.bt2"
    message:"Indexing Reference genome"
    params:
        WORKING_DIR + "genome"
    conda:
        "envs/samtools_bowtie.yaml"
    shell:
        "bowtie2-build --threads 10 {input} {params}"

rule trimmomatic:
    input:
WJH58's avatar
WJH58 committed
103
104
105
106
107
108
        reads = get_fastq,
        adapters = config['trimmomatic']["adapters"]
    output:
        forward_reads   = WORKING_DIR + "trimmed/{samples}_forward.fastq.gz",
        reverse_reads   = WORKING_DIR + "trimmed/{samples}_reverse.fastq.gz",
        forwardUnpaired = temp(WORKING_DIR + "trimmed/{samples}_forward_unpaired.fastq.gz"),
WJH58's avatar
WJH58 committed
109
        reverseUnpaired = temp(WORKING_DIR + "trimmed/{samples}_reverse_unpaired.fastq.gz"),
WJH58's avatar
WJH58 committed
110
    log:
WJH58's avatar
WJH58 committed
111
        RESULT_DIR + "logs/trimmomatic/{samples}.log"
WJH58's avatar
WJH58 committed
112
113
114
115
116
117
118
119
120
121
122
123
    params:
        seedMisMatches =            str(config['trimmomatic']['seedMisMatches']),
        palindromeClipTreshold =    str(config['trimmomatic']['palindromeClipTreshold']),
        simpleClipThreshhold =      str(config['trimmomatic']['simpleClipThreshold']),
        LeadMinTrimQual =           str(config['trimmomatic']['LeadMinTrimQual']),
        TrailMinTrimQual =          str(config['trimmomatic']['TrailMinTrimQual']),
        windowSize =                str(config['trimmomatic']['windowSize']),
        avgMinQual =                str(config['trimmomatic']['avgMinQual']),
        minReadLen =                str(config['trimmomatic']['minReadLength']),
        phred = 		            str(config["trimmomatic"]["phred"])
    threads: 10
    conda:
WJH58's avatar
WJH58 committed
124
        "envs/trimmomatic.yaml"
WJH58's avatar
WJH58 committed
125
    shell:
WJH58's avatar
WJH58 committed
126
        """
WJH58's avatar
WJH58 committed
127
        trimmomatic PE \
WJH58's avatar
WJH58 committed
128
129
130
131
132
133
134
135
136
137
        -threads 10 \
        -phred33 \
         {input.reads} \
         {output.forward_reads} {output.forwardUnpaired} {output.reverse_reads} {output.reverseUnpaired} \
         ILLUMINACLIP:{input.adapters}:{params.seedMisMatches}:{params.palindromeClipTreshold}:{params.simpleClipThreshhold} \
         LEADING:3 \
         TRAILING:3 \
         SLIDINGWINDOW:4:15 \
         MINLEN:40 &>{log}
        """
WJH58's avatar
WJH58 committed
138

WJH58's avatar
WJH58 committed
139
140
rule trimmed_fastqc:
    input:
141
142
        forward_reads = WORKING_DIR + "trimmed/{samples}_forward.fastq.gz",
        reverse_reads = WORKING_DIR + "trimmed/{samples}_reverse.fastq.gz"
WJH58's avatar
WJH58 committed
143
    output:
144
        RESULT_DIR + "trimmed_fastqc/{samples}_{direction}_fastqc.html"
WJH58's avatar
WJH58 committed
145
    params:
WJH58's avatar
WJH58 committed
146
        RESULT_DIR + "trimmed_fastqc/"
WJH58's avatar
WJH58 committed
147
148
    conda:
        "envs/fastqc.yaml"
WJH58's avatar
WJH58 committed
149
    log:
150
        RESULT_DIR + "logs/trimmed_fastqc/{samples}_{direction}.fastqc.log"
WJH58's avatar
WJH58 committed
151
    shell:
152
        "fastqc --outdir={params} {input.forward_reads} {input.reverse_reads} &>{log}"
WJH58's avatar
WJH58 committed
153

154
155
rule fastqc:
    input:
156
157
        fwd = expand(DATA_DIR + "sample_{numbers}_R1.fastq.gz", numbers = ['8','12','4_3','4_1']),
        rev = expand(DATA_DIR + "sample_{numbers}_R2.fastq.gz", numbers = ['8','12','4_3','4_1'])
158
    output:
159
        expand(RESULT_DIR + "fastqc/{samples}_{R}_fastqc.html", samples = SAMPLES, R=['R1', 'R2'])
160
161
162
163
164
165
166
167
    log:
        expand(RESULT_DIR + "logs/fastqc/{samples}.fastqc.log", samples = SAMPLES)
    params:
        RESULT_DIR + "fastqc/"
    conda:
        "envs/fastqc.yaml"
    shell:
        "fastqc --outdir={params} {input.fwd} {input.rev} &>{log}"
WJH58's avatar
WJH58 committed
168

WJH58's avatar
WJH58 committed
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
rule mapping:
    input:
        forward_reads = WORKING_DIR + "trimmed/{samples}_forward.fastq.gz",
        reverse_reads = WORKING_DIR + "trimmed/{samples}_reverse.fastq.gz",
        forwardUnpaired = WORKING_DIR + "trimmed/{samples}_forward_unpaired.fastq.gz",
        reverseUnpaired = WORKING_DIR + "trimmed/{samples}_reverse_unpaired.fastq.gz",
        index = [WORKING_DIR + "genome." + str(i) + ".bt2" for i in range(1,4)]
    output:
        mapped = WORKING_DIR + "mapped/{samples}.bam",
        unmapped = [WORKING_DIR + "unmapped/{samples}.fq." + str(i) +".gz" for i in range(1,2)],
    params:
        bowtie          = " ".join(config["bowtie2"]["params"].values()), #take argument separated as a list separated with a space
        index           = WORKING_DIR + "genome",
        unmapped        = WORKING_DIR + "unmapped/{samples}.fq.gz"
    threads: 10
    conda:
WJH58's avatar
WJH58 committed
185
        "envs/samtools_bowtie.yaml"
WJH58's avatar
WJH58 committed
186
187
188
189
    log:
        RESULT_DIR + "logs/bowtie/{samples}.log"
    shell:
        """
190
        bowtie2 {params.bowtie} --threads {threads} -x {params.index} -1 {input.forward_reads} -2 {input.reverse_reads} -U {input.forwardUnpaired},{input.reverseUnpaired} --un-conc-gz {params.unmapped} | samtools view -Sb - > {output.mapped} >&{log}
WJH58's avatar
WJH58 committed
191
        """
WJH58's avatar
WJH58 committed
192
193
194
195
196

rule sort:
    input:
        mapped = WORKING_DIR + "mapped/{samples}.bam"
    output:
197
        map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
WJH58's avatar
WJH58 committed
198
199
200
201
202
    conda:
        "envs/samtools_bowtie.yaml"
    log:
        RESULT_DIR + "logs/sort/{samples}.log"
    shell:
WJH58's avatar
WJH58 committed
203
        "samtools sort {input} -o {output} &>{log}"
WJH58's avatar
WJH58 committed
204
205
206

rule index:
    input:
207
        map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
WJH58's avatar
WJH58 committed
208
    output:
209
        map_sorted_indexed = WORKING_DIR + "index/{samples}.sorted.bam.bai"
WJH58's avatar
WJH58 committed
210
211
212
213
214
    conda:
        "envs/samtools_bowtie.yaml"
    log:
        RESULT_DIR + "logs/index/{samples}.log"
    shell:
WJH58's avatar
WJH58 committed
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
        "samtools index {input} {output} &>{log}"

rule get_genome_info:
    input:
        map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
    output:
        genome_info = WORKING_DIR + "genome_info/{samples}.genome.info"
    conda:
        "envs/perl.yaml"
    shell:
        """
        samtools view -H {input} | grep SQ | cut -f 2-3 | cut -d ':' -f 2 | cut -f 1 > tmp
        samtools view -H {input} | grep SQ | cut -f 2-3 | cut -d ':' -f 2,3 | cut -d ':' -f 2 > tmp2
        paste tmp tmp2 > {output}
        rm tmp tmp2
        """

rule peak_calling:
    input:
        map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam",
235
        map_sorted_indexed = WORKING_DIR + "index/{samples}.sorted.bam.bai"
WJH58's avatar
WJH58 committed
236
    output:
237
238
        gapped_peak = RESULT_DIR + "peaks/{samples}_peaks.gappedPeak",
        Name_log = RESULT_DIR + "peaks/{samples}.log"
WJH58's avatar
WJH58 committed
239
240
241
242
243
244
245
246
247
    params:
        genome_info = str(config['hmmratac']['genome.info']),
        output_preflix = "{samples}"
    log:
        RESULT_DIR + "logs/peak_calling/{samples}.log"
    conda:
        "envs/hmmratac.yaml"
    shell:
        """
248
        HMMRATAC -b {input.map_sorted} -i {input.map_sorted_indexed} -g {params.genome_info} -o {params.output_preflix}
WJH58's avatar
WJH58 committed
249
        """