snakefile 9.93 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
################## Import libraries ##################

import pandas as pd
import os
import sys
from subprocess import call
import itertools
from snakemake.utils import R


################## Configuration file and PATHS##################

configfile: "config.yaml"

WORKING_DIR         = config["working_dir"]
RESULT_DIR          = config["results_dir"]
DATA_DIR            = config["data_dir"]
GENOME_FASTA_URL    = config["genome_fasta_url"]

WJH58's avatar
WJH58 committed
20
#units = pd.read_table(config["units"], dtype=str).set_index(["bed"], drop=False)
21

WJH58's avatar
WJH58 committed
22
#BED = units.index.get_level_values('bed').unique().tolist()
WJH58's avatar
WJH58 committed
23
units = pd.read_table(config["units"], dtype=str).set_index(["sample"], drop=False)
24
25
26
27
28
29
30

SAMPLES = units.index.get_level_values('sample').unique().tolist()

###############
# Helper Functions
###############
def get_fastq(wildcards):
WJH58's avatar
WJH58 committed
31
    return units.loc[(wildcards.samples), ["fq1", "fq2"]].dropna()
32
33


WJH58's avatar
WJH58 committed
34
35
36
37
38
39
40
41
##############
# Wildcards
##############
wildcard_constraints:
    sample = "[A-Za-z0-9]+"

wildcard_constraints:
    unit = "L[0-9]+"
42
43


WJH58's avatar
WJH58 committed
44
45
################## DESIRED OUTPUT ##################

WJH58's avatar
WJH58 committed
46
FASTQC          = expand(RESULT_DIR + "fastqc/sample_{numbers}_{R}_fastqc.html",numbers = ['8','12','4_3','4_1'], R=['R1', 'R2']),
WJH58's avatar
WJH58 committed
47
48
FORWARD_READS   = expand(WORKING_DIR + "trimmed/{samples}_forward.fastq.gz", samples = SAMPLES),
REVERSE_READS   = expand(WORKING_DIR + "trimmed/{samples}_reverse.fastq.gz", samples = SAMPLES),
49
TRIMMED_FASTQC  = expand(RESULT_DIR + "trimmed_fastqc/{samples}_{direction}_fastqc.html", samples = SAMPLES, direction=['forward', 'reverse']),
WJH58's avatar
WJH58 committed
50
51
MAPPED = expand(WORKING_DIR + "mapped/{samples}.bam", samples = SAMPLES),
UNMAPPED = expand([WORKING_DIR + "unmapped/{samples}.fq." + str(i) +".gz" for i in range(1,2)], samples = SAMPLES),
WJH58's avatar
WJH58 committed
52
MAP_SORTED = expand(WORKING_DIR + "sort/{samples}.sorted.bam", samples = SAMPLES),
WJH58's avatar
WJH58 committed
53
54
SORTED_INDEXED = expand(WORKING_DIR + "index/{samples}.sorted.bam.bai", samples = SAMPLES),
GENOME_INFO = expand(WORKING_DIR + "genome_info/{samples}.genome.info", samples  = SAMPLES),
55
GAPPED_PEAK = expand(RESULT_DIR + "peaks/{samples}_peaks.gappedPeak", samples = SAMPLES),
WJH58's avatar
WJH58 committed
56
57
NAME_LOG = expand(RESULT_DIR + "peaks/{samples}.log", samples = SAMPLES),
NARROWPEAK = expand(RESULT_DIR + "macs3/{samples}_peaks.narrowPeak", samples = SAMPLES)
58
59
60
61
62
63
64

rule all:
    input:
        FASTQC,
        WORKING_DIR + "reference",
        [WORKING_DIR + "genome." + str(i) + ".bt2" for i in range(1,4)],
        WORKING_DIR + "genome.rev.1.bt2",
WJH58's avatar
WJH58 committed
65
66
        WORKING_DIR + "genome.rev.2.bt2",
        FORWARD_READS,
WJH58's avatar
WJH58 committed
67
        REVERSE_READS,
WJH58's avatar
WJH58 committed
68
69
        TRIMMED_FASTQC,
        MAPPED,
WJH58's avatar
WJH58 committed
70
71
        UNMAPPED,
        MAP_SORTED,
WJH58's avatar
WJH58 committed
72
        SORTED_INDEXED,
WJH58's avatar
WJH58 committed
73
        NARROWPEAK,
74
75
76
77
        #GENOME_INFO,
        #GAPPED_PEAK,
        #NAME_LOG

78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
    message : "Analysis is complete!"
    shell:""

################## INCLUDE RULES ##################
rule download_genome:
    output:
        WORKING_DIR + "reference",
    shell:
        "wget -O {output} {GENOME_FASTA_URL}"

rule index_genome:
    input:
        WORKING_DIR + "reference",
    output:
        [WORKING_DIR + "genome." + str(i) + ".bt2" for i in range(1,4)],
        WORKING_DIR + "genome.rev.1.bt2",
        WORKING_DIR + "genome.rev.2.bt2"
    message:"Indexing Reference genome"
    params:
        WORKING_DIR + "genome"
    conda:
        "envs/samtools_bowtie.yaml"
    shell:
        "bowtie2-build --threads 10 {input} {params}"

rule trimmomatic:
    input:
WJH58's avatar
WJH58 committed
105
106
107
108
109
110
        reads = get_fastq,
        adapters = config['trimmomatic']["adapters"]
    output:
        forward_reads   = WORKING_DIR + "trimmed/{samples}_forward.fastq.gz",
        reverse_reads   = WORKING_DIR + "trimmed/{samples}_reverse.fastq.gz",
        forwardUnpaired = temp(WORKING_DIR + "trimmed/{samples}_forward_unpaired.fastq.gz"),
WJH58's avatar
WJH58 committed
111
        reverseUnpaired = temp(WORKING_DIR + "trimmed/{samples}_reverse_unpaired.fastq.gz")
WJH58's avatar
WJH58 committed
112
    log:
WJH58's avatar
WJH58 committed
113
        RESULT_DIR + "logs/trimmomatic/{samples}.log"
WJH58's avatar
WJH58 committed
114
115
116
117
118
119
120
121
122
123
124
125
    params:
        seedMisMatches =            str(config['trimmomatic']['seedMisMatches']),
        palindromeClipTreshold =    str(config['trimmomatic']['palindromeClipTreshold']),
        simpleClipThreshhold =      str(config['trimmomatic']['simpleClipThreshold']),
        LeadMinTrimQual =           str(config['trimmomatic']['LeadMinTrimQual']),
        TrailMinTrimQual =          str(config['trimmomatic']['TrailMinTrimQual']),
        windowSize =                str(config['trimmomatic']['windowSize']),
        avgMinQual =                str(config['trimmomatic']['avgMinQual']),
        minReadLen =                str(config['trimmomatic']['minReadLength']),
        phred = 		            str(config["trimmomatic"]["phred"])
    threads: 10
    conda:
WJH58's avatar
WJH58 committed
126
        "envs/trimmomatic.yaml"
WJH58's avatar
WJH58 committed
127
    shell:
WJH58's avatar
WJH58 committed
128
        """
WJH58's avatar
WJH58 committed
129
        trimmomatic PE \
WJH58's avatar
WJH58 committed
130
131
132
133
134
135
136
137
138
139
        -threads 10 \
        -phred33 \
         {input.reads} \
         {output.forward_reads} {output.forwardUnpaired} {output.reverse_reads} {output.reverseUnpaired} \
         ILLUMINACLIP:{input.adapters}:{params.seedMisMatches}:{params.palindromeClipTreshold}:{params.simpleClipThreshhold} \
         LEADING:3 \
         TRAILING:3 \
         SLIDINGWINDOW:4:15 \
         MINLEN:40 &>{log}
        """
WJH58's avatar
WJH58 committed
140

WJH58's avatar
WJH58 committed
141
142
rule trimmed_fastqc:
    input:
143
144
        forward_reads = WORKING_DIR + "trimmed/{samples}_forward.fastq.gz",
        reverse_reads = WORKING_DIR + "trimmed/{samples}_reverse.fastq.gz"
WJH58's avatar
WJH58 committed
145
    output:
146
        RESULT_DIR + "trimmed_fastqc/{samples}_{direction}_fastqc.html"
WJH58's avatar
WJH58 committed
147
    params:
WJH58's avatar
WJH58 committed
148
        RESULT_DIR + "trimmed_fastqc/"
WJH58's avatar
WJH58 committed
149
150
    conda:
        "envs/fastqc.yaml"
WJH58's avatar
WJH58 committed
151
    log:
152
        RESULT_DIR + "logs/trimmed_fastqc/{samples}_{direction}.fastqc.log"
WJH58's avatar
WJH58 committed
153
    shell:
154
        "fastqc --outdir={params} {input.forward_reads} {input.reverse_reads} &>{log}"
WJH58's avatar
WJH58 committed
155

156
157
rule fastqc:
    input:
158
159
        fwd = expand(DATA_DIR + "sample_{numbers}_R1.fastq.gz", numbers = ['8','12','4_3','4_1']),
        rev = expand(DATA_DIR + "sample_{numbers}_R2.fastq.gz", numbers = ['8','12','4_3','4_1'])
160
    output:
WJH58's avatar
WJH58 committed
161
        expand(RESULT_DIR + "fastqc/sample_{numbers}_{R}_fastqc.html", numbers = ['8','12','4_3','4_1'], R=['R1', 'R2'])
162
163
164
165
166
167
168
169
    log:
        expand(RESULT_DIR + "logs/fastqc/{samples}.fastqc.log", samples = SAMPLES)
    params:
        RESULT_DIR + "fastqc/"
    conda:
        "envs/fastqc.yaml"
    shell:
        "fastqc --outdir={params} {input.fwd} {input.rev} &>{log}"
WJH58's avatar
WJH58 committed
170

WJH58's avatar
WJH58 committed
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
rule mapping:
    input:
        forward_reads = WORKING_DIR + "trimmed/{samples}_forward.fastq.gz",
        reverse_reads = WORKING_DIR + "trimmed/{samples}_reverse.fastq.gz",
        forwardUnpaired = WORKING_DIR + "trimmed/{samples}_forward_unpaired.fastq.gz",
        reverseUnpaired = WORKING_DIR + "trimmed/{samples}_reverse_unpaired.fastq.gz",
        index = [WORKING_DIR + "genome." + str(i) + ".bt2" for i in range(1,4)]
    output:
        mapped = WORKING_DIR + "mapped/{samples}.bam",
        unmapped = [WORKING_DIR + "unmapped/{samples}.fq." + str(i) +".gz" for i in range(1,2)],
    params:
        bowtie          = " ".join(config["bowtie2"]["params"].values()), #take argument separated as a list separated with a space
        index           = WORKING_DIR + "genome",
        unmapped        = WORKING_DIR + "unmapped/{samples}.fq.gz"
    threads: 10
    conda:
WJH58's avatar
WJH58 committed
187
        "envs/samtools_bowtie.yaml"
WJH58's avatar
WJH58 committed
188
189
190
191
    log:
        RESULT_DIR + "logs/bowtie/{samples}.log"
    shell:
        """
WJH58's avatar
WJH58 committed
192
        bowtie2 {params.bowtie} --threads {threads} -x {params.index} -1 {input.forward_reads} -2 {input.reverse_reads} -U {input.forwardUnpaired},{input.reverseUnpaired} --un-conc-gz {params.unmapped} | samtools view -Sb - > {output.mapped} 2>{log}
WJH58's avatar
WJH58 committed
193
        """
WJH58's avatar
WJH58 committed
194
195
196
197
198

rule sort:
    input:
        mapped = WORKING_DIR + "mapped/{samples}.bam"
    output:
199
        map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
WJH58's avatar
WJH58 committed
200
201
202
203
204
    conda:
        "envs/samtools_bowtie.yaml"
    log:
        RESULT_DIR + "logs/sort/{samples}.log"
    shell:
WJH58's avatar
WJH58 committed
205
        "samtools sort {input} -o {output} &>{log}"
WJH58's avatar
WJH58 committed
206
207
208

rule index:
    input:
209
        map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
WJH58's avatar
WJH58 committed
210
    output:
WJH58's avatar
WJH58 committed
211
        map_sorted_indexed = WORKING_DIR + "sort/{samples}.sorted.bam.bai"
WJH58's avatar
WJH58 committed
212
213
214
215
216
    conda:
        "envs/samtools_bowtie.yaml"
    log:
        RESULT_DIR + "logs/index/{samples}.log"
    shell:
WJH58's avatar
WJH58 committed
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
        "samtools index {input} {output} &>{log}"

rule get_genome_info:
    input:
        map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
    output:
        genome_info = WORKING_DIR + "genome_info/{samples}.genome.info"
    conda:
        "envs/perl.yaml"
    shell:
        """
        samtools view -H {input} | grep SQ | cut -f 2-3 | cut -d ':' -f 2 | cut -f 1 > tmp
        samtools view -H {input} | grep SQ | cut -f 2-3 | cut -d ':' -f 2,3 | cut -d ':' -f 2 > tmp2
        paste tmp tmp2 > {output}
        rm tmp tmp2
        """

rule peak_calling:
    input:
        map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam",
237
        map_sorted_indexed = WORKING_DIR + "index/{samples}.sorted.bam.bai"
WJH58's avatar
WJH58 committed
238
    output:
239
240
        gapped_peak = RESULT_DIR + "peaks/{samples}_peaks.gappedPeak",
        Name_log = RESULT_DIR + "peaks/{samples}.log"
WJH58's avatar
WJH58 committed
241
242
243
244
245
246
247
248
249
    params:
        genome_info = str(config['hmmratac']['genome.info']),
        output_preflix = "{samples}"
    log:
        RESULT_DIR + "logs/peak_calling/{samples}.log"
    conda:
        "envs/hmmratac.yaml"
    shell:
        """
250
        HMMRATAC -b {input.map_sorted} -i {input.map_sorted_indexed} -g {params.genome_info} -o {params.output_preflix}
WJH58's avatar
WJH58 committed
251
        """
WJH58's avatar
WJH58 committed
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272

rule call_narrow_peaks:
    input:
        map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
    output:
        narrowPeak = RESULT_DIR + "macs3/{samples}_peaks.narrowPeak"
    params:
        name = "{samples}",
        format = str(config['macs2']['format']),
        genomesize = str(config['macs2']['genomesize']),
        outdir = str(config['macs2']['outdir'])
    log:
        RESULT_DIR + "logs/macs3/{samples}_peaks.narrowPeak.log"
    conda:
        "envs/macs2.yaml"
    shell:
        """
        macs3 callpeak -t {input} {params.format} {params.genomesize} --name {params.name} --nomodel --bdg -q 0.05 --outdir {params.outdir}/ 2>{log}
        """
#bdg: create bedgraph output files
#format=BAMPE: only use properly-paired read alignments