Commit 351a5144 authored by JihedC's avatar JihedC
Browse files

clean the Snakefile by including rules in rules folder

parent 8ca11f4f
......@@ -77,13 +77,13 @@ rule all:
MAPPED,
UNMAPPED,
MAP_SORTED,
DEDUP,
STATS,
COVERAGE_TRACK,
BIGWIGSUMMARY,
PCAPLOT,
SORTED_INDEXED,
NARROWPEAK,
#DEDUP,
#STATS,
#COVERAGE_TRACK,
#BIGWIGSUMMARY,
#PCAPLOT,
#SORTED_INDEXED,
#NARROWPEAK,
#GENOME_INFO,
#GAPPED_PEAK,
#NAME_LOG
......@@ -92,257 +92,6 @@ rule all:
shell:""
################## INCLUDE RULES ##################
rule download_genome:
output:
WORKING_DIR + "reference",
shell:
"wget -O {output} {GENOME_FASTA_URL}"
rule index_genome:
input:
WORKING_DIR + "reference",
output:
[WORKING_DIR + "genome." + str(i) + ".bt2" for i in range(1,4)],
WORKING_DIR + "genome.rev.1.bt2",
WORKING_DIR + "genome.rev.2.bt2"
message:"Indexing Reference genome"
params:
WORKING_DIR + "genome"
conda:
"envs/samtools_bowtie.yaml"
shell:
"bowtie2-build --threads 10 {input} {params}"
rule trimmomatic:
input:
reads = get_fastq,
adapters = config['trimmomatic']["adapters"]
output:
forward_reads = WORKING_DIR + "trimmed/{samples}_forward.fastq.gz",
reverse_reads = WORKING_DIR + "trimmed/{samples}_reverse.fastq.gz",
forwardUnpaired = temp(WORKING_DIR + "trimmed/{samples}_forward_unpaired.fastq.gz"),
reverseUnpaired = temp(WORKING_DIR + "trimmed/{samples}_reverse_unpaired.fastq.gz")
log:
RESULT_DIR + "logs/trimmomatic/{samples}.log"
params:
seedMisMatches = str(config['trimmomatic']['seedMisMatches']),
palindromeClipTreshold = str(config['trimmomatic']['palindromeClipTreshold']),
simpleClipThreshhold = str(config['trimmomatic']['simpleClipThreshold']),
LeadMinTrimQual = str(config['trimmomatic']['LeadMinTrimQual']),
TrailMinTrimQual = str(config['trimmomatic']['TrailMinTrimQual']),
windowSize = str(config['trimmomatic']['windowSize']),
avgMinQual = str(config['trimmomatic']['avgMinQual']),
minReadLen = str(config['trimmomatic']['minReadLength']),
phred = str(config["trimmomatic"]["phred"])
threads: 10
conda:
"envs/trimmomatic.yaml"
shell:
"""
trimmomatic PE \
-threads 10 \
-phred33 \
{input.reads} \
{output.forward_reads} {output.forwardUnpaired} {output.reverse_reads} {output.reverseUnpaired} \
ILLUMINACLIP:{input.adapters}:{params.seedMisMatches}:{params.palindromeClipTreshold}:{params.simpleClipThreshhold} \
LEADING:3 \
TRAILING:3 \
SLIDINGWINDOW:4:15 \
MINLEN:40 &>{log}
"""
rule trimmed_fastqc:
input:
forward_reads = WORKING_DIR + "trimmed/{samples}_forward.fastq.gz",
reverse_reads = WORKING_DIR + "trimmed/{samples}_reverse.fastq.gz"
output:
RESULT_DIR + "trimmed_fastqc/{samples}_{direction}_fastqc.html"
params:
RESULT_DIR + "trimmed_fastqc/"
conda:
"envs/fastqc.yaml"
log:
RESULT_DIR + "logs/trimmed_fastqc/{samples}_{direction}.fastqc.log"
shell:
"fastqc --outdir={params} {input.forward_reads} {input.reverse_reads} &>{log}"
rule fastqc:
input:
fwd = expand(DATA_DIR + "sample_{numbers}_R1.fastq.gz", numbers = ['8','12','4_3','4_1']),
rev = expand(DATA_DIR + "sample_{numbers}_R2.fastq.gz", numbers = ['8','12','4_3','4_1'])
output:
expand(RESULT_DIR + "fastqc/sample_{numbers}_{R}_fastqc.html", numbers = ['8','12','4_3','4_1'], R=['R1', 'R2'])
log:
expand(RESULT_DIR + "logs/fastqc/{samples}.fastqc.log", samples = SAMPLES)
params:
RESULT_DIR + "fastqc/"
conda:
"envs/fastqc.yaml"
shell:
"fastqc --outdir={params} {input.fwd} {input.rev} &>{log}"
rule mapping:
input:
forward_reads = WORKING_DIR + "trimmed/{samples}_forward.fastq.gz",
reverse_reads = WORKING_DIR + "trimmed/{samples}_reverse.fastq.gz",
forwardUnpaired = WORKING_DIR + "trimmed/{samples}_forward_unpaired.fastq.gz",
reverseUnpaired = WORKING_DIR + "trimmed/{samples}_reverse_unpaired.fastq.gz",
index = [WORKING_DIR + "genome." + str(i) + ".bt2" for i in range(1,4)]
output:
mapped = WORKING_DIR + "mapped/{samples}.bam",
unmapped = [WORKING_DIR + "unmapped/{samples}.fq." + str(i) +".gz" for i in range(1,2)],
params:
bowtie = " ".join(config["bowtie2"]["params"].values()), #take argument separated as a list separated with a space
index = WORKING_DIR + "genome",
unmapped = WORKING_DIR + "unmapped/{samples}.fq.gz"
threads: 10
conda:
"envs/samtools_bowtie.yaml"
log:
RESULT_DIR + "logs/bowtie/{samples}.log"
shell:
"""
bowtie2 {params.bowtie} --threads {threads} -x {params.index} -1 {input.forward_reads} -2 {input.reverse_reads} -U {input.forwardUnpaired},{input.reverseUnpaired} --un-conc-gz {params.unmapped} | samtools view -Sb - > {output.mapped} 2>{log}
"""
rule sort:
input:
mapped = WORKING_DIR + "mapped/{samples}.bam"
output:
sorted = WORKING_DIR + "sort/{samples}.sorted.bam",
bai = WORKING_DIR + "sort/{samples}.sorted.bam.bai"
conda:
"envs/samtools_bowtie.yaml"
log:
RESULT_DIR + "logs/sort/{samples}.log"
shell:
"""
samtools sort {input} -o {output.sorted} &>{log}
samtools index {output.sorted}
"""
rule deduplication:
input:
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
output:
dedup = WORKING_DIR + "dedup/{samples}.dedup.bam",
stats = WORKING_DIR + "dedup/{samples}.dedup.stats"
conda:
"envs/picard.yaml"
log:
RESULT_DIR + "logs/dedup/{samples}.log"
shell:
"""
picard MarkDuplicates -I {input} -O {output.dedup} -M {output.stats} 2>{log}
"""
# #rule index:
# #input:
# map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
# output:
# map_sorted_indexed = WORKING_DIR + "sort/{samples}.sorted.bam.bai"
# conda:
# "envs/samtools_bowtie.yaml"
# log:
# RESULT_DIR + "logs/index/{samples}.log"
# shell:
# "samtools index {input} {output} &>{log}"
rule get_genome_info:
input:
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
output:
genome_info = WORKING_DIR + "genome_info/{samples}.genome.info"
conda:
"envs/perl.yaml"
shell:
"""
samtools view -H {input} | grep SQ | cut -f 2-3 | cut -d ':' -f 2 | cut -f 1 > tmp
samtools view -H {input} | grep SQ | cut -f 2-3 | cut -d ':' -f 2,3 | cut -d ':' -f 2 > tmp2
paste tmp tmp2 > {output}
rm tmp tmp2
"""
rule peak_calling:
input:
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam",
map_sorted_indexed = WORKING_DIR + "index/{samples}.sorted.bam.bai"
output:
gapped_peak = RESULT_DIR + "peaks/{samples}_peaks.gappedPeak",
Name_log = RESULT_DIR + "peaks/{samples}.log"
params:
genome_info = str(config['hmmratac']['genome.info']),
output_preflix = "{samples}"
log:
RESULT_DIR + "logs/peak_calling/{samples}.log"
conda:
"envs/hmmratac.yaml"
shell:
"""
HMMRATAC -b {input.map_sorted} -i {input.map_sorted_indexed} -g {params.genome_info} -o {params.output_preflix}
"""
rule call_narrow_peaks:
input:
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
output:
narrowPeak = RESULT_DIR + "macs2/{samples}_peaks.narrowPeak"
params:
name = "{samples}",
format = str(config['macs2']['format']),
genomesize = str(config['macs2']['genomesize']),
outdir = str(config['macs2']['outdir'])
log:
RESULT_DIR + "logs/macs2/{samples}_peaks.narrowPeak.log"
conda:
"envs/macs2.yaml"
shell:
"""
macs2 callpeak -t {input} {params.format} {params.genomesize} --name {params.name} --nomodel --bdg -q 0.05 --outdir {params.outdir}/ 2>{log}
"""
rule bamCoverage:
input:
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
output:
coverage_track = RESULT_DIR + "bamCoverage/{samples}.bw"
params:
effectiveGenomeSize = str(config['bamCoverage']['effectiveGenomeSize']),
normalization = str(config['bamCoverage']['normalization']),
binSize = str(config['bamCoverage']['binSize'])
log:
RESULT_DIR + "logs/bamCoverage/{samples}.bw.log"
conda:
"envs/deeptools.yaml"
shell:
"bamCoverage -b {input} --effectiveGenomeSize {params.effectiveGenomeSize} --normalizeUsing {params.normalization} --ignoreDuplicates -o {output} --binSize {params.binSize}"
#bdg: create bedgraph output files
#format=BAMPE: only use properly-paired read alignments
# rule multibigwigSummary need to be expanded to use more than 1 bigwig file
# Right now the file only use 1 file, which is not useful for a comparison
rule multibigwigSummary:
input:
lambda wildcards: expand(RESULT_DIR + "bamCoverage/{sample}.bw", sample = SAMPLES)
output:
bigwigsummary = RESULT_DIR + "bigwigsummary/multiBigwigSummary.npz"
log:
RESULT_DIR + "logs/bigwigsummary/multiBigwigSummary.log"
conda:
"envs/deeptools.yaml"
shell:
"multiBigwigSummary bins -b {input} -o {output} "
#2>{log}
# PlotPCA does not work with a bigwigsummary made of only one sample
rule PCA:
input:
bigwigsummary = RESULT_DIR + "bigwigsummary/multiBigwigSummary.npz"
output:
PCAplot = RESULT_DIR + "PCA/PCA_PLOT.pdf"
log:
RESULT_DIR + "logs/bigwigsummary/PCA_PLOT.log"
conda:
"envs/deeptools.yaml"
shell:
"plotPCA -in {input} -o {output} 2>{log}"
include: "rules/external_data.smk"
include: "rules/pre-processing.smk"
include: "rules/macs2.smk"
chr1 195471971
chr2 182113224
chr3 160039680
chr4 156508116
chr5 151834684
chr6 149736546
chr7 145441459
chr8 129401213
chr9 124595110
chr10 130694993
chr11 122082543
chr12 120129022
chr13 120421639
chr14 124902244
chr15 104043685
chr16 98207768
chr17 94987271
chr18 90702639
chr19 61431566
chrX 171031299
chrY 91744698
chrM 16299
GL456210.1 169725
GL456211.1 241735
GL456212.1 153618
GL456213.1 39340
GL456216.1 66673
GL456219.1 175968
GL456221.1 206961
GL456233.1 336933
GL456239.1 40056
GL456350.1 227966
GL456354.1 195993
GL456359.1 22974
GL456360.1 31704
GL456366.1 47073
GL456367.1 42057
GL456368.1 20208
GL456370.1 26764
GL456372.1 28664
GL456378.1 31602
GL456379.1 72385
GL456381.1 25871
GL456382.1 23158
GL456383.1 38659
GL456385.1 35240
GL456387.1 24685
GL456389.1 28772
GL456390.1 24668
GL456392.1 23629
GL456393.1 55711
GL456394.1 24323
GL456396.1 21240
JH584292.1 14945
JH584293.1 207968
JH584294.1 191905
rule bamCoverage:
input:
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
output:
coverage_track = RESULT_DIR + "bamCoverage/{samples}.bw"
params:
effectiveGenomeSize = str(config['bamCoverage']['effectiveGenomeSize']),
normalization = str(config['bamCoverage']['normalization']),
binSize = str(config['bamCoverage']['binSize'])
log:
RESULT_DIR + "logs/bamCoverage/{samples}.bw.log"
conda:
"../envs/deeptools.yaml"
shell:
"bamCoverage -b {input} --effectiveGenomeSize {params.effectiveGenomeSize} --normalizeUsing {params.normalization} --ignoreDuplicates -o {output} --binSize {params.binSize}"
#bdg: create bedgraph output files
#format=BAMPE: only use properly-paired read alignments
# rule multibigwigSummary need to be expanded to use more than 1 bigwig file
# Right now the file only use 1 file, which is not useful for a comparison
rule multibigwigSummary:
input:
lambda wildcards: expand(RESULT_DIR + "bamCoverage/{sample}.bw", sample = SAMPLES)
output:
bigwigsummary = RESULT_DIR + "bigwigsummary/multiBigwigSummary.npz"
log:
RESULT_DIR + "logs/bigwigsummary/multiBigwigSummary.log"
conda:
"../envs/deeptools.yaml"
shell:
"multiBigwigSummary bins -b {input} -o {output} "
#2>{log}
# PlotPCA does not work with a bigwigsummary made of only one sample
rule PCA:
input:
bigwigsummary = RESULT_DIR + "bigwigsummary/multiBigwigSummary.npz"
output:
PCAplot = RESULT_DIR + "PCA/PCA_PLOT.pdf"
log:
RESULT_DIR + "logs/bigwigsummary/PCA_PLOT.log"
conda:
"../envs/deeptools.yaml"
shell:
"plotPCA -in {input} -o {output} 2>{log}"
rule download_genome:
output:
WORKING_DIR + "reference",
shell:
"wget -O {output} {GENOME_FASTA_URL}"
rule index_genome:
input:
WORKING_DIR + "reference",
output:
[WORKING_DIR + "genome." + str(i) + ".bt2" for i in range(1,4)],
WORKING_DIR + "genome.rev.1.bt2",
WORKING_DIR + "genome.rev.2.bt2"
message:"Indexing Reference genome"
params:
WORKING_DIR + "genome"
conda:
"../envs/samtools_bowtie.yaml"
shell:
"bowtie2-build --threads 10 {input} {params}"
rule get_genome_info:
input:
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
output:
genome_info = WORKING_DIR + "genome_info/{samples}.genome.info"
conda:
"../envs/perl.yaml"
shell:
"""
samtools view -H {input} | grep SQ | cut -f 2-3 | cut -d ':' -f 2 | cut -f 1 > tmp
samtools view -H {input} | grep SQ | cut -f 2-3 | cut -d ':' -f 2,3 | cut -d ':' -f 2 > tmp2
paste tmp tmp2 > {output}
rm tmp tmp2
"""
rule peak_calling:
input:
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam",
map_sorted_indexed = WORKING_DIR + "index/{samples}.sorted.bam.bai"
output:
gapped_peak = RESULT_DIR + "peaks/{samples}_peaks.gappedPeak",
Name_log = RESULT_DIR + "peaks/{samples}.log"
params:
genome_info = str(config['hmmratac']['genome.info']),
output_preflix = "{samples}"
log:
RESULT_DIR + "logs/peak_calling/{samples}.log"
conda:
"../envs/hmmratac.yaml"
shell:
"""
HMMRATAC -b {input.map_sorted} -i {input.map_sorted_indexed} -g {params.genome_info} -o {params.output_preflix}
"""
rule call_narrow_peaks:
input:
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
output:
narrowPeak = RESULT_DIR + "macs2/{samples}_peaks.narrowPeak"
params:
name = "{samples}",
format = str(config['macs2']['format']),
genomesize = str(config['macs2']['genomesize']),
outdir = str(config['macs2']['outdir'])
log:
RESULT_DIR + "logs/macs2/{samples}_peaks.narrowPeak.log"
conda:
"../envs/macs2.yaml"
shell:
"""
macs2 callpeak -t {input} {params.format} {params.genomesize} --name {params.name} --nomodel --bdg -q 0.05 --outdir {params.outdir}/ 2>{log}
"""
rule trimmomatic:
input:
reads = get_fastq,
adapters = config['trimmomatic']["adapters"]
output:
forward_reads = WORKING_DIR + "trimmed/{samples}_forward.fastq.gz",
reverse_reads = WORKING_DIR + "trimmed/{samples}_reverse.fastq.gz",
forwardUnpaired = temp(WORKING_DIR + "trimmed/{samples}_forward_unpaired.fastq.gz"),
reverseUnpaired = temp(WORKING_DIR + "trimmed/{samples}_reverse_unpaired.fastq.gz")
log:
RESULT_DIR + "logs/trimmomatic/{samples}.log"
params:
seedMisMatches = str(config['trimmomatic']['seedMisMatches']),
palindromeClipTreshold = str(config['trimmomatic']['palindromeClipTreshold']),
simpleClipThreshhold = str(config['trimmomatic']['simpleClipThreshold']),
LeadMinTrimQual = str(config['trimmomatic']['LeadMinTrimQual']),
TrailMinTrimQual = str(config['trimmomatic']['TrailMinTrimQual']),
windowSize = str(config['trimmomatic']['windowSize']),
avgMinQual = str(config['trimmomatic']['avgMinQual']),
minReadLen = str(config['trimmomatic']['minReadLength']),
phred = str(config["trimmomatic"]["phred"])
threads: 10
conda:
"../envs/trimmomatic.yaml"
shell:
"""
trimmomatic PE \
-threads 10 \
-phred33 \
{input.reads} \
{output.forward_reads} {output.forwardUnpaired} {output.reverse_reads} {output.reverseUnpaired} \
ILLUMINACLIP:{input.adapters}:{params.seedMisMatches}:{params.palindromeClipTreshold}:{params.simpleClipThreshhold} \
LEADING:3 \
TRAILING:3 \
SLIDINGWINDOW:4:15 \
MINLEN:40 &>{log}
"""
rule trimmed_fastqc:
input:
forward_reads = WORKING_DIR + "trimmed/{samples}_forward.fastq.gz",
reverse_reads = WORKING_DIR + "trimmed/{samples}_reverse.fastq.gz"
output:
RESULT_DIR + "trimmed_fastqc/{samples}_{direction}_fastqc.html"
params:
RESULT_DIR + "trimmed_fastqc/"
conda:
"../envs/fastqc.yaml"
log:
RESULT_DIR + "logs/trimmed_fastqc/{samples}_{direction}.fastqc.log"
shell:
"fastqc --outdir={params} {input.forward_reads} {input.reverse_reads} &>{log}"
rule fastqc:
input:
fwd = expand(DATA_DIR + "sample_{numbers}_R1.fastq.gz", numbers = ['8','12','4_3','4_1']),
rev = expand(DATA_DIR + "sample_{numbers}_R2.fastq.gz", numbers = ['8','12','4_3','4_1'])
output:
expand(RESULT_DIR + "fastqc/sample_{numbers}_{R}_fastqc.html", numbers = ['8','12','4_3','4_1'], R=['R1', 'R2'])
log:
expand(RESULT_DIR + "logs/fastqc/{samples}.fastqc.log", samples = SAMPLES)
params:
RESULT_DIR + "fastqc/"
conda:
"../envs/fastqc.yaml"
shell:
"fastqc --outdir={params} {input.fwd} {input.rev} &>{log}"
rule mapping:
input:
forward_reads = WORKING_DIR + "trimmed/{samples}_forward.fastq.gz",
reverse_reads = WORKING_DIR + "trimmed/{samples}_reverse.fastq.gz",
forwardUnpaired = WORKING_DIR + "trimmed/{samples}_forward_unpaired.fastq.gz",
reverseUnpaired = WORKING_DIR + "trimmed/{samples}_reverse_unpaired.fastq.gz",
index = [WORKING_DIR + "genome." + str(i) + ".bt2" for i in range(1,4)]
output:
mapped = WORKING_DIR + "mapped/{samples}.bam",
unmapped = [WORKING_DIR + "unmapped/{samples}.fq." + str(i) +".gz" for i in range(1,2)],
params:
bowtie = " ".join(config["bowtie2"]["params"].values()), #take argument separated as a list separated with a space
index = WORKING_DIR + "genome",
unmapped = WORKING_DIR + "unmapped/{samples}.fq.gz"
threads: 10
conda:
"../envs/samtools_bowtie.yaml"
log:
RESULT_DIR + "logs/bowtie/{samples}.log"
shell:
"""
bowtie2 {params.bowtie} --threads {threads} \
-x {params.index} -1 {input.forward_reads} \
-2 {input.reverse_reads} \
-U {input.forwardUnpaired},{input.reverseUnpaired} \
--un-conc-gz {params.unmapped} --no-mixed | samtools view -Sb - > {output.mapped} 2>{log}
"""
rule sort:
input:
mapped = WORKING_DIR + "mapped/{samples}.bam"
output:
sorted = WORKING_DIR + "sort/{samples}.sorted.bam",
bai = WORKING_DIR + "sort/{samples}.sorted.bam.bai"
conda:
"../envs/samtools_bowtie.yaml"
log:
RESULT_DIR + "logs/sort/{samples}.log"
shell:
"""
samtools sort {input} -o {output.sorted} &>{log}
samtools index {output.sorted}
"""
rule deduplication:
input:
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
output:
dedup = WORKING_DIR + "dedup/{samples}.dedup.bam",
stats = WORKING_DIR + "dedup/{samples}.dedup.stats"
conda:
"../envs/picard.yaml"
log:
RESULT_DIR + "logs/dedup/{samples}.log"
shell:
"""
picard MarkDuplicates -I {input} -O {output.dedup} -M {output.stats} 2>{log}
"""
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment