Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Chouaref
Snakemake_ATAC_2020
Commits
bab1a3b4
Unverified
Commit
bab1a3b4
authored
Dec 23, 2020
by
Jihed Chouaref
Committed by
GitHub
Dec 23, 2020
Browse files
Update and rename snakefile to Snakefile
parent
d3011c41
Changes
1
Hide whitespace changes
Inline
Side-by-side
ATAC-seq_pipeline/
s
nakefile
→
ATAC-seq_pipeline/
S
nakefile
View file @
bab1a3b4
...
...
@@ -42,24 +42,27 @@ wildcard_constraints:
################## DESIRED OUTPUT ##################
FASTQC = expand(RESULT_DIR + "fastqc/sample_{numbers}_{R}_fastqc.html",numbers = ['8','12','4_3','4_1'], R=['R1', 'R2']),
FORWARD_READS = expand(WORKING_DIR + "trimmed/{samples}_forward.fastq.gz", samples = SAMPLES),
REVERSE_READS = expand(WORKING_DIR + "trimmed/{samples}_reverse.fastq.gz", samples = SAMPLES),
TRIMMED_FASTQC = expand(RESULT_DIR + "trimmed_fastqc/{samples}_{direction}_fastqc.html", samples = SAMPLES, direction=['forward', 'reverse']),
MAPPED = expand(WORKING_DIR + "mapped/{samples}.bam", samples = SAMPLES),
UNMAPPED = expand([WORKING_DIR + "unmapped/{samples}.fq." + str(i) +".gz" for i in range(1,2)], samples = SAMPLES),
MAP_SORTED = expand(WORKING_DIR + "sort/{samples}.sorted.bam", samples = SAMPLES),
DEDUP = expand(WORKING_DIR + "dedup/{samples}.dedup.bam", samples = SAMPLES),
STATS = expand(WORKING_DIR + "dedup/{samples}.dedup.stats", samples = SAMPLES),
SORTED_INDEXED = expand(WORKING_DIR + "index/{samples}.sorted.bam.bai", samples = SAMPLES),
GENOME_INFO = expand(WORKING_DIR + "genome_info/{samples}.genome.info", samples = SAMPLES),
GAPPED_PEAK = expand(RESULT_DIR + "peaks/{samples}_peaks.gappedPeak", samples = SAMPLES),
NAME_LOG = expand(RESULT_DIR + "peaks/{samples}.log", samples = SAMPLES),
NARROWPEAK = expand(RESULT_DIR + "macs2/{samples}_peaks.narrowPeak", samples = SAMPLES),
COVERAGE_TRACK = expand(WORKING_DIR + "bamCoverage/{samples}.bw", samples = SAMPLES),
BIGWIGSUMMARY = expand(WORKING_DIR + "bigwigsummary/{samples}.npz", samples = SAMPLES),
PCAPLOT = expand(RESULT_DIR + "PCA/{samples}_pca.png", samples = SAMPLES)
# Here we define the outputs of rules we want the pipeline to produce.
# The varialble defined here is then used in the `rule all`.
# Fastqc rule need to be modified to accept any sample
FASTQC = expand(RESULT_DIR + "fastqc/sample_{numbers}_{R}_fastqc.html",numbers = ['8','12','4_3','4_1'], R=['R1', 'R2']),
FORWARD_READS = expand(WORKING_DIR + "trimmed/{samples}_forward.fastq.gz", samples = SAMPLES),
REVERSE_READS = expand(WORKING_DIR + "trimmed/{samples}_reverse.fastq.gz", samples = SAMPLES),
TRIMMED_FASTQC = expand(RESULT_DIR + "trimmed_fastqc/{samples}_{direction}_fastqc.html", samples = SAMPLES, direction=['forward', 'reverse']),
MAPPED = expand(WORKING_DIR + "mapped/{samples}.bam", samples = SAMPLES),
UNMAPPED = expand([WORKING_DIR + "unmapped/{samples}.fq." + str(i) +".gz" for i in range(1,2)], samples = SAMPLES),
MAP_SORTED = expand(WORKING_DIR + "sort/{samples}.sorted.bam", samples = SAMPLES),
DEDUP = expand(WORKING_DIR + "dedup/{samples}.dedup.bam", samples = SAMPLES),
STATS = expand(WORKING_DIR + "dedup/{samples}.dedup.stats", samples = SAMPLES),
SORTED_INDEXED = expand(WORKING_DIR + "sort/{samples}.sorted.bam.bai", samples = SAMPLES),
GENOME_INFO = expand(WORKING_DIR + "genome_info/{samples}.genome.info", samples = SAMPLES),
GAPPED_PEAK = expand(RESULT_DIR + "peaks/{samples}_peaks.gappedPeak", samples = SAMPLES),
NAME_LOG = expand(RESULT_DIR + "peaks/{samples}.log", samples = SAMPLES),
NARROWPEAK = expand(RESULT_DIR + "macs2/{samples}_peaks.narrowPeak", samples = SAMPLES),
COVERAGE_TRACK = expand(RESULT_DIR + "bamCoverage/{samples}.bw", samples = SAMPLES),
BIGWIGSUMMARY = RESULT_DIR + "bigwigsummary/multiBigwigSummary.npz",
PCAPLOT = RESULT_DIR + "PCA/PCA_PLOT.pdf"
rule all:
input:
...
...
@@ -79,8 +82,8 @@ rule all:
COVERAGE_TRACK,
BIGWIGSUMMARY,
PCAPLOT,
#
SORTED_INDEXED,
#
NARROWPEAK,
SORTED_INDEXED,
NARROWPEAK,
#GENOME_INFO,
#GAPPED_PEAK,
#NAME_LOG
...
...
@@ -204,15 +207,19 @@ rule mapping:
rule sort:
input:
mapped = WORKING_DIR + "mapped/{samples}.bam"
mapped
= WORKING_DIR + "mapped/{samples}.bam"
output:
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
sorted = WORKING_DIR + "sort/{samples}.sorted.bam",
bai = WORKING_DIR + "sort/{samples}.sorted.bam.bai"
conda:
"envs/samtools_bowtie.yaml"
log:
RESULT_DIR + "logs/sort/{samples}.log"
shell:
"samtools sort {input} -o {output} &>{log}"
"""
samtools sort {input} -o {output.sorted} &>{log}
samtools index {output.sorted}
"""
rule deduplication:
input:
...
...
@@ -229,17 +236,17 @@ rule deduplication:
picard MarkDuplicates -I {input} -O {output.dedup} -M {output.stats} 2>{log}
"""
rule index:
input:
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
output:
map_sorted_indexed = WORKING_DIR + "sort/{samples}.sorted.bam.bai"
conda:
"envs/samtools_bowtie.yaml"
log:
RESULT_DIR + "logs/index/{samples}.log"
shell:
"samtools index {input} {output} &>{log}"
# #
rule index:
#
#
input:
#
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
#
output:
#
map_sorted_indexed = WORKING_DIR + "sort/{samples}.sorted.bam.bai"
#
conda:
#
"envs/samtools_bowtie.yaml"
#
log:
#
RESULT_DIR + "logs/index/{samples}.log"
#
shell:
#
"samtools index {input} {output} &>{log}"
rule get_genome_info:
input:
...
...
@@ -298,39 +305,44 @@ rule bamCoverage:
input:
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
output:
coverage_track =
WORKING
_DIR + "bamCoverage/{samples}.bw"
coverage_track =
RESULT
_DIR + "bamCoverage/{samples}.bw"
params:
effectiveGenomeSize = str(config['bamCoverage']['effectiveGenomeSize']),
normalization = str(config['bamCoverage']['normalization'])
effectiveGenomeSize = str(config['bamCoverage']['effectiveGenomeSize']),
normalization = str(config['bamCoverage']['normalization']),
binSize = str(config['bamCoverage']['binSize'])
log:
RESULT_DIR + "logs/bamCoverage/{samples}.bw.log"
conda:
"envs/deeptools.yaml"
shell:
"bamCoverage -b {input} --effectiveGenomeSize {params.effectiveGenomeSize} --normalizeUsing {params.normalization} --ignoreDuplicates -o {output}"
"bamCoverage -b {input} --effectiveGenomeSize {params.effectiveGenomeSize} --normalizeUsing {params.normalization} --ignoreDuplicates -o {output}
--binSize {params.binSize}
"
#bdg: create bedgraph output files
#format=BAMPE: only use properly-paired read alignments
# rule multibigwigSummary need to be expanded to use more than 1 bigwig file
# Right now the file only use 1 file, which is not useful for a comparison
rule multibigwigSummary:
input:
coverage_track = WORKING
_DIR + "bamCoverage/{sample
s
}.bw"
lambda wildcards: expand(RESULT
_DIR + "bamCoverage/{sample}.bw"
, sample = SAMPLES)
output:
bigwigsummary =
WORKING
_DIR + "bigwigsummary/
{samples}
.npz"
bigwigsummary =
RESULT
_DIR + "bigwigsummary/
multiBigwigSummary
.npz"
log:
RESULT_DIR + "logs/bigwigsummary/
{samples}.bw
.log"
RESULT_DIR + "logs/bigwigsummary/
multiBigwigSummary
.log"
conda:
"envs/deeptools.yaml"
shell:
"multiBigwigSummary bins -b {input} -o {output}"
"multiBigwigSummary bins -b {input} -o {output} "
#2>{log}
# PlotPCA does not work with a bigwigsummary made of only one sample
rule PCA:
input:
bigwigsummary =
WORKING
_DIR + "bigwigsummary/
{samples}
.npz"
bigwigsummary =
RESULT
_DIR + "bigwigsummary/
multiBigwigSummary
.npz"
output:
PCAplot = RESULT_DIR + "PCA/
{samples}_pca.png
"
PCAplot = RESULT_DIR + "PCA/
PCA_PLOT.pdf
"
log:
RESULT_DIR + "logs/bigwigsummary/
{samples}_pca.png
.log"
RESULT_DIR + "logs/bigwigsummary/
PCA_PLOT
.log"
conda:
"envs/deeptools.yaml"
shell:
"plotPCA -in {input} -o {output}"
"plotPCA -in {input} -o {output}
2>{log}
"
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment