Commit 76b562b4 authored by JihedC's avatar JihedC
Browse files

add bamcoverage rule

parent 2d67f3da
......@@ -90,22 +90,9 @@ wildcard_constraints:
##############
FASTP = expand(WORKING_DIR + "trimmed/" + "{sample}_{read}_trimmed.fq.gz", sample=SAMPLES, read={"R1", "R2"})
BOWTIE2 = expand(WORKING_DIR + "mapped/{sample}.bam", sample= SAMPLES)
FASTQC = expand(RESULT_DIR + "fastqc/{sample}.fastqc.html", sample = SAMPLES)
FASTQC_REPORTS = expand(RESULT_DIR + "fastqc/{sample}.fastqc.zip", sample=SAMPLES)
BAM_INDEX = expand(RESULT_DIR + "mapped/{sample}.sorted.rmdup.bam.bai", sample=SAMPLES)
BIGWIG = expand(RESULT_DIR + "bigwig/{sample}.bw", sample=SAMPLES)
BED_NARROW = expand(RESULT_DIR + "bed/{sample}_peaks.narrowPeak", sample=SAMPLES)
MULTIBAMSUMMARY = RESULT_DIR + "multiBamSummary/MATRIX.npz"
PLOTCORRELATION = RESULT_DIR + "plotCorrelation/MATRIX.png"
COMPUTEMATRIX = expand(RESULT_DIR + "computematrix/{sample}.{type}.gz", sample=SAMPLES, type={"TSS", "scale-regions"})
HEATMAP = expand(RESULT_DIR + "heatmap/{sample}.{type}.pdf", sample=SAMPLES, type={"TSS", "scale-regions"})
PLOTFINGERPRINT = RESULT_DIR + "plotFingerprint/Fingerplot.pdf"
PLOTPROFILE_PDF = expand(RESULT_DIR + "plotProfile/{sample}.{type}.pdf", sample=SAMPLES, type={"TSS", "scale-regions"})
PLOTPROFILE_BED = expand(RESULT_DIR + "plotProfile/{sample}.{type}.bed", sample=SAMPLES, type={"TSS", "scale-regions"})
MULTIQC = "qc/multiqc.html"
FRAGMENTSIZE = RESULT_DIR + "bamPEFragmentSize/fragmentSize.png"
PLOTCOVERAGE = RESULT_DIR + "plotCoverage/Coverage.png"
FASTQC = expand(RESULT_DIR + "fastqc/{sample}.fastqc.html", sample=SAMPLES)
BIGWIG = expand(RESULT_DIR + "bigwig/{sample}_rpkm.bw", sample=SAMPLES)
###############
# Final output
......@@ -114,7 +101,8 @@ rule all:
input:
FASTP,
FASTQC,
BOWTIE2
BOWTIE2,
BIGWIG
message: "ChIP-seq SE pipeline succesfully run." #finger crossed to see this message!
shell:"#rm -rf {WORKING_DIR}"
......@@ -125,19 +113,6 @@ rule all:
include : "rules/external_data.smk"
include : 'rules/pre_processing.smk'
include : "rules/macs2_peak_calling.smk"
#include : "rules/macs2_peak_calling.smk"
include : "rules/deeptools_post_processing.smk"
rule multiqc:
input:
expand(RESULT_DIR + "fastqc/{sample}.fastqc.zip", sample= SAMPLES),
expand(RESULT_DIR + "bed/{treatment}_vs_{control}_peaks.xls", zip, treatment = CASES, control = CONTROLS)
output:
"qc/multiqc.html"
params:
"" # Optional: extra parameters for multiqc.
log:
"logs/multiqc.log"
wrapper:
"0.27.1/bio/multiqc"
......@@ -38,69 +38,10 @@ bowtie2:
sensitivity: "--very-sensitive-local"
verbose: "-q"
#Parameters for bamCoverage:
#BAMCOVERAGE
# bamCoverage --bam {input} -o {output} --effectiveGenomeSize
# --effectiveGenomeSize The effective genome size is the portion of the genome that is mappable. Large fractions of the genome are stretches of NNNN that should be discarded. Also, if repetitive regions were not included in the mapping of
# reads, the effective genome size needs to be adjusted accordingly.
bamCoverage:
params:
EFFECTIVEGENOMESIZE: '820000000' #source = http://plant-plasticity.github.io/resources/3_ATAC-seq%20data%20processing.pdf #option is --effectiveGenomeSize
EXTENDREADS: '200' # extend each reads with a 200bp to match the average fragment size of the ChIP experiment
binSize: "1000"
ignoreForNormalization: "SL3.0ch00" #list here space-delimited chromosomes that should be ignored for normalization, sex chromosomes usually.
smoothLength: "40"
normalizeUsing: "RPKM"
bamcompare:
binSize: "1000"
normalizeUsing: "RPKM" #others choices are CPM, BPM, RPGC, None more documentation:https://deeptools.readthedocs.io/en/develop/content/tools/bamCompare.html?highlight=bamcompare
EFFECTIVEGENOMESIZE: '820000000'
operation : "log2" #others choices are ratio, subtract, add, mean, reciprocal_ratio, first, second more documentation:https://deeptools.readthedocs.io/en/develop/content/tools/bamCompare.html?highlight=bamcompare
smoothLength: "40"
scaleFactorsMethod: "None" #others choices are readCount, ,SES
ignoreForNormalization: "SL3.0ch00" #list here space-delimited chromosomes that should be ignored for normalization, sex chromosomes usually.
# macs2 Parameters:
# for information over macs2, refer to https://github.com/taoliu/MACS
# regular peak calling : macs2 callpeak -t ChIP.bam -c Control.bam -f BAM -g hs -n test -B -q 0.01
# broad peak calling : macs2 callpeak -t ChIP.bam -c Control.bam --broad -g hs --broad-cutoff 0.1
macs2:
genomesize: "--gsize mm" #here I used 'mm' because it's the closest to tomato, for human change to 'hs'
format: "--format BAM" #Use BAMPE to activate the paired end data, MACS will use the actual insert size of pairs of reads to build the fragemnt pileup.
qvalue: "0.05" #default is 0.05
outdir : "results/bed/"
bandwidth: "--bw 350" #the bandwidth is used to scan the genome for model building. To be set to the expected sonication fragment size.
multiBamSummary:
binSize: "10"
extendReads: "500" #this parameter is necessary when dealing with SE data
computeMatrix:
binSize : "10"
upstream : "3000"
downstream: "3000"
plotCorrelation:
corMethod : "pearson" # Can be replaced by spearman
whatToPlot: "heatmap" # Can be replaced by scatterplot
color : "PuBuGn" # see [here](https://matplotlib.org/examples/color/colormaps_reference.html) for alternative colors
plotHeatmap:
kmeans : "1"
color : "PuBuGn"
plot : "plot, heatmap and colorbar" # Others options are : “plot and heatmap”, “heatmap only” and “heatmap and colorbar”
plotFingerprint:
EXTENDREADS: '200'
binSize: "10"
plotProfile:
kmeans : "1" # choose the number of kmeans to compute
startLabel : "TSS" # default is TSS but could be anything, like "peak start"
endLabel : "TES" # TES is default but can be changed like for startLabel
bamcoverage:
binsize: '10'
normalizeUsing: 'RPKM'
effectiveGenomeSize: '2652783500' #https://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html
smoothLength: '50'
rule bamCoverage:
rule bamcoverage:
input:
RESULT_DIR + "mapped/{sample}.sorted.rmdup.bam"
bam = RESULT_DIR + "mapped/{sample}.sorted.bam",
bai = RESULT_DIR + "mapped/{sample}.sorted.bam.bai"
output:
RESULT_DIR + "bigwig/{sample}.bw"
bigwig = RESULT_DIR + "bigwig/{sample}_rpkm.bw"
message:
"Converting {wildcards.sample} bam into bigwig file"
"Create genome coverage tracks"
benchmark:
RESULT_DIR + "benchmark/bamcoverage_{sample}.benchmark.txt"
params:
binsize = config["bamcoverage"]["binsize"],
normalizeUsing = config["bamcoverage"]["normalizeUsing"],
effectiveGenomeSize = config["bamcoverage"]["effectiveGenomeSize"],
smoothLength = config["bamcoverage"]["smoothLength"]
log:
RESULT_DIR + "logs/deeptools/{sample}_bamtobigwig.log"
params:
EFFECTIVEGENOMESIZE = str(config["bamCoverage"]["params"]["EFFECTIVEGENOMESIZE"]), #take argument separated as a list separated with a space
EXTENDREADS = str(config["bamCoverage"]["params"]["EXTENDREADS"]),
binSize = str(config['bamCoverage']["params"]['binSize']),
normalizeUsing = str(config['bamCoverage']["params"]['normalizeUsing']),
ignoreForNormalization = str(config['bamCoverage']["params"]['ignoreForNormalization']),
smoothLength = str(config['bamCoverage']["params"]['smoothLength'])
conda:
"../envs/deeptools.yaml"
shell:
"bamCoverage --bam {input} \
-o {output} \
--effectiveGenomeSize {params.EFFECTIVEGENOMESIZE} \
--extendReads {params.EXTENDREADS} \
--binSize {params.binSize} \
--smoothLength {params.smoothLength} \
--ignoreForNormalization {params.ignoreForNormalization} \
&>{log}"
# rule bamcompare:
# input:
# treatment = RESULT_DIR + "mapped/{treatment}.sorted.rmdup.bam", #input requires an indexed bam file
# control = RESULT_DIR + "mapped/{control}.sorted.rmdup.bam" #input requires an indexed bam file
# output:
# bigwig = RESULT_DIR + "bamcompare/log2_{treatment}_{control}.bamcompare.bw"
# message:
# "Running bamCompare for {wildcards.treatment} and {wildcards.control}"
# log:
# RESULT_DIR + "logs/deeptools/log2_{treatment}_{control}.bamcompare.bw.log"
# conda:
# "../envs/deeptools.yaml"
# params:
# binSize = str(config['bamcompare']['binSize']),
# normalizeUsing = str(config['bamcompare']['normalizeUsing']),
# EFFECTIVEGENOMESIZE = str(config["bamcompare"]["EFFECTIVEGENOMESIZE"]),
# operation = str(config['bamcompare']['operation']),
# smoothLength = str(config['bamcompare']['smoothLength']),
# ignoreForNormalization = str(config['bamcompare']['ignoreForNormalization']),
# scaleFactorsMethod = str(config['bamcompare']['scaleFactorsMethod'])
# shell:
# "bamCompare -b1 {input.treatment} \
# -b2 {input.control} \
# --binSize {params.binSize} \
# -o {output.bigwig} \
# --normalizeUsing {params.normalizeUsing} \
# --operation {params.operation} \
# --smoothLength {params.smoothLength} \
# --ignoreForNormalization {params.ignoreForNormalization} \
# --scaleFactorsMethod {params.scaleFactorsMethod} \
# &>{log}"
rule multiBamSummary:
input:
lambda wildcards: expand(RESULT_DIR + "mapped/{sample}.sorted.rmdup.bam", sample = SAMPLES)
output:
RESULT_DIR + "multiBamSummary/MATRIX.npz"
message:
"Computing the read coverage into a numpy array "
threads: 10
params:
binSize = str(config['multiBamSummary']['binSize']),
extendReads = str(config['multiBamSummary']['extendReads'])
log:
RESULT_DIR + "logs/deeptools/multibamsummary/MATRIX.log"
shell:
"multiBamSummary bins \
--bamfiles {input} \
--numberOfProcessors {threads}\
--binSize {params.binSize} \
--centerReads \
--extendReads {params.extendReads} \
-o {output} \
2> {log}"
rule plotCorrelation:
input:
RESULT_DIR + "multiBamSummary/MATRIX.npz"
output:
RESULT_DIR + "plotCorrelation/MATRIX.png"
log:
RESULT_DIR + "logs/deeptools/plotcorrelation/MATRIX.log"
params:
corMethod = str(config['plotCorrelation']['corMethod']),
whatToPlot = str(config['plotCorrelation']['whatToPlot']),
color = str(config['plotCorrelation']['color'])
conda:
"../envs/deeptools.yaml"
message:
"Preparing the correlation plot between all samples"
shell:
"plotCorrelation \
--corData {input} \
--corMethod {params.corMethod} \
--whatToPlot {params.whatToPlot} \
--skipZeros \
--colorMap {params.color} \
--plotFile {output} \
--plotNumbers \
2> {log}"
#--plotTitle 'Pearson Correlation of {params.title} coverage' \
rule computeMatrix:
input:
bigwig = RESULT_DIR + "bigwig/{sample}.bw",
bed = WORKING_DIR + "gene_model.gtf"
output:
RESULT_DIR + "computematrix/{sample}.TSS.gz"
threads: 10
params:
binSize = str(config['computeMatrix']['binSize']),
upstream = str(config['computeMatrix']['upstream']),
downstream = str(config['computeMatrix']['downstream'])
conda:
"../envs/deeptools.yaml"
log:
RESULT_DIR + "logs/deeptools/computematrix/{sample}.log"
message:
"Computing matrix for {input.bigwig} with {params.binSize} bp windows and {params.upstream} bp around TSS"
shell:
"computeMatrix \
reference-point \
--referencePoint TSS \
-S {input.bigwig} \
-R {input.bed} \
--afterRegionStartLength {params.upstream} \
--beforeRegionStartLength {params.downstream} \
--numberOfProcessors {threads} \
--binSize {params.binSize} \
-o {output} \
2> {log}"
rule computeMatrix_scale:
input:
bigwig = RESULT_DIR + "bigwig/{sample}.bw",
bed = WORKING_DIR + "gene_model.gtf"
output:
RESULT_DIR + "computematrix/{sample}.scale-regions.gz"
threads: 10
params:
binSize = str(config['computeMatrix']['binSize']),
upstream = str(config['computeMatrix']['upstream']),
downstream = str(config['computeMatrix']['downstream'])
conda:
"../envs/deeptools.yaml"
log:
RESULT_DIR + "logs/deeptools/computematrix/{sample}.log"
shell:
"computeMatrix \
scale-regions \
-S {input.bigwig} \
-R {input.bed} \
--afterRegionStartLength {params.upstream} \
--beforeRegionStartLength {params.downstream} \
--numberOfProcessors {threads} \
--binSize {params.binSize} \
-o {output} \
2> {log}"
rule plotHeatmap:
input:
RESULT_DIR + "computematrix/{sample}.{type}.gz"
output:
RESULT_DIR + "heatmap/{sample}.{type}.pdf"
params:
kmeans = str(config['plotHeatmap']['kmeans']),
color = str(config['plotHeatmap']['color']),
plot = str(config['plotHeatmap']['plot']),
cluster = RESULT_DIR + "heatmap/{sample}.bed"
conda:
"../envs/deeptools.yaml"
log:
RESULT_DIR + "logs/deeptools/plotHeatmap/{sample}.{type}.log"
message:
"Preparing Heatmaps..."
shell:
"plotHeatmap \
--matrixFile {input} \
--outFileName {output} \
--kmeans {params.kmeans} \
--colorMap {params.color} \
--legendLocation best \
--outFileSortedRegions {params.cluster}"
rule plotFingerprint:
input:
expand(RESULT_DIR + "mapped/{sample}.sorted.rmdup.bam", sample = SAMPLES)
output:
pdf = RESULT_DIR + "plotFingerprint/Fingerplot.pdf"
params:
EXTENDREADS = str(config["bamCoverage"]["params"]["EXTENDREADS"]),
binSize = str(config['bamCoverage']["params"]['binSize'])
conda:
"../envs/deeptools.yaml"
message:
"Preparing deeptools plotFingerprint"
shell:
"plotFingerprint \
-b {input} \
--extendReads {params.EXTENDREADS} \
--binSize {params.binSize} \
--plotFile {output}"
rule plotProfile:
input:
RESULT_DIR + "computematrix/{sample}.{type}.gz"
output:
pdf = RESULT_DIR + "plotProfile/{sample}.{type}.pdf",
bed = RESULT_DIR + "plotProfile/{sample}.{type}.bed"
params:
kmeans = str(config['plotProfile']['kmeans']),
startLabel = str(config['plotProfile']['startLabel']),
endLabel = str(config['plotProfile']['endLabel'])
conda:
"../envs/deeptools.yaml"
log:
RESULT_DIR + "logs/deeptools/plotProfile/{sample}.{type}.log"
message:
"Preparing deeptools plotProfile"
RESULT_DIR + "log/bamcoverage/{sample}.log"
shell:
"plotProfile \
--matrixFile {input} \
--outFileName {output.pdf} \
--outFileSortedRegions {output.bed} \
--kmeans {params.kmeans} \
--startLabel {params.startLabel} \
--endLabel {params.endLabel}"
"bamCoverage -b {input.bam} --binSize {params.binsize} --effectiveGenomeSize {params.effectiveGenomeSize} --normalizeUsing {params.normalizeUsing} --smoothLength {params.smoothLength} -o {output.bigwig} 2>{log}"
################## Rules used for QC ##################
rule fastp:
input:
get_fastq
......@@ -81,30 +79,23 @@ rule sort:
input:
WORKING_DIR + "mapped/{sample}.bam"
output:
temp(RESULT_DIR + "mapped/{sample}.sorted.bam")
bam = RESULT_DIR + "mapped/{sample}.sorted.bam"
message:"Sorting {wildcards.sample} bam file"
threads: 10
log:
RESULT_DIR + "logs/samtools/{sample}.sort.log"
conda:
"../envs/samtools.yaml"
shell:
"samtools sort -@ {threads} -o {output} {input} &>{log}"
"""
samtools sort -@ {threads} -o {output.bam} {input} &>{log}
samtools index {output.bam}
"""
rule rmdup:
rule index_bam:
input:
RESULT_DIR + "mapped/{sample}.sorted.bam"
output:
bam = RESULT_DIR + "mapped/{sample}.sorted.rmdup.bam",
bai = RESULT_DIR + "mapped/{sample}.sorted.rmdup.bam.bai" #bai files required for the bigwig and bamCompare rules
message: "Removing duplicate from file {wildcards.sample}"
output:
RESULT_DIR + "mapped/{sample}.sorted.bam.bai"
log:
RESULT_DIR + "logs/samtools/{sample}.sorted.rmdup.bam.log"
conda:
"../envs/samtools.yaml"
RESULT_DIR + "log/sort/{sample}.log"
shell:
"""
samtools rmdup -s {input} {output.bam} &>{log}
samtools index {output.bam}
"""
#samtools manual says "This command is obsolete. Use markdup instead
"samtools index {input} 2>{log}"
......@@ -6,7 +6,9 @@
#SBATCH -t 24:00:00
#SBATCH --mem=15000
module purge
module load genomics/ngs/samtools/1.11/gcc-8.3.1
module load genomics/ngs/aligners/bowtie2/2.4.2/gcc-8.3.1
echo Start time : `date`
snakemake -p \
--snakefile Snakefile \
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment