Commit 77741dda authored by JihedC's avatar JihedC
Browse files

Include working HMMRATAC and cleaned rules

parent 351a5144
......@@ -45,6 +45,7 @@ wildcard_constraints:
# Here we define the outputs of rules we want the pipeline to produce.
# The varialble defined here is then used in the `rule all`.
# Fastqc rule need to be modified to accept any sample
# HMMRATAC will not work with the test dataset.
FASTQC = expand(RESULT_DIR + "fastqc/sample_{numbers}_{R}_fastqc.html",numbers = ['8','12','4_3','4_1'], R=['R1', 'R2']),
FORWARD_READS = expand(WORKING_DIR + "trimmed/{samples}_forward.fastq.gz", samples = SAMPLES),
......@@ -57,7 +58,7 @@ DEDUP = expand(WORKING_DIR + "dedup/{samples}.dedup.bam",
STATS = expand(WORKING_DIR + "dedup/{samples}.dedup.stats", samples = SAMPLES),
SORTED_INDEXED = expand(WORKING_DIR + "sort/{samples}.sorted.bam.bai", samples = SAMPLES),
GENOME_INFO = expand(WORKING_DIR + "genome_info/{samples}.genome.info", samples = SAMPLES),
GAPPED_PEAK = expand(RESULT_DIR + "peaks/{samples}_peaks.gappedPeak", samples = SAMPLES),
HMMRATAC = expand(RESULT_DIR + "hmmratac/{samples}_peaks.gappedPeak", samples = SAMPLES),
NAME_LOG = expand(RESULT_DIR + "peaks/{samples}.log", samples = SAMPLES),
NARROWPEAK = expand(RESULT_DIR + "macs2/{samples}_peaks.narrowPeak", samples = SAMPLES),
COVERAGE_TRACK = expand(RESULT_DIR + "bamCoverage/{samples}.bw", samples = SAMPLES),
......@@ -84,9 +85,7 @@ rule all:
#PCAPLOT,
#SORTED_INDEXED,
#NARROWPEAK,
#GENOME_INFO,
#GAPPED_PEAK,
#NAME_LOG
HMMRATAC,
message : "Analysis is complete!"
shell:""
......@@ -95,3 +94,4 @@ rule all:
include: "rules/external_data.smk"
include: "rules/pre-processing.smk"
include: "rules/macs2.smk"
include: "rules/hmmratac.smk"
......@@ -11,7 +11,7 @@ units : "units.tsv"
## genome reference ##
genome_fasta_url : "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M17/GRCm38.primary_assembly.genome.fa.gz"
HMMRATAC: "https://github.com/LiuLabUB/HMMRATAC/releases/download/1.2.10/HMMRATAC_V1.2.10_exe.jar"
## trimmomatic ##
......
#########################
#Sets of rules for deeptools postprocessing. All parameters can be modified in the configuration file.
#########################
rule bamCoverage:
input:
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
output:
coverage_track = RESULT_DIR + "bamCoverage/{samples}.bw"
coverage_track = RESULT_DIR + "bamCoverage/{samples}.bw"
params:
effectiveGenomeSize = str(config['bamCoverage']['effectiveGenomeSize']),
normalization = str(config['bamCoverage']['normalization']),
......@@ -12,7 +16,14 @@ rule bamCoverage:
conda:
"../envs/deeptools.yaml"
shell:
"bamCoverage -b {input} --effectiveGenomeSize {params.effectiveGenomeSize} --normalizeUsing {params.normalization} --ignoreDuplicates -o {output} --binSize {params.binSize}"
"""
bamCoverage -b {input} \
--effectiveGenomeSize {params.effectiveGenomeSize} \
--normalizeUsing {params.normalization} \
--ignoreDuplicates \
-o {output} \
--binSize {params.binSize}
"""
#bdg: create bedgraph output files
#format=BAMPE: only use properly-paired read alignments
......@@ -23,23 +34,30 @@ rule multibigwigSummary:
input:
lambda wildcards: expand(RESULT_DIR + "bamCoverage/{sample}.bw", sample = SAMPLES)
output:
bigwigsummary = RESULT_DIR + "bigwigsummary/multiBigwigSummary.npz"
bigwigsummary = RESULT_DIR + "bigwigsummary/multiBigwigSummary.npz"
log:
RESULT_DIR + "logs/bigwigsummary/multiBigwigSummary.log"
conda:
"../envs/deeptools.yaml"
shell:
"multiBigwigSummary bins -b {input} -o {output} "
"""
multiBigwigSummary bins -b {input} \
-o {output}
"""
#2>{log}
# PlotPCA does not work with a bigwigsummary made of only one sample
rule PCA:
input:
bigwigsummary = RESULT_DIR + "bigwigsummary/multiBigwigSummary.npz"
bigwigsummary = RESULT_DIR + "bigwigsummary/multiBigwigSummary.npz"
output:
PCAplot = RESULT_DIR + "PCA/PCA_PLOT.pdf"
PCAplot = RESULT_DIR + "PCA/PCA_PLOT.pdf"
log:
RESULT_DIR + "logs/bigwigsummary/PCA_PLOT.log"
conda:
"../envs/deeptools.yaml"
shell:
"plotPCA -in {input} -o {output} 2>{log}"
"""
plotPCA -in {input} \
-o {output} \
2>{log}
"""
#########################
#Sets of rules for trimming, mapping, sorting and indexing of ATAC-seq reads.
#########################
rule download_genome:
output:
WORKING_DIR + "reference",
......
rule get_genome_info:
input:
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
#########################
#Sets of rules for HMMRATAC
#Will only work with Paired-end ATAC-data
#--no-mixed argument is required in bowtie2 rules otherwise HMMRATAC will throw an error message.
#HMMRATAC will not work with the test dataset. The sample size is too small.
#########################
rule download_hmmratac:
output:
genome_info = WORKING_DIR + "genome_info/{samples}.genome.info"
conda:
"../envs/perl.yaml"
params:
hmmratac = config["HMMRATAC"]
log:
RESULT_DIR + "logs/hmmratac/download_hmmratac_github.log"
shell:
"""
samtools view -H {input} | grep SQ | cut -f 2-3 | cut -d ':' -f 2 | cut -f 1 > tmp
samtools view -H {input} | grep SQ | cut -f 2-3 | cut -d ':' -f 2,3 | cut -d ':' -f 2 > tmp2
paste tmp tmp2 > {output}
rm tmp tmp2
"""
"wget {params.hmmratac}"
rule peak_calling:
rule hmmratac:
input:
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam",
map_sorted_indexed = WORKING_DIR + "index/{samples}.sorted.bam.bai"
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam",
map_sorted_indexed = WORKING_DIR + "sort/{samples}.sorted.bam.bai"
output:
gapped_peak = RESULT_DIR + "peaks/{samples}_peaks.gappedPeak",
Name_log = RESULT_DIR + "peaks/{samples}.log"
gapped_peak = RESULT_DIR + "hmmratac/{samples}_peaks.gappedPeak"
params:
genome_info = str(config['hmmratac']['genome.info']),
output_preflix = "{samples}"
genome_info = str(config['hmmratac']['genome.info']),
output_preflix = "{samples}"
log:
RESULT_DIR + "logs/peak_calling/{samples}.log"
conda:
"../envs/hmmratac.yaml"
RESULT_DIR + "logs/hmmratac/{samples}_hmmratac.log"
shell:
"""
HMMRATAC -b {input.map_sorted} -i {input.map_sorted_indexed} -g {params.genome_info} -o {params.output_preflix}
java -jar HMMRATAC_V1.2.10_exe.jar -b {input.map_sorted} \
-i {input.map_sorted_indexed} \
-g {params.genome_info} \
-o {params.output_preflix} \
2>{log}
"""
#########################
#Sets of rules for MACS2 peak calling
#########################
rule call_narrow_peaks:
input:
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
map_sorted = WORKING_DIR + "sort/{samples}.sorted.bam"
output:
narrowPeak = RESULT_DIR + "macs2/{samples}_peaks.narrowPeak"
narrowPeak = RESULT_DIR + "macs2/{samples}_peaks.narrowPeak"
params:
name = "{samples}",
format = str(config['macs2']['format']),
genomesize = str(config['macs2']['genomesize']),
outdir = str(config['macs2']['outdir'])
name = "{samples}",
format = str(config['macs2']['format']),
genomesize = str(config['macs2']['genomesize']),
outdir = str(config['macs2']['outdir'])
log:
RESULT_DIR + "logs/macs2/{samples}_peaks.narrowPeak.log"
conda:
"../envs/macs2.yaml"
shell:
"""
macs2 callpeak -t {input} {params.format} {params.genomesize} --name {params.name} --nomodel --bdg -q 0.05 --outdir {params.outdir}/ 2>{log}
macs2 callpeak -t {input} {params.format} {params.genomesize} \
--name {params.name} --nomodel --bdg -q 0.05 \
--outdir {params.outdir}/ 2>{log}
"""
#########################
#Sets of rules for trimming, mapping, sorting and indexing of ATAC-seq reads.
#########################
rule trimmomatic:
input:
reads = get_fastq,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment