Commit 2d67f3da authored by JihedC's avatar JihedC
Browse files

inititate commit

parent c0eedc0e
temp/*
results/
temp/trimmed/
.snakemake/
\ No newline at end of file
# Snakemake file for ChIP-Seq SE analysis
# Snakemake pipeline for the analysis of ChIP_seq PE and SE
###############
# Libraries
###############
import os
################## Import libraries ##################
import pandas as pd
from snakemake.utils import validate, min_version
#############################################
# Configuration and sample sheets
#############################################
import os
import sys
from subprocess import call
import itertools
from snakemake.utils import R
configfile: "configs/config.yaml"
WORKING_DIR = config["working_dir"] # where you want to store your intermediate files (this directory will be cleaned up at the end)
RESULT_DIR = config["result_dir"] # what you want to keep
################## Configuration file ##################
GENOME_FASTA_URL = config["refs"]["genome_url"]
GENOME_FASTA_FILE = os.path.basename(config["refs"]["genome_url"])
GFF_URL = config["refs"]["gff_url"]
GFF_FILE = os.path.basename(config["refs"]["gff_url"])
configfile: "config.yaml"
WORKING_DIR = config["working_dir"]
RESULT_DIR = config["result_dir"]
annotation = config["annotation"]
TOTALCORES = 16 #check this via 'grep -c processor /proc/cpuinfo'
################## Configuration file ##################
###############
# Helper Functions
###############
def get_fastq(wildcards):
return units.loc[(wildcards.sample), ["fq1", "fq2"]].dropna()
# read the tab separated table containing columns: sample, fq1, fq2 and condition
units = pd.read_table(config["units"], dtype=str).set_index(["sample"], drop=False)
SAMPLES = units.index.get_level_values('sample').unique().tolist()
samples = pd.read_csv(config["units"], dtype=str,index_col=0,sep="\t")
samplefile = config["units"]
def get_samples_per_treatment(input_df="units.tsv",colsamples="sample",coltreatment="condition",treatment="control"):
"""This function returns a list of samples that correspond to the same experimental condition"""
df = pd.read_table(input_df)
df = df.loc[df[coltreatment] == treatment]
filtered_samples = df[colsamples].tolist()
return filtered_samples
################## Helper functions ##################
##############
# Samples and conditions
##############
def sample_is_single_end(sample):
"""This function detect missing value in the column 2 of the units.tsv"""
if "fq2" not in samples.columns:
return True
else:
return pd.isnull(samples.loc[(sample), "fq2"])
units = pd.read_table(config["units"], dtype=str).set_index(["sample"], drop=False)
def get_fastq(wildcards):
"""This function checks if the sample has paired end or single end reads and returns 1 or 2 names of the fastq files"""
if sample_is_single_end(wildcards.sample):
return samples.loc[(wildcards.sample), ["fq1"]].dropna()
else:
return samples.loc[(wildcards.sample), ["fq1", "fq2"]].dropna()
def get_trim_names(wildcards):
"""
This function:
1. Checks if the sample is paired end or single end
2. Returns the correct input and output trimmed file names.
"""
if sample_is_single_end(wildcards.sample):
inFile = samples.loc[(wildcards.sample), ["fq1"]].dropna()
return "--in1 " + inFile[0] + " --out1 " + WORKING_DIR + "trimmed/" + wildcards.sample + "_R1_trimmed.fq.gz"
else:
inFile = samples.loc[(wildcards.sample), ["fq1", "fq2"]].dropna()
return "--in1 " + inFile[0] + " --in2 " + inFile[1] + " --out1 " + WORKING_DIR + "trimmed/" + wildcards.sample + "_R1_trimmed.fq.gz --out2 " + WORKING_DIR + "trimmed/" + wildcards.sample + "_R2_trimmed.fq.gz"
def get_star_names(wildcards):
"""
This function:
1. Checks if the sample is paired end or single end.
2. Returns the correct input file names for STAR mapping step.
"""
if sample_is_single_end(wildcards.sample):
return WORKING_DIR + "trimmed/" + wildcards.sample + "_R1_trimmed.fq.gz"
else:
return WORKING_DIR + "trimmed/" + wildcards.sample + "_R1_trimmed.fq.gz " + WORKING_DIR + "trimmed/" + wildcards.sample + "_R2_trimmed.fq.gz"
SAMPLES = units.index.get_level_values('sample').unique().tolist()
def get_samples_per_treatment(input_df="units.tsv",colsamples="sample",coltreatment="condition",treatment="control"):
"""This function returns a list of samples that correspond to the same experimental condition"""
df = pd.read_table(input_df)
df = df.loc[df[coltreatment] == treatment]
filtered_samples = df[colsamples].tolist()
return filtered_samples
CASES = get_samples_per_treatment(treatment="treatment")
CONTROLS = get_samples_per_treatment(treatment="control")
################## Wilcards constrains ##################
GROUPS = {
"group1" : ["ChIP1", "ChIP3"],
"group2" : ["ChIP2", "ChIP4"]
} #I used this dictionnary to define the group of sample used in the multiBamSummary, might be improved a lot
##############
# Wildcards
##############
wildcard_constraints:
sample = "[A-Za-z0-9]+"
wildcard_constraints:
unit = "L[0-9]+"
unit = "[A-Za-z0-9]+"
################## DESIRED OUTPUT ##################
##############
# Desired output
##############
FASTP = expand(WORKING_DIR + "trimmed/" + "{sample}_{read}_trimmed.fq.gz", sample=SAMPLES, read={"R1", "R2"})
BOWTIE2 = expand(WORKING_DIR + "mapped/{sample}.bam", sample= SAMPLES)
FASTQC = expand(RESULT_DIR + "fastqc/{sample}.fastqc.html", sample = SAMPLES)
FASTQC_REPORTS = expand(RESULT_DIR + "fastqc/{sample}.fastqc.zip", sample=SAMPLES)
BAM_INDEX = expand(RESULT_DIR + "mapped/{sample}.sorted.rmdup.bam.bai", sample=SAMPLES)
......@@ -85,20 +112,9 @@ PLOTCOVERAGE = RESULT_DIR + "plotCoverage/Coverage.png"
################
rule all:
input:
BAM_INDEX,
FASTQC_REPORTS,
#BEDGRAPH,
BIGWIG,
BED_NARROW,
#BED_BROAD,
MULTIBAMSUMMARY,
PLOTCORRELATION,
COMPUTEMATRIX,
HEATMAP,
PLOTFINGERPRINT,
PLOTPROFILE_PDF,
PLOTPROFILE_BED,
#MULTIQC
FASTP,
FASTQC,
BOWTIE2
message: "ChIP-seq SE pipeline succesfully run." #finger crossed to see this message!
shell:"#rm -rf {WORKING_DIR}"
......
......@@ -3,28 +3,24 @@ samples: sample.tsv
units: units.tsv
# files and directories
#fastq_dir: "fastq/"
working_dir: "temp/"
result_dir: "results/"
annotation: "annotation/"
#Parameters for the rules
GENOME_ZIP_FASTA_URL: ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M20/GRCm38.primary_assembly.genome.fa.gz
GENOME_ZIP_GTF_URL: ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M20/gencode.vM20.annotation.gtf.gz
#ENCODE repeat_masker
REPEAT_GTF_URL: http://labshare.cshl.edu/shares/mhammelllab/www-data/TEtranscripts/TE_GTF/GRCm38_Ensembl_rmsk_TE.gtf.gz
REPEAT_LOCIND: http://labshare.cshl.edu/shares/mhammelllab/www-data/TElocal/prebuilt_indices/mm10_rmsk_TE.gtf.locInd.gz #others prebuilt indices can be found: http://labshare.cshl.edu/shares/mhammelllab/www-data/TElocal/prebuilt_indices/
# adapters for trimmomatic
adapters: "adapters.fasta"
#FASTP
# trimmomatic parameters
trimmomatic:
adapters: "adapters.fasta"
seedMisMatches: '2'
palindromeClipTreshold: '30'
simpleClipThreshold: '10'
LeadMinTrimQual: '3'
TrailMinTrimQual: '3'
windowSize: '4'
avgMinQual: '15'
minReadLength: '40'
phred: "-phred33" # phred: for illumina >1.8 the quality score are encoded by phred33
# read quality trimming
fastp:
qualified_quality_phred: 30 # Phred+33 score (> 15 for Proton Ion, > 30 or more for Illumina)
## Genomic references, annotations and aligner indexes
refs:
......@@ -40,8 +36,6 @@ bowtie2:
params:
mode: "--local"
sensitivity: "--very-sensitive-local"
max_fragment_len: "--maxins 500" # maximum fragment length for valid paired-end alignments
min_fragment_len: "--minins 80" # minimum fragment length for valid paired-end alignments
verbose: "-q"
#Parameters for bamCoverage:
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment