Commit 3f384cd7 authored by WJH58's avatar WJH58
Browse files

working pipeline for fastqc and index_genome

parent 4eb7c33d
.DS_Store
ATAC-seq_pipeline/temp/*
## directories ##
working_dir: "temp/"
results_dir: "results/"
data_dir: "../data/"
sample : "units.tsv"
## genome reference ##
genome_fasta_url : "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M17/GRCm38.primary_assembly.genome.fa.gz"
## trimmomatic ##
adapters: "adapters.fasta"
channels:
- bioconda
dependencies:
- fastqc=0.11.9
channels:
- bioconda
dependencies:
- bowtie2=2.4.2
- samtools=1.11
#######Configuration files#######
GENOME_FASTA_URL = config[‘genome_fasta_url’]
WORKING_DIR = config[working_dir]
RESULT_DIR = config[results_dir]
#######Rules###########
rule download_genome:
output:
WORKING_DIR + “reference.fa”
shell:
“wget -o {output} {GENOME_FASTA_URL}”
# download the file from GENOME_FASTA_URL link and save the file to {output}
rule fastqc:
input:
fwd = WORKING_DIR + “{samples}_fwd.fq”
rev = WORKING_DIR + “{samples}_rev.fq”
output:
R1 = RESULT_DIR + “fastqc/{samples}_fwd.fq.gz”
R2 = RESULT_DIR + “fastqc/{samples}_rev.fq.gz”
shell:
“fastqc -o {output} -t 10 {input}”
# fastqc [-o output dir] [--(no)extract] [-f fastq|bam|sam] [-c contaminant file] seqfile1 .. seqfileN
rule trimmomatic:
input:
R1 = RESULT_DIR + “fastqc/{samples}_fwd.fq.gz”
R2 = RESULT_DIR + “fastqc/{samples}_rev.fq.gz”
output:
forward = WORKING_DIR + “trimmed/{samples}_forward.fq.gz”
reverse = WORKING_DIR + “trimmed/{samples}_reverse.fq.gz”
shell:
“java -jar trimmomatic-0.35.jar PE -phred33 {input} {output}”
rule fastqc2:
input:
forward = WORKING_DIR + “trimmed/{samples}_forward.fq.gz”
reverse = WORKING_DIR + “trimmed/{samples}_reverse.fq.gz”output:
RESULT_DIR + “fastqc/{samples}_forward_posttrim.fq.gz”
RESULT_DIR + “fastqc/{samples}_reverse_posttrim.fq.gz”
shell:
“fastqc -o {output} -t 6 {input}”
rule index:
input:
WORKING_DIR + “reference.fa”
output:
WORKING_DIR + “genome.rev.1.bt2”
WORKING_DIR + “genome.rev.2.bt2”
message:
“index reference genome”
params:
WORKING_DIR + “genome_index”
shell:
“bowtie2-build –threads 10 {input}{params}”
rule align:
input:
index = WORKING_DIR + “genome_index”
forward = WORKING_DIR + “trimmed/{samples}_forward.fq.gz”
reverse = WORKING_DIR + “trimmed/{samples}_reverse.fq.gz”
params:
index = WORKING_DIR + “genome_index”
output:
WORKING_DIR + “mapped/{sample}.bam”
message:
“mapping samples to reference genome”
shell:
“bowtie2 -t 10 {params.index} -1 {input.forward} -2{input.reverse} | samtools view -Sb - > {output}”
################## Import libraries ##################
import pandas as pd
import os
import sys
from subprocess import call
import itertools
from snakemake.utils import R
################## Configuration file and PATHS##################
configfile: "config.yaml"
WORKING_DIR = config["working_dir"]
RESULT_DIR = config["results_dir"]
DATA_DIR = config["data_dir"]
GENOME_FASTA_URL = config["genome_fasta_url"]
#units = pd.read_table(config["units"], dtype=str).set_index(["bed"], drop=False)
#BED = units.index.get_level_values('bed').unique().tolist()
units = pd.read_table(config["sample"], dtype=str).set_index(["sample"], drop=False)
SAMPLES = units.index.get_level_values('sample').unique().tolist()
###############
# Helper Functions
###############
def get_fastq(wildcards):
return units.loc[(sample), ["fq1", "fq2"]].dropna()
################## DESIRED OUTPUT ##################
FASTQC = expand(RESULT_DIR + "fastqc/{samples}_{R}_2000reads_fastqc.html", samples = SAMPLES, R=['R1', 'R2'])
rule all:
input:
FASTQC,
WORKING_DIR + "reference",
[WORKING_DIR + "genome." + str(i) + ".bt2" for i in range(1,4)],
WORKING_DIR + "genome.rev.1.bt2",
WORKING_DIR + "genome.rev.2.bt2"
message : "Analysis is complete!"
shell:""
################## INCLUDE RULES ##################
rule download_genome:
output:
WORKING_DIR + "reference",
shell:
"wget -O {output} {GENOME_FASTA_URL}"
rule index_genome:
input:
WORKING_DIR + "reference",
output:
[WORKING_DIR + "genome." + str(i) + ".bt2" for i in range(1,4)],
WORKING_DIR + "genome.rev.1.bt2",
WORKING_DIR + "genome.rev.2.bt2"
message:"Indexing Reference genome"
params:
WORKING_DIR + "genome"
conda:
"envs/samtools_bowtie.yaml"
shell:
"bowtie2-build --threads 10 {input} {params}"
rule trimmomatic:
input:
rule fastqc:
input:
fwd = expand(DATA_DIR + "{samples}_R1_2000reads.fq.gz", samples = SAMPLES),
rev = expand(DATA_DIR + "{samples}_R2_2000reads.fq.gz", samples = SAMPLES)
output:
expand(RESULT_DIR + "fastqc/{samples}_{R}_2000reads_fastqc.html", samples = SAMPLES, R=['R1', 'R2'])
log:
expand(RESULT_DIR + "logs/fastqc/{samples}.fastqc.log", samples = SAMPLES)
params:
RESULT_DIR + "fastqc/"
conda:
"envs/fastqc.yaml"
shell:
"fastqc --outdir={params} {input.fwd} {input.rev} &>{log}"
sample fq1 fq2
WT8 ../data/WT8_R1_2000reads.fq.gz ../data/WT8_R2_2000reads.fq.gz
KO12 ../data/KO12_R1_10reads.fq.gz ../data/KO12_R2_10reads.fq.gz
This diff is collapsed.
File added
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment