Commit acc6235f authored by BeatriceTan's avatar BeatriceTan

Updated conda requirements and file locations.

parent fafa8d94
......@@ -28,11 +28,10 @@ onstart:
print("- Output directory: " + config["workdir"] + "\n\n")
onsuccess:
shell("echo \n\nPipeline completed.")
shell("echo Output directory: " + config["workdir"])
print("\n\nPipeline completed.\nOutput directory: " + config["workdir"])
onerror:
print("\n\nPipeline failed. Possible reasons:\n- Wrong input files")
print("\n\nPipeline failed. Possible reasons:\n- Wrong input files\n- Missing arguments in config file\n- Wrong conda environment\n")
rule all:
......
#Directories to be specified
workdir: /home/beatrice/Documents/SASC
#/home/beatrice/CNA_analysis #directory to write output to
gisticdir: /home/beatrice/CNA_analysis/run_gistic2 #directory to install GISTIC2 to
workdir: /home/bftan/CNA_results #directory to write output
gisticdir: /home/bftan/Tools/GISTIC2 #directory to install GISTIC2
#Input details to download from firehose
cancer_type: STES
cancer_type: SKCM
date_data: "2016_07_15"
#Or provide input file
......@@ -12,11 +11,11 @@ inputfile: "" #tumor segmentation data
normal: "" #normal segmentation data
#Data for running and benchmarking tools.
reference: hg19 #or possible to provide ref file?
markerfile: /home/beatrice/Documents/SASC/Input_files/markers.tsv
prev_found_genes: /home/beatrice/Documents/SASC/Input_files/intogen-CM-drivers-data.tsv #/home/beatrice/Documents/SASC/Input_files/SKCM_genes.txt
census_genes: /home/beatrice/Documents/SASC/Input_files/Census_genes.txt
biomart_genes: /home/beatrice/Documents/SASC/Input_files/biomart_human_genes.tsv
reference: hg19
markerfile: input_files/markers.tsv
prev_found_genes: input_files/intogen-CM-drivers-data.tsv
census_genes: input_files/Census_genes.txt
biomart_genes: input_files/biomart_human_genes.tsv
#Settings for sample size differences
sizes: [20, 30, 40, 50, 60, 70, 80, 90]
......
channels:
- conda-forge
dependencies:
- matplotlib-venn =0.11.5
......@@ -7,7 +7,7 @@ rule firehose:
date=config["date_data"],
dateshort=str(config["date_data"]).replace("_", "")
conda:
"workflow.basedir/envs/firehose.yaml"
workflow.basedir + "/envs/firehose.yaml"
shell:
"echo yes | firehose_get -tasks Merge_cna__illuminahiseq_dnaseqc__hms_harvard_edu__Level_3__segmentation__seg stddata {params.date} {params.cancer_type} && \
DATA_DIR=stddata__{params.date}/{params.cancer_type}/{params.dateshort} && \
......
from Rubic import get_seg_rubic
from Reports_new import ReportTools, ReportSegmentation
import os.path
rule install_gistic:
"""Install GISTIC2 to a directory of choice."""
......@@ -14,7 +15,7 @@ rule run_gistic:
gistic_directory=config["gisticdir"],
seg="Input/Segments_tumor.txt"
output:
"GISTIC_results/new"
"GISTIC/"
params:
cnv="",
ref=config["reference"],
......@@ -41,7 +42,7 @@ rule run_rubic:
seg="RUBIC/Segmentation_file.txt",
markers="RUBIC/Marker_file.txt"
output:
"RUBIC_results"
"RUBIC/"
params:
fdr="0.25",
genefile="" #config["biomart_genes"]
......@@ -53,8 +54,8 @@ rule run_rubic:
rule report_tools:
"""Report the differences in calls between GISTIC2 and RUBIC."""
input:
gistic="GISTIC_results/new",
rubic="RUBIC_results"
gistic="GISTIC/",
rubic="RUBIC/"
output:
tools="Reports/Tools.txt",
genes_both="Reports/Genes_both.txt",
......@@ -64,13 +65,13 @@ rule report_tools:
venn="Reports/Venn_overlap_genes.png",
histogram="Reports/Histogram_sizes.png",
swarmplot="Reports/Swarmplot_sizes.png"
params:
census=config["census_genes"],
known=config["prev_found_genes"],
gene_info=config["biomart_genes"]
params: #select input files from repository or own input files
census=os.path.join(workflow.basedir, config["census_genes"]) if config["census_genes"].startswith("input_files") else config["census_genes"],
known=os.path.join(workflow.basedir, config["prev_found_genes"]) if config["prev_found_genes"].startswith("input_files") else config["prev_found_genes"],
gene_info=os.path.join(workflow.basedir, config["biomart_genes"]) if config["biomart_genes"].startswith("input_files") else config["biomart_genes"]
run:
ReportTools(input.gistic, input.rubic, params.census, params.known, params.gene_info, output.tools, \
output.venn, output.genes_both, output.genes_gistic, output.genes_rubic, output.overlap, output.histogram, output.swarmplot)
ReportTools(input.gistic, input.rubic, params.census, params.known, params.gene_info,
output.tools,output.venn, output.genes_both, output.genes_gistic, output.genes_rubic, output.overlap, output.histogram, output.swarmplot)
rule report_seg:
"""Report information on the input segmentation file."""
......
#!/bin/bash
####Script to run snakemake
#conda-env create -n prioritization -f envs/plots.yaml
#conda-env update -n prioritization -f envs/plots.yaml
#source activate prioritization
conda-env create -n CNAprioritization -f envs/pipeline.yaml
conda-env update -n CNAprioritization -f envs/pipeline.yaml
source activate CNAprioritization
snakemake -p \
--use-conda
snakemake -p --use-conda
......@@ -150,8 +150,9 @@ class ReportSegmentation():
def make_seg_report(self, segmentation_file, report_file):
"""Write report."""
with open(report_file, 'w') as out:
seg_rows = ["Number of samples", "Smallest number of CNVs in a sample", "Largest number of CNVs in a sample", \
"Average number of CNVs per sample", "Average number of focal CNVs per sample", "Average number of deletions per sample", "Average length of CNVs (Mb)"]
seg_rows = (["Number of samples", "Smallest number of CNVs in a sample", "Largest number of CNVs in a sample",
"Average number of CNVs per sample", "Average number of focal CNVs per sample",
"Average number of deletions per sample", "Average length of CNVs (Mb)"])
seg_stats = self.get_segment_stats(segmentation_file)
seg_stats_str = [str(i) for i in seg_stats]
for i in range(len(seg_rows)):
......@@ -206,8 +207,8 @@ class ReportSegmentation():
class ReportTools:
"""Make a report and gene file from the analyses with GISTIC2 and RUBIC"""
def __init__(self, gistic_results, rubic_results, census_genes, known_genes, gene_file, file_tools, \
file_venn, file_genes_both, file_genes_GISTIC, file_genes_RUBIC, file_overlap, file_histogram, file_swarmplot):
def __init__(self, gistic_results, rubic_results, census_genes, known_genes, gene_file, file_tools,
file_venn, file_genes_both, file_genes_GISTIC, file_genes_RUBIC, file_overlap, file_histogram, file_swarmplot): #double brackets? ((...))
parsed, stats = self.calculate_stats(gistic_results, rubic_results, file_tools, census_genes, known_genes)
self.make_tool_report(file_tools, stats)
......@@ -394,7 +395,8 @@ class ReportSizes:
"""Make a report of the results produced using input files with different sample sizes."""
with open(report_file, 'w') as out:
dict_stats = {}
row_names = ["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)", "Nr. genes", "Nr. regions with census genes", "Nr. regions with known genes"]
row_names = (["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
"Nr. genes", "Nr. regions with census genes", "Nr. regions with known genes"])
for size_file in size_results:
parsed_gistic = parse().gistic_results(size_file)
size, repetition = size_file.split("/")[-1].split("x")
......@@ -423,7 +425,8 @@ class ReportSizes:
nr_census.append(float(stats[7].split(" (")[0]))
nr_known.append(float(stats[8].split(" (")[0]))
plot_data = [nr_regions, avg_size, total_size, nr_genes, nr_census, nr_known]
plot_y_axis = ['Number of recurrent regions', 'Average size of regions (Kb)', 'Total size (Mb)', 'Number of genes', 'Nr. regions with census genes', 'Nr. regions with known genes']
plot_y_axis = (['Number of recurrent regions', 'Average size of regions (Kb)', 'Total size (Mb)',
'Number of genes', 'Nr. regions with census genes', 'Nr. regions with known genes'])
for plot_nr in range(len(plot_data)):
self.plot_size_differences(plot_data[plot_nr], sizes, len(reps), plot_y_axis[plot_nr], plot_dir)
......@@ -436,7 +439,8 @@ class ReportSizes:
for size in list_sizes:
sample_label = sample_label + [size] * nr_reps * 2
df['sample_size'] = pd.Series(sample_label)
g = sns.factorplot(x="sample_size", y="value_y_axis", col="type", data=df, kind="box", size=5, aspect=1, palette=sns.cubehelix_palette(8, start=.5, rot=-.75, dark=.2))
g = sns.factorplot(x="sample_size", y="value_y_axis", col="type", data=df, kind="box", size=5,
aspect=1, palette=sns.cubehelix_palette(8, start=.5, rot=-.75, dark=.2))
g.set_axis_labels("Sample size", y_axis).set_titles("{col_name}").despine(bottom=True)
cnv_types = ["Amplifications", "Deletions"]
cnv_type = "Amplifications"
......@@ -464,7 +468,8 @@ class ReportControl:
"""Write report."""
parsed_tools, stats_tools = [], []
with open(report_file, 'w') as out:
row_names = ["Control used?", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)", "Nr. genes", "Nr. regions with census genes", "Nr. regions with known genes"]
row_names = (["Control used?", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
"Nr. genes", "Nr. regions with census genes", "Nr. regions with known genes"])
for result in control_results, nocontrol_results:
parsed_results = parse().gistic_results(result)
parsed_tools.append(parsed_results)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment