Commit 1b50acb5 authored by Beatrice Tan's avatar Beatrice Tan
Browse files

Updated reports.

parent a5de6c79
......@@ -14,7 +14,7 @@ bash Miniconda3-latest-Linux-x86_64.sh
```
3. Create an environment for the pipeline and install the required packages:
```
conda install -c bioconda snakemake=4.4.0
conda install -c bioconda snakemake=4.4.0 circos=0.69.6
conda install -c conda-forge matplotlib-venn=0.11.5
conda install -c edurand pyensembl=1.1.0
```
......
......@@ -37,10 +37,11 @@ onerror:
rule all:
"""Define desired output from pipeline."""
input:
"GO/comparison.txt",
"Reports/Circos/RecurrentRegions.png"
#"GO/comparison.txt",
#"Input/Segments_tumor.txt",
#"Reports/Control.txt",
"Reports/Tools.txt",
#"Reports/Tools.txt",
#"Reports/Results.html",
#"Reports/Control.txt",
......
......@@ -8,7 +8,7 @@ rule go_analysis:
organism="hsapiens", #default is human
ontology="BP" #MF, BP, CC or all
benchmark:
"Benchmarks/topGO." + str(datetime.datetime.now()) + ".txt"
"Benchmarks/topGO." + str(datetime.datetime.now()).replace(" ", "_") + ".txt"
wrapper:
"file:" + workflow.basedir + "/wrappers/topgo"
......@@ -28,7 +28,7 @@ def compare_go(GO_files, output_file):
go_terms = get_go(GO_files)
overlap = set(go_terms[0]).intersection(set(go_terms[1]))
with open(output_file, 'w') as out:
out.write("Percentage overlapping: " + str(int(len(overlap) / 50 * 100)))
out.write("Percentage overlapping: " + str(int((len(overlap) / 50 * 100))))
out.write("\n".join(list(overlap)))
def get_go(GO_files):
......
......@@ -67,7 +67,7 @@ def split_normal_tumor(all_samples, out_tumor, out_normal):
normal.write(header)
for line in old:
sample = line.split("\t")[0]
type_ID = sample.split("-")[3]
type_ID = sample.split("-")[3] #.strip("A")
if type_ID == "06A": #TM: Metastatic
tumor.write(line)
elif type_ID == "01A": #TP: Primary Solid Tumor
......
......@@ -2,6 +2,7 @@ from Rubic import MarkerFile
from Reports import ReportTools
import os.path
import datetime
from Circos import InputCircos
rule install_gistic:
"""Install GISTIC2 to a directory of choice."""
......@@ -23,7 +24,7 @@ rule run_gistic:
ref_file="",
extra=""
benchmark:
"Benchmarks/GISTIC2." + str(datetime.datetime.now()) + ".txt" #space between date and time!
"Benchmarks/GISTIC2." + str(datetime.datetime.now()).replace(" ", "_") + ".txt" #space between date and time!
wrapper:
"file:" + workflow.basedir + "/wrappers/GISTIC2"
......@@ -49,10 +50,38 @@ rule run_rubic:
fdr="0.25",
genefile=os.path.join(workflow.basedir, config["biomart_genes"]) if config["biomart_genes"].startswith("input_files") else config["bimart_genes"]
benchmark:
"Benchmarks/RUBIC." + str(datetime.datetime.now()) + ".txt"
"Benchmarks/RUBIC." + str(datetime.datetime.now()).replace(" ", "_") + ".txt"
wrapper:
"file:" + workflow.basedir +"/wrappers/rubic"
rule circos_input:
"""Make input files for making a circos diagram."""
input:
seg="Reports/Segments.txt",
gistic="GISTIC/",
rubic_gains="RUBIC/gains.txt",
rubic_losses="RUBIC/losses.txt"
output:
seg="Reports/Circos/Segments.txt",
gistic="Reports/Circos/GISTIC_results.txt",
rubic="Reports/Circos/RUBIC_results.txt",
run:
InputCircos(input.seg, input.gistic, input.rubic_gains, input.rubic_losses, output.seg, output.gistic, output.rubic)
rule make_circos:
"""Make circos diagram of recurrent regions in RUBIC and GISTIC2.0"""
input:
seg="Reports/Circos/Segments.txt",
gistic="Reports/Circos/GISTIC_results.txt",
rubic="Reports/Circos/RUBIC_results.txt",
output:
"Reports/Circos/RecurrentRegions.png"
params:
workflow.basedir + "/scripts/circos/circos.conf"
shell:
"circos -conf {params[0]} -outputfile {output[0]} -param cnv_file={input.seg} -param gistic_file={input.gistic} -param rubic_file={input.rubic}"
rule report_tools:
"""Report the differences in calls between GISTIC2 and RUBIC."""
input:
......@@ -72,12 +101,10 @@ rule report_tools:
gene_info=os.path.join(workflow.basedir, config["biomart_genes"]) if config["biomart_genes"].startswith("input_files") else config["biomart_genes"],
ref=config["reference"]
run:
ReportTools(input.gistic, input.rubic, params.census, params.known, params.gene_info, params.ref
ReportTools(input.gistic, input.rubic, params.census, params.known, params.gene_info, params.ref,
output.tools, output.venn, output.genes_both, output.genes_gistic, output.genes_rubic, output.overlap, output.swarmplot)
from snakemake.utils import report
rule report:
"""Write html report on segmentation file."""
input:
......@@ -85,11 +112,16 @@ rule report:
output:
html="Reports/Results.html"
run:
from snakemake.utils import report
with open(input.seg, 'r') as seg:
nr_samples = seg.readline().split("\t")[1].strip()
report("""
=======================
====================================================
Report on the results of the prioritization pipeline
=======================
====================================================
seg_
In total, {nr_samples} samples were present in the raw segmentation file.
See: Table T1_
""", output.html, metadata="Beatrice F. Tan (beatrice.ftan@gmail.com)", **input)
""", output.html, metadata="Beatrice F. Tan (beatrice.ftan@gmail.com)", T1=input[0])
#**input)
import numpy as np
import os.path
import pyensembl
ensembl = pyensembl.EnsemblRelease(75)
from math import log10
from collections import OrderedDict
class parse:
"""Functions to parse the outputs from tools"""
def gistic_results(self, result_dir):
"""Create a dictionary with the recurrent regions called by GISTIC2.
Format: {"chr#:#-#" : (chrom, start, end, cnv_type, qval, [gene, gene])}"""
cnv_dict = {}
cnv_dict = OrderedDict()
with open(os.path.join(result_dir, "all_lesions.conf_75.txt"), 'r') as lesions:
lesions.readline()
for lesion in lesions:
......@@ -20,7 +22,7 @@ class parse:
chrom, bp = CNV_id.split(":")
start, end = bp.split("-")
cnv_type = 'amp' if stats[0].startswith("Amplification") else 'del'
qval = log10(float(stats[5])) #Convert q-value to log10(q-value)
qval = log10(float(stats[5])) * -1 #Convert q-value to log10(q-value)
#gene_file = os.path.join(result_dir, cnv_type + "_genes.conf_75.txt")
#list_genes = self.gistic_genes(gene_file, CNV_id)
list_genes = self.genes_locus(chrom, int(start), int(end))
......@@ -54,7 +56,7 @@ class parse:
def rubic_results(self, results_gains, results_losses):
"""Create a dictionary with the recurrent regions called by RUBIC.
Format: {"chr#:#-#" : (chrom, start, end, cnv_type, (qval_left, qval_right), [gene, gene])}"""
cnv_dict = {}
cnv_dict = OrderedDict()
for result_file in results_gains, results_losses:
with open(result_file, 'r') as lesions:
lesions.readline()
......@@ -106,7 +108,7 @@ class parse:
def biomart_gene_info(self, biomart_file):
"""Parse file with all human genes from biomart into dictionary"""
biomart_dict = {}
biomart_dict = OrderedDict()
with open(biomart_file, 'r') as biomart:
biomart.readline()
for line in biomart:
......@@ -165,7 +167,7 @@ class stats:
def get_precision_regions(self, known_regions): #Wat doet dit?
"""Give indication of precision of call based on capturing only the gene as recurrent region."""
percentages = []
new_dict = {}
new_dict = OrderedDict()
for region in known_regions.keys():
genes = known_regions[region]
ends, starts = [], []
......
......@@ -8,6 +8,7 @@ from scipy.stats import ttest_ind
import os.path
import pyensembl
from ParseResults import parse, stats
from collections import OrderedDict
def install_ensembl(reference_genome):
"""Import ensembl if needed and load correct release based on reference genome."""
......@@ -58,7 +59,7 @@ class ReportSegmentation:
def parse_segments(self, segmentation_file):
"""Parse the input segmentation file and return a dictionary with number of CNVs and total CNV length per sample."""
stat_dict = {}
stat_dict = OrderedDict()
with open(segmentation_file, 'r') as lines:
lines.readline()
for line in lines:
......@@ -92,6 +93,7 @@ class ReportTools:
rubic_dir = rubic_gain_file.split("/gains.txt")[0] #get diretory with rubic results from gain file path.
install_ensembl(ref_genome)
parsed, stats = self.calculate_stats(gistic_dir, rubic_dir, file_tools, census_genes, known_genes)
self.overlap_most_sign(parsed)
self.make_tool_report(file_tools, stats)
all_genes = self.make_genefiles(parsed, file_genes_GISTIC, file_genes_RUBIC)
......@@ -113,8 +115,8 @@ class ReportTools:
parsed_tools.append(parsed_results)
stats_results = stats().calculate_stats(parsed_results, census_genes, known_genes, tool)
stats_tools.append(stats_results)
print(parsed_tools)
print(stats_tools)
#print(parsed_tools)
#print(stats_tools)
return parsed_tools, stats_tools
def make_tool_report(self, file_tools, stats_tools):
......@@ -126,6 +128,23 @@ class ReportTools:
list_out = list(map(str, list_out))
out.write("\t".join(list_out) + "\n")
def overlap_most_sign(self, parsed_results):
"""Examine whether most significant regions (highest log10(qvalue)) are also found by other tool."""
for tool in parsed_results:
tool_qvals = {}
for cnv in tool.keys():
qval = tool[cnv][4]
if type(qval) == tuple: #rubic dict
qval = qval[0]
qval = float(qval)
tool_qvals[qval] = tool[cnv]
sorted_qvals = sorted(tool_qvals, reverse=True)
print(sorted_qvals)
for top in range(0,10):
top_cnv = tool_qvals[sorted_qvals[top]]
print(top_cnv)
def make_genefiles(self, parsed_results, out_GISTIC, out_RUBIC):
"""Make files with all genes reported by GISTIC2 and RUBIC."""
genes_all = []
......@@ -260,7 +279,7 @@ class ReportSizes:
def make_sizes_report(self, size_results, census_genes, known_genes, reps, report_file, plot_dir):
"""Make a report of the results produced using input files with different sample sizes."""
with open(report_file, 'w') as out:
dict_stats = {}
dict_stats = OrderedDict()
row_names = (["Size", "Type", "Nr. regions", "Avg. size (Kb)", "Total size (Mb)",
"Nr. genes", "Nr. regions with census genes", "Nr. regions with known genes"])
for size_file in size_results:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment