Commit 7f42daed authored by BeatriceTan's avatar BeatriceTan
Browse files

Added P-value comparison.

parent 7d081506
......@@ -43,9 +43,9 @@ rule all:
"""Define desired output from pipeline."""
......@@ -8,7 +8,8 @@ rule report_tools:
gistic="GISTIC/all_lesions.conf_" + config["gistic_precision"] + ".txt",
......@@ -17,7 +18,8 @@ rule report_tools:
params: #select input files from repository or own input files
census=os.path.join(workflow.basedir, config["census_genes"]) if config["census_genes"].startswith("input_files") else config["census_genes"],
known=os.path.join(workflow.basedir, config["prev_found_genes"]) if config["prev_found_genes"].startswith("input_files") else config["prev_found_genes"],
......@@ -27,7 +29,8 @@ rule report_tools:
ReportTools.make_report(input.gistic, input.rubic_gain, input.rubic_loss,
params.census, params.known, params.ref, params.biomart_info,, output.table_regions, output.venn, output.overlap, output.swarmplot,
output.genes_both, output.genes_gistic, output.genes_rubic)
output.genes_both, output.genes_gistic, output.genes_rubic,
input.overlap, output.pvals)
rule bed_intersect:
"""Intersect the recurrent regions detected by RUBIC and GISTIC2.0."""
......@@ -99,7 +99,6 @@ def gene_file_IDs(gene_file, top_nr):
list_genes = list_genes[0:top_nr]
return list_genes
def get_stats(list_regions, used_tool):
"""Calculate stats on the results from GISTIC2 and RUBIC."""
return_info = []
......@@ -117,11 +116,12 @@ def get_stats(list_regions, used_tool):
if alt[6] != []:
census_count += 1
type_name = 'Amplifications' if cnv_type == 'amp' else 'Deletions'
avg_size = int(tot_size / nr_reg / 1000) if nr_reg != 0 else 'N/A' #avg size in kb
stats = [used_tool, type_name, nr_reg, avg_size, tot_size / 1000000, nr_genes] #tot size in mb
avg_size = int(tot_size / nr_reg / 1000) if nr_reg != 0 else 'N/A' #average size in kb
tot_size = tot_size / 1000000 #total size in mb
stats = [used_tool, type_name, nr_reg, avg_size, tot_size, nr_genes]
stats = list(map(str, stats))
for count in known_count, census_count:
percent = (count / len(list_regions) * 100) if count != 0 else 0
percent = (count / nr_reg * 100) if count != 0 else 0
stat_str = str(count) + " (" + str(round(percent, 2)) + "%)"
......@@ -41,16 +41,10 @@ def get_segment_stats(segmentation_file):
def parse_segments(segmentation_file):
"""Parse the input segmentation file and return a dictionary with number of CNVs and total CNV length per sample."""
stat_dict = OrderedDict()
minimal_seg = 0
maximal_seg = 0
with open(segmentation_file, 'r') as lines:
for line in lines:
sample, chrom, start, end, num_prob, seg_CN = line.strip().split("\t")
if float(seg_CN) > 0:
maximal_seg = float(seg_CN) if float(seg_CN) > maximal_seg else maximal_seg
if float(seg_CN) < 0:
minimal_seg = float(seg_CN) if float(seg_CN) < minimal_seg else minimal_seg
if sample not in stat_dict:
stat_dict[sample] = [0, 0, 0, 0]
cnv_length = int(end) - int(start)
......@@ -4,7 +4,7 @@ import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
from scipy.stats import ttest_ind, mannwhitneyu, pearsonr
import os.path
import pyensembl
from ParseResults import parse, get_stats, install_ensembl
......@@ -13,7 +13,8 @@ from collections import OrderedDict
def make_report(gistic_results, rubic_gain_results, rubic_loss_results,
census_genes, known_genes, ref_genome, biomart_file,
file_tools, file_regions, file_venn, file_overlap, file_swarmplot,
file_genes_both, file_genes_GISTIC, file_genes_RUBIC):
file_genes_both, file_genes_GISTIC, file_genes_RUBIC,
bed_overlap, file_pvalues):
"""Make a report and gene file from the analyses with GISTIC2 and RUBIC"""
parsed, stats = [], []
......@@ -27,7 +28,7 @@ def make_report(gistic_results, rubic_gain_results, rubic_loss_results,
file_venn, file_swarmplot, file_regions, biomart_file, file_overlap)
def make_files(parsed, stats, file_tools, file_genes_GISTIC, file_genes_RUBIC, file_genes_both,
file_venn, file_swarmplot, file_regions, biomart_file, file_overlap):
file_venn, file_swarmplot, file_regions, biomart_file, file_overlap, bed_overlap, file_pvalues):
table_regions(parsed, file_regions)
make_tool_report(file_tools, stats)
......@@ -37,7 +38,8 @@ def make_files(parsed, stats, file_tools, file_genes_GISTIC, file_genes_RUBIC, f
#compare_known_regions(known_genes, file_overlap, biomart_file,
# gistic_results, rubic_gain_results, rubic_loss_results)
plot_sizes(parsed, file_swarmplot)
compare_overlapping_regions(parsed, biomart_file, file_overlap)
#compare_overlapping_regions(parsed, biomart_file, file_overlap)
overlapping_pvalues(bed_overlap, parsed, file_pvalues)
def make_tool_report(file_tools, stats_tools):
"""Make a report of the results from GISTIC and RUBIC."""
......@@ -136,6 +138,8 @@ def venn_overlap(gene_lists, out_venn): #maybe add known genes?
plt.savefig(out_venn, dpi=300)
def compare_overlapping_regions(parsed_results, biomart_file, overlap_file):
"""Compare the regions containing known driver genes"""
location_dict = {}
......@@ -189,36 +193,30 @@ def get_location_gene(biomart_file, gene_of_interest):
# continue
return location, gene_name
def get_location_gistic(gistic_results, location_gene): #more regions overlap: extract one with longest overlap
"""Get location of the CNV detected by GISTIC2 harboring the gene of interest"""
loc = ['N/A', 'N/A', 'N/A']
chrom_gene, start_gene, end_gene = location_gene
with open(os.path.join(gistic_results, "all_lesions.conf_75.txt"), 'r') as cnvs:
for cnv in cnvs:
cnv_loc = cnv.split("\t")[3]
cnv_loc_stripped = cnv_loc.split("(probes")[0]
chrom, bp = cnv_loc_stripped.split(":")
chrom = chrom.strip("chr")
if chrom == chrom_gene:
start, end = bp.split("-")
if int(start) <= int(end_gene) and int(end) >= int(start_gene):
if loc == ['N/A', 'N/A', 'N/A']:
loc = [chrom, start, end]
#loc = loc, chrom, start, end
return loc
def get_location_rubic(rubic_results, gene):
loc = ['N/A', 'N/A', 'N/A']
for file_type in "gains.txt", "losses.txt":
with open(os.path.join(rubic_results, file_type), 'r') as cnvs:
for cnv in cnvs:
if gene in cnv.strip().split("\t")[5]:
chrom, start, end, pleft, pright, genes = cnv.strip().split("\t")
if loc == ['N/A', 'N/A', 'N/A']:
loc = [chrom, start, end]
#loc = loc, chrom, start, end
return loc
def overlapping_pvalues(bed_file, parsed_results, plot_file):
"""Parse the P-values of overlapping regions between GISTIC2.0 and RUBIC."""
pvals = []
with open(bed_file, 'r') as overlap:
for line in overlap:
g_chr, g_start, g_end, g_nr, r_chr, r_start, r_end, r_nr, bp_overlap = line.strip().split("\t")
for gistic, rubic in parsed_results:
for region in gistic:
if region[0:3] == [g_chr, g_start, g_end]:
g_pval = region[4]
for region in rubic:
if region[0:3] == [r_chr, r_start, r_end]:
r_left_pv, r_right_pv = region[4]
pvals.append((g_pval, r_left_pv))
labels = ["P-value GISTIC2.0", "P-value RUBIC"]
pval_df = pd.DataFrame(pvals, columns=labels)
plot_correlation(pval_df, plot_file)
def plot_correlation(df_pvals, plot_file):
"""Plot correlation between P-values."""
g = sns.jointplot("P-value GISTIC2.0", "P-value RUBIC", data=df_pvals, kind="reg",
xlim=(0, 60), ylim=(0, 12), color="r", size=7)
pr = pearsonr(df_pvals["P-value GISTIC2.0"], df_pvals["P-value RUBIC"])
plt.text(0.9, 0.1, pr)
plt.savefig(plot_file, dpi=300)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment