Commit 6869823f authored by Beatrice Tan's avatar Beatrice Tan
Browse files

Added script to plot precision and recall with different sample sizes.

parent d320be6a
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
def ROC_curve(gistic_subset, gistic_truth, rubic_subset, rubic_truth, plot_file, list_sizes):
plot_recall, plot_precision, plot_avg_recall, plot_avg_precision = [], [], [], []
for tool in (gistic_subset, gistic_truth), (rubic_subset, rubic_truth):
list_recall, list_precision = get_list_rates(tool[0], tool[1], list_sizes)
plot_recall.append(list_recall) #avg_recall)
plot_precision.append(list_precision) #.append(#avg_precision)
avg_recall, avg_precision = calculate_averages(list_recall, list_precision)
plot_avg_recall.append(avg_recall) #avg_recall)
plot_avg_precision.append(avg_precision)
plot_ROC(plot_recall, plot_precision, plot_file, list_sizes, False)
plot_ROC(plot_avg_recall, plot_avg_precision, plot_file + "_avg.png", list_sizes, True)
def get_list_rates(subset_files, truth_files, list_sizes):
list_recall = [[] for i in range(len(list_sizes))]
list_precision = [[] for i in range(len(list_sizes))]
for i in range(len(subset_files)):
size_file = subset_files[i].split("Size")[1].split(".Rep")[0]
recall, precision = calculate_precision_recall(subset_files[i], truth_files[i])
for size in range(len(list_sizes)):
if str(list_sizes[size]) == size_file:
list_recall[size].append(recall)
list_precision[size].append(precision)
return list_recall, list_precision
def calculate_precision_recall(bed_subset, bed_truth):
FP, TP, FN = 0, 0, 0
with open(bed_subset, 'r') as subset: #precision
for region in subset:
if region.startswith("track"):
pass #header
else:
line = region.split("\t")
if line[4] == '.':
FP += 1
else:
TP += 1
with open(bed_truth, 'r') as truth: #recall
for region in truth:
if region.startswith("track"):
pass #header
else:
line = region.split("\t")
if line[4] == '.':
FN += 1
#else:
# TP += 1
recall = TP / float(TP + FN)
precision = TP / float(TP + FP)
return recall, precision
def calculate_averages(list_recall, list_precision):
avg_recall, avg_precision = [], []
for size in list_recall:
avg = sum(size) / float(len(size))
avg_recall.append(avg)
for size in list_precision:
avg = sum(size) / float(len(size))
avg_precision.append(avg)
return avg_recall, avg_precision
def plot_ROC(list_recall, list_precision, plot_file, list_sizes, avg):
plt.figure()
if avg:
plt.xlim([0, 1])
plt.ylim([0.7, 1])
point_size=50
else:
plt.xlim([0, 1])
plt.ylim([0.5, 1.02])
point_size=20
for i in range(len(list_recall)):
recall, precision = list_recall[i], list_precision[i]
for j in range(len(recall)):
col = sns.cubehelix_palette(len(list_sizes), start=.5, rot=-.75, dark=.2)[j] if i == 0 else sns.cubehelix_palette(len(list_sizes))[j]
plt.scatter(recall[j], precision[j], color=col, s=point_size)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.savefig(plot_file)
plt.close()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment