...
 
Commits (3)
......@@ -18,7 +18,6 @@ include: "rules/GenePrioritization.smk"
include: "rules/ComparisonRegions.smk"
include: "rules/Circos.smk"
include: "rules/SampleSizes.smk"
#include: "rules/UseControl.smk"
include: "rules/ComparisonSettings.smk"
#Directory to save all files.
......@@ -45,8 +44,9 @@ rule all:
"""Define desired output from pipeline."""
input:
#"Settings/Report.txt",
"PipelineResults.html",
"ComparisonResults.html"
"Samplesize/Report.txt"
#"PipelineResults.html",
#"GO/comparison.txt",
rule help:
"""Print list of all targets with help."""
......
......@@ -2,7 +2,7 @@
#workdir: /home/bftan/CNA_results #directory to write output
#gisticdir: /home/bftan/Tools/GISTIC2 #directory to install GISTIC2
workdir: /home/beatrice/CNA_99_genegistic
workdir: /home/beatrice/CNA_analysis #CNA_99_genegistic
gisticdir: /home/beatrice/CNA_analysis/run_gistic2
#Input details to download from firehose
......@@ -32,5 +32,5 @@ comparison_settings: ["-brlen 0.7 -genegistic 1",
"-brlen 0.7 -genegistic 0"]
#Settings for sample size differences
sizes: [20, 30, 40, 50, 60, 70, 80, 90, 100, 110]
sizes: [20, 30, 40, 50, 60, 70, 80, 90]
repeats: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
......@@ -4,10 +4,10 @@ rule do_GO_analysis:
gene_list="Reports/Genes_{tool}.txt"
output:
table="GO/Enriched_GOs_{tool}.txt", #Add gene names to table
plot="GO/Enriched_GOs_{tool}.png"
#plot="GO/Enriched_GOs_{tool}.png"
params:
organism="hsapiens", #default is human
ontology="BP", #MF, BP, CC or all
ontology="all", #MF, BP, CC or all
gene_ID_to_GO=os.path.join(workflow.basedir, config["ID_to_GO"]) if config["ID_to_GO"].startswith("input_files") else config["ID_to_GO"]
benchmark:
"Benchmarks/topGO." + str(datetime.datetime.now()).replace(" ", "_") + ".txt"
......
......@@ -128,6 +128,7 @@ def get_stats(list_regions, used_tool):
return_info = []
for cnv_type in 'amp', 'del':
tot_size, nr_genes, nr_reg, known_count, census_count, focal_count = 0, 0, 0, 0, 0, 0
known_census_count, known_focal = 0,0
all_genes = []
driver_density = []
for alt in list_regions:
......@@ -135,22 +136,23 @@ def get_stats(list_regions, used_tool):
nr_reg += 1
reg_size = int(alt[2]) - int(alt[1])
tot_size += reg_size
if reg_size < 3000000:
if reg_size < 10000000:
focal_count += 1
if alt[5] != []:
known_focal += 1
all_genes = all_genes + alt[7]
if alt[5] != []:
known_count += 1
if alt[6] != []:
census_count += 1
#print(alt[7])
drivers = 1 / len(alt[7]) if alt[7] != [] else 0
driver_density.append(drivers)
nr_genes = len(list(set(all_genes))) #unique genes only
stats = calculate_stats(tot_size, nr_genes, nr_reg, known_count, census_count, focal_count, cnv_type, used_tool, driver_density)
stats = calculate_stats(tot_size, nr_genes, nr_reg, known_count, census_count, focal_count, cnv_type, used_tool, driver_density, known_focal)
return_info.append(stats)
return return_info
def calculate_stats(tot_size, nr_genes, nr_reg, known_count, census_count, focal_count, cnv_type, used_tool, driver_density):
def calculate_stats(tot_size, nr_genes, nr_reg, known_count, census_count, focal_count, cnv_type, used_tool, driver_density, known_focal):
"""Calculate average & total numbers for report on recurrent regions."""
type_name = 'Amplifications' if cnv_type == 'amp' else 'Deletions'
avg_size = int(tot_size / nr_reg / 1000) if nr_reg != 0 else 'N/A' #average size in kb
......@@ -158,8 +160,8 @@ def calculate_stats(tot_size, nr_genes, nr_reg, known_count, census_count, focal
avg_driver_density = round(sum(driver_density) / nr_reg, 2)
stats = [used_tool, type_name, nr_reg, avg_size, round_tot_size, nr_genes, avg_driver_density]
stats = list(map(str, stats))
for count in focal_count, known_count, census_count:
for count in focal_count, known_count, census_count, known_focal:
percent = (count / nr_reg * 100) if count != 0 else 0
stat_str = str(count) + " (" + str(round(percent, 2)) + "%)"
stat_str = str(count) + " (" + str(int(percent)) + "%)"
stats.append(stat_str)
return stats
......@@ -20,11 +20,6 @@ def make_report(size_gistic, size_rubic_gains, size_rubic_losses, census_genes,
for tool in 'GISTIC', 'RUBIC':
size_file = (size_rubic_gains[i], size_rubic_losses[i]) if tool == 'RUBIC' else size_gistic[i]
parsed_results = parse_regions(size_file, known_genes, census_genes, tool, ref_genome)
#if tool not in all_results.keys():
# all_results[tool] = {}
# all_results[tool][size] = [parsed_results]
#else:
# all_results[tool][size] = all_results[tool][size] + [parsed_results]
stats_results = get_stats(parsed_results, size)
for stat_list in stats_results[0], stats_results[1]:
converted_stats = [tool] + stat_list[0:2]
......@@ -50,8 +45,10 @@ def overlap_genes(all_results, report_file):
def make_plots(list_stats, reps, sizes, plot_dir):
plot_y_axis = (['Number of recurrent regions', 'Average size of regions (Kb)', 'Total size (Mb)',
'Number of genes', 'Number of known regions', 'Number of census regions'])
plot_y_axis = (["No. regions", "Avg. size (Kb)", "Total size (Mb)", "No. genes", "Average driver density", "No. focal regions", "No. known regions",
"No. census regions", "No. known focal regions"])
# 'Number of recurrent regions', 'Average size of regions (Kb)', 'Total size (Mb)',
# 'Number of genes', 'Number of known regions', 'Number of census regions'])
df_stats = pd.DataFrame(list_stats, columns=['Tool', 'Sample size', 'Type'] + plot_y_axis)
for plot_name in plot_y_axis:
plot_size_differences(df_stats, plot_name, sizes, len(reps), plot_dir)
......@@ -63,7 +60,7 @@ def plot_size_differences(df_stats, value_y_axis, list_sizes, nr_reps, plot_dir)
sns.set_style("whitegrid")
g = sns.factorplot(x="Sample size", y=value_y_axis, col="Type", hue="Tool", data=df_stats, kind="box",
size=5, aspect=1, palette=["#5975A4","#5F9E6E"], order=list(map(str, list_sizes)))
if value_y_axis in ['Average size of regions (Kb)', 'Total size (Mb)', 'Number of genes']:
if value_y_axis in ["Avg. size (Kb)", "Total size (Mb)", "No. genes"]:
label_y_axis = 'Log ' + value_y_axis[0].lower() + value_y_axis[1:]
g.fig.get_axes()[0].set_yscale('log')
else:
......
......@@ -29,14 +29,15 @@ def make_report(gistic_results, rubic_gain_results, rubic_loss_results,
parsed.append(parsed_results)
stats_results = get_stats(parsed_results, tool)
stats.append(stats_results)
make_table_regions(parsed[0:2], file_regions) #make table with all recurrent regions
make_table_regions(parsed[0:2], file_regions, ref_genome) #make table with all recurrent regions
make_tool_report(file_tools, stats) #make tool report
extract_gene_lists(parsed, file_genes_GISTIC, file_genes_RUBIC, file_genes_both, known_genes, file_venn, ref_genome) #make gene files and venn overlap plot
plot_histogram_sizes(parsed, file_swarmplot) #plot histogram with the sizes of the regions
def make_tool_report(file_tools, stats_tools):
"""Make a report of the results from GISTIC and RUBIC."""
row_names = ["Tool", "Type", "No. regions", "Avg. size (Kb)", "Total size (Mb)", "No. genes", "Average driver density", "No. focal regions", "No. known regions", "No. census regions"]
row_names = ["Tool", "Type", "No. regions", "Avg. size (Kb)", "Total size (Mb)", "No. genes", "Average driver density", "No. focal regions", "No. known regions",
"No. census regions", "No. known focal regions"]
with open(file_tools, 'w') as out:
for i in range(len(row_names)):
list_out = [row_names[i]]
......@@ -45,10 +46,10 @@ def make_tool_report(file_tools, stats_tools):
list_out = list(map(str, list_out))
out.write("\t".join(list_out) + "\n")
def make_table_regions(parsed_tools, file_regions):
def make_table_regions(parsed_tools, file_regions, ref_genome):
"""Make a table with all recurent regions detected by both tools."""
with open(file_regions, 'w') as out:
header = ["Method", "Chr", "Start", "End", "Type", "Negative log10 q-value", "Known genes", "Census genes", "All genes"]
header = ["Method", "Chr", "Start", "End", "Type", "Negative log10 q-value", "Known genes", "Census genes", "Total number of genes"]
out.write("\t".join(header) + "\n")
for tool in parsed_tools:
tool_name = "GISTIC" if tool == parsed_tools[0] else "RUBIC"
......@@ -58,8 +59,12 @@ def make_table_regions(parsed_tools, file_regions):
q_vals = ", ".join(cnv[4]) if type(cnv[4]) == tuple else cnv[4]
begin_row = cnv[0:3] + [cnv_type, str(q_vals)]
out.write("\t".join(begin_row))
for list_genes in range(5,8):
out.write("\t" + ", ".join(cnv[list_genes]))
for list_genes in range(5,7):
out_genes = gene_ID_to_name(cnv[list_genes], ref_genome)
out.write("\t" + ", ".join(out_genes))
out.write("\t" + str(len(cnv[7])))
#for list_genes in range(5,8):
#out.write("\t" + ", ".join(cnv[list_genes]))
out.write("\n")
def extract_gene_lists(parsed_results, out_GISTIC, out_RUBIC, out_both, known_genes, out_venn, ref_genome):
......@@ -107,17 +112,16 @@ def venn3_overlap(gene_lists, known_genes_file, out_venn, ref_genome):
plt.subplots(figsize=(7, 7))
venn_sets = [set(gene_lists[0]), set(gene_lists[1]), set(known_genes)]
c = venn3(venn_sets, ('GISTIC', 'RUBIC', ' Known genes'))
#c.get_patch_by_id('100').set_color('#5975A4'), c.get_patch_by_id('010').set_color('#5F9E6E')
#c.get_patch_by_id('001').set_color('#857AAA'), c.get_patch_by_id('011').set_color('#748A8F')
#c.get_patch_by_id('101').set_color('#6D77A7'), c.get_patch_by_id('110').set_color('#5C888B')
#c.get_patch_by_id('100').set_alpha(1), c.get_patch_by_id('010').set_alpha(1)
#c.get_patch_by_id('001').set_alpha(1), c.get_patch_by_id('011').set_alpha(1)
#c.get_patch_by_id('101').set_alpha(1), c.get_patch_by_id('110').set_alpha(1)
#try:
# c.get_patch_by_id('111').set_color('#B55D60')
# c.get_patch_by_id('111').set_alpha(1)
#except:
# pass #too small overlap from three groups to show the overlapping region
try:
c.get_patch_by_id('100').set_color('#5975A4'), c.get_patch_by_id('100').set_alpha(1)
c.get_patch_by_id('010').set_color('#5F9E6E'), c.get_patch_by_id('010').set_alpha(1)
c.get_patch_by_id('110').set_color('#5C888B'), c.get_patch_by_id('110').set_alpha(1)
c.get_patch_by_id('001').set_color('#857AAA'), c.get_patch_by_id('001').set_alpha(1)
c.get_patch_by_id('101').set_color('#6D77A7'), c.get_patch_by_id('101').set_alpha(1)
c.get_patch_by_id('011').set_color('#748A8F'), c.get_patch_by_id('011').set_alpha(1)
c.get_patch_by_id('111').set_color('#B55D60'), c.get_patch_by_id('111').set_alpha(1)
except:
pass #too small overlap from three groups to show the overlapping region
plt.savefig(out_venn, dpi=300)
def plot_histogram_sizes(parsed_results, plot_file):
......
......@@ -40,21 +40,20 @@ test.stat <- new("weightCount", testStatistic = GOFisherTest, name = "Fisher tes
resultWeight <- getSigGroups(GOdata, test.stat)
#Make table with results
allRes <- GenTable(GOdata, classic = resultFisher,
#KS = resultKS,
allRes <- GenTable(GOdata, classic = resultFisher, #KS = resultKS,
weight = resultWeight,
orderBy = "weight", ranksOf = "classic") #, topNodes = 50)
orderBy = "weight", ranksOf = "classic", topNodes = 50)
write.table(allRes, file=snakemake@output[[1]], quote=FALSE, sep="\t", eol = "\n")
#Visualize GO term relationships
png(snakemake@output[[1]])
#png(snakemake@output[[2]])
#showSigOfNodes(GOdata, score(resultFisher), firstSigNodes = 5, useInfo ='all')
printGraph(GOdata, resultFisher, firstSigNodes = 10, fn.prefix = "tGO", useInfo = "all", pdfSW = TRUE)
dev.off()
#printGraph(GOdata, resultFisher, firstSigNodes = 10, fn.prefix = "tGO", useInfo = "all", pdfSW = TRUE)
#dev.off()
#Print most significant genes
ann.genes <- genesInTerm(GOdata)
str(ann.genes)
fisher.go <- names(score(resultFisher))[1:5]
fisher.ann.genes <- genesInTerm(sampleGOdata, whichGO=fisher.go)
print(fisher.ann.genes)
#ann.genes <- genesInTerm(GOdata)
#str(ann.genes)
#fisher.go <- names(score(resultFisher))[1:5]
#fisher.ann.genes <- genesInTerm(GOdata, whichGO=fisher.go)
#print(fisher.ann.genes)