Commit 397c9d9d authored by Beatrice Tan's avatar Beatrice Tan

Updated GO scripts to use provided input file to map GO terms instead of...

Updated GO scripts to use provided input file to map GO terms instead of frequently failed biomart download.
parent e70aeb29
......@@ -42,7 +42,10 @@ onerror:
rule all:
"""Define desired output from pipeline."""
input:
"Samplesize/Report.txt"
#"Samplesize/Report.txt",
#"Reports/Tools.txt",
"GO/comparison.txt"
# "Samplesize/Report.txt"
rule help:
"""Print list of all targets with help."""
......
......@@ -21,6 +21,7 @@ reference: hg19
prev_found_genes: input_files/intogen-CM-drivers-data.tsv
census_genes: input_files/Census_genes.txt
biomart_genes: input_files/biomart_human_genes.tsv
ID_to_GO: input_files/ID_to_GO.txt
#Settings GISTIC2.0
gistic_precision: "99"
......@@ -33,7 +34,13 @@ settings_gistic: "-ta 0.1
-genegistic 1
-conf 0.99"
comparison_settings: ["-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.7 -cap 1.5 -rx 1 -genegistic 1 -conf 0.99",
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.7 -cap 1.5 -rx 1 -genegistic 1 -conf 0.75",
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.98 -cap 1.5 -rx 1 -genegistic 1 -conf 0.75",
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.98 -cap 1.5 -rx 1 -genegistic 0 -conf 0.75",
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.7 -cap 1.5 -rx 1 -genegistic 0 -conf 0.75",]
#Settings for sample size differences
sizes: [20, 30, 40, 60, 70, 80, 90]
sizes: [20, 30, 40, 50, 60, 70, 80, 90]
#sizes: [30, 40]
repeats: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
This diff is collapsed.
rule gistic_settings:
"""Run GISTIC2 based on different settings."""
input:
gistic_directory=os.path.join(config["gisticdir"], "gistic2"),
seg="Input/Segments_tumor.txt"
output:
expand("Settings/GISTIC_{setting_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt", setting_nr=config["comparison_settings"],
expand("Settings/GISTIC_{setting_nr}/regions_track.conf_" + config["gistic_precision"] + ".bed", setting_nr=config["comparison_settings"]
params:
cnv="",
ref=config["reference"],
ref_file="",
extra={setting_nr}
wrapper:
"file:" + workflow.basedir + "/wrappers/GISTIC2"
rule compare_settings:
##
......@@ -6,7 +6,8 @@ rule go_analysis:
go="GO/{tool}.txt"
params:
organism="hsapiens", #default is human
ontology="BP" #MF, BP, CC or all
ontology="BP", #MF, BP, CC or all
gene_ID_to_GO=os.path.join(workflow.basedir, config["ID_to_GO"]) if config["ID_to_GO"].startswith("input_files") else config["ID_to_GO"]
benchmark:
"Benchmarks/topGO." + str(datetime.datetime.now()).replace(" ", "_") + ".txt"
wrapper:
......@@ -28,7 +29,7 @@ def compare_go(GO_files, output_file):
go_terms = get_go(GO_files)
overlap = set(go_terms[0]).intersection(set(go_terms[1]))
with open(output_file, 'w') as out:
out.write("Percentage overlapping: " + str(int((len(overlap) / 50 * 100))))
out.write("Percentage overlapping: " + str(int((len(overlap) / 50 * 100))) + "\n")
out.write("\n".join(list(overlap)))
def get_go(GO_files):
......@@ -42,4 +43,5 @@ def get_go(GO_files):
go_term = line.split(' ')[1].strip('"')
go_terms_tool += [go_term]
go_terms.append(go_terms_tool)
go_terms.append(go_terms_tool)
return go_terms
......@@ -21,9 +21,10 @@ def make_report(size_gistic, size_rubic_gains, size_rubic_losses, census_genes,
size_file = (size_rubic_gains[i], size_rubic_losses[i]) if tool == 'RUBIC' else size_gistic[i]
parsed_results = parse_regions(size_file, known_genes, census_genes, tool)
if tool not in all_results.keys():
all_results[tool] = [parsed_results]
all_results[tool] = {}
all_results[tool][size] = [parsed_results]
else:
all_results[tool] = all_results[tool] + [parsed_results]
all_results[tool][size] = all_results[tool][size] + [parsed_results]
stats_results = get_stats(parsed_results, size)
for stat_list in stats_results[0], stats_results[1]:
converted_stats = [tool] + stat_list[0:2]
......@@ -36,9 +37,14 @@ def make_report(size_gistic, size_rubic_gains, size_rubic_losses, census_genes,
make_plots(list_stats, reps, sizes, plot_dir)
def overlap_genes(all_results, report_file):
for tool in all_results.keys():
print(all_results)
known_genes = {}
with open(report_file, 'w') as out:
for tool in all_results.keys():
out.write(tool + '\n')
for size in all_results[tool].keys():
out.write(size + '\n')
for cnv_type in size:
out.write(cnv_type[5])
def make_plots(list_stats, reps, sizes, plot_dir):
......
......@@ -7,6 +7,7 @@ import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu, pearsonr
import os.path
from ParseResults import parse_regions, get_stats, parse_bed_overlap, parse_gene_file
from ensemblQueries import gene_name_to_ID
from collections import OrderedDict
import math
......@@ -70,15 +71,17 @@ def make_gene_files(parsed_results, out_GISTIC, out_RUBIC, out_both, known_genes
genes_all.append(genes_tool)
out_file = out_GISTIC if tool_results == parsed_results[0] else out_RUBIC
with open(out_file, 'w') as out:
out.write("\n".join(genes_tool))
all_IDs = gene_name_to_ID(genes_tool)
out.write("\n".join(all_IDs))
make_gene_file_overlap(genes_all, out_both)
venn3_overlap(genes_all, known_genes, out_venn)
def make_gene_file_overlap(gene_lists, out_both):
"""Make file with all genes reported by both tools."""
overlap = set(gene_lists[0]).intersection(set(gene_lists[1]))
all_IDs = gene_name_to_ID(list(overlap))
with open(out_both, 'w') as overlap_file:
overlap_file.write("\n".join(list(overlap)))
overlap_file.write("\n".join(all_IDs))
def venn3_overlap(gene_lists, known_genes, out_venn):
known = parse_gene_file(known_genes, np.inf)
......@@ -88,11 +91,14 @@ def venn3_overlap(gene_lists, known_genes, out_venn):
c.get_patch_by_id('100').set_color('#5975A4'), c.get_patch_by_id('010').set_color('#5F9E6E')
c.get_patch_by_id('001').set_color('#857AAA'), c.get_patch_by_id('011').set_color('#748A8F')
c.get_patch_by_id('101').set_color('#6D77A7'), c.get_patch_by_id('110').set_color('#5C888B')
c.get_patch_by_id('111').set_color('#B55D60')
try:
c.get_patch_by_id('111').set_color('#B55D60')
c.get_patch_by_id('111').set_alpha(1)
except:
pass #too little overlap from three groups to show
c.get_patch_by_id('100').set_alpha(1), c.get_patch_by_id('010').set_alpha(1)
c.get_patch_by_id('001').set_alpha(1), c.get_patch_by_id('011').set_alpha(1)
c.get_patch_by_id('101').set_alpha(1), c.get_patch_by_id('110').set_alpha(1)
c.get_patch_by_id('111').set_alpha(1)
plt.savefig(out_venn, dpi=300)
def plot_histogram_sizes(parsed_results, plot_file):
......
......@@ -30,6 +30,6 @@ def gene_name_to_ID(gene_names):
gene_ID = ensembl.gene_ids_of_gene_name(gene)
list_genes.append(gene_ID[0])
except:
print(gene)
#print(gene)
pass
return list_genes
......@@ -4,15 +4,17 @@
# __license__ = "LUMC"
library(topGO)
library(biomaRt)
# Download table with gene IDs and GO terms from biomart
organism <- if (snakemake@params[["organism"]] == "") "hsapiens" else snakemake@params[["organism"]]
organismDataset <- paste(organism, "_gene_ensembl", sep="")
#Download table with geen IDs and GO terms from biomart or provide own file
if (snakemake@params[["gene_ID_to_GO"]] == "") {
library(biomaRt)
ensembl <- useMart("ensembl")
ensembl <- useDataset(organismDataset, mart=ensembl)
ensembl <- useDataset("hsapiens_gene_ensembl", mart=ensembl)
EG2GO <- getBM(mart=ensembl, attributes=c('ensembl_gene_id','go_id'))
} else {
EG2GO <- read.table(snakemake@params[["gene_ID_to_GO"]], sep="\t", header=TRUE)
colnames(EG2GO) <- c("ensembl_gene_id","go_id")
}
geneID2GO <- by(EG2GO$go_id,
EG2GO$ensembl_gene_id,
......@@ -48,7 +50,7 @@ allRes <- GenTable(GOdata, classic = resultFisher,
KS = resultKS, weight = resultWeight,
orderBy = "weight", ranksOf = "classic", topNodes = 50)
print(allRes)
write.table(allRes, file=snakemake@output[[1]])
write.table(allRes, file=snakemake@output[[1]], quote=FALSE)
#Visualize GO term relationships
#Rgraphviz nodig!
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment