Commit 397c9d9d authored by Beatrice Tan's avatar Beatrice Tan

Updated GO scripts to use provided input file to map GO terms instead of...

Updated GO scripts to use provided input file to map GO terms instead of frequently failed biomart download.
parent e70aeb29
...@@ -42,7 +42,10 @@ onerror: ...@@ -42,7 +42,10 @@ onerror:
rule all: rule all:
"""Define desired output from pipeline.""" """Define desired output from pipeline."""
input: input:
"Samplesize/Report.txt" #"Samplesize/Report.txt",
#"Reports/Tools.txt",
"GO/comparison.txt"
# "Samplesize/Report.txt"
rule help: rule help:
"""Print list of all targets with help.""" """Print list of all targets with help."""
......
...@@ -21,6 +21,7 @@ reference: hg19 ...@@ -21,6 +21,7 @@ reference: hg19
prev_found_genes: input_files/intogen-CM-drivers-data.tsv prev_found_genes: input_files/intogen-CM-drivers-data.tsv
census_genes: input_files/Census_genes.txt census_genes: input_files/Census_genes.txt
biomart_genes: input_files/biomart_human_genes.tsv biomart_genes: input_files/biomart_human_genes.tsv
ID_to_GO: input_files/ID_to_GO.txt
#Settings GISTIC2.0 #Settings GISTIC2.0
gistic_precision: "99" gistic_precision: "99"
...@@ -33,7 +34,13 @@ settings_gistic: "-ta 0.1 ...@@ -33,7 +34,13 @@ settings_gistic: "-ta 0.1
-genegistic 1 -genegistic 1
-conf 0.99" -conf 0.99"
comparison_settings: ["-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.7 -cap 1.5 -rx 1 -genegistic 1 -conf 0.99",
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.7 -cap 1.5 -rx 1 -genegistic 1 -conf 0.75",
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.98 -cap 1.5 -rx 1 -genegistic 1 -conf 0.75",
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.98 -cap 1.5 -rx 1 -genegistic 0 -conf 0.75",
"-ta 0.1 -td 0.1 -qvt 0.25 -brlen 0.7 -cap 1.5 -rx 1 -genegistic 0 -conf 0.75",]
#Settings for sample size differences #Settings for sample size differences
sizes: [20, 30, 40, 60, 70, 80, 90] sizes: [20, 30, 40, 50, 60, 70, 80, 90]
#sizes: [30, 40] #sizes: [30, 40]
repeats: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] repeats: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
This diff is collapsed.
rule gistic_settings:
"""Run GISTIC2 based on different settings."""
input:
gistic_directory=os.path.join(config["gisticdir"], "gistic2"),
seg="Input/Segments_tumor.txt"
output:
expand("Settings/GISTIC_{setting_nr}/all_lesions.conf_" + config["gistic_precision"] + ".txt", setting_nr=config["comparison_settings"],
expand("Settings/GISTIC_{setting_nr}/regions_track.conf_" + config["gistic_precision"] + ".bed", setting_nr=config["comparison_settings"]
params:
cnv="",
ref=config["reference"],
ref_file="",
extra={setting_nr}
wrapper:
"file:" + workflow.basedir + "/wrappers/GISTIC2"
rule compare_settings:
##
...@@ -6,7 +6,8 @@ rule go_analysis: ...@@ -6,7 +6,8 @@ rule go_analysis:
go="GO/{tool}.txt" go="GO/{tool}.txt"
params: params:
organism="hsapiens", #default is human organism="hsapiens", #default is human
ontology="BP" #MF, BP, CC or all ontology="BP", #MF, BP, CC or all
gene_ID_to_GO=os.path.join(workflow.basedir, config["ID_to_GO"]) if config["ID_to_GO"].startswith("input_files") else config["ID_to_GO"]
benchmark: benchmark:
"Benchmarks/topGO." + str(datetime.datetime.now()).replace(" ", "_") + ".txt" "Benchmarks/topGO." + str(datetime.datetime.now()).replace(" ", "_") + ".txt"
wrapper: wrapper:
...@@ -28,7 +29,7 @@ def compare_go(GO_files, output_file): ...@@ -28,7 +29,7 @@ def compare_go(GO_files, output_file):
go_terms = get_go(GO_files) go_terms = get_go(GO_files)
overlap = set(go_terms[0]).intersection(set(go_terms[1])) overlap = set(go_terms[0]).intersection(set(go_terms[1]))
with open(output_file, 'w') as out: with open(output_file, 'w') as out:
out.write("Percentage overlapping: " + str(int((len(overlap) / 50 * 100)))) out.write("Percentage overlapping: " + str(int((len(overlap) / 50 * 100))) + "\n")
out.write("\n".join(list(overlap))) out.write("\n".join(list(overlap)))
def get_go(GO_files): def get_go(GO_files):
...@@ -42,4 +43,5 @@ def get_go(GO_files): ...@@ -42,4 +43,5 @@ def get_go(GO_files):
go_term = line.split(' ')[1].strip('"') go_term = line.split(' ')[1].strip('"')
go_terms_tool += [go_term] go_terms_tool += [go_term]
go_terms.append(go_terms_tool) go_terms.append(go_terms_tool)
go_terms.append(go_terms_tool)
return go_terms return go_terms
...@@ -21,9 +21,10 @@ def make_report(size_gistic, size_rubic_gains, size_rubic_losses, census_genes, ...@@ -21,9 +21,10 @@ def make_report(size_gistic, size_rubic_gains, size_rubic_losses, census_genes,
size_file = (size_rubic_gains[i], size_rubic_losses[i]) if tool == 'RUBIC' else size_gistic[i] size_file = (size_rubic_gains[i], size_rubic_losses[i]) if tool == 'RUBIC' else size_gistic[i]
parsed_results = parse_regions(size_file, known_genes, census_genes, tool) parsed_results = parse_regions(size_file, known_genes, census_genes, tool)
if tool not in all_results.keys(): if tool not in all_results.keys():
all_results[tool] = [parsed_results] all_results[tool] = {}
all_results[tool][size] = [parsed_results]
else: else:
all_results[tool] = all_results[tool] + [parsed_results] all_results[tool][size] = all_results[tool][size] + [parsed_results]
stats_results = get_stats(parsed_results, size) stats_results = get_stats(parsed_results, size)
for stat_list in stats_results[0], stats_results[1]: for stat_list in stats_results[0], stats_results[1]:
converted_stats = [tool] + stat_list[0:2] converted_stats = [tool] + stat_list[0:2]
...@@ -36,9 +37,14 @@ def make_report(size_gistic, size_rubic_gains, size_rubic_losses, census_genes, ...@@ -36,9 +37,14 @@ def make_report(size_gistic, size_rubic_gains, size_rubic_losses, census_genes,
make_plots(list_stats, reps, sizes, plot_dir) make_plots(list_stats, reps, sizes, plot_dir)
def overlap_genes(all_results, report_file): def overlap_genes(all_results, report_file):
for tool in all_results.keys(): known_genes = {}
print(all_results) with open(report_file, 'w') as out:
for tool in all_results.keys():
out.write(tool + '\n')
for size in all_results[tool].keys():
out.write(size + '\n')
for cnv_type in size:
out.write(cnv_type[5])
def make_plots(list_stats, reps, sizes, plot_dir): def make_plots(list_stats, reps, sizes, plot_dir):
......
...@@ -7,6 +7,7 @@ import seaborn as sns ...@@ -7,6 +7,7 @@ import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu, pearsonr from scipy.stats import ttest_ind, mannwhitneyu, pearsonr
import os.path import os.path
from ParseResults import parse_regions, get_stats, parse_bed_overlap, parse_gene_file from ParseResults import parse_regions, get_stats, parse_bed_overlap, parse_gene_file
from ensemblQueries import gene_name_to_ID
from collections import OrderedDict from collections import OrderedDict
import math import math
...@@ -70,15 +71,17 @@ def make_gene_files(parsed_results, out_GISTIC, out_RUBIC, out_both, known_genes ...@@ -70,15 +71,17 @@ def make_gene_files(parsed_results, out_GISTIC, out_RUBIC, out_both, known_genes
genes_all.append(genes_tool) genes_all.append(genes_tool)
out_file = out_GISTIC if tool_results == parsed_results[0] else out_RUBIC out_file = out_GISTIC if tool_results == parsed_results[0] else out_RUBIC
with open(out_file, 'w') as out: with open(out_file, 'w') as out:
out.write("\n".join(genes_tool)) all_IDs = gene_name_to_ID(genes_tool)
out.write("\n".join(all_IDs))
make_gene_file_overlap(genes_all, out_both) make_gene_file_overlap(genes_all, out_both)
venn3_overlap(genes_all, known_genes, out_venn) venn3_overlap(genes_all, known_genes, out_venn)
def make_gene_file_overlap(gene_lists, out_both): def make_gene_file_overlap(gene_lists, out_both):
"""Make file with all genes reported by both tools.""" """Make file with all genes reported by both tools."""
overlap = set(gene_lists[0]).intersection(set(gene_lists[1])) overlap = set(gene_lists[0]).intersection(set(gene_lists[1]))
all_IDs = gene_name_to_ID(list(overlap))
with open(out_both, 'w') as overlap_file: with open(out_both, 'w') as overlap_file:
overlap_file.write("\n".join(list(overlap))) overlap_file.write("\n".join(all_IDs))
def venn3_overlap(gene_lists, known_genes, out_venn): def venn3_overlap(gene_lists, known_genes, out_venn):
known = parse_gene_file(known_genes, np.inf) known = parse_gene_file(known_genes, np.inf)
...@@ -88,11 +91,14 @@ def venn3_overlap(gene_lists, known_genes, out_venn): ...@@ -88,11 +91,14 @@ def venn3_overlap(gene_lists, known_genes, out_venn):
c.get_patch_by_id('100').set_color('#5975A4'), c.get_patch_by_id('010').set_color('#5F9E6E') c.get_patch_by_id('100').set_color('#5975A4'), c.get_patch_by_id('010').set_color('#5F9E6E')
c.get_patch_by_id('001').set_color('#857AAA'), c.get_patch_by_id('011').set_color('#748A8F') c.get_patch_by_id('001').set_color('#857AAA'), c.get_patch_by_id('011').set_color('#748A8F')
c.get_patch_by_id('101').set_color('#6D77A7'), c.get_patch_by_id('110').set_color('#5C888B') c.get_patch_by_id('101').set_color('#6D77A7'), c.get_patch_by_id('110').set_color('#5C888B')
c.get_patch_by_id('111').set_color('#B55D60') try:
c.get_patch_by_id('111').set_color('#B55D60')
c.get_patch_by_id('111').set_alpha(1)
except:
pass #too little overlap from three groups to show
c.get_patch_by_id('100').set_alpha(1), c.get_patch_by_id('010').set_alpha(1) c.get_patch_by_id('100').set_alpha(1), c.get_patch_by_id('010').set_alpha(1)
c.get_patch_by_id('001').set_alpha(1), c.get_patch_by_id('011').set_alpha(1) c.get_patch_by_id('001').set_alpha(1), c.get_patch_by_id('011').set_alpha(1)
c.get_patch_by_id('101').set_alpha(1), c.get_patch_by_id('110').set_alpha(1) c.get_patch_by_id('101').set_alpha(1), c.get_patch_by_id('110').set_alpha(1)
c.get_patch_by_id('111').set_alpha(1)
plt.savefig(out_venn, dpi=300) plt.savefig(out_venn, dpi=300)
def plot_histogram_sizes(parsed_results, plot_file): def plot_histogram_sizes(parsed_results, plot_file):
......
...@@ -30,6 +30,6 @@ def gene_name_to_ID(gene_names): ...@@ -30,6 +30,6 @@ def gene_name_to_ID(gene_names):
gene_ID = ensembl.gene_ids_of_gene_name(gene) gene_ID = ensembl.gene_ids_of_gene_name(gene)
list_genes.append(gene_ID[0]) list_genes.append(gene_ID[0])
except: except:
print(gene) #print(gene)
pass pass
return list_genes return list_genes
...@@ -4,15 +4,17 @@ ...@@ -4,15 +4,17 @@
# __license__ = "LUMC" # __license__ = "LUMC"
library(topGO) library(topGO)
library(biomaRt)
# Download table with gene IDs and GO terms from biomart
organism <- if (snakemake@params[["organism"]] == "") "hsapiens" else snakemake@params[["organism"]]
organismDataset <- paste(organism, "_gene_ensembl", sep="")
#Download table with geen IDs and GO terms from biomart or provide own file
if (snakemake@params[["gene_ID_to_GO"]] == "") {
library(biomaRt)
ensembl <- useMart("ensembl") ensembl <- useMart("ensembl")
ensembl <- useDataset(organismDataset, mart=ensembl) ensembl <- useDataset("hsapiens_gene_ensembl", mart=ensembl)
EG2GO <- getBM(mart=ensembl, attributes=c('ensembl_gene_id','go_id')) EG2GO <- getBM(mart=ensembl, attributes=c('ensembl_gene_id','go_id'))
} else {
EG2GO <- read.table(snakemake@params[["gene_ID_to_GO"]], sep="\t", header=TRUE)
colnames(EG2GO) <- c("ensembl_gene_id","go_id")
}
geneID2GO <- by(EG2GO$go_id, geneID2GO <- by(EG2GO$go_id,
EG2GO$ensembl_gene_id, EG2GO$ensembl_gene_id,
...@@ -48,7 +50,7 @@ allRes <- GenTable(GOdata, classic = resultFisher, ...@@ -48,7 +50,7 @@ allRes <- GenTable(GOdata, classic = resultFisher,
KS = resultKS, weight = resultWeight, KS = resultKS, weight = resultWeight,
orderBy = "weight", ranksOf = "classic", topNodes = 50) orderBy = "weight", ranksOf = "classic", topNodes = 50)
print(allRes) print(allRes)
write.table(allRes, file=snakemake@output[[1]]) write.table(allRes, file=snakemake@output[[1]], quote=FALSE)
#Visualize GO term relationships #Visualize GO term relationships
#Rgraphviz nodig! #Rgraphviz nodig!
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment