Commit 95f7f7ff authored by Beatrice Tan's avatar Beatrice Tan

Removed code that is not used.

parent e6e17a54
......@@ -43,10 +43,7 @@ onerror:
rule all:
"""Define desired output from pipeline."""
input:
#"Settings/Report.txt",
"Samplesize/Report.txt"
#"PipelineResults.html",
#"GO/comparison.txt",
"PipelineResults.html"
rule help:
"""Print list of all targets with help."""
......@@ -66,6 +63,7 @@ rule report_pipeline:
venn="Reports/Venn_overlap_genes.png",
swarmplot="Reports/Comparison_sizes.png",
circos="Circos/RecurrentRegions_legend.png",
go="GO/comparison.txt"
output:
html="PipelineResults.html"
run:
......@@ -80,7 +78,7 @@ rule report_pipeline:
- Table with all recurrent regions and overlapping genes: table_regions_
- Circos plot showing the raw segmentation file and recurrent regions detected by both tools: circos_
- Venn diagram showing the overlap between gene lists from both tools: venn_
- Swarmplot showing the differences in sizes between both tools: swarmplot_
- Boxplot showing the differences in sizes between both tools: swarmplot_
""", output.html, metadata="Beatrice F. Tan (beatrice.ftan@gmail.com)", **input)
......
......@@ -2,7 +2,7 @@
#workdir: /home/bftan/CNA_results #directory to write output
#gisticdir: /home/bftan/Tools/GISTIC2 #directory to install GISTIC2
workdir: /home/beatrice/CNA_analysis #CNA_99_genegistic
workdir: /home/beatrice/CNA_99_genegistic
gisticdir: /home/beatrice/CNA_analysis/run_gistic2
#Input details to download from firehose
......@@ -14,10 +14,10 @@ inputfile: "" #tumor segmentation data
normal: "" #normal segmentation data
#Data for running and benchmarking tools.
reference: hg19 #hg38.UCSC.add_miR.160920.refgene
reference: hg19
prev_found_genes: input_files/intogen-CM-drivers-data.tsv
census_genes: input_files/Census_genes.txt
biomart_genes: input_files/biomart_human_genes_hg19.tsv #wrong genome build
biomart_genes: input_files/biomart_human_genes_hg19.tsv
ID_to_GO: input_files/ID_to_GO.txt
#Settings GISTIC2.0
......
......@@ -57,8 +57,6 @@ def parse_gistic_results(results_file, known_genes, census_genes, ref_genome):
result_dir, conf = results_file.split("all_lesions")
gene_file = os.path.join(result_dir, cnv_type + "_genes" + conf)
list_genes = gistic_genes(gene_file, CNV_id)
#known_overlap = set(list_genes).intersection(set(known_genes))
#census_overlap = set(list_genes).intersection(set(census_genes))
overlapping_genes = gene_IDs_at_locus(int(chrom), int(start), int(end), ref_genome) #get list of genes at genomic region
known_overlap = set(overlapping_genes).intersection(set(known_genes)) #extract known genes captured by analysis
census_overlap = set(overlapping_genes).intersection(set(census_genes))
......@@ -89,11 +87,8 @@ def parse_rubic_results(results_gains, results_losses, known_genes, census_genes
for lesion in lesions:
chrom, start, end, qval_left, qval_right, genes = lesion.split("\t")
cnv_type = 'amp' if result_file.endswith('gains.txt') else 'del'
#list_genes = genes.strip().split(",")
qval_left = float(qval_left)
qval_right = float(qval_right)
#known_overlap = set(list_genes).intersection(set(known_genes))
#census_overlap = set(list_genes).intersection(set(census_genes))
overlapping_genes = gene_IDs_at_locus(chrom, int(start), int(end), ref_genome)
known_overlap = set(overlapping_genes).intersection(set(known_genes))
census_overlap = set(overlapping_genes).intersection(set(census_genes))
......@@ -155,9 +150,9 @@ def get_stats(list_regions, used_tool):
def calculate_stats(tot_size, nr_genes, nr_reg, known_count, census_count, focal_count, cnv_type, used_tool, driver_density, known_focal):
"""Calculate average & total numbers for report on recurrent regions."""
type_name = 'Amplifications' if cnv_type == 'amp' else 'Deletions'
avg_size = int(tot_size / nr_reg / 1000) if nr_reg != 0 else 'N/A' #average size in kb
avg_size = int(tot_size / nr_reg / 1000) if nr_reg != 0 else '0' #average size in kb
round_tot_size = round((tot_size / 1000000), 2) #total size in mb
avg_driver_density = round(sum(driver_density) / nr_reg, 2)
avg_driver_density = round(sum(driver_density) / nr_reg, 2) if nr_reg != 0 else '0'
stats = [used_tool, type_name, nr_reg, avg_size, round_tot_size, nr_genes, avg_driver_density]
stats = list(map(str, stats))
for count in focal_count, known_count, census_count, known_focal:
......
......@@ -14,12 +14,14 @@ def make_report(size_gistic, size_rubic_gains, size_rubic_losses, census_genes,
"""Make a report of the results produced using input files with different sample sizes."""
list_stats = []
all_results = {}
all_parsed = []
for i in range(len(size_rubic_gains)): #loop through sizes
size_rep = size_rubic_gains[i].split("Size")[1].split("/gains")[0]
size, repetition = size_rep.split(".Rep")
for tool in 'GISTIC', 'RUBIC':
size_file = (size_rubic_gains[i], size_rubic_losses[i]) if tool == 'RUBIC' else size_gistic[i]
parsed_results = parse_regions(size_file, known_genes, census_genes, tool, ref_genome)
all_parsed.append(parsed_results)
stats_results = get_stats(parsed_results, size)
for stat_list in stats_results[0], stats_results[1]:
converted_stats = [tool] + stat_list[0:2]
......@@ -28,27 +30,21 @@ def make_report(size_gistic, size_rubic_gains, size_rubic_losses, census_genes,
for stat in stat_list[5:]:
converted_stats.append(float(stat.split(" (")[0]))
list_stats.append(converted_stats)
overlap_genes(all_results, report_file)
overlap_genes(all_parsed, report_file)
make_plots(list_stats, reps, sizes, plot_dir)
def overlap_genes(all_results, report_file):
def overlap_genes(parsed_results, report_file):
with open(report_file, 'w') as out:
out.write("\n".join(list(all_results.keys())))
# known_genes = {}
# with open(report_file, 'w') as out:
# for tool in all_results.keys():
# out.write(tool + '\n')
# for size in all_results[tool].keys():
# out.write(size + '\n')
# for cnv_type in size:
# out.write(cnv_type[5])
for size_file in parsed_results:
out_line = ''
for line in size_file:
if line[5] != []:
out_line = out_line + "\t" + ", ".join(line[5])
out.write(out_line + "\n")
def make_plots(list_stats, reps, sizes, plot_dir):
plot_y_axis = (["No. regions", "Avg. size (Kb)", "Total size (Mb)", "No. genes", "Average driver density", "No. focal regions", "No. known regions",
"No. census regions", "No. known focal regions"])
# 'Number of recurrent regions', 'Average size of regions (Kb)', 'Total size (Mb)',
# 'Number of genes', 'Number of known regions', 'Number of census regions'])
df_stats = pd.DataFrame(list_stats, columns=['Tool', 'Sample size', 'Type'] + plot_y_axis)
for plot_name in plot_y_axis:
plot_size_differences(df_stats, plot_name, sizes, len(reps), plot_dir)
......@@ -101,9 +97,7 @@ def test_significance(x1, x2, df, cnv_type, list_sizes, value_y_axis):
"""Test whether differences between sample sizes x1 and x2 in dataframe df for cnv_type of interest are significant.
Returns P-value."""
only_tool = df[df['Type']==cnv_type]
#only_tool = only_type[only_type['Tool']=='GISTIC']
values_x1 = only_tool[only_tool['Sample size']==list_sizes[x1]]
values_x2 = only_tool[only_tool['Sample size']==list_sizes[x2]]
#signficance = ttest_ind(values_x1['value_y_axis'], values_x2['value_y_axis'])
significance = mannwhitneyu(values_x1[value_y_axis], values_x2[value_y_axis])
return significance[1]
......@@ -63,8 +63,6 @@ def make_table_regions(parsed_tools, file_regions, ref_genome):
out_genes = gene_ID_to_name(cnv[list_genes], ref_genome)
out.write("\t" + ", ".join(out_genes))
out.write("\t" + str(len(cnv[7])))
#for list_genes in range(5,8):
#out.write("\t" + ", ".join(cnv[list_genes]))
out.write("\n")
def extract_gene_lists(parsed_results, out_GISTIC, out_RUBIC, out_both, known_genes, out_venn, ref_genome):
......@@ -87,16 +85,12 @@ def extract_gene_lists(parsed_results, out_GISTIC, out_RUBIC, out_both, known_ge
def make_gene_file_tool(gene_names, out_file):
"""Make file with list of genes reported by tool (one gene per line)."""
with open(out_file, 'w') as out:
#gene_IDs = gene_name_to_ID(gene_names) #Convert gene names to IDs in order to do GO enrichment analysis
#out.write("\n".join(gene_IDs))
out.write("\n".join(gene_names))
def make_gene_file_overlap(gene_lists, out_both):
"""Make file with list of all genes reported by both tools (one gene per line)."""
overlapping_genes = set(gene_lists[0]).intersection(set(gene_lists[1]))
#overlapping_IDs = gene_name_to_ID(list(overlapping_genes))
with open(out_both, 'w') as overlap_file:
#overlap_file.write("\n".join(overlapping_IDs))
overlap_file.write("\n".join(overlapping_genes))
def venn3_overlap(gene_lists, known_genes_file, out_venn, ref_genome):
......
......@@ -8,8 +8,6 @@ class SegFile:
def make_seg_random_subsets(self, segmentation_file, out_files):
"""Take a given number of random subsets with for a given list of sample sizes to write new segmentation files."""
header, sample_dict = self.parse_seg(segmentation_file)
#out_files = out_files.split(" ")
print(out_files)
for size in out_files:
print(size)
with open(size, 'w') as new:
......
......@@ -41,7 +41,6 @@ def gene_name_to_ID(gene_names, ref_genome):
gene_ID = ensembl.gene_ids_of_gene_name(gene)
gene_IDs.append(gene_ID[0])
except:
#print(gene)
pass
return gene_IDs
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment