Commit fa320a0d authored by bow's avatar bow
Browse files

Initial setup for raw flt3 csv output

parent 5e93608f
......@@ -12,7 +12,8 @@ OUTPUTS = {
"fqs": "{sample}/{sample}-{pair}.fq.gz",
"fqs_stats": "{sample}/qc-seq/{sample}-seq-stats.json",
"flt3_bam": "{sample}/itd-flt3/{sample}.flt3.bam",
"flt3_sc_jsons": "{sample}/itd-flt3/{sample}.flt3-sc.json",
"flt3_sc_json": "{sample}/itd-flt3/{sample}.flt3-sc.json",
"flt3_sc_csv": "{sample}/itd-flt3/{sample}.flt3-sc.roi.csv",
"flt3_sc_plots": "{sample}/itd-flt3/{sample}.flt3-sc.png",
"smallvars_bam": "{sample}/snv-indels/{sample}.snv-indel.bam",
"smallvars_vcf": "{sample}/snv-indels/{sample}.annotated.vcf.gz",
......
......@@ -9,6 +9,8 @@ RUN.set_default_setting("extract_sc_flt3",
srcdir(path.join("scripts", "extract_sc_flt3.py")))
RUN.set_default_setting("plot_sc_linear_flt3",
srcdir(path.join("scripts", "plot_sc_linear_flt3.py")))
RUN.set_default_setting("json2csv",
srcdir(path.join("scripts", "json2csv.py")))
# Region of interest ~ exon 14-15 in transcript coordinates.
RUN.set_default_setting("flt3_start", 1787)
......@@ -65,3 +67,16 @@ rule plot_sc_linear_flt3:
shell:
"python {input.scr} --sc-bg {input.bg} --min-scl-count 2 --min-insert-count 2"
" --fuzziness 12 --padding 0 {input.counts} {params.start} {params.end} {output.png}"
rule flt3_json_to_csv:
input:
counts=RUN.output("{sample}/itd-flt3/{sample}.flt3-sc.json"),
scr=RUN.settings["json2csv"],
output:
csv=RUN.output("{sample}/itd-flt3/{sample}.flt3-sc.roi.csv"),
params:
start=RUN.settings["flt3_start"],
end=RUN.settings["flt3_end"],
conda: srcdir("envs/flt3_csv.yml")
shell:
"python {input.scr} {wildcards.sample} --start {params.start} --end {params.end} > {output.csv}"
channels:
- bioconda
- r
- defaults
- conda-forge
dependencies:
- click=6.6=py35_0
- pandas=0.18.1=np111py35_0
#!/usr/bin/env python
# Script for converting raw FLT3 JSON data to CSV.
import csv
import json
import sys
import click
import pandas as pd
START = 1786 # 0-based
END = 2024
def within_range(pos, start, end):
if start is None and end is None:
return True
return start <= pos < end
def load_json(fname, sample_name, start, end):
with open(fname, "r") as src:
data = json.load(src)
pileups_by_pos = {x["pos"]: {"count": x["count"]} for x in data["pileups"]}
sc_by_pos = {x["pos"]: {k: x[k] for k in ("altPosCount", "count")}
for x in data["scs"] if within_range(x["pos"], start, end)}
inserts_by_pos = {}
for x in data["inserts"]:
pos = x["pos"]
if not within_range(pos, start, end):
continue
if pos not in inserts_by_pos:
inserts_by_pos[pos] = []
inserts_by_pos[pos].append({"count": x["count"], "seq": x["seq"]})
dps = []
for pos in range(start, end):
pileup_count = pileups_by_pos.get(pos, {"count": 0})["count"]
sc = sc_by_pos.get(pos, {"altPosCount": [], "count": 0})
sc_count = sc["count"]
try:
sc_pct = sc_count * 100.0 / pileup_count
except ZeroDivisionError:
sc_pct = float("nan")
links = [x for x in sc["altPosCount"] if within_range(x["pos"], start, end)]
links_count = sum(x["count"] for x in links)
try:
lsc_count = sum(x["count"]
for x in sc["altPosCount"] if within_range(x["pos"], start, end))
lsc_pct = lsc_count * 100.0 / pileup_count
except ZeroDivisionError:
lsc_pct = float("nan")
inserts = inserts_by_pos.get(pos) or []
inserts_count = sum(x["count"] for x in inserts)
dps.append({
"sample": sample_name,
"pos": pos,
"pileup_count": pileup_count,
"sc_count": sc_count,
"sc_pct": sc_pct,
"lsc_pct": lsc_pct,
"lsc_count": lsc_count,
"insert_count": inserts_count,
"links_count": links_count,
"links": links,
"inserts": inserts,
})
df = pd.DataFrame(dps).set_index(["pos", "sample"], drop=False,
verify_integrity=True)
return df
@click.command(context_settings={"help_option_names": ["-h", "--help"]})
@click.argument("input",
type=click.Path(exists=True, dir_okay=False))
@click.argument("sample_id", type=str)
@click.option("--start", type=int, default=START)
@click.option("--end", type=int, default=END)
def main(input, sample_id, start, end):
df = load_json(input, sample_id, start=start, end=end)
df_cols = ["sample", "pos", "pileup_count", "sc_count", "lsc_count",
"sc_pct"]
header = ["sample", "position", "coverage", "n_soft_clips",
"n_alt_soft_clip_aln", "pct_soft_clips"]
# Convert coordinate to 1-based, fully-closed.
df["pos"] = df["pos"].apply(lambda p: p + 1)
df[df_cols].to_csv(sys.stdout, index=False, header=header)
if __name__ == "__main__":
main()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment