Commit 749ebd78 authored by Sam Nooij's avatar Sam Nooij
Browse files

Concatenate deduplication metrics

parent 67d56ae4
......@@ -131,12 +131,12 @@ rule all:
"data/processed/Mapped_read_counts-paired.tsv",
"data/processed/Mapped_read_counts-unpaired.tsv",
"data/processed/Mapped_read_counts.tsv",
#Concatenated tables with number of mapped reads per sequence per sample
#Tables with number of mapped reads per sequence per sample
"data/processed/Depth_of_coverage-paired.tsv",
"data/processed/Depth_of_coverage-unpaired.tsv",
"data/processed/Depth_of_coverage.tsv",
#Concatenated tables with depth of coverage
#Tables with depth of coverage
# And the _deduplicated_ mapped files:
......@@ -167,12 +167,16 @@ rule all:
"data/processed/Deduplicated_read_counts-paired.tsv",
"data/processed/Deduplicated_read_counts-unpaired.tsv",
"data/processed/Deduplicated_read_counts.tsv",
#Concatenated tables with number of mapped reads per sequence per sample
#Tables with number of mapped reads per sequence per sample
"data/processed/Deduplicated_depth_of_coverage-paired.tsv",
"data/processed/Deduplicated_depth_of_coverage-unpaired.tsv",
"data/processed/Deduplicated_depth_of_coverage.tsv",
#Concatenated tables with depth of coverage
#Tables with depth of coverage
"data/processed/Deduplication_metrics-paired.tsv",
"data/processed/Deduplication_metrics-unpaired.tsv",
#Metrics of duplication per sample per reference
### Step 3: include processing steps to generate output ###
......@@ -870,6 +874,41 @@ rule concatenate_depth_deduplicated_unpaired_reads:
python bin/concatenate_depth_tables.py -i {input} -o {output} > {log} 2>&1
"""
rule concatenate_deduplication_metrics_paired:
input:
expand("data/tmp/Deduplicated-metrics_{sample}_to_{ref}-paired.txt",
sample = SAMPLES, ref = REFERENCE_NAMES)
output:
"data/processed/Deduplication_metrics-paired.tsv"
conda:
"envs/pandas.yaml"
log:
"log/concatenate_deduplication_metrics-paired.txt"
benchmark:
"log/benchmark/concatenate_deduplication_metrics-paired.txt"
shell:
"""
python bin/concatenate_deduplication_metrics.py -i {input} -o {output} > {log} 2>&1
"""
rule concatenate_deduplication_metrics_unpaired:
input:
expand("data/tmp/Deduplicated-metrics_{sample}_to_{ref}-unpaired.txt",
sample = SAMPLES, ref = REFERENCE_NAMES)
output:
"data/processed/Deduplication_metrics-unpaired.tsv"
conda:
"envs/pandas.yaml"
log:
"log/concatenate_deduplication_metrics-unpaired.txt"
benchmark:
"log/benchmark/concatenate_deduplication_metrics-unpaired.txt"
shell:
"""
python bin/concatenate_deduplication_metrics.py -i {input} -o {output} > {log} 2>&1
"""
rule sum_depths:
input:
paired="data/processed/Depth_of_coverage-paired.tsv",
......
#! /usr/bin/env python3
# Concatenate per-sample deduplication metrics from Picard into an overall table.
# Required input:
# - A number of per-sample Picard output metrics files
#
# The output file will be a concatenated table with additional "Sample_name"
# and "Reference" columns, based on the input files' names.
#
# Example use:
# python concatenate_deduplication_metrics.py -i sample1_to_reference1-paired.tsv sample2_to_reference1-paired.tsv sample3_to_reference1-paired.tsv -o All_samples_to_reference1-paired.tsv
# IMPORT required libraries---------------------------------
import pandas as pd
import argparse
# Define FUNCTIONS------------------------------------------
def parse_arguments():
"""
Parse the arguments from the command line, i.e.:
-i/--input = list of input files (tab-separated tables)
-o/--output = output file (tab-separated table)
-h/--help = show help
"""
parser = argparse.ArgumentParser(
prog="concatenate mapped read counts",
description="Concatenate mapped read count tables",
usage="concatenate_mapped_read_counts.py -i [input] -o [output]"
" [-h / --help]",
add_help=False,
)
required = parser.add_argument_group("Required arguments")
required.add_argument(
"-i",
"--input",
dest="input",
required=True,
type=str,
nargs="+",
help="List of input files (metrics per sample).",
)
required.add_argument(
"-o",
"--output",
dest="output",
required=True,
type=str,
help="Output file name (and directory).",
)
(args, extra_args) = parser.parse_known_args()
return args
def extract_sample_and_reference_name(filename):
"""
Filenames in this pipeline will usually be in the form:
Deduplicated-metrics-{sample}_to_{reference}-[paired/unpaired].tsv
To extract the sample name, remove "Deduplicated_metrics-", and
everything from "_to_" until the end.
"""
if "/" in filename:
# If the directory path is attached to the filename, remove it
without_path = filename.split("/")[-1]
else:
without_path = filename
if "Deduplicated-metrics" in without_path:
without_prefix = without_path.replace("Deduplicated-metrics_", "")
else:
# This should never happen, but it will show in sample names otherwise.
pass
sample = without_prefix[: without_prefix.index("_to_")]
reference = without_prefix[without_prefix.index("_to_") + 4 : -4]
# Assume the .tsv file extension; hence strip the last 4 characters
if "-unpaired" in reference:
reference = reference.replace("-unpaired", "")
elif "-paired" in reference:
reference = reference.replace("-paired", "")
else:
print(
"There might be an unexpected suffix in the reference name:"
" %s" % reference
)
pass
return (sample, reference)
def extract_deduplication_metrics(filename):
"""
Read Picard's default output metrics file for read deduplication and return a
pandas dataframe with:
- Sample name
- Reference name
- Paired mapped reads
- Unpaired mapped reads
- Paired read duplicates
- Unpaired read duplicates
- Read pair optical duplicates
- Percent duplication
"""
(sample, reference) = extract_sample_and_reference_name(filename)
with open(filename, "r") as read_file:
for line in read_file:
if line.startswith("Unknown Library"):
values = line.split()
else:
pass
# Not interested in any other line
try:
unpaired_reads = values[2]
paired_reads = values[3]
unpaired_duplicates = values[6]
paired_duplicates = values[7]
optical_duplicates = values[8]
percent_duplication = values[9]
df = pd.DataFrame(
data={
"Sample": sample,
"Reference": reference,
"Paired_mapped_reads": paired_reads,
"Unpaired_mapped_reads": unpaired_reads,
"Paired_duplicates": paired_duplicates,
"Unpaired_duplicates": unpaired_duplicates,
"Paired_optical_duplicates": optical_duplicates,
"Percent_duplication": percent_duplication,
},
index=[1],
)
except UnboundLocalError:
print(
"-----\n%s does not appear to have metrics.\n"
"It may not have any mapping reads and therefore zero values are filled in."
% filename
)
df = pd.DataFrame(
data={
"Sample": sample,
"Reference": reference,
"Paired_mapped_reads": 0,
"Unpaired_mapped_reads": 0,
"Paired_duplicates": 0,
"Unpaired_duplicates": 0,
"Paired_optical_duplicates": 0,
"Percent_duplication": 0,
},
index=[1],
)
return df
def main():
"""
Main execution of the script
"""
# 1. Parse and show arguments
arguments = parse_arguments()
message = (
"\n"
"These are the arguments you have provided:\n"
" INPUT:\n"
"{0},\n"
" OUTPUT:\n"
"{1}\n".format(arguments.input, arguments.output)
)
print(message)
# 2. Read input files and make into one dataframe
concat_df = pd.DataFrame()
for file in arguments.input:
df = extract_deduplication_metrics(file)
concat_df = pd.concat([concat_df, df])
# 3. Write table to a tsv file
concat_df.to_csv(arguments.output, sep="\t", index=False)
return 0
# EXECUTE script--------------------------------------------
if __name__ == "__main__":
exit(main())
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment