Commit c8aa56d3 authored by Sam Nooij's avatar Sam Nooij
Browse files

Reformat Python code with Black

parent 6ce6ee4e
#! /usr/bin/env python3
#Collect scaffold information for a species filtered from Jovian's
# Collect scaffold information for a species filtered from Jovian's
# classification table. (Which scaffolds, their length and number
# of reads mapped to them.)
#
......@@ -9,19 +9,19 @@
# -i data/all_taxClassified-Escherichia_coli.tsv data/Mapped_read_counts.tsv \
# -o results/Escherichia_coli-scaffolds_per_sample.tsv results/Escherichia_coli-stats_per_sample.tsv \
#
#Three required input arguments are:
# Three required input arguments are:
# -i/--input for the two required tables (see example)
# -o/--output for the result tables (see example)
## Import required libraries
import sys #to abort the script when an error occurs
import argparse #to parse command-line arguments
import pandas as pd #to work with dataframes (tabular data)
import numpy as np #for calculations (summing contig stats)
import re #to extract sample names from multiqc table
import sys # to abort the script when an error occurs
import argparse # to parse command-line arguments
import pandas as pd # to work with dataframes (tabular data)
import numpy as np # for calculations (summing contig stats)
import re # to extract sample names from multiqc table
## Define functions:
#1. Parse command-line arguments
# 1. Parse command-line arguments
def parse_arguments():
"""
Parse the arguments from the command line, i.e.:
......@@ -29,98 +29,111 @@ def parse_arguments():
-o/--output = 2 output files
-h/--help = show help
"""
parser = argparse.ArgumentParser(prog="collect species statistics",
parser = argparse.ArgumentParser(
prog="collect species statistics",
description="Collect scaffold statistics for each species filtered from Jovian's results",
usage="collect_species_statistics.py -i file1 file2 -o file3 file4"
" [-h / --help]",
add_help=False)
add_help=False,
)
required = parser.add_argument_group("Required arguments")
required.add_argument("-i",
required.add_argument(
"-i",
"--input",
dest="input",
metavar="file1 file2",
required=True,
nargs=2,
type=str,
help="Classified scaffolds table and mapped read counts.")
help="Classified scaffolds table and mapped read counts.",
)
required.add_argument("-o",
required.add_argument(
"-o",
"--output",
dest="output",
metavar="file3 file4",
required=True,
nargs=2,
type=str,
help="Output tables (per scaffold, and summed statistics).")
help="Output tables (per scaffold, and summed statistics).",
)
optional = parser.add_argument_group("Optional arguments")
optional.add_argument("-h",
"--help",
action="help",
help="Show this message and exit.")
optional.add_argument(
"-h", "--help", action="help", help="Show this message and exit."
)
(args, extra_args) = parser.parse_known_args()
return(args)
return args
def main():
"""
Main execution of the script
"""
#1. Parse and print command-line arguments
# 1. Parse and print command-line arguments
arguments = parse_arguments()
message = ("\n"
message = (
"\n"
"These are the arguments you have provided:\n"
" INPUT:\n"
"{0}\n"
" OUTPUT:\n"
"{1}\n".format(arguments.input,
arguments.output))
"{1}\n".format(arguments.input, arguments.output)
)
print(message)
#2. Open input files
species_scaffolds_df = pd.read_csv(arguments.input[0],
sep="\t")
# 2. Open input files
species_scaffolds_df = pd.read_csv(arguments.input[0], sep="\t")
mapped_reads_df = pd.read_csv(arguments.input[1],
sep="\t")
mapped_reads_df = pd.read_csv(arguments.input[1], sep="\t")
#3. Merge input files together
quantified_scaffolds_df = pd.merge(species_scaffolds_df, mapped_reads_df,
# 3. Merge input files together
quantified_scaffolds_df = pd.merge(
species_scaffolds_df,
mapped_reads_df,
on=["scaffold_name", "Sample_name"],
how="left")
#4. Extract columns of interest
quantified_scaffolds_df = quantified_scaffolds_df[["Sample_name",
"scaffold_name", "Length", "mapped_reads"]]
how="left",
)
#And rename them
quantified_scaffolds_df.rename(columns={ "Sample_name" : "Sample",
"scaffold_name" : "Scaffold",
"Length" : "Scaffold_length",
"mapped_reads" : "Mapped_reads"},
inplace=True)
# 4. Extract columns of interest
quantified_scaffolds_df = quantified_scaffolds_df[
["Sample_name", "scaffold_name", "Length", "mapped_reads"]
]
# And rename them
quantified_scaffolds_df.rename(
columns={
"Sample_name": "Sample",
"scaffold_name": "Scaffold",
"Length": "Scaffold_length",
"mapped_reads": "Mapped_reads",
},
inplace=True,
)
quantified_scaffolds_df.to_csv(arguments.output[0],
sep="\t", index=False)
quantified_scaffolds_df.to_csv(arguments.output[0], sep="\t", index=False)
#5. Aggregate statistics per sample
# 5. Aggregate statistics per sample
quantified_scaffolds_df["Number_of_scaffolds"] = 1
statistics_per_sample = pd.DataFrame(quantified_scaffolds_df.groupby(
["Sample"]).sum()[["Number_of_scaffolds", "Scaffold_length",
"Mapped_reads"]]
statistics_per_sample = pd.DataFrame(
quantified_scaffolds_df.groupby(["Sample"]).sum()[
["Number_of_scaffolds", "Scaffold_length", "Mapped_reads"]
]
)
statistics_per_sample.to_csv(arguments.output[1],
sep="\t", index=True)
#requires index for sample names
statistics_per_sample.to_csv(arguments.output[1], sep="\t", index=True)
# requires index for sample names
return None
return(None)
## Execute script
if __name__ == "__main__":
......
#! /usr/bin/env python3
#Concatenate per-sample tables of depth of coverage into an overall table.
#Required input:
# Concatenate per-sample tables of depth of coverage into an overall table.
# Required input:
# - A number of per-sample tables as tsv files
#
#The output file will be a concatenated table with additional "Sample_name"
# The output file will be a concatenated table with additional "Sample_name"
# column. Entering a name on the command-line is required.
#
#Example use:
# Example use:
# python concatenate_depth_tables.py -i Depth_of_coverage-sample1-paired.tsv Depth_of_coverage-sample2-paired.tsv Depth_of_coverage-sample3-paired.tsv -o Depth_of_coverage-paired.tsv
#IMPORT required libraries---------------------------------
# IMPORT required libraries---------------------------------
import pandas as pd
import argparse
#Define FUNCTIONS------------------------------------------
# Define FUNCTIONS------------------------------------------
def parse_arguments():
"""
Parse the arguments from the command line, i.e.:
......@@ -22,34 +22,41 @@ def parse_arguments():
-o/--output = output file (tab-separated table)
-h/--help = show help
"""
parser = argparse.ArgumentParser(prog="concatenate mapped read counts",
parser = argparse.ArgumentParser(
prog="concatenate mapped read counts",
description="Concatenate mapped read count tables",
usage="concatenate_mapped_read_counts.py -i [input] -o [output]"
" [-h / --help]",
add_help=False)
add_help=False,
)
required = parser.add_argument_group("Required arguments")
required.add_argument('-i',
'--input',
required.add_argument(
"-i",
"--input",
dest="input",
metavar='',
metavar="",
required=True,
type=str,
nargs='+',
help="List of input files (counts per sample).")
nargs="+",
help="List of input files (counts per sample).",
)
required.add_argument('-o',
'--output',
required.add_argument(
"-o",
"--output",
dest="output",
metavar='',
metavar="",
required=True,
type=str,
help="Output file name (and directory).")
help="Output file name (and directory).",
)
(args, extra_args) = parser.parse_known_args()
return(args)
return args
def extract_sample_name(filename):
"""
......@@ -59,34 +66,36 @@ To extract the sample name, remove "Mapped_read_counts-", and
everything from "_to_" until the end.
"""
if "/" in filename:
#If the directory path is attached to the filename, remove it
# If the directory path is attached to the filename, remove it
without_path = filename.split("/")[-1]
else:
without_path = filename
without_prefix = without_path.replace("Depth_of_coverage-", "")
sample = without_prefix[:without_prefix.index("_to_")]
sample = without_prefix[: without_prefix.index("_to_")]
return sample
return(sample)
def main():
"""
Main execution of the script
"""
#1. Parse and show arguments
# 1. Parse and show arguments
arguments = parse_arguments()
message = ("\n"
message = (
"\n"
"These are the arguments you have provided:\n"
" INPUT:\n"
"{0},\n"
" OUTPUT:\n"
"{1}\n".format(arguments.input,
arguments.output))
"{1}\n".format(arguments.input, arguments.output)
)
print(message)
#2. Read input files and make into one dataframe
# 2. Read input files and make into one dataframe
concat_df = pd.DataFrame()
for file in arguments.input:
......@@ -96,12 +105,12 @@ def main():
concat_df = pd.concat([concat_df, df])
#3. Write table to a tsv file
concat_df.to_csv(arguments.output, sep = '\t',
index = False)
# 3. Write table to a tsv file
concat_df.to_csv(arguments.output, sep="\t", index=False)
return None
return(None)
#EXECUTE script--------------------------------------------
# EXECUTE script--------------------------------------------
if __name__ == "__main__":
main()
#! /usr/bin/env python3
#Concatenate per-sample tables of read counts into an overall table.
#Required input:
# Concatenate per-sample tables of read counts into an overall table.
# Required input:
# - A number of per-sample tables as tsv files
#
#The output file will be a concatenated table with additional "Sample_name"
# The output file will be a concatenated table with additional "Sample_name"
# and "Reference" columns. Entering a name on the command-line is required.
#
#Example use:
# Example use:
# python concatenate_read_counts.py -i sample1-paired.tsv sample2-paired.tsv sample3-paired.tsv -o All_samples-paired.tsv
#IMPORT required libraries---------------------------------
# IMPORT required libraries---------------------------------
import pandas as pd
import argparse
#Define FUNCTIONS------------------------------------------
# Define FUNCTIONS------------------------------------------
def parse_arguments():
"""
Parse the arguments from the command line, i.e.:
......@@ -22,34 +22,41 @@ def parse_arguments():
-o/--output = output file (tab-separated table)
-h/--help = show help
"""
parser = argparse.ArgumentParser(prog="concatenate mapped read counts",
parser = argparse.ArgumentParser(
prog="concatenate mapped read counts",
description="Concatenate mapped read count tables",
usage="concatenate_mapped_read_counts.py -i [input] -o [output]"
" [-h / --help]",
add_help=False)
add_help=False,
)
required = parser.add_argument_group("Required arguments")
required.add_argument('-i',
'--input',
required.add_argument(
"-i",
"--input",
dest="input",
metavar='',
metavar="",
required=True,
type=str,
nargs='+',
help="List of input files (counts per sample).")
nargs="+",
help="List of input files (counts per sample).",
)
required.add_argument('-o',
'--output',
required.add_argument(
"-o",
"--output",
dest="output",
metavar='',
metavar="",
required=True,
type=str,
help="Output file name (and directory).")
help="Output file name (and directory).",
)
(args, extra_args) = parser.parse_known_args()
return(args)
return args
def extract_sample_and_reference_name(filename):
"""
......@@ -59,44 +66,48 @@ To extract the sample name, remove "Mapped_read_counts-", and
everything from "_to_" until the end.
"""
if "/" in filename:
#If the directory path is attached to the filename, remove it
# If the directory path is attached to the filename, remove it
without_path = filename.split("/")[-1]
else:
without_path = filename
without_prefix = without_path.replace("Mapped_read_counts-", "")
sample = without_prefix[:without_prefix.index("_to_")]
sample = without_prefix[: without_prefix.index("_to_")]
reference = without_prefix[without_prefix.index("_to_") + 4 : -4]
#Assume the .tsv file extension; hence strip the last 4 characters
# Assume the .tsv file extension; hence strip the last 4 characters
if "-unpaired" in reference:
reference = reference.replace("-unpaired", "")
elif "-paired" in reference:
reference = reference.replace("-paired", "")
else:
print("There might be an unexpected suffix in the reference name:"
" %s" % reference)
print(
"There might be an unexpected suffix in the reference name:"
" %s" % reference
)
pass
return(sample, reference)
return (sample, reference)
def main():
"""
Main execution of the script
"""
#1. Parse and show arguments
# 1. Parse and show arguments
arguments = parse_arguments()
message = ("\n"
message = (
"\n"
"These are the arguments you have provided:\n"
" INPUT:\n"
"{0},\n"
" OUTPUT:\n"
"{1}\n".format(arguments.input,
arguments.output))
"{1}\n".format(arguments.input, arguments.output)
)
print(message)
#2. Read input files and make into one dataframe
# 2. Read input files and make into one dataframe
concat_df = pd.DataFrame()
for file in arguments.input:
......@@ -107,12 +118,12 @@ def main():
concat_df = pd.concat([concat_df, df])
#3. Write table to a tsv file
concat_df.to_csv(arguments.output, sep = '\t',
index = False)
# 3. Write table to a tsv file
concat_df.to_csv(arguments.output, sep="\t", index=False)
return None
return(None)
#EXECUTE script--------------------------------------------
# EXECUTE script--------------------------------------------
if __name__ == "__main__":
main()
#! /usr/bin/env python3
#From the species-filtered Jovian output table, create fasta files per sample
# From the species-filtered Jovian output table, create fasta files per sample
# for each species. E.g. 'all_taxClassified-Escherichia_coli.tsv' becomes
# 'A-Escherichia_coli-scaffolds.fasta', 'B-Escherichia_coli-scaffolds.fasta',
# and 'C-Escherichia_coli-scaffolds.fasta' if your sample names are A, B and
# C.
#Resulting fasta files are written to the same directory as the input file.
# Resulting fasta files are written to the same directory as the input file.
#
#Example use:
# Example use:
# python create_per_sample_fasta.py -i all_taxClassified-Escherichia_coli.tsv
from pathlib import Path
import argparse
def parse_arguments():
"""
Parse the arguments from the command line, i.e.:
......@@ -20,41 +21,46 @@ def parse_arguments():
-s/--samples = sample names for which to generate output (necessary for snakemake)
-h/--help = show help
"""
parser = argparse.ArgumentParser(prog="create per sample fasta",
parser = argparse.ArgumentParser(
prog="create per sample fasta",
description="Create a fasta file per sample with scaffolds from the specified species",
usage="create_per_sample_fasta.py -i file -s samples"
" [-h / --help]",
add_help=False)
usage="create_per_sample_fasta.py -i file -s samples" " [-h / --help]",
add_help=False,
)
required = parser.add_argument_group("Required arguments")
required.add_argument("-i",
required.add_argument(
"-i",
"--input",
dest="input",
metavar="file",
required=True,
type=str,
help="Classified scaffolds table for species of interest.")
help="Classified scaffolds table for species of interest.",
)
optional = parser.add_argument_group("Optional arguments")
optional.add_argument("-s",
optional.add_argument(
"-s",
"--samples",
dest="samples",
metavar="samples",
type=str,
nargs="+",
default="None",
help="Samples for which to create fasta files.")
help="Samples for which to create fasta files.",
)
optional.add_argument("-h",
"--help",
action="help",
help="Show this message and exit.")
optional.add_argument(
"-h", "--help", action="help", help="Show this message and exit."
)
(args, extra_args) = parser.parse_known_args()
return(args)
return args
def read_and_create_fastas(input_file, samples):
"""
......@@ -63,90 +69,95 @@ def read_and_create_fastas(input_file, samples):
"""
species = input_file.split("-")[1][:-4]
#species is the second part of the file name, after the dash,
# species is the second part of the file name, after the dash,
# and excluding the extension (last 4 characters)
input_dir = Path(input_file).parent
samples_seen = []
#Keep a list of which samples have been seen, so that
# Keep a list of which samples have been seen, so that
# for each sample a new empty file can be initiated.
with open(input_file, 'r') as read_file:
next(read_file) #skip the header
with open(input_file, "r") as read_file:
next(read_file) # skip the header
for line in read_file:
line = line.split()
#split the line in separate elements
# split the line in separate elements
sample = line[0] #sample is the first element
scaffold_id = line[1] #scaffold id is the second element
sequence = line[-1] #sequence is the last element
sample = line[0] # sample is the first element
scaffold_id = line[1] # scaffold id is the second element
sequence = line[-1] # sequence is the last element
output_file = Path(input_dir / ("%s-%s-scaffolds.fasta" % (sample, species)))
output_file = Path(
input_dir / ("%s-%s-scaffolds.fasta" % (sample, species))
)
if not output_file.exists():
#If the file is not there yet, create an empty file
# If the file is not there yet, create an empty file
output_file.touch()
else:
#And if it does exist
# And if it does exist
if sample in samples_seen:
#Check whether this had been seen already
# Check whether this had been seen already
pass
else:
# and if not, empty the file and add it to the seen list
open(output_file, 'w').close()