Commit c8aa56d3 authored by Sam Nooij's avatar Sam Nooij
Browse files

Reformat Python code with Black

parent 6ce6ee4e
#! /usr/bin/env python3 #! /usr/bin/env python3
#Collect scaffold information for a species filtered from Jovian's # Collect scaffold information for a species filtered from Jovian's
# classification table. (Which scaffolds, their length and number # classification table. (Which scaffolds, their length and number
# of reads mapped to them.) # of reads mapped to them.)
# #
# Example use: # Example use:
...@@ -9,19 +9,19 @@ ...@@ -9,19 +9,19 @@
# -i data/all_taxClassified-Escherichia_coli.tsv data/Mapped_read_counts.tsv \ # -i data/all_taxClassified-Escherichia_coli.tsv data/Mapped_read_counts.tsv \
# -o results/Escherichia_coli-scaffolds_per_sample.tsv results/Escherichia_coli-stats_per_sample.tsv \ # -o results/Escherichia_coli-scaffolds_per_sample.tsv results/Escherichia_coli-stats_per_sample.tsv \
# #
#Three required input arguments are: # Three required input arguments are:
# -i/--input for the two required tables (see example) # -i/--input for the two required tables (see example)
# -o/--output for the result tables (see example) # -o/--output for the result tables (see example)
## Import required libraries ## Import required libraries
import sys #to abort the script when an error occurs import sys # to abort the script when an error occurs
import argparse #to parse command-line arguments import argparse # to parse command-line arguments
import pandas as pd #to work with dataframes (tabular data) import pandas as pd # to work with dataframes (tabular data)
import numpy as np #for calculations (summing contig stats) import numpy as np # for calculations (summing contig stats)
import re #to extract sample names from multiqc table import re # to extract sample names from multiqc table
## Define functions: ## Define functions:
#1. Parse command-line arguments # 1. Parse command-line arguments
def parse_arguments(): def parse_arguments():
""" """
Parse the arguments from the command line, i.e.: Parse the arguments from the command line, i.e.:
...@@ -29,99 +29,112 @@ def parse_arguments(): ...@@ -29,99 +29,112 @@ def parse_arguments():
-o/--output = 2 output files -o/--output = 2 output files
-h/--help = show help -h/--help = show help
""" """
parser = argparse.ArgumentParser(prog="collect species statistics", parser = argparse.ArgumentParser(
description="Collect scaffold statistics for each species filtered from Jovian's results", prog="collect species statistics",
usage="collect_species_statistics.py -i file1 file2 -o file3 file4" description="Collect scaffold statistics for each species filtered from Jovian's results",
" [-h / --help]", usage="collect_species_statistics.py -i file1 file2 -o file3 file4"
add_help=False) " [-h / --help]",
add_help=False,
)
required = parser.add_argument_group("Required arguments") required = parser.add_argument_group("Required arguments")
required.add_argument("-i", required.add_argument(
"--input", "-i",
dest="input", "--input",
metavar="file1 file2", dest="input",
required=True, metavar="file1 file2",
nargs=2, required=True,
type=str, nargs=2,
help="Classified scaffolds table and mapped read counts.") type=str,
help="Classified scaffolds table and mapped read counts.",
required.add_argument("-o", )
"--output",
dest="output", required.add_argument(
metavar="file3 file4", "-o",
required=True, "--output",
nargs=2, dest="output",
type=str, metavar="file3 file4",
help="Output tables (per scaffold, and summed statistics).") required=True,
nargs=2,
type=str,
help="Output tables (per scaffold, and summed statistics).",
)
optional = parser.add_argument_group("Optional arguments") optional = parser.add_argument_group("Optional arguments")
optional.add_argument("-h", optional.add_argument(
"--help", "-h", "--help", action="help", help="Show this message and exit."
action="help", )
help="Show this message and exit.")
(args, extra_args) = parser.parse_known_args() (args, extra_args) = parser.parse_known_args()
return(args) return args
def main(): def main():
""" """
Main execution of the script Main execution of the script
""" """
#1. Parse and print command-line arguments # 1. Parse and print command-line arguments
arguments = parse_arguments() arguments = parse_arguments()
message = ("\n" message = (
"These are the arguments you have provided:\n" "\n"
" INPUT:\n" "These are the arguments you have provided:\n"
"{0}\n" " INPUT:\n"
" OUTPUT:\n" "{0}\n"
"{1}\n".format(arguments.input, " OUTPUT:\n"
arguments.output)) "{1}\n".format(arguments.input, arguments.output)
)
print(message) print(message)
#2. Open input files # 2. Open input files
species_scaffolds_df = pd.read_csv(arguments.input[0], species_scaffolds_df = pd.read_csv(arguments.input[0], sep="\t")
sep="\t")
mapped_reads_df = pd.read_csv(arguments.input[1], sep="\t")
mapped_reads_df = pd.read_csv(arguments.input[1],
sep="\t") # 3. Merge input files together
quantified_scaffolds_df = pd.merge(
#3. Merge input files together species_scaffolds_df,
quantified_scaffolds_df = pd.merge(species_scaffolds_df, mapped_reads_df, mapped_reads_df,
on=["scaffold_name", "Sample_name"], on=["scaffold_name", "Sample_name"],
how="left") how="left",
)
#4. Extract columns of interest
quantified_scaffolds_df = quantified_scaffolds_df[["Sample_name", # 4. Extract columns of interest
"scaffold_name", "Length", "mapped_reads"]] quantified_scaffolds_df = quantified_scaffolds_df[
["Sample_name", "scaffold_name", "Length", "mapped_reads"]
#And rename them ]
quantified_scaffolds_df.rename(columns={ "Sample_name" : "Sample",
"scaffold_name" : "Scaffold", # And rename them
"Length" : "Scaffold_length", quantified_scaffolds_df.rename(
"mapped_reads" : "Mapped_reads"}, columns={
inplace=True) "Sample_name": "Sample",
"scaffold_name": "Scaffold",
quantified_scaffolds_df.to_csv(arguments.output[0], "Length": "Scaffold_length",
sep="\t", index=False) "mapped_reads": "Mapped_reads",
},
#5. Aggregate statistics per sample inplace=True,
)
quantified_scaffolds_df.to_csv(arguments.output[0], sep="\t", index=False)
# 5. Aggregate statistics per sample
quantified_scaffolds_df["Number_of_scaffolds"] = 1 quantified_scaffolds_df["Number_of_scaffolds"] = 1
statistics_per_sample = pd.DataFrame(quantified_scaffolds_df.groupby( statistics_per_sample = pd.DataFrame(
["Sample"]).sum()[["Number_of_scaffolds", "Scaffold_length", quantified_scaffolds_df.groupby(["Sample"]).sum()[
"Mapped_reads"]] ["Number_of_scaffolds", "Scaffold_length", "Mapped_reads"]
]
) )
statistics_per_sample.to_csv(arguments.output[1], statistics_per_sample.to_csv(arguments.output[1], sep="\t", index=True)
sep="\t", index=True) # requires index for sample names
#requires index for sample names
return None
return(None)
## Execute script ## Execute script
if __name__ == "__main__": if __name__ == "__main__":
main() main()
\ No newline at end of file
#! /usr/bin/env python3 #! /usr/bin/env python3
#Concatenate per-sample tables of depth of coverage into an overall table. # Concatenate per-sample tables of depth of coverage into an overall table.
#Required input: # Required input:
# - A number of per-sample tables as tsv files # - A number of per-sample tables as tsv files
# #
#The output file will be a concatenated table with additional "Sample_name" # The output file will be a concatenated table with additional "Sample_name"
# column. Entering a name on the command-line is required. # column. Entering a name on the command-line is required.
# #
#Example use: # Example use:
# python concatenate_depth_tables.py -i Depth_of_coverage-sample1-paired.tsv Depth_of_coverage-sample2-paired.tsv Depth_of_coverage-sample3-paired.tsv -o Depth_of_coverage-paired.tsv # python concatenate_depth_tables.py -i Depth_of_coverage-sample1-paired.tsv Depth_of_coverage-sample2-paired.tsv Depth_of_coverage-sample3-paired.tsv -o Depth_of_coverage-paired.tsv
#IMPORT required libraries--------------------------------- # IMPORT required libraries---------------------------------
import pandas as pd import pandas as pd
import argparse import argparse
#Define FUNCTIONS------------------------------------------ # Define FUNCTIONS------------------------------------------
def parse_arguments(): def parse_arguments():
""" """
Parse the arguments from the command line, i.e.: Parse the arguments from the command line, i.e.:
...@@ -22,34 +22,41 @@ def parse_arguments(): ...@@ -22,34 +22,41 @@ def parse_arguments():
-o/--output = output file (tab-separated table) -o/--output = output file (tab-separated table)
-h/--help = show help -h/--help = show help
""" """
parser = argparse.ArgumentParser(prog="concatenate mapped read counts", parser = argparse.ArgumentParser(
description="Concatenate mapped read count tables", prog="concatenate mapped read counts",
usage="concatenate_mapped_read_counts.py -i [input] -o [output]" description="Concatenate mapped read count tables",
" [-h / --help]", usage="concatenate_mapped_read_counts.py -i [input] -o [output]"
add_help=False) " [-h / --help]",
add_help=False,
)
required = parser.add_argument_group("Required arguments") required = parser.add_argument_group("Required arguments")
required.add_argument('-i', required.add_argument(
'--input', "-i",
dest="input", "--input",
metavar='', dest="input",
required=True, metavar="",
type=str, required=True,
nargs='+', type=str,
help="List of input files (counts per sample).") nargs="+",
help="List of input files (counts per sample).",
required.add_argument('-o', )
'--output',
dest="output", required.add_argument(
metavar='', "-o",
required=True, "--output",
type=str, dest="output",
help="Output file name (and directory).") metavar="",
required=True,
type=str,
help="Output file name (and directory).",
)
(args, extra_args) = parser.parse_known_args() (args, extra_args) = parser.parse_known_args()
return(args) return args
def extract_sample_name(filename): def extract_sample_name(filename):
""" """
...@@ -59,49 +66,51 @@ To extract the sample name, remove "Mapped_read_counts-", and ...@@ -59,49 +66,51 @@ To extract the sample name, remove "Mapped_read_counts-", and
everything from "_to_" until the end. everything from "_to_" until the end.
""" """
if "/" in filename: if "/" in filename:
#If the directory path is attached to the filename, remove it # If the directory path is attached to the filename, remove it
without_path = filename.split("/")[-1] without_path = filename.split("/")[-1]
else: else:
without_path = filename without_path = filename
without_prefix = without_path.replace("Depth_of_coverage-", "") without_prefix = without_path.replace("Depth_of_coverage-", "")
sample = without_prefix[:without_prefix.index("_to_")] sample = without_prefix[: without_prefix.index("_to_")]
return(sample) return sample
def main(): def main():
""" """
Main execution of the script Main execution of the script
""" """
#1. Parse and show arguments # 1. Parse and show arguments
arguments = parse_arguments() arguments = parse_arguments()
message = ("\n" message = (
"These are the arguments you have provided:\n" "\n"
" INPUT:\n" "These are the arguments you have provided:\n"
"{0},\n" " INPUT:\n"
" OUTPUT:\n" "{0},\n"
"{1}\n".format(arguments.input, " OUTPUT:\n"
arguments.output)) "{1}\n".format(arguments.input, arguments.output)
)
print(message) print(message)
#2. Read input files and make into one dataframe # 2. Read input files and make into one dataframe
concat_df = pd.DataFrame() concat_df = pd.DataFrame()
for file in arguments.input: for file in arguments.input:
sample = extract_sample_name(file) sample = extract_sample_name(file)
df = pd.read_csv(file, sep="\t") df = pd.read_csv(file, sep="\t")
df["Sample_name"] = sample df["Sample_name"] = sample
concat_df = pd.concat([concat_df, df]) concat_df = pd.concat([concat_df, df])
#3. Write table to a tsv file
concat_df.to_csv(arguments.output, sep = '\t',
index = False)
return(None) # 3. Write table to a tsv file
concat_df.to_csv(arguments.output, sep="\t", index=False)
return None
#EXECUTE script-------------------------------------------- # EXECUTE script--------------------------------------------
if __name__ == "__main__": if __name__ == "__main__":
main() main()
\ No newline at end of file
#! /usr/bin/env python3 #! /usr/bin/env python3
#Concatenate per-sample tables of read counts into an overall table. # Concatenate per-sample tables of read counts into an overall table.
#Required input: # Required input:
# - A number of per-sample tables as tsv files # - A number of per-sample tables as tsv files
# #
#The output file will be a concatenated table with additional "Sample_name" # The output file will be a concatenated table with additional "Sample_name"
# and "Reference" columns. Entering a name on the command-line is required. # and "Reference" columns. Entering a name on the command-line is required.
# #
#Example use: # Example use:
# python concatenate_read_counts.py -i sample1-paired.tsv sample2-paired.tsv sample3-paired.tsv -o All_samples-paired.tsv # python concatenate_read_counts.py -i sample1-paired.tsv sample2-paired.tsv sample3-paired.tsv -o All_samples-paired.tsv
#IMPORT required libraries--------------------------------- # IMPORT required libraries---------------------------------
import pandas as pd import pandas as pd
import argparse import argparse
#Define FUNCTIONS------------------------------------------ # Define FUNCTIONS------------------------------------------
def parse_arguments(): def parse_arguments():
""" """
Parse the arguments from the command line, i.e.: Parse the arguments from the command line, i.e.:
...@@ -22,34 +22,41 @@ def parse_arguments(): ...@@ -22,34 +22,41 @@ def parse_arguments():
-o/--output = output file (tab-separated table) -o/--output = output file (tab-separated table)
-h/--help = show help -h/--help = show help
""" """
parser = argparse.ArgumentParser(prog="concatenate mapped read counts", parser = argparse.ArgumentParser(
description="Concatenate mapped read count tables", prog="concatenate mapped read counts",
usage="concatenate_mapped_read_counts.py -i [input] -o [output]" description="Concatenate mapped read count tables",
" [-h / --help]", usage="concatenate_mapped_read_counts.py -i [input] -o [output]"
add_help=False) " [-h / --help]",
add_help=False,
)
required = parser.add_argument_group("Required arguments") required = parser.add_argument_group("Required arguments")
required.add_argument('-i', required.add_argument(
'--input', "-i",
dest="input", "--input",
metavar='', dest="input",
required=True, metavar="",
type=str, required=True,
nargs='+', type=str,
help="List of input files (counts per sample).") nargs="+",
help="List of input files (counts per sample).",
required.add_argument('-o', )
'--output',
dest="output", required.add_argument(
metavar='', "-o",
required=True, "--output",
type=str, dest="output",
help="Output file name (and directory).") metavar="",
required=True,
type=str,
help="Output file name (and directory).",
)
(args, extra_args) = parser.parse_known_args() (args, extra_args) = parser.parse_known_args()
return(args) return args
def extract_sample_and_reference_name(filename): def extract_sample_and_reference_name(filename):
""" """
...@@ -59,60 +66,64 @@ To extract the sample name, remove "Mapped_read_counts-", and ...@@ -59,60 +66,64 @@ To extract the sample name, remove "Mapped_read_counts-", and
everything from "_to_" until the end. everything from "_to_" until the end.
""" """
if "/" in filename: if "/" in filename:
#If the directory path is attached to the filename, remove it # If the directory path is attached to the filename, remove it
without_path = filename.split("/")[-1] without_path = filename.split("/")[-1]
else: else:
without_path = filename without_path = filename
without_prefix = without_path.replace("Mapped_read_counts-", "") without_prefix = without_path.replace("Mapped_read_counts-", "")
sample = without_prefix[:without_prefix.index("_to_")] sample = without_prefix[: without_prefix.index("_to_")]
reference = without_prefix[without_prefix.index("_to_") + 4 : -4] reference = without_prefix[without_prefix.index("_to_") + 4 : -4]
#Assume the .tsv file extension; hence strip the last 4 characters # Assume the .tsv file extension; hence strip the last 4 characters
if "-unpaired" in reference: if "-unpaired" in reference:
reference = reference.replace("-unpaired", "") reference = reference.replace("-unpaired", "")
elif "-paired" in reference: elif "-paired" in reference:
reference = reference.replace("-paired", "") reference = reference.replace("-paired", "")
else: else:
print("There might be an unexpected suffix in the reference name:" print(
" %s" % reference) "There might be an unexpected suffix in the reference name:"
" %s" % reference
)
pass pass
return(sample, reference) return (sample, reference)
def main(): def main():
""" """
Main execution of the script Main execution of the script
""" """
#1. Parse and show arguments # 1. Parse and show arguments
arguments = parse_arguments() arguments = parse_arguments()
message = ("\n" message = (
"These are the arguments you have provided:\n" "\n"
" INPUT:\n" "These are the arguments you have provided:\n"
"{0},\n" " INPUT:\n"
" OUTPUT:\n" "{0},\n"
"{1}\n".format(arguments.input, " OUTPUT:\n"
arguments.output)) "{1}\n".format(arguments.input, arguments.output)
)
print(message)