From a78a4afb0af67e6e1eff5f579f1b51119fbd7a40 Mon Sep 17 00:00:00 2001
From: skchronicles <kuhnsa3@gmail.com>
Date: Tue, 28 Nov 2023 16:27:43 -0500
Subject: [PATCH] Adding new file to spreadsheet script and adding XlsxWriter
 dependency

---
 docker/blast/Dockerfile               |   1 +
 workflow/scripts/files2spreadsheet.py | 451 +++++++++++++++++++-------
 2 files changed, 330 insertions(+), 122 deletions(-)
diff --git a/docker/blast/Dockerfile b/docker/blast/Dockerfile
index cabe8c2..6df1336 100644
--- a/docker/blast/Dockerfile
+++ b/docker/blast/Dockerfile
@@ -84,6 +84,7 @@ RUN pip3 install --upgrade pip \
     && pip3 install matplotlib \
     && pip3 install multiqc==1.14 \
     && pip3 install pandas \
+    && pip3 install XlsxWriter \
     && pip3 install scipy
 
 # Perl fix issue
diff --git a/workflow/scripts/files2spreadsheet.py b/workflow/scripts/files2spreadsheet.py
index bb3ba19..f8bb204 100755
--- a/workflow/scripts/files2spreadsheet.py
+++ b/workflow/scripts/files2spreadsheet.py
@@ -1,143 +1,350 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
+# Standard library
 from __future__ import print_function
+import argparse, sys, os, textwrap
+
+
+# 3rd party packages,
+# installed from pypi
 import pandas as pd
 import xlsxwriter
-import sys
-import os
+
+# Script metadata
+__author__  = "Skyler Kuhn"
+__version__ = "v0.1.0"
+
+_help = textwrap.dedent(
+"""./file2spreadsheet.py:
+Creates an excel file from a list of input files.
+
+@Usage:
+    $ ./file2spreadsheet.py [-h] [--version] \\
+            [--rm-suffix RM_SUFFIX] \\
+            [--comment-symbol COMMENT_SYMBOL] \\
+            --input FILE_1 [FILE_2 ...] \\
+            --output OUTPUT
+
+@About:
+    Given a list of input files and an output file 
+    name, this script will create an excel spread-
+    sheet containing a tab/worksheet of each input
+    file. 
+    
+    By default, any lines starting with the comment 
+    character, '#', will not be skipped. Optionally,
+    a suffix string can be provided to remove any 
+    file extensions from each of the resulting tab
+    or worksheet names. By default, the file exten-
+    sion will only be removed from the when creating
+    the tab or worksheet name.
+
+@Required Arguments:
+    -i, --input FILE_1 [FILE_2 ...]
+                   One or more files to include in
+                   the excel spreadsheet. Multiple 
+                   files should be seperated by a
+                   white space character.
+    -o, --output OUTPUT  
+                   Output excel file name. The file 
+                   name of the resulting excel file. 
+                   Within the excel file, there will 
+                   be a tab for each provided input 
+                   file. 
+@Options
+    -r, --rm-suffix CLEAN_SUFFIX
+                    Removes the provided suffix the
+                    tab/worksheet name in the excel 
+                    file. By default, just the first
+                    extension is removed.
+    -c, --comment-symbol COMMENT_SYMBOL
+                    Set the character that represents
+                    comment lines. Lines starting with
+                    this character will be skipped. By
+                    default, all lines of the file will
+                    be included and nothing is skipped.
+                    
+    --h, --help     Shows this help message and exits.
+
+
+@Author: {0}
+@Version: {1}
+""".format(__author__, __version__)
+)
+
+
+def err(*message, **kwargs):
+    """Prints any provided args to standard error.
+    kwargs can be provided to modify print functions 
+    behavior.
+    @param message <any>:
+        Values printed to standard error
+    @params kwargs <print()>
+        Key words to modify print function behavior
+    """
+    print(*message, file=sys.stderr, **kwargs)
+
+
+def fatal(*message, **kwargs):
+    """Prints any provided args to standard error
+    and exits with an exit code of 1.
+    @param message <any>:
+        Values printed to standard error
+    @params kwargs <print()>
+        Key words to modify print function behavior
+    """
+    err(*message, **kwargs)
+    sys.exit(1)
+
+
+def parse_arguments():
+    """Collect and parse command line arguments."""
+    # Sanity check for usage
+    if len(sys.argv) == 1:
+        # Nothing was provided
+        fatal('Invalid usage: file2spreadsheet.py [-h] ...')
+
+    # Parse command-line arguments
+    # Create a top-level parser
+    parser = argparse.ArgumentParser(
+        usage = argparse.SUPPRESS,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description = _help,
+        add_help=False
+    )
+
+    # Required Positional Arguments
+    # List of input files
+    parser.add_argument(
+        '-i',
+        '--input',
+        required=True,
+        nargs = '+',
+        help = argparse.SUPPRESS
+    )
+    # Output file name
+    parser.add_argument(
+        '-o',
+        '--output',
+        required=True,
+        type=str,
+        help = argparse.SUPPRESS
+    )
+
+    # Options
+    # Skip lines with comment
+    # chars, i.e "#" symbols
+    parser.add_argument(
+        '-c',
+        '--comment-symbol', 
+        required=False,
+        default = None,
+        type=str,
+        help = argparse.SUPPRESS
+    )
+
+    # Remove suffix from 
+    # tab/worksheet name
+    parser.add_argument(
+        '-r',
+        '--rm-suffix', 
+        required=False,
+        default='',
+        type=str,
+        help = argparse.SUPPRESS
+    )
+
+    # Add custom help message
+    parser.add_argument(
+        '-h', '--help', 
+        action='help', 
+        help=argparse.SUPPRESS
+    )
+
+    # Collect parsed arguments
+    args = parser.parse_args()
+
+    return args 
 
 
 def reader(filename, subset=[], skip='#', **kwargs):
-	"""Reads in an MAF-like file as a dataframe. Determines the 
-	correct handler for reading in a given MAF file. Supports reading
-	in TSV files (.tsv, .txt, .text, .vcf, or .maf), CSV files (.csv), 
-	and excel files (.xls, .xlsx, .xlsm, .xlsb, .odf, .ods, .odt ). 
-	The subset option allows a users to only select a few columns 
-	given a list of column names.
-	@param filename <str>:
-		Path of an MAF-like file to read and parse
-	@param subset list[<str>]:
-		List of column names which can be used to subset the df
-	@param skip <str>:
-		Skips over line starting with this character
-	@params kwargs <read_excel()>
-		Key words to modify pandas.read_excel() function behavior
-	@return <pandas dataframe>:
-		dataframe with spreadsheet contents
-	"""
-	# Get file extension
-	extension = os.path.splitext(filename)[-1].lower()
-
-	# Assign a handler to read in the file
-	if extension in ['.xls', '.xlsx', '.xlsm', '.xlsb', '.odf', '.ods', '.odt']:
-		# Read in as an excel file
-		return excel(filename, subset, skip, **kwargs)
-	elif extension in ['.csv']:
-		# Read in as an CSV file
-		return csv(filename, subset, skip, **kwargs)
-	else:
-		# Default to reading in as an TSV file
-		# Tab is the normal delimeter for MAF or VCF files
-		# MAF files usually have one of the following
-		# extensions: '.tsv', '.txt', '.text', '.vcf', '.maf'
-		return tsv(filename, subset, skip, **kwargs)
+    """Reads in an MAF-like file as a dataframe. Determines the 
+    correct handler for reading in a given MAF file. Supports reading
+    in TSV files (.tsv, .txt, .text, .vcf, or .maf), CSV files (.csv), 
+    and excel files (.xls, .xlsx, .xlsm, .xlsb, .odf, .ods, .odt ). 
+    The subset option allows a users to only select a few columns 
+    given a list of column names.
+    @param filename <str>:
+        Path of an MAF-like file to read and parse
+    @param subset list[<str>]:
+        List of column names which can be used to subset the df
+    @param skip <str>:
+        Skips over line starting with this character
+    @params kwargs <read_excel()>
+        Key words to modify pandas.read_excel() function behavior
+    @return <pandas dataframe>:
+        dataframe with spreadsheet contents
+    """
+    # Get file extension
+    extension = os.path.splitext(filename)[-1].lower()
+
+    # Assign a handler to read in the file
+    if extension in ['.xls', '.xlsx', '.xlsm', '.xlsb', '.odf', '.ods', '.odt']:
+        # Read in as an excel file
+        return excel(filename, subset, skip, **kwargs)
+    elif extension in ['.csv']:
+        # Read in as an CSV file
+        return csv(filename, subset, skip, **kwargs)
+    else:
+        # Default to reading in as an TSV file
+        # Tab is the normal delimeter for MAF or VCF files
+        # MAF files usually have one of the following
+        # extensions: '.tsv', '.txt', '.text', '.vcf', '.maf'
+        return tsv(filename, subset, skip, **kwargs)
 
 
 def excel(filename, subset=[], skip='#', **kwargs):
-	"""Reads in an excel file as a dataframe. The subset option
-	allows a users to only select a few columns given a list of 
-	column names.
-	@param filename <str>:
-		Path of an EXCEL file to read and parse
-	@param subset list[<str>]:
-		List of column names which can be used to subset the df
-	@param skip <str>:
-		Skips over line starting with this character
-	@params kwargs <read_excel()>
-		Key words to modify pandas.read_excel() function behavior
-	@return <pandas dataframe>:
-		dataframe with spreadsheet contents
-	"""
-	if subset:
-		return pd.read_excel(filename, comment=skip, **kwargs)[subset]
-
-	return pd.read_excel(filename, comment=skip, **kwargs)
+    """Reads in an excel file as a dataframe. The subset option
+    allows a users to only select a few columns given a list of 
+    column names.
+    @param filename <str>:
+        Path of an EXCEL file to read and parse
+    @param subset list[<str>]:
+        List of column names which can be used to subset the df
+    @param skip <str>:
+        Skips over line starting with this character
+    @params kwargs <read_excel()>
+        Key words to modify pandas.read_excel() function behavior
+    @return <pandas dataframe>:
+        dataframe with spreadsheet contents
+    """
+    if subset:
+        return pd.read_excel(filename, comment=skip, **kwargs)[subset]
+
+    return pd.read_excel(filename, comment=skip, **kwargs)
 
 
 def tsv(filename, subset=[], skip='#', **kwargs):
-	"""Reads in an TSV file as a dataframe. The subset option
-	allows a users to only select a few columns given a list of 
-	column names.
-	@param filename <str>:
-		Path of an TSV file to read and parse
-	@param subset list[<str>]:
-		List of column names which can be used to subset the df
-	@param skip <str>:
-		Skips over line starting with this character
-	@params kwargs <read_excel()>
-		Key words to modify pandas.read_excel() function behavior
-	@return <pandas dataframe>:
-		dataframe with spreadsheet contents
-	"""
-	if subset:
-		return pd.read_table(filename, comment=skip, **kwargs)[subset]
-
-	return pd.read_table(filename, comment=skip, **kwargs)
+    """Reads in an TSV file as a dataframe. The subset option
+    allows a users to only select a few columns given a list of 
+    column names.
+    @param filename <str>:
+        Path of an TSV file to read and parse
+    @param subset list[<str>]:
+        List of column names which can be used to subset the df
+    @param skip <str>:
+        Skips over line starting with this character
+    @params kwargs <read_excel()>
+        Key words to modify pandas.read_excel() function behavior
+    @return <pandas dataframe>:
+        dataframe with spreadsheet contents
+    """
+    if subset:
+        return pd.read_table(filename, comment=skip, **kwargs)[subset]
+
+    return pd.read_table(filename, comment=skip, **kwargs)
 
 
 def csv(filename, subset=[], skip='#', **kwargs):
-	"""Reads in an CSV file as a dataframe. The subset option
-	allows a users to only select a few columns given a list of 
-	column names.
-	@param filename <str>:
- 		Path of an CSV file to read and parse
-	@param subset list[<str>]:
-		List of column names which can be used to subset the df
-	@param skip <str>:
-		Skips over line starting with this character
-	@params kwargs <read_excel()>
-		Key words to modify pandas.read_excel() function behavior
-	@return <pandas dataframe>:
-		dataframe with spreadsheet contents
-	"""
-	if subset:
-		return pd.read_csv(filename, comment=skip, **kwargs)[subset]
-
-	return pd.read_csv(filename, comment=skip, **kwargs)
-
-
-def excel_writer(files, spreadsheet= 'test.xlsx'):
-	"""Takes a list of files and creates one excel spreadsheet.
-	Each file will becomes a sheet in the spreadsheet where the 
-	name of the sheet is the basename of the file with the extension
-	removed.
-	@param files list[<str>]:
-    	List of files to merge into one execl file
-	@param spreadsheet <str>:
-    	Output filename of the spreadsheet
-	"""
-
-	writer = pd.ExcelWriter(spreadsheet, engine='xlsxwriter')
-	
-	# Create a spreadsheet from the contents of each file
-	for file in files:
-		print('Reading in {}'.format(file))
-		df = reader(file)
-		sheet = os.path.splitext(os.path.basename(file))[0]
-		try:
-			# Sheet name cannot exceed 31 characters in length
-			df.to_excel(writer, sheet_name = sheet, index = False, freeze_panes = (1,0))
-		except xlsxwriter.exceptions.InvalidWorksheetName as e:
-			df.to_excel(writer, sheet_name = sheet[:31], index = False,freeze_panes = (1,0))
-
-	writer.save()
+    """Reads in an CSV file as a dataframe. The subset option
+    allows a users to only select a few columns given a list of 
+    column names.
+    @param filename <str>:
+         Path of an CSV file to read and parse
+    @param subset list[<str>]:
+        List of column names which can be used to subset the df
+    @param skip <str>:
+        Skips over line starting with this character
+    @params kwargs <read_excel()>
+        Key words to modify pandas.read_excel() function behavior
+    @return <pandas dataframe>:
+        dataframe with spreadsheet contents
+    """
+    if subset:
+        return pd.read_csv(filename, comment=skip, **kwargs)[subset]
 
+    return pd.read_csv(filename, comment=skip, **kwargs)
 
-if __name__ == '__main__':
 
-	# List of file to convert into an excel file
-	files = sys.argv[1:-1]
-	# Output file name 
-	outfh = sys.argv[-1]
+def excel_writer(files, spreadsheet,  skip_comments=None, remove_suffix = ''):
+    """Takes a list of files and creates one excel spreadsheet.
+    Each file will becomes a sheet in the spreadsheet where the 
+    name of the sheet is the basename of the file with the extension
+    removed.
+    @param files list[<str>]:
+        List of files to merge into one execl file
+    @param spreadsheet <str>:
+        Output filename of the spreadsheet
+    """
+    # Create output directory as needed
+    outdir = os.path.dirname(os.path.abspath(spreadsheet))
+    if not os.path.exists(outdir):
+        # Pipeline output directory
+        # does not exist on filesystem
+        os.makedirs(outdir)
+    
+    # Create a spreadsheet from the contents of each file
+    with pd.ExcelWriter(spreadsheet, engine='xlsxwriter') as writer:
+        for file in files:
+            print('Reading in {}'.format(file))
+            df = reader(file, skip=skip_comments)
+            # Removing leading # char from header
+            # df.rename(columns={df.columns[0]: "{0}".format(df.columns[0].lstrip('#'))})
+            sheet, ext = os.path.splitext(os.path.basename(file))
+            if remove_suffix:
+                # Default behavior is to always
+                # remove the file extension, how-
+                # ever, if a user provides an 
+                # additional suffix to remove
+                # we must take into account the
+                # extension has already been
+                # removed prior to 
+                cleaned_suffix, ext = os.path.splitext(remove_suffix)
+                sheet = sheet.split(cleaned_suffix)[0]
+            
+            # Sheet name cannot exceed 
+            # 31 characters in length
+            df.to_excel(writer, sheet_name = sheet[:31], index = False, freeze_panes = (1,0))
+            # Automagically set the width
+            # of a column to its max width
+            worksheet = writer.sheets[sheet]  # pull worksheet object
+            for idx, col in enumerate(df):  # loop through all columns
+                series = df[col]
+                max_len = max(
+                    (
+                        series.astype(str).map(len).max(),  # len of largest item
+                        len(str(series.name))  # len of column name/header
+                    )
+                ) + 2  # adding a little extra space
+                worksheet.set_column(idx, idx, max_len)  # set column width
+
+
+def main():
+    # Parse command-line arguments
+    args = parse_arguments()
+
+    # List of files to convert into 
+    # an excel spreasheet
+    inputs = args.input
+    # Output file name 
+    output = args.output
 
-	excel_writer(files, outfh)
+    # Skip comment lines
+    comment_char = args.comment_symbol
+    # Remove suffix from 
+    # X tab/worksheet name 
+    rm_suffix = args.rm_suffix
+
+    # Create XLSX file from the list
+    # of input files 
+    excel_writer(
+        files=inputs,
+        spreadsheet=output,
+        skip_comments=comment_char,
+        remove_suffix=rm_suffix
+    )
+
+
+if __name__ == '__main__':
+    # Call main method
+    main()
\ No newline at end of file