diff --git a/Multiplex_Analysis_Web_Apps.py b/Multiplex_Analysis_Web_Apps.py index e1a2442d..033032b3 100644 --- a/Multiplex_Analysis_Web_Apps.py +++ b/Multiplex_Analysis_Web_Apps.py @@ -3,6 +3,7 @@ ''' import os import re +import logging import subprocess import numpy as np @@ -12,41 +13,45 @@ import nidap_dashboard_lib as ndl # Useful functions for dashboards connected to NIDAP import streamlit_utils import platform_io -import install_missing_packages - -install_missing_packages.live_package_installation() # Note if any of the following imports having " # slow" are not commented out, there is a delay in running the forking test -from pages2 import data_import_and_export -from pages2 import datafile_format_unifier -from pages2 import open_file -from pages2 import feature_creation -from pages2 import robust_scatter_plotter -from pages2 import multiaxial_gating -from pages2 import thresholded_phenotyping # slow due to things ultimately importing umap -from pages2 import adaptive_phenotyping -from pages2 import Pheno_Cluster_a # "slow" for forking test initialization -from pages2 import Pheno_Cluster_b # "slow" for forking test initialization -from pages2 import Tool_parameter_selection -from pages2 import Run_workflow -from pages2 import Display_individual_ROI_heatmaps -from pages2 import Display_average_heatmaps -from pages2 import Display_average_heatmaps_per_annotation -from pages2 import Display_ROI_P_values_overlaid_on_slides -from pages2 import Neighborhood_Profiles # slow due to things ultimately importing umap -from pages2 import UMAP_Analyzer # slow due to things ultimately importing umap -from pages2 import Clusters_Analyzer # slow due to things ultimately importing umap -from pages2 import memory_analyzer -from pages2 import radial_bins_plots -from pages2 import radial_profiles_analysis -from pages2 import preprocessing -from pages2 import results_transfer -# from pages2 import forking_test - +from pages import data_import_and_export +from pages import datafile_format_unifier +from pages import open_file +from pages import feature_creation +from pages import robust_scatter_plotter +from pages import multiaxial_gating +from pages import thresholded_phenotyping # slow due to things ultimately importing umap +from pages import adaptive_phenotyping +from pages import Pheno_Cluster_a # "slow" for forking test initialization +from pages import Pheno_Cluster_b # "slow" for forking test initialization +from pages import Tool_parameter_selection +from pages import Run_workflow +from pages import Display_individual_ROI_heatmaps +from pages import Display_average_heatmaps +from pages import Display_average_heatmaps_per_annotation +from pages import Display_ROI_P_values_overlaid_on_slides +from pages import Neighborhood_Profiles # slow due to things ultimately importing umap +from pages import UMAP_Analyzer # slow due to things ultimately importing umap +from pages import Clusters_Analyzer # slow due to things ultimately importing umap +from pages import memory_analyzer +from pages import radial_bins_plots +from pages import radial_profiles_analysis +from pages import preprocessing +from pages import results_transfer +# from pages import forking_test + +# Configure logging +logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) def welcome_page(): ''' - First page displayed when the app opens + First page displayed when the app opens. + + This requires some extra work to make the markdown rendering + work properly. ''' # Markdown text with open("markdown/MAWA_WelcomePage.md", "r", encoding="utf-8") as f: @@ -73,13 +78,20 @@ def check_for_platform(session_state): ''' Set the platform parameters based on the platform the Streamlit app is running on ''' + # Initialize the platform object if 'platform' not in session_state: + logger.info('Platform initialization starting.') + session_state['platform'] = platform_io.Platform(platform=('nidap' if platform_is_nidap() else 'local')) + logger.info('Platform initialization complete.') return session_state def main(): + ''' + Main function for running the Multiplex Analysis Web Apps + ''' st.set_page_config(layout="wide") @@ -143,9 +155,11 @@ def main(): # Ensure the input/output directories exist input_path = './input' if not os.path.exists(input_path): + logger.info("Creating input directory at %s", input_path) os.makedirs(input_path) output_path = './output' if not os.path.exists(output_path): + logger.info("Creating output directory at %s", output_path) os.makedirs(output_path) # For widget persistence, we need always copy the session state to itself, being careful with widgets that cannot be persisted, like st.data_editor() (where we use the "__do_not_persist" suffix to avoid persisting it) @@ -173,16 +187,18 @@ def main(): # Initalize session_state values for streamlit processing if 'init' not in st.session_state: + logger.info("Initializing session state") st.session_state = ndl.init_session_state(st.session_state) # Sidebar organization with st.sidebar: - st.write('**:book: [Documentation](https://ncats.github.io/multiplex-analysis-web-apps/)**') + st.write('**📖 [Documentation](https://ncats.github.io/multiplex-analysis-web-apps/)**') with st.expander('Advanced:'): - benchmark_button = True - if benchmark_button: - st.button('Record Benchmarking', on_click = st.session_state.bc.save_run_to_csv) + if st.button('Record Benchmarking'): + logger.info("Recording benchmark information") + st.session_state.bc.save_run_to_csv() if st.button('Calculate memory used by Python session'): + logger.info("Calculating memory used by Python session") streamlit_utils.write_python_session_memory_usage() # Check the platform diff --git a/basic_phenotyper_lib.py b/basic_phenotyper_lib.py index 90d06e5b..23f12907 100644 --- a/basic_phenotyper_lib.py +++ b/basic_phenotyper_lib.py @@ -100,26 +100,32 @@ def init_pheno_cols(df, marker_names, marker_col_prefix): df_markers = df[marker_cols] df_markers = df_markers.map(lambda x: {'+': '1', '-': '0'}[x[-1]]) - df['mark_bits'] = df_markers.astype(str).apply(''.join, axis='columns') # efficiently create a series of strings that are the columns (in string format) concatenated together + + # Vectorized creation of 'mark_bits' + df['mark_bits'] = df_markers.astype(str).agg(''.join, axis=1) # Add a column of prettier names for the species, e.g., 'VIM- ECAD+ COX2+ NOS2-' - df['species_name_long'] = df['mark_bits'].apply(lambda mark_bits: ' '.join([marker_name + ('+' if marker_bit == '1' else '-') for marker_name, marker_bit in zip(marker_names, mark_bits)])) - - # Add a column dropping the negative markers from these pretty names, e.g., 'ECAD+ COX2+' - def species_name_long_to_short(species_name_long): - x = '+ '.join([marker_names[iy] for iy, y in enumerate([x for x in species_name_long if x in ('+', '-')]) if y == '+']) + '+' - species_name_short = x if len(x) != 1 else 'Other' - return species_name_short - # This can possibly be made faster (if it's correct) via but I haven't tested it: - # marker_indices = [i for i, x in enumerate(species_name_long) if x == '+'] - # if not marker_indices: - # return 'Other' - # return ' + '.join(marker_names[i] for i in marker_indices) + '+' - df['species_name_short'] = df['species_name_long'].apply(species_name_long_to_short) - - # Create a new column called 'has pos mark' identifying which species_name_shorts are not Other - df['has_pos_mark'] = True - df.loc[df['species_name_short'] == 'Other', 'has_pos_mark'] = False + df['species_name_long'] = df['mark_bits'].apply( + lambda mark_bits: ' '.join( + [f"{marker_name}{'+' if bit == '1' else '-'}" for marker_name, bit in zip(marker_names, mark_bits)] + ) + ) + + # # Add a column dropping the negative markers from these pretty names, e.g., 'ECAD+ COX2+' + # def species_name_long_to_short(species_name_long): + # x = '+ '.join([marker_names[iy] for iy, y in enumerate([x for x in species_name_long if x in ('+', '-')]) if y == '+']) + '+' + # species_name_short = x if len(x) != 1 else 'Other' + # return species_name_short + # # This can possibly be made faster (if it's correct) via but I haven't tested it: + # # marker_indices = [i for i, x in enumerate(species_name_long) if x == '+'] + # # if not marker_indices: + # # return 'Other' + # # return ' + '.join(marker_names[i] for i in marker_indices) + '+' + # df['species_name_short'] = df['species_name_long'].apply(species_name_long_to_short) + df['species_name_short'] = df['species_name_long'].str.extractall(r'(\w+)\+').groupby(level=0).agg(' + '.join).fillna('Other') + '+' + + # Create a new column called 'has pos mark' identifying which species_name_shorts are not "Other" + df['has_pos_mark'] = df['species_name_short'] != 'Other' # Create phenotype column and assign a value of 'unassigned' df['phenotype'] = 'unassigned' @@ -143,36 +149,15 @@ def init_pheno_assign(df): of each "exclusive" species ''' - st_init_species = time.time() - spec_summ = df[['species_name_short', 'phenotype', 'species_name_long']] - sp_init_species = time.time() - elapsed = round(sp_init_species - st_init_species, 3) - print(f' Initalizing Phenotying Assignments: {elapsed}s') - - # This line seems to throw a TypeError: unhashable type: 'numpy.ndarray' error - spec_summ['species_count'] = spec_summ['species_name_short'].groupby(spec_summ['species_name_short']).transform('count') - spec_summ = spec_summ.drop_duplicates().reset_index(drop=True) + spec_summ = df[['species_name_short', 'phenotype', 'species_name_long']].copy() - # The above seems a bit inefficient and should probably be replaced with something like this: - # spec_summ = spec_summ['species_name_short'].value_counts().reset_index() - # spec_summ.columns = ['species_name_short', 'species_count'] + species_counts = spec_summ['species_name_short'].value_counts().reset_index() + species_counts.columns = ['species_name_short', 'species_count'] - sp_species_count = time.time() - elapsed_counts = round(sp_species_count - sp_init_species, 3) - print(f' Phenotying Assignments Counts Calculations: {elapsed_counts}s') + spec_summ = spec_summ.drop_duplicates(subset=['species_name_short']).merge(species_counts, on='species_name_short') + spec_summ['species_percent'] = (spec_summ['species_count'] / spec_summ['species_count'].sum() * 100).round(2) - spec_summ['species_percent'] = [round(100*x/sum(spec_summ['species_count']), 2) for x in spec_summ['species_count']] - sp_species_per = time.time() - elapsed_per = round(sp_species_per - sp_species_count, 3) - print(f' Phenotying Assignments Percents Calculations: {elapsed_per}s') - - spec_summ = spec_summ.sort_values(by='species_count', ascending= False).reset_index(drop=True) - sp_species_sort = time.time() - elapsed_sort = round(sp_species_sort - sp_species_per, 3) - print(f' Phenotying Assignments sorting: {elapsed_sort}s') - - # Return the created dataframe - return spec_summ + return spec_summ.sort_values(by='species_count', ascending=False).reset_index(drop=True) def init_pheno_summ(df): '''For each unique species (elsewhere called "exclusive" phenotyping), @@ -187,11 +172,21 @@ def init_pheno_summ(df): each "exclusive" species ''' - assign_pheno = df[['phenotype', 'species_name_short', 'species_name_long']].groupby(by='phenotype', as_index = False).agg(lambda x: np.unique(list(x))) + # Group by phenotype and aggregate unique values for species_name_short and species_name_long + assign_pheno = df.groupby('phenotype', as_index=False).agg({ + 'species_name_short': lambda x: ', '.join(str(val) for val in pd.unique(x.dropna())), + 'species_name_long': lambda x: ', '.join(str(val) for val in pd.unique(x.dropna())) + }) + + # Calculate phenotype counts and percentages + phenotype_counts = df['phenotype'].value_counts() + total_count = phenotype_counts.sum() + + assign_pheno['phenotype_count'] = assign_pheno['phenotype'].map(phenotype_counts) + assign_pheno['phenotype_percent'] = (assign_pheno['phenotype_count'] / total_count * 100).round(2) - assign_pheno['phenotype_count'] = [sum(df['phenotype'] == x) for x in assign_pheno.phenotype] - assign_pheno['phenotype_percent'] = [round(100*x/sum(assign_pheno['phenotype_count']), 2) for x in assign_pheno['phenotype_count']] - assign_pheno = assign_pheno.sort_values(by='phenotype_count', ascending=False) + # Sort by phenotype count in descending order + assign_pheno = assign_pheno.sort_values(by='phenotype_count', ascending=False).reset_index(drop=True) return assign_pheno diff --git a/install_missing_packages.py b/install_missing_packages.py index 23a28ec1..d46540d6 100644 --- a/install_missing_packages.py +++ b/install_missing_packages.py @@ -15,7 +15,9 @@ def is_mamba_installed(): ''' try: # Run the 'mamba --version' command - result = subprocess.run(['mamba', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + result = subprocess.run(['mamba', '--version'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, check=False) # Check if the command was successful if result.returncode == 0: @@ -36,7 +38,9 @@ def install_with_mamba(packages): print(f"&&&& Attempting to install {', '.join(packages)} with mamba.") try: # Run the 'mamba install ' command - result = subprocess.run(['mamba', 'install', '-y'] + packages, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + result = subprocess.run(['mamba', 'install', '-y'] + packages, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, check=False) # Check if the command was successful if result.returncode == 0: @@ -56,7 +60,9 @@ def install_with_conda(packages): print(f"&&&& Attempting to install {', '.join(packages)} with conda.") try: # Run the 'conda install ' command - result = subprocess.run(['conda', 'install', '-y'] + packages, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + result = subprocess.run(['conda', 'install', '-y'] + packages, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, check=False) # Check if the command was successful if result.returncode == 0: @@ -75,7 +81,9 @@ def install_with_pip(packages): print(f"&&&& Attempting to install {', '.join(packages)} with pip.") try: # Run the 'pip install ' command - result = subprocess.run(['pip', 'install'] + packages, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + result = subprocess.run(['pip', 'install'] + packages, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, check=False) # Check if the command was successful if result.returncode == 0: @@ -91,7 +99,7 @@ def live_package_installation(): ''' Function to check if packages are installed ''' - + # last two probably only needed for published dashboards packages_to_install = ['hnswlib', 'parc', 'sklearn_ann', 'annoy', 'pyNNDescent'] installers_to_use = ['mamba', 'pip'] diff --git a/nidap_dashboard_lib.py b/nidap_dashboard_lib.py index c51889e8..4ecd6110 100644 --- a/nidap_dashboard_lib.py +++ b/nidap_dashboard_lib.py @@ -14,12 +14,10 @@ alt.data_transformers.disable_max_rows() from natsort import natsorted from pathlib import Path -from datetime import datetime import basic_phenotyper_lib as bpl # Useful functions for cell phenotyping from foundry_IO_lib import foundry_IO_lib # Foundry Input/Output Class from benchmark_collector import benchmark_collector # Benchmark Collector Class from neighborhood_profiles import NeighborhoodProfiles, UMAPDensityProcessing # slow because this imports umap -import PlottingTools as umPT def identify_col_type(col): ''' @@ -327,9 +325,8 @@ def set_phenotyping_elements(session_state, df_orig): if hasattr(session_state, 'dataeditor__do_not_persist'): delattr(session_state, 'dataeditor__do_not_persist') - # Initalize Phenotyping Settings (Radio BUttons) + # Initalize Phenotyping Settings (Radio Buttons) session_state.noPhenoOpt = 'Not Selected' - session_state.phenoMeth = 'Species' # Default when first loaded session_state.selected_phenoMeth = session_state.noPhenoOpt # Default when first loaded return session_state diff --git a/nidap_io.py b/nidap_io.py index e2c37553..3bbaf413 100644 --- a/nidap_io.py +++ b/nidap_io.py @@ -1,11 +1,12 @@ -# This is a single place to put all the functions that interact with NIDAP. It is called exclusively from platform_io.py. +'''This is a single place to put all the functions +that interact with NIDAP. It is called exclusively +from platform_io.py.''' import os import shutil import time import utils - def get_foundry_dataset(alias='input'): """Create a dataset object. This should be fast. @@ -20,7 +21,15 @@ def upload_file_to_dataset(dataset, selected_filepath='/home/user/repo/bleh.txt' This overwrites existing files. This trivially returns a string of the uploaded filename. This is consistent with Code Workspaces snippets on 3/10/24. + This should be slow. + + Args: + dataset: The dataset to upload the file to. + selected_filepath: The local file path to upload. + + Returns: + str: The uploaded filename. """ return dataset.upload_file(selected_filepath) @@ -28,7 +37,12 @@ def upload_file_to_dataset(dataset, selected_filepath='/home/user/repo/bleh.txt' def upload_dir_to_dataset(dataset, path_to_dir_to_upload='../junk_files'): """Upload a local directory to a dataset. This overwrites existing files. - This returns a dictionary where each key is the path to the local file in path_to_dir_to_upload and each value is the name of the file in the dataset on NIDAP, where the name includes separators (e.g., "/") and is equal to the local file path without the prefix path_to_dir_to_upload (plus following "/"), probably as you would expect on Amazon S3. E.g., the return value could be: + This returns a dictionary where each key is the path to the local file + in path_to_dir_to_upload and each value is the name of the file in the + dataset on NIDAP, where the name includes separators (e.g., "/") and is + equal to the local file path without the prefix path_to_dir_to_upload + (plus following "/"), probably as you would expect on Amazon S3. + E.g., the return value could be: {'../junk_files/junk-200mb-20': 'junk-200mb-20', '../junk_files/junk-200mb-09': 'junk-200mb-09', '../junk_files/junk-200mb-39': 'junk-200mb-39', @@ -38,10 +52,19 @@ def upload_dir_to_dataset(dataset, path_to_dir_to_upload='../junk_files'): '../junk_files/subdir/subdir2/junk-200mb-4': 'subdir/subdir2/junk-200mb-4', '../junk_files/subdir/subdir2/junk-200mb-9': 'subdir/subdir2/junk-200mb-9', '../junk_files/subdir/subdir2/junk-200mb-0': 'subdir/subdir2/junk-200mb-0'} - Note there is at least a single-file upload limit of about 2000 MB, which is higher than reported in an old Issue to Palantir. + Note there is at least a single-file upload limit of about 2000 MB, which is higher + than reported in an old Issue to Palantir. + This should be slow. + + Args: + dataset: The dataset to upload the directory to. + path_to_dir_to_upload: The local directory path to upload. + + Returns: + dict: A dictionary mapping local file paths to their dataset names. """ - output_dir = os.path.join(os.environ["USER_WORKING_DIR"], "outputs") # per Palantir on 4/10/24: write files and directories to output_dir or a subdir to upload them + output_dir = os.path.join(os.environ["USER_WORKING_DIR"], "outputs") print(f'Transferring {utils.get_dir_size(path_to_dir_to_upload):.2f} MB from directory {path_to_dir_to_upload}...', end='') shutil.rmtree(output_dir) shutil.copytree(path_to_dir_to_upload, output_dir) @@ -61,6 +84,14 @@ def download_files_from_dataset(dataset, dataset_filter_func=lambda f: f.path.st This returns a dictionary of *all* downloaded files as described above. I'm pretty sure this does not overwrite already-downloaded files, so if it's been run once with the same parameters, it will be fast on subsequent runs. This is otherwise slow. + + Args: + dataset: The dataset to download files from. + dataset_filter_func: A function to filter the dataset files. + limit: The maximum number of files to download in each batch. + + Returns: + dict: A dictionary mapping local file paths to their dataset names. """ # limit=15 seems to be the best value for downloading 60 200MB files. It's unclear to me exactly what this limit is doing. But in this situation, I get an overall download speed of about 800 MB/s! # If I had to guess, without the limit keyword I believe all matching files are downloaded in a single batch (so e.g. the loop below is iterated once and is unnecessary), and with the limit keyword, the files are downloaded in batches of size limit (so each batch has limit files). I believe that each batch is downloaded sequentially, but within each batch, multiple CPUs are used to download files in the batch in parallel. So I think you want to have at least as many files in each batch (i.e., limit) as there are CPUs available to download files in parallel. It's a bit unclear why a single batch with all the files isn't fastest because I'd think it'd parallelize the file downloads efficiently, but e.g. limit=15 was faster than limit=20, which was faster than larger limits. Likewise, smaller limit values (than 15) were slower. diff --git a/pages2/Clusters_Analyzer.py b/pages/Clusters_Analyzer.py similarity index 100% rename from pages2/Clusters_Analyzer.py rename to pages/Clusters_Analyzer.py diff --git a/pages2/Display_ROI_P_values_overlaid_on_slides.py b/pages/Display_ROI_P_values_overlaid_on_slides.py similarity index 100% rename from pages2/Display_ROI_P_values_overlaid_on_slides.py rename to pages/Display_ROI_P_values_overlaid_on_slides.py diff --git a/pages2/Display_average_heatmaps.py b/pages/Display_average_heatmaps.py similarity index 100% rename from pages2/Display_average_heatmaps.py rename to pages/Display_average_heatmaps.py diff --git a/pages2/Display_average_heatmaps_per_annotation.py b/pages/Display_average_heatmaps_per_annotation.py similarity index 100% rename from pages2/Display_average_heatmaps_per_annotation.py rename to pages/Display_average_heatmaps_per_annotation.py diff --git a/pages2/Display_individual_ROI_heatmaps.py b/pages/Display_individual_ROI_heatmaps.py similarity index 100% rename from pages2/Display_individual_ROI_heatmaps.py rename to pages/Display_individual_ROI_heatmaps.py diff --git a/pages2/Neighborhood_Profiles.py b/pages/Neighborhood_Profiles.py similarity index 100% rename from pages2/Neighborhood_Profiles.py rename to pages/Neighborhood_Profiles.py diff --git a/pages2/Pheno_Cluster_a.py b/pages/Pheno_Cluster_a.py similarity index 100% rename from pages2/Pheno_Cluster_a.py rename to pages/Pheno_Cluster_a.py diff --git a/pages2/Pheno_Cluster_b.py b/pages/Pheno_Cluster_b.py similarity index 100% rename from pages2/Pheno_Cluster_b.py rename to pages/Pheno_Cluster_b.py diff --git a/pages2/Run_workflow.py b/pages/Run_workflow.py similarity index 100% rename from pages2/Run_workflow.py rename to pages/Run_workflow.py diff --git a/pages2/Tool_parameter_selection.py b/pages/Tool_parameter_selection.py similarity index 100% rename from pages2/Tool_parameter_selection.py rename to pages/Tool_parameter_selection.py diff --git a/pages2/UMAP_Analyzer.py b/pages/UMAP_Analyzer.py similarity index 100% rename from pages2/UMAP_Analyzer.py rename to pages/UMAP_Analyzer.py diff --git a/pages2/__init__.py b/pages/__init__.py similarity index 100% rename from pages2/__init__.py rename to pages/__init__.py diff --git a/pages2/adaptive_phenotyping.py b/pages/adaptive_phenotyping.py similarity index 99% rename from pages2/adaptive_phenotyping.py rename to pages/adaptive_phenotyping.py index 1e346920..2a7a31dd 100644 --- a/pages2/adaptive_phenotyping.py +++ b/pages/adaptive_phenotyping.py @@ -3,13 +3,12 @@ import numpy as np import pandas as pd import plotly.graph_objects as go -from pages2 import multiaxial_gating +from pages import multiaxial_gating import utils # Global variable st_key_prefix = 'adaptive_phenotyping__' - def plotly_mean_and_sem(dfs, df_names): # Create a Plotly figure diff --git a/pages2/child_process_killer.py b/pages/child_process_killer.py similarity index 97% rename from pages2/child_process_killer.py rename to pages/child_process_killer.py index e565fac5..5380f8de 100644 --- a/pages2/child_process_killer.py +++ b/pages/child_process_killer.py @@ -1,17 +1,21 @@ +''' +child_process_killer.py +''' + # Import relevant libraries -import streamlit as st -import app_top_of_page as top -import streamlit_dataframe_editor as sde import os import subprocess import psutil import pandas as pd -import time + +import streamlit as st +import app_top_of_page as top +import streamlit_dataframe_editor as sde def get_system_info(): # Run the top command and get its output output = subprocess.check_output( - ["top", "-b", "-n", "1"], + ["top", "-b", "-n", "1"], universal_newlines=True ) @@ -45,7 +49,6 @@ def get_system_info(): return df - def kill_child_processes(dry_run=False): parent_pid = os.getpid() # Get the process ID of the current process @@ -68,10 +71,10 @@ def main(): if st.button('Show system info'): df = get_system_info() st.dataframe(df) - + if st.button('Show what child processes *would* be killed if the following button is clicked'): kill_child_processes(dry_run=True) - + if st.button('Kill child processes'): kill_child_processes() diff --git a/pages2/data_import_and_export.py b/pages/data_import_and_export.py similarity index 100% rename from pages2/data_import_and_export.py rename to pages/data_import_and_export.py diff --git a/pages2/datafile_format_unifier.py b/pages/datafile_format_unifier.py similarity index 99% rename from pages2/datafile_format_unifier.py rename to pages/datafile_format_unifier.py index 71bd5b7b..f095a074 100644 --- a/pages2/datafile_format_unifier.py +++ b/pages/datafile_format_unifier.py @@ -1,13 +1,22 @@ +''' +datafile_format_unifier.py +''' + # Import relevant libraries import os +import re import streamlit as st import pandas as pd import streamlit_dataframe_editor as sde -import re import utils - def callback_for_combining_datafiles(filenames): + ''' + callback for combining datafiles + + Args: + filenames (list): A list of filenames to combine + ''' # Clear all keys in the session state starting with "unifier__" and not applicable to the selections above the callback button keys_to_delete = [key for key in st.session_state.keys() if (key.startswith("unifier__")) and (key not in ['unifier__input_files', 'unifier__de_datafile_selection', 'unifier__df_datafile_selection', 'unifier__df_datafile_selection_changes_dict', 'unifier__df_datafile_selection_key'])] @@ -19,6 +28,12 @@ def callback_for_combining_datafiles(filenames): def generate_guess_for_basename_of_mawa_unified_file(filenames): + ''' + generate a guess for the basename of the MAWA unified file + + Args: + filenames (list): A list of filenames to combine + ''' # generate_guess_for_basename_of_mawa_unified_file(df_reconstructed.loc[selected_rows, 'Filename']) # Convert the pandas Series to a list @@ -694,8 +709,8 @@ def main(): information = f''' Loaded dataset properties: - :small_orange_diamond: Number of rows: `{df.shape[0]}` - :small_orange_diamond: Number of columns: `{df.shape[1]}` + :small_orange_diamond: Number of rows: `{df.shape[0]}` + :small_orange_diamond: Number of columns: `{df.shape[1]}` :small_orange_diamond: Coordinate units: `{st.session_state['unifier__microns_per_coordinate_unit'] if 'unifier__microns_per_coordinate_unit' in st.session_state else None} microns/coord` :small_orange_diamond: Loaded memory usage: `{usage_str}` ''' diff --git a/pages2/dummy_editor.py b/pages/dummy_editor.py similarity index 100% rename from pages2/dummy_editor.py rename to pages/dummy_editor.py diff --git a/pages2/feature_creation.py b/pages/feature_creation.py similarity index 100% rename from pages2/feature_creation.py rename to pages/feature_creation.py diff --git a/pages2/forking_test.py b/pages/forking_test.py similarity index 100% rename from pages2/forking_test.py rename to pages/forking_test.py diff --git a/pages2/macro_radial_density.py b/pages/macro_radial_density.py similarity index 100% rename from pages2/macro_radial_density.py rename to pages/macro_radial_density.py diff --git a/pages2/memory_analyzer.py b/pages/memory_analyzer.py similarity index 100% rename from pages2/memory_analyzer.py rename to pages/memory_analyzer.py diff --git a/pages2/multiaxial_gating.py b/pages/multiaxial_gating.py similarity index 100% rename from pages2/multiaxial_gating.py rename to pages/multiaxial_gating.py diff --git a/pages2/open_file.py b/pages/open_file.py similarity index 100% rename from pages2/open_file.py rename to pages/open_file.py diff --git a/pages2/preprocessing.py b/pages/preprocessing.py similarity index 100% rename from pages2/preprocessing.py rename to pages/preprocessing.py diff --git a/pages2/radial_bins_plots.py b/pages/radial_bins_plots.py similarity index 100% rename from pages2/radial_bins_plots.py rename to pages/radial_bins_plots.py diff --git a/pages2/radial_profiles_analysis.py b/pages/radial_profiles_analysis.py similarity index 100% rename from pages2/radial_profiles_analysis.py rename to pages/radial_profiles_analysis.py diff --git a/pages2/results_transfer.py b/pages/results_transfer.py similarity index 100% rename from pages2/results_transfer.py rename to pages/results_transfer.py diff --git a/pages2/robust_scatter_plotter.py b/pages/robust_scatter_plotter.py similarity index 100% rename from pages2/robust_scatter_plotter.py rename to pages/robust_scatter_plotter.py diff --git a/pages2/skeleton2.py b/pages/skeleton.py similarity index 60% rename from pages2/skeleton2.py rename to pages/skeleton.py index 85d7c902..ec5abc01 100644 --- a/pages2/skeleton2.py +++ b/pages/skeleton.py @@ -1,4 +1,3 @@ -# Much simpler now (vs. skeleton.py) as the top and bottom matter is now located in Multiplex_Analysis_Web_Apps.py using the new Streamlit multipage functionality # Import relevant libraries import streamlit as st diff --git a/pages2/spatial_umap_prediction_app.py b/pages/spatial_umap_prediction_app.py similarity index 99% rename from pages2/spatial_umap_prediction_app.py rename to pages/spatial_umap_prediction_app.py index 6c91cffe..5b99e4c0 100644 --- a/pages2/spatial_umap_prediction_app.py +++ b/pages/spatial_umap_prediction_app.py @@ -7,7 +7,7 @@ import streamlit as st import streamlit_dataframe_editor as sde import app_top_of_page as top -from pages2 import sit_03a_Tool_parameter_selection as sit +from pages import sit_03a_Tool_parameter_selection as sit import new_phenotyping_lib import utils diff --git a/pages2/thresholded_phenotyping.py b/pages/thresholded_phenotyping.py similarity index 97% rename from pages2/thresholded_phenotyping.py rename to pages/thresholded_phenotyping.py index b5649277..5624385f 100644 --- a/pages2/thresholded_phenotyping.py +++ b/pages/thresholded_phenotyping.py @@ -20,7 +20,9 @@ def data_editor_change_callback(): st.session_state.df = bpl.assign_phenotype_custom(st.session_state.df, st.session_state['pheno__de_phenotype_assignments'].reconstruct_edited_dataframe()) # Create Phenotypes Summary Table based on 'phenotype' column in df + st.session_state.bc.startTimer() st.session_state.pheno_summ = bpl.init_pheno_summ(st.session_state.df) + st.session_state.bc.printElapsedTime(msg = 'Updating Phenotype Summary Table after phenotype assignment change') def slide_id_prog_left_callback(): ''' @@ -170,10 +172,14 @@ def main(): mid_col = st.columns(2) with mid_col[1]: - with st.expander('Choose Markers to include'): + with st.form('Choose Markers to include'): st.multiselect('Markers', options = st.session_state.loaded_marker_names, - key = 'marker_multi_sel', - on_change=marker_multiselect_callback) + key = 'marker_multi_sel') + + # Every form must have a submit button. + submitted = st.form_submit_button('Apply Changes') + if submitted: + marker_multiselect_callback() ## In-App Instructions if st.session_state.data_loaded is False: diff --git a/pages2/skeleton.py b/pages2/skeleton.py deleted file mode 100644 index 01bf5bc8..00000000 --- a/pages2/skeleton.py +++ /dev/null @@ -1,33 +0,0 @@ -# Import relevant libraries -import streamlit as st -import app_top_of_page as top -import streamlit_dataframe_editor as sde - - -def main(): - """ - Main function for the page. - """ - - st.write('Insert your code here.') - - -# Run the main function -if __name__ == '__main__': - - # Set page settings - page_name = 'Your Page Name Here' - st.set_page_config(layout='wide', page_title=page_name) - st.title(page_name) - - # Run streamlit-dataframe-editor library initialization tasks at the top of the page - st.session_state = sde.initialize_session_state(st.session_state) - - # Run Top of Page (TOP) functions - st.session_state = top.top_of_page_reqs(st.session_state) - - # Call the main function - main() - - # Run streamlit-dataframe-editor library finalization tasks at the bottom of the page - st.session_state = sde.finalize_session_state(st.session_state) diff --git a/platform_io.py b/platform_io.py index ddb37e85..4e34e6c3 100644 --- a/platform_io.py +++ b/platform_io.py @@ -4,26 +4,54 @@ # Import relevant libraries import os +import sys import time import shutil import pandas as pd import streamlit as st +import nidap_io import streamlit_dataframe_editor as sde import utils -from pages2 import memory_analyzer +from pages import memory_analyzer # Constant local_input_dir = os.path.join('.', 'input') local_output_dir = os.path.join('.', 'output') -# Write a dataframe from a file listing with columns for selection, filename, # of files inside (for directories), and modification time, sorted descending by modification time -# Note this is primarily for local listings, not remote listings def make_complex_dataframe_from_file_listing(dirpath, item_names, df_session_state_key_basename=None, editable=True): - import time + '''Write a dataframe from a file listing with columns for selection, filename, # of files inside + (for directories), and modification time, sorted descending by modification time + + + Note this is primarily for local listings, not remote listings + + Parameters + ---------- + dirpath : str + The directory path to list files from. + item_names : list + The names of the items (files or directories) to include in the dataframe. + df_session_state_key_basename : str, optional + The base name for the session state keys to use for the dataframe and data editor. + editable : bool, optional + Whether the dataframe should be editable in the Streamlit app. + + Returns + ------- + pd.DataFrame + A dataframe containing the file listing information. + ''' + num_contents = [len(os.listdir(os.path.join(dirpath, x))) if os.path.isdir(os.path.join(dirpath, x)) else None for x in item_names] modification_times = [os.path.getmtime(os.path.join(dirpath, x)) for x in item_names] selecteds = [False for _ in item_names] df = pd.DataFrame({'Selected': selecteds, 'File or directory name': item_names, '# of files within': num_contents, 'Modification time': [time.ctime(x) for x in modification_times], 'mod_time_sec': modification_times}).sort_values('mod_time_sec', ascending=False).reset_index(drop=True) + + column_config = { + "Selected": st.column_config.CheckboxColumn(label="Select", width=75), + "File or directory name": st.column_config.TextColumn(label="File or directory name"), + "# of files within": st.column_config.NumberColumn(label="Number of files within", width=125), + } if editable: ss_de_key_name = 'loader__de_' + df_session_state_key_basename ss_df_key_name = 'loader__df_' + df_session_state_key_basename @@ -33,14 +61,33 @@ def make_complex_dataframe_from_file_listing(dirpath, item_names, df_session_sta del st.session_state[ss_de_key_name] if ss_de_key_name not in st.session_state: st.session_state[ss_de_key_name] = sde.DataframeEditor(df_name=ss_df_key_name, default_df_contents=df.iloc[:, :-1]) - st.session_state[ss_de_key_name].dataframe_editor(reset_data_editor_button_text='Reset file selections') + st.session_state[ss_de_key_name].dataframe_editor(reset_data_editor_button_text='Reset file selections', + column_config=column_config) else: st.dataframe(df.iloc[:, 1:-1]) if df_session_state_key_basename is not None: st.warning('Session state key {} is not being assigned since editable=False was selected in call to make_complex_dataframe_from_file_listing()'.format(ss_df_key_name)) -# Write an editable dataframe (simple, having only a selection column and filenames with .zip removed) for the available files, also saving the filenames with the possible .zip extensions to a separate Series def make_simple_dataframe_from_file_listing(available_files, df_session_state_key_basename=None, streamlit_key_for_available_filenames_srs=None, editable=True): + ''' + Create a simple, editable dataframe from the available files. + + This dataframe will have a selection column and will strip any .zip extensions + from the filenames. This is shown in the Data Import and Export Page. + + Args: + available_files (list): A list of available file names (with .zip extensions). + df_session_state_key_basename (str, optional): The base name for the session state + keys to use for the dataframe and data + editor. + streamlit_key_for_available_filenames_srs (str, optional): The Streamlit key to use for + the Series containing the full + filenames. + editable (bool, optional): Whether the dataframe should be editable in the Streamlit app. + + Returns: + pd.DataFrame: A simple, editable dataframe containing the available files. + ''' # Save (to Streamlit, analogous to how it's done for the data editor, below), the full filenames of the available files if streamlit_key_for_available_filenames_srs is not None: @@ -52,6 +99,11 @@ def make_simple_dataframe_from_file_listing(available_files, df_session_state_ke # Create a simple dataframe of the available files, with a selection column and stripped of any .zip extensions df = pd.DataFrame({'Selected': [False for _ in available_files], 'File or directory name': available_files}) + column_config = { + "Selected": st.column_config.CheckboxColumn(label="Select", width=75), + "File or directory name": st.column_config.TextColumn(label="File or directory name") + } + # Display an editable dataframe version of this if editable: ss_de_key_name = 'loader__de_' + df_session_state_key_basename @@ -61,15 +113,27 @@ def make_simple_dataframe_from_file_listing(available_files, df_session_state_ke if set(df['File or directory name']) != set(st.session_state[ss_de_key_name].reconstruct_edited_dataframe()['File or directory name']): del st.session_state[ss_de_key_name] if ss_de_key_name not in st.session_state: - st.session_state[ss_de_key_name] = sde.DataframeEditor(df_name=ss_df_key_name, default_df_contents=df) - st.session_state[ss_de_key_name].dataframe_editor(reset_data_editor_button_text='Reset file selections') + st.session_state[ss_de_key_name] = sde.DataframeEditor(df_name=ss_df_key_name, + default_df_contents=df) + st.session_state[ss_de_key_name].dataframe_editor(reset_data_editor_button_text='Reset file selections', + column_config=column_config) else: - st.dataframe(df) + st.dataframe(df, column_config=column_config) if df_session_state_key_basename is not None: st.warning('Session state key {} is not being assigned since editable=False was selected in call to make_simple_dataframe_from_file_listing()'.format(ss_df_key_name)) -# Delete selected files/dirs from a directory def delete_selected_files_and_dirs(directory, selected_files): + ''' + Deletes the selected files and or directories + + Args: + directory (str): The directory containing the files and directories to delete. + selected_files (list): A list of file and directory names to delete. + + Returns: + None + ''' + for curr_file in selected_files: curr_path = os.path.join(directory, curr_file) if os.path.isfile(curr_path): @@ -83,7 +147,6 @@ def write_current_tool_parameters_to_disk(output_dir): import subprocess import yaml import streamlit_utils - import sys print('Writing the current tool parameters to disk...') settings_yaml_filename = 'settings_as_of_{}.yml'.format(utils.get_timestamp()) pathname = os.path.join(output_dir, settings_yaml_filename) @@ -109,7 +172,6 @@ def write_current_tool_parameters_to_disk(output_dir): # Write the current conda/pip environment to disk def write_current_environment_to_disk(output_dir): import subprocess - import os print('Writing the current conda/pip environment to disk...') environment_yaml_filename = 'environment_as_of_{}.yml'.format(utils.get_timestamp()) pathname = os.path.join(output_dir, environment_yaml_filename) @@ -177,15 +239,16 @@ def create_zipfile_with_ignores(zipfile_dirpath, basename_suffix_for_zipfile, pr # Return the full path of the created zipfile return zipfile_basename + '.zip' -# Ensure a directory exists but is empty def ensure_empty_directory(dirpath): - import shutil - import os + ''' + Ensure a directory exists but is empty + ''' + if os.path.exists(dirpath): shutil.rmtree(dirpath) # if it exists, then delete it - print('Tree {} deleted'.format(dirpath)) + print(f'Tree {dirpath} deleted') os.mkdir(dirpath) # create an empty destination directory - print('New directory {} created'.format(dirpath)) + print(f'New directory {dirpath} created') # Logic in case currently selected value is no long present in directory listing as might happen when it has just been deleted def account_for_stale_streamlit_values(session_state_key, possible_values): @@ -200,26 +263,39 @@ def account_for_stale_streamlit_values(session_state_key, possible_values): else: st.session_state[session_state_key] = None -# Create a class that takes the platform type (e.g., local, nidap) as input and creates corresponding methods, with consisting naming, for performing the same function on different platforms class Platform: + ''' + Create a class that takes the platform type (e.g., local, nidap) as input and + creates corresponding methods, with consisting naming, for performing the same + function on different platforms + ''' - # Object instantiation def __init__(self, platform='local'): + ''' + Object initialization + + Args: + platform (str): The platform type, e.g., 'local' or 'nidap'. + ''' self.platform = platform self.available_inputs = None self.available_archives = None - # Get a list of the files available to import locally to serve as input files for a workflow def get_available_inputs_listing(self): - # Potentially slow + ''' + Returns a list of the files available to import locally to serve as input files + for a workflow. + + Potentially slow + ''' - # When running locally, this is irrelevant as we can easily place all input files into the same ./input directory, as opposed to having to read them in from a remote + # When running locally, this will duplicate what is in ./input if self.platform == 'local': - available_inputs = [] + available_inputs = self.get_local_inputs_listing() # On NIDAP, load the metadata for the "input" unstructured dataset elif self.platform == 'nidap': - import nidap_io + dataset = nidap_io.get_foundry_dataset(alias='input') dataset_file_objects = nidap_io.get_file_objects_from_dataset(dataset) # slow available_inputs = nidap_io.list_files_in_dataset(dataset_file_objects) @@ -227,152 +303,151 @@ def get_available_inputs_listing(self): # Save the values *that we'll need later* that result from the long calculation (1-2 sec) as properties of the object so they're stored rather than discarded self.available_inputs = sorted(available_inputs) - - # Write a dataframe of the available inputs on the remote + def display_available_inputs_df(self): + ''' + Write a dataframe of the available inputs on the remote + ''' - # Again, irrelevant for local - if self.platform == 'local': - pass + st.subheader(':open_file_folder: Available input data on NIDAP') - # If on NIDAP... - elif self.platform == 'nidap': + # Identify the available inputs + if self.available_inputs is None: + self.get_available_inputs_listing() - st.subheader(':open_file_folder: Available input data on NIDAP') + # Get a shortcut to the available input list + available_inputs = self.available_inputs - # If we've never determined the inputs available on the remote (e.g., when the script first starts), do so now - if self.available_inputs is None: - self.get_available_inputs_listing() + # Create a simple editable dataframe of the available input filenames + make_simple_dataframe_from_file_listing(available_files=available_inputs, + df_session_state_key_basename='available_inputs', + streamlit_key_for_available_filenames_srs='srs_available_input_filenames', + editable=True) - # Get a shortcut to the available input list - available_inputs = self.available_inputs - - # Create a simple editable dataframe of the available input filenames - make_simple_dataframe_from_file_listing(available_files=available_inputs, df_session_state_key_basename='available_inputs', streamlit_key_for_available_filenames_srs='srs_available_input_filenames', editable=True) - - # Add a button to re-read the available input files on the remote def add_refresh_available_inputs_button(self): + ''' + Add a button to re-read the available input files on remote + + rerun is used at the end since this potentially changes outputs. + Rule of thumb for rerunning the page should probably be that if + this method changes outputs, will those possibly changed outputs + definitely get redrawn? If not, do a rerun! Consider where this method + falls in the top-down rerun of the calling script, are the outputs before + or after the method is called? + ''' + + if st.button(':arrows_clockwise: Refresh available input data'): + self.get_available_inputs_listing() + st.rerun() - # Irrelevant for local - if self.platform == 'local': - pass - - # If on NIDAP, create a button to simply update the available inputs - elif self.platform == 'nidap': - if st.button(':arrows_clockwise: Refresh available input data'): - self.get_available_inputs_listing() - st.rerun() # rerun since this potentially changes outputs... rule of thumb for rerunning the page should probably be that if this method changes outputs, will those possibly changed outputs definitely get redrawn? If not, do a rerun! Consider where this method falls in the top-down rerun of the calling script, are the outputs before or after the method is called? - - # Load any selected available inputs on the remote to the local machine def load_selected_inputs(self): + ''' + Load any selected available inputs on the remote to the local machine + + ''' + + st.subheader(':tractor: Load input data into MAWA') + + # If a load button is clicked... + if st.button('Load selected NIDAP input data :arrow_right:'): + + # # Get "shortcuts" to the object properties + # dataset_file_objects = self.dataset_file_objects_for_available_inputs + + # Get the selected filenames + df_available_inputs = st.session_state['loader__de_available_inputs'].reconstruct_edited_dataframe() + srs_available_input_filenames = st.session_state['srs_available_input_filenames'] + selected_input_filenames = srs_available_input_filenames[df_available_inputs['Selected']].tolist() + + # Download the selected files + all_downloaded_files = nidap_io.download_files_from_dataset(nidap_io.get_foundry_dataset(alias='input'), dataset_filter_func=lambda f: f.path in selected_input_filenames, limit=15) + + # For each downloaded file, move it to the local input directory + for selected_input_filename, local_download_path in all_downloaded_files.items(): + + # # For each selected available input file... + # for selected_input_filename in selected_input_filenames: + + # # Download the file and get its local download path + # dataset_file_object = nidap_io.get_dataset_file_object(dataset_file_objects, selected_filename=selected_input_filename) + # local_download_path = nidap_io.download_file_from_dataset(dataset_file_object) # slow + + # Populate the local input directory with the current selection + # Rules: + # * If it's not a .zip file, just copy it over + # * If it's a .zip file, there can be either 1 or 2 periods in the full filename including extension + # * If there's just a single period (e.g., asdf.zip), it must be a zipped directory with name following either DIRNAME.zip or DIRNAME--bleh.zip + # * If there are two periods (e.g., asdf.csv.zip), it must be a zipped datafile with name following asdf.csv + if selected_input_filename.endswith('.zip'): + splitted = selected_input_filename.split('.') # should be of length 2 or 3 (for, e.g., asdf.csv.zip) + num_periods = len(splitted) - 1 # should be 1 or 2 + # if (num_periods < 1) or (num_periods > 2): + # st.error('Available .zip input filename {} has a bad number of periods ({}... it should have 1-2 periods); please fix this.'.format(selected_input_filename, num_periods)) + # sys.exit() + if num_periods == 1: # it's a zipped directory, by specification + if '--' not in selected_input_filename: + dirpath = os.path.join(local_input_dir, selected_input_filename.rstrip('.zip')) + else: + dirpath = os.path.join(local_input_dir, selected_input_filename.split('--')[0]) + ensure_empty_directory(dirpath) + shutil.unpack_archive(local_download_path, dirpath) + else: # it's a zipped datafile + shutil.unpack_archive(local_download_path, local_input_dir) + else: + shutil.copy(local_download_path, local_input_dir) - # Irrelevant for local - if self.platform == 'local': - pass + def save_selected_input(self): + ''' + Save a MAWA-unified datafile to NIDAP + ''' - # If on NIDAP... - elif self.platform == 'nidap': + # Write a header + st.subheader(':tractor: Save MAWA-unified datafile to NIDAP') - st.subheader(':tractor: Load input data into MAWA') - - # If a load button is clicked... - if st.button('Load selected NIDAP input data :arrow_right:'): - - # Import relevant libraries - import nidap_io - import sys - - # # Get "shortcuts" to the object properties - # dataset_file_objects = self.dataset_file_objects_for_available_inputs - - # Get the selected filenames - df_available_inputs = st.session_state['loader__de_available_inputs'].reconstruct_edited_dataframe() - srs_available_input_filenames = st.session_state['srs_available_input_filenames'] - selected_input_filenames = srs_available_input_filenames[df_available_inputs['Selected']].tolist() - - # Download the selected files - all_downloaded_files = nidap_io.download_files_from_dataset(nidap_io.get_foundry_dataset(alias='input'), dataset_filter_func=lambda f: f.path in selected_input_filenames, limit=15) - - # For each downloaded file, move it to the local input directory - for selected_input_filename, local_download_path in all_downloaded_files.items(): - - # # For each selected available input file... - # for selected_input_filename in selected_input_filenames: - - # # Download the file and get its local download path - # dataset_file_object = nidap_io.get_dataset_file_object(dataset_file_objects, selected_filename=selected_input_filename) - # local_download_path = nidap_io.download_file_from_dataset(dataset_file_object) # slow - - # Populate the local input directory with the current selection - # Rules: - # * If it's not a .zip file, just copy it over - # * If it's a .zip file, there can be either 1 or 2 periods in the full filename including extension - # * If there's just a single period (e.g., asdf.zip), it must be a zipped directory with name following either DIRNAME.zip or DIRNAME--bleh.zip - # * If there are two periods (e.g., asdf.csv.zip), it must be a zipped datafile with name following asdf.csv - if selected_input_filename.endswith('.zip'): - splitted = selected_input_filename.split('.') # should be of length 2 or 3 (for, e.g., asdf.csv.zip) - num_periods = len(splitted) - 1 # should be 1 or 2 - # if (num_periods < 1) or (num_periods > 2): - # st.error('Available .zip input filename {} has a bad number of periods ({}... it should have 1-2 periods); please fix this.'.format(selected_input_filename, num_periods)) - # sys.exit() - if num_periods == 1: # it's a zipped directory, by specification - if '--' not in selected_input_filename: - dirpath = os.path.join(local_input_dir, selected_input_filename.rstrip('.zip')) - else: - dirpath = os.path.join(local_input_dir, selected_input_filename.split('--')[0]) - ensure_empty_directory(dirpath) - shutil.unpack_archive(local_download_path, dirpath) - else: # it's a zipped datafile - shutil.unpack_archive(local_download_path, local_input_dir) - else: - shutil.copy(local_download_path, local_input_dir) - - # Save a MAWA-unified datafile to NIDAP - def save_selected_input(self): + # Create a list of the CSV files having a "mawa-unified_datafile-" prefix and ".csv" suffix in the local input directory + mawa_unified_datafiles = [x for x in os.listdir(local_input_dir) if x.startswith('mawa-unified_datafile-') and x.endswith('.csv')] - # If working on NIDAP... - if self.platform == 'nidap': + # Create a dictionary of the stripped filenames and their corresponding full filenames + mawa_unified_datafiles_dict = {x.split('mawa-unified_datafile-')[1].split('.csv')[0]: x for x in mawa_unified_datafiles} + keys = list(mawa_unified_datafiles_dict.keys()) - # Write a header - st.subheader(':tractor: Save MAWA-unified datafile to NIDAP') + # If the session state key doesn't exist, create it and set it to the first key in the list (if it exists) + if ('loader__mawa_unified_datafile_to_save' not in st.session_state) or (st.session_state['loader__mawa_unified_datafile_to_save'] not in keys): + st.session_state['loader__mawa_unified_datafile_to_save'] = keys[0] if keys else None + st.selectbox('Select MAWA-unified datafile to save:', keys, key='loader__mawa_unified_datafile_to_save') - # Create a list of the CSV files having a "mawa-unified_datafile-" prefix and ".csv" suffix in the local input directory - mawa_unified_datafiles = [x for x in os.listdir(local_input_dir) if x.startswith('mawa-unified_datafile-') and x.endswith('.csv')] + # Create a button to zip the selected file and save it to NIDAP + if st.button('Save selected (above) MAWA-unified datafile to NIDAP :arrow_left:', help='This will zip the selected file and save it to NIDAP. We generally don\'t want to save a file **generated** in the app to the **`input`** dataset on NIDAP on principle, but this is a reasonable exception so that the file can be used again or in other use cases.', disabled=st.session_state['loader__mawa_unified_datafile_to_save'] is None): - # Create a dictionary of the stripped filenames and their corresponding full filenames - mawa_unified_datafiles_dict = {x.split('mawa-unified_datafile-')[1].split('.csv')[0]: x for x in mawa_unified_datafiles} - keys = list(mawa_unified_datafiles_dict.keys()) + # Create a spinner to indicate that the zipping and saving is in progress + with st.spinner('Zipping and saving...'): - # If the session state key doesn't exist, create it and set it to the first key in the list (if it exists) - if ('loader__mawa_unified_datafile_to_save' not in st.session_state) or (st.session_state['loader__mawa_unified_datafile_to_save'] not in keys): - st.session_state['loader__mawa_unified_datafile_to_save'] = keys[0] if keys else None - st.selectbox('Select MAWA-unified datafile to save:', keys, key='loader__mawa_unified_datafile_to_save') + # Zip the selected file + selected_mawa_unified_datafile = mawa_unified_datafiles_dict[st.session_state['loader__mawa_unified_datafile_to_save']] + shutil.make_archive(os.path.join(local_input_dir, selected_mawa_unified_datafile), 'zip', local_input_dir, selected_mawa_unified_datafile) - # Create a button to zip the selected file and save it to NIDAP - if st.button('Save selected (above) MAWA-unified datafile to NIDAP :arrow_left:', help='This will zip the selected file and save it to NIDAP. We generally don\'t want to save a file **generated** in the app to the **`input`** dataset on NIDAP on principle, but this is a reasonable exception so that the file can be used again or in other use cases.', disabled=st.session_state['loader__mawa_unified_datafile_to_save'] is None): + # Transfer the zipped file to NIDAP + dataset = nidap_io.get_foundry_dataset(alias='input') + upload_single_file_to_dataset((dataset, local_input_dir, selected_mawa_unified_datafile + '.zip')) + # nidap_io.upload_file_to_dataset(dataset, selected_filepath=os.path.join(local_input_dir, selected_mawa_unified_datafile + '.zip')) - # Create a spinner to indicate that the zipping and saving is in progress - with st.spinner('Zipping and saving...'): + # Delete the zipped file from the local input directory + os.remove(os.path.join(local_input_dir, selected_mawa_unified_datafile + '.zip')) - # Zip the selected file - selected_mawa_unified_datafile = mawa_unified_datafiles_dict[st.session_state['loader__mawa_unified_datafile_to_save']] - shutil.make_archive(os.path.join(local_input_dir, selected_mawa_unified_datafile), 'zip', local_input_dir, selected_mawa_unified_datafile) + def get_local_inputs_listing(self): + ''' + Get a listing of the files/dirs in the local ./input directory - # Transfer the zipped file to NIDAP - import nidap_io - dataset = nidap_io.get_foundry_dataset(alias='input') - upload_single_file_to_dataset((dataset, local_input_dir, selected_mawa_unified_datafile + '.zip')) - # nidap_io.upload_file_to_dataset(dataset, selected_filepath=os.path.join(local_input_dir, selected_mawa_unified_datafile + '.zip')) + This will ignore zip files, which can appear locally on a local platform, + since for a remote platform such as NIDAP, per above, all zip files get unzipped + ''' - # Delete the zipped file from the local input directory - os.remove(os.path.join(local_input_dir, selected_mawa_unified_datafile + '.zip')) + return sorted([x for x in os.listdir(local_input_dir) if not x.endswith('.zip')]) - # Get a listing of the files/dirs in the local input directory, which is platform-independent because it's local - def get_local_inputs_listing(self): - return sorted([x for x in os.listdir(local_input_dir) if not x.endswith('.zip')]) # ignore zip files, which can appear locally only on a local platform, since for a remote platform such as NIDAP, per above, all zip files get unzipped - - # Write a dataframe of the local input files, which we don't want to be editable because we don't want to mess with the local inputs (for now), even though they're basically a local copy def display_local_inputs_df(self): + ''' + Write a dataframe of the local input files, which we don't want to be editable because we don't want to mess with the local inputs (for now), even though they're basically a local copy + ''' st.subheader(':open_file_folder: Input data in MAWA') local_inputs = self.get_local_inputs_listing() if self.platform == 'local': # not editable locally because deletion is disabled anyway so there'd be nothing to do with selected files @@ -396,7 +471,7 @@ def add_delete_local_inputs_button(self): local_input_files_to_delete = df_local_inputs[df_local_inputs['Selected']]['File or directory name'] delete_selected_files_and_dirs(local_input_dir, local_input_files_to_delete) st.rerun() - + # List the results archives on the remote def get_archives_listing(self): @@ -407,7 +482,6 @@ def get_archives_listing(self): # List the contents of the output unstructured dataset (there should only be output_archive-*.zip files) elif self.platform == 'nidap': - import nidap_io dataset = nidap_io.get_foundry_dataset(alias='output') dataset_file_objects = nidap_io.get_file_objects_from_dataset(dataset) # slow available_archives = [x for x in nidap_io.list_files_in_dataset(dataset_file_objects) if (x.startswith('output_archive-') and ('.zip' in x))] @@ -429,7 +503,7 @@ def get_archives_listing(self): # Save the results of the long calculations that we'll need later as object properties self.available_archives = sorted(available_archives_trimmed) - + # Write a dataframe of the available archives def display_archives_df(self): @@ -444,8 +518,9 @@ def display_archives_df(self): st.subheader(':open_file_folder: Available results archives (i.e., saved results) on NIDAP') if self.available_archives is None: self.get_archives_listing() - make_simple_dataframe_from_file_listing(available_files=self.available_archives, editable=False) - + make_simple_dataframe_from_file_listing(available_files=self.available_archives, + editable=False) + # Add a button for deleting available archives def add_delete_archives_button(self): @@ -483,7 +558,7 @@ def add_refresh_archives_button(self): if st.button(':arrows_clockwise: Refresh available results archives'): self.get_archives_listing() st.rerun() # this may change outputs so refresh - + # Load the selected results archives so calculations can be resumed or results can be visualized # def load_selected_archive(self, nworkers_for_data_transfer=8): def load_selected_archive(self): @@ -524,18 +599,12 @@ def load_selected_archive(self): # If the user wants to load the selected archive... if st.button('Load selected (above) results archive :arrow_right:', help='WARNING: This will copy the contents of the selected archive to the results directory and will overwrite currently loaded results; please ensure they are backed up (you can just use the functions on this page)!'): - # Import relevant libraries - import nidap_io - # import utils - # import multiprocessing - # Delete all files currently present in the output results directory delete_selected_files_and_dirs(local_output_dir, self.get_local_results_listing()) - + # Obtain the full filename corresponding to the selected archive to load and run a check list_of_len_1 = [x for x in self.available_archives if x.startswith(st.session_state['archive_to_load'])] if len(list_of_len_1) != 1: - import sys print('ERROR: More than one available archive found ({}) for selected archive to load ({})'.format(list_of_len_1, st.session_state['archive_to_load'])) sys.exit() selected_archive_with_proper_extension = list_of_len_1[0] @@ -609,11 +678,11 @@ def save_results_to_archive(self): # Rerun since this potentially changes outputs st.rerun() - + # List all currently loaded results that aren't output archives, which is platform-independent def get_local_results_listing(self): return sorted([x for x in os.listdir(local_output_dir) if not x.startswith('output_archive-')]) # only locally will there exist files/dirs that start with output_archive- but it doesn't hurt to keep this here - + # Write a dataframe of the results in the local output directory, also obviously platform-independent def display_local_results_df(self): st.subheader(':open_file_folder: Results in MAWA') @@ -633,10 +702,10 @@ def add_delete_local_results_button(self): # Delete them delete_selected_files_and_dirs(local_output_dir, selected_items_to_delete) - + # Rerun since this potentially changes outputs st.rerun() - + # Write a YAML file of the current tool parameters to the loaded results directory def write_settings_to_local_results(self): st.subheader(':tractor: Write current tool parameters to loaded results') @@ -663,7 +732,7 @@ def add_create_empty_archive_button(self): if st.button(':pencil2: Create empty results archive directory'): _ = create_empty_output_archive(st.session_state['basename_suffix_for_new_local_archive_dir'], local_output_dir) st.rerun() # rerun since this potentially changes outputs - + # Delete local empty results output archive directories def add_delete_empty_archives_button(self): @@ -671,7 +740,7 @@ def add_delete_empty_archives_button(self): # If the user wants to delete all empty local results archives... if st.button(':x: Delete empty local results archive directories'): - + # Store the local results dataframe df_local_results = st.session_state['loader__de_local_results'].reconstruct_edited_dataframe() @@ -697,9 +766,6 @@ def multi_contains(full_str, substrs): def get_recursive_file_listing_of_directory(topdir, dirpath_prefixes_to_exclude=(), dirpath_suffixes_to_exclude=(), dirpath_substrs_to_exclude=(), filename_prefixes_to_exclude=(), filename_suffixes_to_exclude=(), filename_substrs_to_exclude=()): # Sample usage: platform_io.get_recursive_file_listing_of_directory(os.path.join('.', 'config')) - # Import relevant library - import os - # Initialize a list holding the file listing file_listing = [] @@ -720,7 +786,7 @@ def get_recursive_file_listing_of_directory(topdir, dirpath_prefixes_to_exclude= # From a file string that's either a path or just the filename, get the directory name and pure filename def get_dirname_and_basename_from_file(file_str): - import os + if os.path.sep in file_str: file_dirname = os.path.dirname(file_str) file_basename = os.path.basename(file_str) @@ -733,9 +799,6 @@ def get_dirname_and_basename_from_file(file_str): def append_group_size_to_all_files_in_group(file_path): # zipfile_path can be e.g. (1) os.path.join('..', 'my_zipfile.zip') or (2) 'my_zipfile.zip' - # Import relevant library - import os - # Get the directory and basename of the file file_dirname, file_basename = get_dirname_and_basename_from_file(file_path) @@ -794,7 +857,6 @@ def create_zipfile_from_files_in_dir(zipfile_name, topdir, chunksize_in_mb=None, # platform_io.create_zipfile_from_files_in_dir('../dude2.zip', 'output/output_archive-probably_good_recent_lci_results_from_original_dataset-20230921_020450', chunksize_in_mb=250) # Import relevant libraries - import os import split_file_reader.split_file_writer # Get the absolute path of the zip file @@ -850,7 +912,6 @@ def extract_zipfile_to_directory(zipfile_name='', extraction_path='', filepaths= # Import relevant library import split_file_reader.split_file_reader - import os # Get the list of zip files matching zipfile_name if filepaths is None: @@ -869,12 +930,10 @@ def extract_zipfile_to_directory(zipfile_name='', extraction_path='', filepaths= # Run some checks if standard_zip: if len(filepaths) != 1: - import sys print('ERROR: At least one of the detected files ends with ".zip" alone, but more than one file was detected! Detected files: {}'.format(filepaths)) sys.exit() else: if sum([not x.endswith('.zip') for x in filepaths]) != len(filepaths): - import sys print('ERROR: Not all detected files don\'t end purely with ".zip"! Detected files: {}'.format(filepaths)) sys.exit() @@ -895,9 +954,10 @@ def extract_zipfile_to_directory(zipfile_name='', extraction_path='', filepaths= print('{} zip file(s) {}{} extracted to {}'.format(num_parts, zipfile_name, suffix, extraction_path)) def upload_single_file_to_dataset(args_as_single_tuple): - import os - import time - import nidap_io + ''' + Upload a single file to a NIDAP dataset + ''' + dataset, filedir, filename = args_as_single_tuple print('Uploading zip file chunk {}...'.format(filename)) filesize = os.path.getsize(os.path.join(filedir, filename)) / 1024 ** 2 @@ -907,9 +967,9 @@ def upload_single_file_to_dataset(args_as_single_tuple): print(' Upload of {} ({:5.3f} MB) from Workspaces to Compass took {:3.1f} seconds --> {:3.1f} MB/s'.format(filename, filesize, duration, filesize / duration)) def back_up_results_to_nidap(local_output_dir, basename_suffix_for_new_results_archive, chunksize_in_mb=200): - - # Import relevant library - import nidap_io + ''' + Back up results to NIDAP + ''' # Create a temporary transfer directory to hold the generated zip files local_transfer_dir = os.path.join('.', 'transfer') diff --git a/streamlit_dataframe_editor.py b/streamlit_dataframe_editor.py index 7ed20397..84ee390a 100644 --- a/streamlit_dataframe_editor.py +++ b/streamlit_dataframe_editor.py @@ -117,7 +117,11 @@ class DataframeEditor: def __init__(self, df_name, default_df_contents): ''' - Object instantiation + DataframeEditor instantiation + + Args: + df_name (str): The name of the dataframe. + default_df_contents (pd.DataFrame): The default contents of the dataframe. ''' self.df_name = df_name self.default_df_contents = cast_column_labels_to_strings(default_df_contents) @@ -158,6 +162,31 @@ def reconstruct_edited_dataframe(self): def dataframe_editor(self, current_page_key='current_page_name', previous_page_key='previous_page_name', dynamic_rows=True, reset_data_editor_button=True, reset_data_editor_button_text='Reset data editor', on_change=None, hide_index=None, column_config=None, debug=False): ''' Function to perform all data editor functionalities for a dataframe that users should be able to manipulate + + Parameters + ---------- + current_page_key : str + The key for the current page in the Streamlit session state. + previous_page_key : str + The key for the previous page in the Streamlit session state. + dynamic_rows : bool + Whether the number of rows in the data editor should be dynamic or fixed. + reset_data_editor_button : bool + Whether to show a button to reset the data editor. + reset_data_editor_button_text : str + The text to display on the reset button. + on_change : callable, optional + A callback function to run when the data editor changes. + hide_index : bool, optional + Whether to hide the index column in the data editor. + column_config : dict, optional + A dictionary to configure the columns in the data editor. + debug : bool, optional + Whether to enable debug mode. + + Returns + ------- + None ''' # Shortcuts to object attributes @@ -168,15 +197,21 @@ def dataframe_editor(self, current_page_key='current_page_name', previous_page_k previous_page_name = st.session_state[previous_page_key] key_for_data_editor_widget = st.session_state[df_name + '_key'] - # If the user switches to this page, then - # have the data editor input be the previously saved data editor "output". + # If the user switches to this page, then + # have the data editor input be the previously saved data editor "output". # Note doing this provide a smooth and hiccup-free experience # e.g., no scrollbar snapping back to the topmost location if current_page_name != previous_page_name: self.update_editor_contents(new_df_contents=self.reconstruct_edited_dataframe(), reset_key=False) # Output a data editor for a dataframe of interest - st.data_editor(st.session_state[df_name], key=key_for_data_editor_widget, on_change=save_data_editor_changes, args=(df_name + '_changes_dict', key_for_data_editor_widget, on_change), num_rows=('dynamic' if dynamic_rows else 'fixed'), hide_index=hide_index, column_config=column_config) + st.data_editor(st.session_state[df_name], + key=key_for_data_editor_widget, + on_change=save_data_editor_changes, + args=(df_name + '_changes_dict', key_for_data_editor_widget, on_change), + num_rows=('dynamic' if dynamic_rows else 'fixed'), + hide_index=hide_index, + column_config=column_config) # Debugging information if debug: diff --git a/streamlit_session_state_management.py b/streamlit_session_state_management.py index 76ef1120..9f06e387 100644 --- a/streamlit_session_state_management.py +++ b/streamlit_session_state_management.py @@ -9,7 +9,7 @@ # from pympler.asizeof import asizeof as deep_mem_usage_in_bytes from objsize import get_deep_size as deep_mem_usage_in_bytes import time -from pages2 import memory_analyzer +from pages import memory_analyzer def load_session_state_preprocessing(saved_streamlit_session_states_dir, saved_streamlit_session_state_prefix='streamlit_session_state-', saved_streamlit_session_state_key='session_selection', selected_session=None):