Refactor for Python and dependency upgrades.

sandialabs · Aug 15, 2024 · f8f5088 · f8f5088
1 parent 3c3a655
commit f8f5088
Show file tree

Hide file tree

Showing 26 changed files with 509 additions and 694 deletions.
diff --git a/.github/workflows/build-and-run-tests.yml b/.github/workflows/build-and-run-tests.yml
@@ -7,8 +7,8 @@ jobs:
   build:
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10"]
-        os: [ubuntu-latest, windows-latest, macos-13]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        os: [ubuntu-latest, windows-latest, macos-latest]
     runs-on: ${{ matrix.os }}
     steps:
       - name: Checkout
@@ -19,6 +19,12 @@ jobs:
           python-version: ${{ matrix.python-version }}
           cache: "pip"
           cache-dependency-path: "**/pyproject.toml"
+      - name: Install HDF5 (macOS only)
+        if: runner.os == 'macOS'
+        run: brew install hdf5
+      - name: Set HDF5_DIR environment variable (macOS only)
+        if: runner.os == 'macOS'
+        run: echo "HDF5_DIR=$(brew --prefix hdf5)" >> $GITHUB_ENV
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip

diff --git a/.github/workflows/run-examples.yml b/.github/workflows/run-examples.yml
@@ -7,8 +7,8 @@ jobs:
   build:
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10"]
-        os: [ubuntu-latest, windows-latest, macos-13]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        os: [ubuntu-latest, windows-latest, macos-latest]
     runs-on: ${{ matrix.os }}
     steps:
       - name: Checkout
@@ -19,6 +19,12 @@ jobs:
           python-version: ${{ matrix.python-version }}
           cache: "pip"
           cache-dependency-path: "**/pyproject.toml"
+      - name: Install HDF5 (macOS only)
+        if: runner.os == 'macOS'
+        run: brew install hdf5
+      - name: Set HDF5_DIR environment variable (macOS only)
+        if: runner.os == 'macOS'
+        run: echo "HDF5_DIR=$(brew --prefix hdf5)" >> $GITHUB_ENV
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip setuptools wheel

diff --git a/README.md b/README.md
@@ -11,23 +11,22 @@ PyRIID is a Python package providing modeling and data synthesis utilities for m
 
 Requirements:
 
-- Python version: 3.8 to 3.10
+- Python version: 3.9 to 3.12
+  - Note: we recommended the highest Python version you can manage as anecdotally, we have noticed that everything just tends to get faster.
 - Operating systems: Windows, Mac, or Ubuntu
 
-A virtual environment is recommended.
-
 Tests and examples are run via Actions on many combinations of Python version and operating system.
 You can verify support for your platform by checking the workflow files.
 
 ### For Use
 
-To use the latest version on PyPI (note: changes are slower to appear here), run:
+To use the latest version on PyPI, run:
 
 ```sh
 pip install riid
 ```
 
-**For the latest features, run:**
+Note that changes are slower to appear on PyPI, so for the latest features, run:**
 
 ```sh
 pip install git+https://github.com/sandialabs/pyriid.git@main
@@ -86,7 +85,7 @@ Full copyright details can be found [here](https://github.com/sandialabs/PyRIID/
 ## Acknowledgements
 
 **Thank you** to the U.S. Department of Energy, National Nuclear Security Administration,
-Office of Defense Nuclear Nonproliferation Research and Development (DNN R&D) for funding that has led to version `2.x`.
+Office of Defense Nuclear Nonproliferation Research and Development (DNN R&D) for funding that has led to versions `2.0` and `2.1`.
 
 Additionally, **thank you** to the following individuals who have provided invaluable subject-matter expertise:
 

diff --git a/examples/data/synthesis/mix_seeds.py b/examples/data/synthesis/mix_seeds.py
@@ -2,14 +2,16 @@
 # Under the terms of Contract DE-NA0003525 with NTESS,
 # the U.S. Government retains certain rights in this software.
 """This example demonstrates how to generate synthetic gamma spectra from seeds."""
+import numpy as np
 from riid.data.synthetic import get_dummy_seeds
 from riid.data.synthetic.seed import SeedMixer
 
 fg_seeds_ss, bg_seeds_ss = get_dummy_seeds().split_fg_and_bg()
 
-mixed_fg_seeds_ss = SeedMixer(fg_seeds_ss, mixture_size=2)\
+rng = np.random.default_rng(3)
+mixed_fg_seeds_ss = SeedMixer(fg_seeds_ss, mixture_size=2, rng=rng)\
     .generate(n_samples=10)
-mixed_bg_seeds_ss = SeedMixer(bg_seeds_ss, mixture_size=3)\
+mixed_bg_seeds_ss = SeedMixer(bg_seeds_ss, mixture_size=3, rng=rng)\
     .generate(n_samples=10)
 
 print(mixed_fg_seeds_ss)

diff --git a/examples/modeling/arad.py b/examples/modeling/arad.py
@@ -14,7 +14,7 @@
 # Config
 rng = np.random.default_rng(42)
 OOD_QUANTILE = 0.99
-VERBOSE = True
+VERBOSE = False
 # Some of the following parameters are set low because this example runs on GitHub Actions and
 #   we don't want it taking a bunch of time.
 # When running this locally, change the values per their corresponding comment, otherwise
@@ -54,7 +54,7 @@
     arad.predict(gross_train_ss)
     ood_threshold = np.quantile(gross_train_ss.info.recon_error, OOD_QUANTILE)
 
-    reconstructions = arad.predict(test_ss, verbose=True)
+    reconstructions = arad.predict(test_ss, verbose=VERBOSE)
     ood = test_ss.info.recon_error.values > ood_threshold
     false_positive_rate = ood.mean()
     mean_recon_error = test_ss.info.recon_error.values.mean()

diff --git a/examples/modeling/arad_latent_prediction.py b/examples/modeling/arad_latent_prediction.py
@@ -5,16 +5,17 @@
 from an ARAD latent space.
 """
 import numpy as np
+from keras.api.metrics import Accuracy, CategoricalCrossentropy
 from sklearn.metrics import f1_score, mean_squared_error
 
 from riid.data.synthetic import get_dummy_seeds
 from riid.data.synthetic.seed import SeedMixer
 from riid.data.synthetic.static import StaticSynthesizer
-from riid.models.neural_nets.arad import ARADv2, ARADLatentPredictor
+from riid.models.neural_nets.arad import ARADLatentPredictor, ARADv2
 
 # Config
 rng = np.random.default_rng(42)
-VERBOSE = True
+VERBOSE = False
 # Some of the following parameters are set low because this example runs on GitHub Actions and
 #   we don't want it taking a bunch of time.
 # When running this locally, change the values per their corresponding comment, otherwise
@@ -66,7 +67,7 @@
 print("Training Classifier")
 arad_classifier = ARADLatentPredictor(
     loss="categorical_crossentropy",
-    metrics=("accuracy", "categorical_crossentropy"),
+    metrics=[Accuracy(), CategoricalCrossentropy()],
     final_activation="softmax"
 )
 arad_classifier.fit(

diff --git a/examples/modeling/classifier_comparison.py b/examples/modeling/classifier_comparison.py
@@ -36,8 +36,8 @@
 train_fg_ss, _ = static_synth.generate(fg_seeds_ss, mixed_bg_seed_ss, verbose=False)
 train_fg_ss.normalize()
 
-model_nn = MLPClassifier(hidden_layers=(5,))
-model_nn.fit(train_fg_ss, epochs=10, patience=5, verbose=1)
+model_nn = MLPClassifier()
+model_nn.fit(train_fg_ss, epochs=10, patience=5)
 
 # Create PB model
 model_pb = PoissonBayesClassifier()

diff --git a/examples/modeling/label_proportion_estimation.py b/examples/modeling/label_proportion_estimation.py
@@ -55,7 +55,6 @@
     batch_size=10,
     epochs=2,
     validation_split=0.2,
-    verbose=True,
     bg_cps=300
 )
 

diff --git a/examples/modeling/multi_event_classifier.py b/examples/modeling/multi_event_classifier.py
diff --git a/examples/modeling/neural_network_classifier.py b/examples/modeling/neural_network_classifier.py
@@ -24,7 +24,7 @@
 train_ss.normalize()
 
 model = MLPClassifier()
-model.fit(train_ss, epochs=10, patience=5, verbose=1)
+model.fit(train_ss, epochs=10, patience=5)
 
 # Generate some test data
 static_synth.samples_per_seed = 50

diff --git a/examples/visualization/confusion_matrix.py b/examples/visualization/confusion_matrix.py
@@ -26,7 +26,7 @@
     .generate(fg_seeds_ss, mixed_bg_seed_ss)
 train_ss.normalize()
 
-model = MLPClassifier(hidden_layers=(8,))
+model = MLPClassifier()
 model.fit(train_ss, verbose=0, epochs=50)
 
 # Generate some test data

diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@ namespaces = false
 [project]
 name = "riid"
 description = "Machine learning-based models and utilities for radioisotope identification"
-version = "2.1.0"
+version = "2.2.0"
 maintainers = [
     {name="Tyler Morrow", email="[email protected]"},
 ]
@@ -41,44 +41,43 @@ classifiers = [
     'Topic :: Software Development',
     'Topic :: Software Development :: Libraries',
     'Topic :: Software Development :: Libraries :: Python Modules',
+    'Programming Language :: Python',
     'Programming Language :: Python :: 3',
-    'Programming Language :: Python :: 3 :: Only',
-    'Programming Language :: Python :: 3.8',
-    'Programming Language :: Python :: 3.9',
     'Programming Language :: Python :: 3.10',
+    'Programming Language :: Python :: 3.11',
+    'Programming Language :: Python :: 3.12',
 ]
 keywords = ["pyriid", "riid", "machine learning", "radioisotope identification", "gamma spectrum"]
 
-requires-python = ">=3.8,<3.11"
+requires-python = ">=3.9,<3.13"
 dependencies = [
-    "jsonschema                     ==4.17.*",
-    "matplotlib                     ==3.7.*",
-    "pyyaml                         ==6.0.*",
-    "seaborn                        ==0.12.*",
-    "tf2onnx                        ==1.14.*",
-    "onnx                           ==1.14.1",
-    "tqdm                           ==4.65.*",
-    "numpy                          ==1.23.*",
-    "pandas                         ==2.0.*",
-    "parmap                         ==1.6.*",
-    "pythonnet                      ==3.0.*; platform_system == 'Windows'",
-    "tables                         ==3.8.*",
-    "tensorflow                     ==2.12.*",
-    "tensorflow-io                  ==0.27.*",
-    "tensorflow-model-optimization  ==0.7.*",
-    "tensorflow-probability         ==0.20.*",
-    "typeguard                      ==2.7.*",
-    "scikit-learn                   ==1.2.*",
+    "jsonschema                     ==4.23.*",  # 3.8 - 3.13
+    "matplotlib                     ==3.9.*",   # 3.9 - 3.12
+    "numpy                          ==1.26.*",  # 3.9 - 3.12, also to be limited by onnx 1.16.2
+    "pandas                         ==2.2.*",   # >= 3.9
+    "pythonnet                      ==3.0.3; platform_system == 'Windows'",  # 3.7 - 3.12
+    "pyyaml                         ==6.0.*",   # >= 3.6
+    "tables                         ==3.9.*",   # >= 3.9
+    "scikit-learn                   ==1.5.*",   # 3.9 - 3.12
+    "scipy                          ==1.13.*",  # >= 3.10
+    "seaborn                        ==0.13.*",  # >= 3.8
+    "tensorflow                     ==2.16.*",  # 3.9 - 3.12
+    "tensorflow-model-optimization  ==0.8.*",   # 3.7 - 3.12
+    "onnx                           ==1.16.1",  # 3.7 - 3.10
+    "tf2onnx                        ==1.16.1",  # 3.7 - 3.10
+    "tqdm                           ==4.66.*",  # >= 3.7
+    "typeguard                      ==4.3.*",   # 3.9 - 3.12
 ]
 
 [project.optional-dependencies]
 dev = [
-    "flake8",
-    "flake8-quotes",
     "coverage",
     "ipykernel",
+    "flake8",
+    "flake8-quotes",
     "tabulate",
 ]
 
 [project.urls]
-repository = "https://github.com/sandialabs/PyRIID"
+Documentation = "https://sandialabs.github.io/PyRIID"
+Repository = "https://github.com/sandialabs/PyRIID"
diff --git a/riid/data/converters/__init__.py b/riid/data/converters/__init__.py
@@ -6,7 +6,7 @@
 from pathlib import Path
 from typing import Callable
 
-import parmap as pm
+from joblib import Parallel, delayed
 
 
 def _validate_and_create_output_dir(output_dir: str):
@@ -17,8 +17,8 @@ def _validate_and_create_output_dir(output_dir: str):
 
 
 def convert_directory(input_dir_path: str, conversion_func: Callable, file_ext: str,
-                      pm_processes: int = 8, pm_chunksize: int = 1, **kwargs):
-    """Convert and save every file in a specified directory in parallel.
+                      n_jobs: int = 8, **kwargs):
+    """Convert and save every file in a specified directory.
 
     Conversion functions can be found in sub-modules:
 
@@ -32,16 +32,14 @@ def convert_directory(input_dir_path: str, conversion_func: Callable, file_ext:
         convert_directory(...)
     ```
 
-    Consider setting `pm_processes` to `multiprocessing.cpu_count()`;
-    unfortunately, `pm_chunksize` requires some experimentation to fully optimize.
+    Tip: for max utilization, considering setting `n_jobs` to `multiprocessing.cpu_count()`.
 
     Args:
         input_dir_path: directory path containing the input files
         conversion_func: function used to convert a data file to a `SampleSet`
         file_ext: file extension to read in for conversion
-        pm_processes: parmap parameter to set the # of processes
-        pm_chunksize: parmap parameter to set the chunksize
-        kwargs: keyword args passed to underlying conversion_func operations
+        n_jobs: `joblib.Parallel` parameter to set the # of jobs
+        kwargs: additional keyword args passed to conversion_func
     """
     input_path = Path(input_dir_path)
     if not input_path.exists() or not input_path.is_dir():
@@ -50,13 +48,6 @@ def convert_directory(input_dir_path: str, conversion_func: Callable, file_ext:
 
     input_file_paths = sorted(glob.glob(f"{input_dir_path}/*.{file_ext}"))
 
-    x = pm.map(
-        conversion_func,
-        input_file_paths,
-        **kwargs,
-        pm_processes=pm_processes,
-        pm_chunksize=pm_chunksize,
-        pm_parallel=True,
-        pm_pbar=True,
+    Parallel(n_jobs, verbose=10)(
+        delayed(conversion_func)(path, **kwargs) for path in input_file_paths
     )
-    return x