Add Parallel File Parsing and Improve testing (#31)

* Add paprallel file parsring (can be turned off with OPPPY_USE_THREADS=False * try/except multiprocessing availablity * add multiprocessing to the dependencies * temporarily remove python 3.11 testing * add log plotting for 2d contours * fix testing to actually... test * fix interactive testing and improve coverage for -ls options in 2d contours * add hdf h5py package to dependencies * fix math syntax * more math syntax fixes * add serial file parsing tests * update tests to use gold standards --------- Co-authored-by: Cleveland <[email protected]>
lanl · Jan 9, 2024 · e852606 · e852606
1 parent 790cc69
commit e852606
Show file tree

Hide file tree

Showing 29 changed files with 625 additions and 124 deletions.
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -19,7 +19,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install --user pytest-cov numpy matplotlib scipy argparse
+        pip install --user pytest-cov numpy matplotlib scipy argparse h5py
 ##    - name: flake
 ##       run: |
 ##         # stop the build if there are Python syntax errors or undefined names

diff --git a/opppy/dump_utils.py b/opppy/dump_utils.py
@@ -26,11 +26,16 @@
 '''
 
 from numpy import *
+import os
 import sys
 import pickle
+import math
+from multiprocessing import Process, Manager
 
 from opppy.progress import progress
 
+USE_THREADS = os.getenv("OPPPY_USE_THREADS", 'True').lower() in ('true', '1', 't')
+
 def point_value_1d(data, x_key, value_key, x_value, method='nearest'):
     '''
     Grid data function. This function takes a 1D data structure from dictionary
@@ -114,7 +119,7 @@ def point_value_3d(data, x_key, y_key, z_key, value_key, x_value, y_value, z_val
 
 
 
-def data2grid(data, x_key, y_key, value_key, npts=500, method='nearest'):
+def data2grid(data, x_key, y_key, value_key, npts=500, method='nearest', log_scale=False):
     '''
     This function takes a 2D data structure from dictionary and creates a 2D
     grid for each array by interpolating. This is useful for plotting.
@@ -138,14 +143,18 @@ def data2grid(data, x_key, y_key, value_key, npts=500, method='nearest'):
     grid_data = {}
     value = data[value_key]
     grid_data[value_key] = griddata((X, Y), value, (xi, yi), method).T
+    if(log_scale):
+        grid_data[value_key] = [[0.0 if val<=0.0 else math.log10(val) for val in vals] for vals in
+                grid_data[value_key]]
     grid_data[x_key] = xi
     grid_data[y_key] = yi
 
     return grid_data
 
 
 
-def data2gridbox(data, x_key, y_key, value_key, xmin, ymin, xmax, ymax,npts=500, method='nearest'):
+def data2gridbox(data, x_key, y_key, value_key, xmin, ymin, xmax, ymax,npts=500, method='nearest',
+        log_scale=False):
     '''
     This function takes a 2D data structure from a data dictionary and creates
     a 2D grid for each array by interpolating in a user defined region.
@@ -174,14 +183,18 @@ def data2gridbox(data, x_key, y_key, value_key, xmin, ymin, xmax, ymax,npts=500,
     grid_data = {}
     value = data[value_key]
     grid_data[value_key] = griddata((X, Y), value, (xi, yi), method).T
+    if(log_scale):
+        grid_data[value_key] = [[0.0 if val<=0.0 else math.log10(val) for val in vals] for vals in
+                grid_data[value_key]]
     grid_data[x_key] = xi
     grid_data[y_key] = yi
 
     return grid_data
 
 
 
-def data2grid3Dslice(data, x_key, y_key, z_key, value_key, z_slice_value, npts=500,method='nearest'):
+def data2grid3Dslice(data, x_key, y_key, z_key, value_key, z_slice_value, npts=500,method='nearest',
+        log_scale=False):
     ''' 
     This function takes a 3D data structure from a data dictionary and creates
     a 2D grid for each array by interpolating. This is useful for plotting.
@@ -206,6 +219,9 @@ def data2grid3Dslice(data, x_key, y_key, z_key, value_key, z_slice_value, npts=5
     grid_data = {}
     V = data[value_key]
     grid_data[value_key] = griddata((X, Y, Z), V, (xi, yi, zi), method).T[0]
+    if(log_scale):
+        grid_data[value_key] = [[0.0 if val<=0.0 else math.log10(val) for val in vals] for vals in
+                grid_data[value_key]]
     grid_data[x_key] = xi.T[0]
     grid_data[y_key] = yi.T[0]
 
@@ -399,7 +415,8 @@ def extract_series_line(data_list,series_key,value_key,dim_keys,point0_values,po
 
     return t, grid
 
-def extract_series_2d(data_list, series_key, value_key, dim_keys, npts=500, method='nearest', box=[]):
+def extract_series_2d(data_list, series_key, value_key, dim_keys, npts=500, method='nearest',
+        log_scale=False, box=[]):
     '''
     This function extracts the data values along a specified line from a
     series of data dictionaries.
@@ -423,16 +440,18 @@ def extract_series_2d(data_list, series_key, value_key, dim_keys, npts=500, meth
     for data in data_list:
         T.append(data[series_key])
         if len(box) == 0:
-            grid.append(data2grid(data, dim_keys[0], dim_keys[1], value_key, npts, method))
+            grid.append(data2grid(data, dim_keys[0], dim_keys[1], value_key, npts, method, log_scale))
         else:
-            grid.append(data2gridbox(data, dim_keys[0], dim_keys[1], value_key, box[0], box[1], box[2], box[3],npts,method))
+            grid.append(data2gridbox(data, dim_keys[0], dim_keys[1], value_key, box[0], box[1],
+                box[2], box[3],npts,method, log_scale))
 
     t = {}
     t[series_key] = array(T)
 
     return t, grid
 
-def extract_series_2d_slice(data_list,series_key,value_key,dim_keys, slice_value, npts=500, method='nearest'):
+def extract_series_2d_slice(data_list,series_key,value_key,dim_keys, slice_value, npts=500,
+        method='nearest', log_scale=False):
     '''
     This function extracts the data values along a specified line from a
     series of data dictionaries.
@@ -459,7 +478,8 @@ def extract_series_2d_slice(data_list,series_key,value_key,dim_keys, slice_value
             print("Error: series_key dictionary item must return a single value (i.e. cycle or time)")
             sys.exit(0)
 
-        grid.append(data2grid3Dslice(data, dim_keys[0], dim_keys[1], dim_keys[2],value_key, slice_value, npts, method))
+        grid.append(data2grid3Dslice(data, dim_keys[0], dim_keys[1], dim_keys[2],value_key,
+            slice_value, npts, method, log_scale))
 
     t = {}
     t[series_key] = array(T)
@@ -468,7 +488,10 @@ def extract_series_2d_slice(data_list,series_key,value_key,dim_keys, slice_value
 
 def append_dumps(data, dump_files, opppy_parser, key_words=None):
     '''
-    Append output data from a list of output_files to a user provided dictionary using a user proved opppy_parser
+    Append output data from a list of output_files to a user provided dictionary using a user proved
+    opppy_parser. By default this function will use the multiprocessing option to parallelize the
+    parsing of multiple dumps. The parallel parsing can be disabled by setting
+    the environment variable 'OPPPY_USE_THREADS=False'
 
     Input options:
         data opppy input dictionary to be append to (must have a 'verion' opppy key)
@@ -477,13 +500,32 @@ def append_dumps(data, dump_files, opppy_parser, key_words=None):
         append_date bool to specify if the data should be appended to the file
             name for tracking purposes 
     '''
+
     total = len(dump_files)
     count = 0
-    for dump in dump_files:
-      # append new dictionary data to the pickle file
-      data[dump.split('/')[-1]] = opppy_parser.build_data_dictionary(dump,key_words)
-      count += 1
-      progress(count,total, 'of dump files read')
+    print('')
+    print("Number of files to be read: ", total)
+    if(USE_THREADS):
+        def thread_all(file_name, key_words, result_d):
+            result_d[file_name.split('/')[-1]] = opppy_parser.build_data_dictionary(file_name,key_words)
+        with Manager() as manager:
+            result_d = manager.dict()
+            threads = []
+            for file_name in dump_files:
+                thread = Process(target=thread_all, args=(file_name, key_words, result_d,))
+                thread.start()
+                threads.append(thread)
+            for thread in threads:
+                thread.join()
+                count += 1
+                progress(count,total, 'of input files read')
+            data.update(result_d)
+    else:
+        for dump in dump_files:
+          # append new dictionary data to the pickle file
+          data[dump.split('/')[-1]] = opppy_parser.build_data_dictionary(dump,key_words)
+          count += 1
+          progress(count,total, 'of dump files read')
 
     print('')
     print('')

diff --git a/opppy/interactive_utils.py b/opppy/interactive_utils.py
@@ -807,11 +807,15 @@ def plot_series_contour(self, args):
                 if len(args.dimension_keys) != 3:
                     print('Error: z_slice_location specified so length of dimension_keys must be 3')
                     sys.exit(0)
-                tracer_t, tracer_grid = extract_series_2d_slice(dictionary_list, args.series_key, args.data_name, args.dimension_keys, args.z_slice_location, args.number_of_points, args.interpolation_method) 
+                tracer_t, tracer_grid = extract_series_2d_slice(dictionary_list, args.series_key,
+                        args.data_name, args.dimension_keys, args.z_slice_location,
+                        args.number_of_points, args.interpolation_method, args.log_scale) 
             else:
                 if len(args.dimension_keys) != 2:
                     print('Error: z_slice_location specified is not specified so length of dimension_keys must be 2')
-                tracer_t, tracer_grid = extract_series_2d(dictionary_list, args.series_key, args.data_name, args.dimension_keys, args.number_of_points, args.interpolation_method) 
+                tracer_t, tracer_grid = extract_series_2d(dictionary_list, args.series_key,
+                        args.data_name, args.dimension_keys, args.number_of_points,
+                        args.interpolation_method, args.log_scale) 
             series_data = series_pair(tracer_t, tracer_grid)
         elif args.pickle_file is not None:
             dictionary = pickle.load(open(args.pickle_file,'rb'))
@@ -825,11 +829,15 @@ def plot_series_contour(self, args):
                 if len(args.dimension_keys) != 3:
                     print('Error: z_slice_location specified so length of dimension_keys must be 3')
                     sys.exit(0)
-                tracer_t, tracer_grid = extract_series_2d_slice(dictionary_list, args.series_key, args.data_name, args.dimension_keys, args.z_slice_location, args.number_of_points, args.interpolation_method) 
+                tracer_t, tracer_grid = extract_series_2d_slice(dictionary_list, args.series_key,
+                        args.data_name, args.dimension_keys, args.z_slice_location,
+                        args.number_of_points, args.interpolation_method, args.log_scale) 
             else:
                 if len(args.dimension_keys) != 2:
                     print('Error: z_slice_location specified is not specified so length of dimension_keys must be 2')
-                tracer_t, tracer_grid = extract_series_2d(dictionary_list, args.series_key, args.data_name, args.dimension_keys, args.number_of_points, args.interpolation_method) 
+                tracer_t, tracer_grid = extract_series_2d(dictionary_list, args.series_key,
+                        args.data_name, args.dimension_keys, args.number_of_points,
+                        args.interpolation_method, args.log_scale) 
             series_data = series_pair(tracer_t, tracer_grid)
         if args.case_file is not None:
             dictionary_list = build_case_data_list(args.case_file, None, self.dump_parser, args.key_words)
@@ -839,11 +847,15 @@ def plot_series_contour(self, args):
                 if len(args.dimension_keys) != 3:
                     print('Error: z_slice_location specified so length of dimension_keys must be 3')
                     sys.exit(0)
-                tracer_t, tracer_grid = extract_series_2d_slice(dictionary_list, args.series_key, args.data_name, args.dimension_keys, args.z_slice_location, args.number_of_points, args.interpolation_method) 
+                tracer_t, tracer_grid = extract_series_2d_slice(dictionary_list, args.series_key,
+                        args.data_name, args.dimension_keys, args.z_slice_location,
+                        args.number_of_points, args.interpolation_method, args.log_scale) 
             else:
                 if len(args.dimension_keys) != 2:
                     print('Error: z_slice_location specified is not specified so length of dimension_keys must be 2')
-                tracer_t, tracer_grid = extract_series_2d(dictionary_list, args.series_key, args.data_name, args.dimension_keys, args.number_of_points, args.interpolation_method) 
+                tracer_t, tracer_grid = extract_series_2d(dictionary_list, args.series_key,
+                        args.data_name, args.dimension_keys, args.number_of_points,
+                        args.interpolation_method, args.log_scale) 
             series_data = series_pair(tracer_t, tracer_grid)
 
         args.x_value_name= args.dimension_keys[0]

diff --git a/opppy/output.py b/opppy/output.py
@@ -26,11 +26,14 @@
 import sys
 import pickle
 import io
+import os
 import numpy as np
+from multiprocessing import Process, Manager
 
 from opppy.version import __version__
 from opppy.progress import *
 
+USE_THREADS = os.getenv("OPPPY_USE_THREADS", 'True').lower() in ('true', '1', 't')
 
 def append_cycle_data(cycle_data, data, sort_key_string):
     '''
@@ -312,7 +315,11 @@ def extract_cycle_data(cycle_string, my_opppy_parser):
 
 def append_output_dictionary(data, output_files, opppy_parser, append_date=False):
     '''
-    Append output data from a list of output_files to a user provided dictionary using a user proved opppy_parser
+    Append output data from a list of output_files to a user provided dictionary using a user proved
+    opppy_parser. By default this function will use the multiprocessing option to parallelize the
+    parsing of multiple dumps. The parallel parsing can be disabled by setting
+    the environment variable 'OPPPY_USE_THREADS=False'
+
 
     arguments:
         data opppy input dictionary to be append to (must have a 'verion' opppy key)
@@ -330,33 +337,59 @@ def append_output_dictionary(data, output_files, opppy_parser, append_date=False
         print("This data dictionary has no version")
       print("This version of OPPPY is ", __version__)
       sys.exit(0)
-
     time = ''
     if append_date:
       time = time+'.'+datetime.datetime.now().strftime ("%Y%m%d%H%M%S")
     count = 0
     total = len(output_files) 
     print('')
     print("Number of files to be read: ", total)
-    cycle_string_list=[]
+    data_list = []
+    if(USE_THREADS):
+      def thread_all(file_name, result_d):
+          thread_cycle_string_list = get_output_lines(file_name, opppy_parser.cycle_opening_string,
+                 opppy_parser.cycle_closing_string, opppy_parser.file_end_string);
+          thread_data = []
+          for cycle_string in thread_cycle_string_list:
+                thread_data.append(extract_cycle_data(cycle_string, opppy_parser))
+          result_d[file_name]=thread_data
+      with Manager() as manager:
+            result_d = manager.dict()
+            threads = []
+            for file_name in output_files:
+                thread = Process(target=thread_all, args=(file_name, result_d,))
+                thread.start()
+                threads.append(thread)
+            for thread in threads:
+                thread.join()
+                count += 1
+                progress(count,total, 'of input files read')
+            for file_name in output_files:
+                data_list += result_d[file_name]
+    else:
+      cycle_string_list=[]
+      for file_name in output_files:
+        cycle_string_list+=get_output_lines(file_name, opppy_parser.cycle_opening_string, opppy_parser.cycle_closing_string, opppy_parser.file_end_string)
+        count += 1
+        progress(count,total, 'of input files read')
+
+      count = 0
+      total = len(cycle_string_list) 
+      print('')
+      print("Number of cycles to be parsed: ", total)
+      for cycle_string in cycle_string_list:
+        data_list.append(extract_cycle_data(cycle_string, opppy_parser))
+        count += 1
+        progress(count,total, 'of cycles parsed')
+      print('')
+
     for file_name in output_files:
-      cycle_string_list+=get_output_lines(file_name, opppy_parser.cycle_opening_string, opppy_parser.cycle_closing_string, opppy_parser.file_end_string)
       if 'appended_files' in data:
           data['appended_files'].append(file_name.split('/')[-1]+time)
       else:
           data['appended_files'] = [file_name.split('/')[-1]+time]
-      count += 1
-      progress(count,total, 'of input files read')
-
-    total = len(cycle_string_list) 
-    count = 0
-    print('')
-    print("Number of cycles to be parsed: ", total)
-    for cycle_string in cycle_string_list:
-      cycle_data = extract_cycle_data(cycle_string, opppy_parser)
+    for cycle_data in data_list:
       data = append_cycle_data(cycle_data,data,opppy_parser.sort_key_string)
-      count += 1
-      progress(count,total, 'of cycles parsed')
 
     print('')
     print('')

diff --git a/opppy/plot_dictionary.py b/opppy/plot_dictionary.py
@@ -21,6 +21,7 @@
 from math import *
 import argparse
 import shlex
+import warnings
 
 from opppy.plotting_help import *
 
@@ -264,6 +265,7 @@ def plot_dict(self, args, dictionaries, data_names):
             fig = PyPloter.savefig(args.figure_name, dpi=args.figure_resolution)
             print("Plot save as -- "+args.figure_name)
         elif(not args.hide_plot):
+            warnings.filterwarnings("ignore")
             PyPloter.show()