Cleanup and documentation

azadeh-gh · azadeh-gh · commit 675bf7dceec8 · 2025-12-31T21:33:55.000Z
diff --git a/gnn_model/evaluations.py b/gnn_model/evaluations.py
@@ -1,3 +1,49 @@
+"""
+OCELOT Model Evaluation and Visualization Suite
+
+Author: Azadeh Gholoubi
+Organization: NOAA/NCEP/EMC
+
+Description:
+    This module provides comprehensive evaluation and visualization tools for the OCELOT
+    (Observation-Centered Earth Learning Observation Transformer) graph neural network model.
+    It includes specialized diagnostics for multiple observation types including satellite
+    radiances, conventional surface observations, and atmospheric soundings (radiosondes).
+
+Key Features:
+    - Multi-panel geospatial visualization of predictions vs. truth with difference maps
+    - Radiosonde-specific diagnostics including vertical profile analysis by atmospheric layer
+    - Pressure-stratified error analysis (surface, mid-troposphere, upper atmosphere)
+    - Statistical metrics: RMSE, bias, R², sMAPE, MAE with percentile-based robust estimates
+    - Quality control integration with observation masking
+    - Channel-wise evaluation for multi-channel instruments (ATMS, AMSU-A, SSMIS, etc.)
+
+Main Functions:
+    - plot_ocelot_target_diff: 3-panel maps showing prediction, truth, and difference
+    - plot_radiosonde_by_layer: Scatter plots stratified by atmospheric layers
+    - plot_radiosonde_pressure_distribution: Vertical distribution of observations
+    - plot_radiosonde_error_vs_pressure: Error profiles as function of pressure
+    - print_radiosonde_layer_stats: Tabular statistics for each atmospheric layer
+    - plot_instrument_maps: Generic instrument evaluation with geographic context
+
+Technical Notes:
+    - Handles missing data and QC-failed observations through mask columns
+    - Uses robust statistics (percentiles) to handle outliers
+    - Supports multiple projection systems via Cartopy
+    - Generates publication-quality figures with proper units and annotations
+    - Implements symmetric difference scaling for intuitive visualization
+
+Usage:
+    Called during model validation/testing phases to generate diagnostic plots
+    and statistical summaries. Reads CSV files exported during validation and
+    produces figures in the specified output directory.
+
+Dependencies:
+    - numpy, pandas: Data manipulation
+    - matplotlib: Plotting and visualization
+    - cartopy: Geospatial projections and map features
+"""
+
 import os
 import numpy as np
 import pandas as pd
@@ -237,9 +283,6 @@ def plot_radiosonde_by_layer(
     fig_dir: str = PLOT_DIR,
 ):
     """
-    Radiosonde-specific evaluation with layer stratification.
-    Validates FIX 1, 2, 3 are working correctly.
-
     Creates scatter plots of predicted vs true values for each atmospheric layer:
     - Surface (850-1200 hPa)
     - Mid-troposphere (400-850 hPa)
@@ -255,12 +298,12 @@ def plot_radiosonde_by_layer(
 
     os.makedirs(fig_dir, exist_ok=True)
 
-    # Check FIX 2: pressure_normalized should be present
+    # pressure_normalized should be present
     if 'pressure_normalized' in df.columns:
-        print("✅ FIX 2 ACTIVE: pressure_normalized metadata found")
+        print("pressure_normalized metadata found")
     else:
-        print("⚠️  WARNING: pressure_normalized not found in CSV!")
-        print("   Note: FIX 2 may still be active during training, but metadata")
+        print("WARNING: pressure_normalized not found in CSV!")
+        print("   Note: may still be active during training, but metadata")
         print("         is not exported to validation CSVs. This is expected.")
 
     # Define layers matching FIX 3 configuration
@@ -363,7 +406,7 @@ def plot_radiosonde_pressure_distribution(
 ):
     """
     Visualize distribution of radiosonde observations across pressure levels.
-    Validates FIX 1 (nearest matching) is working - should see ~80% retention.
+    Validates (nearest matching) is working - should see ~80% retention.
     """
     filepath = f"{data_dir}/val_radiosonde_target_epoch{epoch}_batch{batch_idx}_step0.csv"
     try:
@@ -387,7 +430,7 @@ def plot_radiosonde_pressure_distribution(
 
     fig, axes = plt.subplots(1, 2, figsize=(14, 5))
     fig.suptitle(f'Radiosonde Pressure Distribution - Epoch {epoch}\n'
-                 f'Validates FIX 1 (Nearest-Level Matching)', fontsize=14)
+                 f'Validates Nearest-Level Matching', fontsize=14)
 
     # Histogram of pressure distribution
     axes[0].hist(pressure_valid, bins=50, edgecolor='black', alpha=0.7, color='steelblue')
@@ -451,9 +494,9 @@ def plot_radiosonde_pressure_distribution(
         retention_pct = 100 * total / total_rows
         print(f"    Data retention: {retention_pct:.1f}% ({total:,}/{total_rows:,})")
         if retention_pct > 70:
-            print(f"    ✅ FIX 1 WORKING: High retention (~{retention_pct:.0f}% vs expected ~80%)")
+            print(f"    FIX 1 WORKING: High retention (~{retention_pct:.0f}% vs expected ~80%)")
         else:
-            print(f"    ⚠️  WARNING: Low retention ({retention_pct:.1f}%), FIX 1 may not be active")
+            print(f"    WARNING: Low retention ({retention_pct:.1f}%), FIX 1 may not be active")
 
 
 def print_radiosonde_layer_stats(
@@ -485,7 +528,7 @@ def print_radiosonde_layer_stats(
 
     print(f"\n{'='*80}")
     print(f"RADIOSONDE VALIDATION BY LAYER - Epoch {epoch}")
-    print(f"Validates FIX 3: Level-Specific Normalization")
+    print(f"Validates: Level-Specific Normalization")
     print(f"{'='*80}")
 
     for fname in ['airTemperature', 'dewPointTemperature', 'airPressure']:
@@ -535,19 +578,19 @@ def print_radiosonde_layer_stats(
 
             # WARNING: Check for catastrophically bad results
             if r2 < 0:
-                print(f"   ⚠️  WARNING: Negative R² indicates predictions worse than mean!")
+                print(f"   WARNING: Negative R² indicates predictions worse than mean!")
             if rmse > 10 and fname in ['airTemperature', 'dewPointTemperature']:
-                print(f"   ⚠️  WARNING: Very high RMSE (>{rmse:.1f}K) - check denormalization!")
+                print(f"   WARNING: Very high RMSE (>{rmse:.1f}K) - check denormalization!")
 
     print(f"\n{'='*80}")
-    print("✅ If FIX 3 is working correctly, RMSE should be similar across all layers")
+    print("If FIX 3 is working correctly, RMSE should be similar across all layers")
     print("   (uniform error means layer-specific normalization is effective)")
-    print(f"\n📊 EXPECTED PERFORMANCE (with all 3 fixes working):")
+    print(f"\n EXPECTED PERFORMANCE (with all 3 fixes working):")
     print(f"   • airTemperature RMSE:       ~2.0-2.5 K per layer")
     print(f"   • dewPointTemperature RMSE:  ~2.5-3.0 K per layer")
     print(f"   • airPressure RMSE:          ~12-20 hPa per layer")
     print(f"   • R² values:                 >0.98 for all layers")
-    print(f"\n⚠️  If seeing very high RMSE or negative R² values:")
+    print(f"\n  If seeing very high RMSE or negative R² values:")
     print(f"   1. Check if model was trained WITH the fixes enabled")
     print(f"   2. Verify observation_config.yaml has all 3 fixes active")
     print(f"   3. Check process_timeseries.py has FIX 1, 2, 3 implemented")
diff --git a/gnn_model/gnn_model.py b/gnn_model/gnn_model.py
@@ -1,3 +1,59 @@
+"""
+OCELOT Graph Neural Network Model Architecture
+
+Author: Azadeh Gholoubi
+Organization: NOAA/NCEP/EMC
+
+Description:
+    Core GNN architecture for the OCELOT model. Implements a heterogeneous graph neural network with PyTorch Lightning
+    for processing multi-instrument observational data on an icosahedral mesh structure.
+
+Architecture Components:
+    - Encoder: Maps heterogeneous observations to a common latent space on mesh nodes
+      * Supports InteractionNet or BipartiteGAT (Graph Attention Network)
+      * Configurable layers, attention heads, and dropout
+    
+    - Processor: Propagates information across the mesh graph
+      * InteractionNet: Message passing with edge features
+      * SlidingWindowTransformerProcessor: Temporal attention mechanism
+      * Configurable depth, attention heads, and dropout
+    
+    - Decoder: Projects mesh representations back to observation space for predictions
+      * Supports InteractionNet or BipartiteGAT
+      * Inverse-distance weighted aggregation for multi-connectivity
+      * Configurable layers and attention heads
+
+Key Features:
+    - Multi-instrument support (satellites, surface stations, radiosondes)
+    - Weighted loss function with per-instrument and per-channel weights
+    - Latent rollout for multi-step predictions
+    - Gradient checkpointing for memory efficiency
+    - Distributed training with PyTorch Lightning DDP
+    - Automatic mixed precision (FP16) support
+    - Comprehensive validation and CSV export for analysis
+
+Model Pipeline:
+    1. Encode: Observation features → Mesh latent representations
+    2. Process: Message passing on mesh graph (multiple steps)
+    3. Decode: Mesh → Target predictions at observation locations
+    4. Loss: Weighted Huber loss with instrument/channel priorities
+    5. Rollout: Optional multi-step autoregressive predictions
+
+Training Details:
+    - Optimizer: Adam with configurable learning rate
+    - Loss: Weighted Huber loss (robust to outliers)
+    - Regularization: LayerNorm, Dropout, gradient clipping
+    - Monitoring: Training/validation losses, per-instrument metrics
+    - Checkpointing: Model state, optimizer state, epoch tracking
+    - CSV Export: Predictions, targets, masks for offline evaluation
+
+Performance Optimizations:
+    - Gradient checkpointing to reduce memory usage
+    - Mixed precision training (FP16)
+    - Efficient scatter operations for aggregation
+    - Distributed data parallel (DDP) for multi-GPU training
+"""
+
 import lightning.pytorch as pl
 import os
 import time
diff --git a/gnn_model/process_timeseries.py b/gnn_model/process_timeseries.py
@@ -1,3 +1,40 @@
+"""
+Observation Data Processing and Feature Extraction Pipeline
+
+Author: Azadeh Gholoubi
+Organization: NOAA/NCEP/EMC
+
+Description:
+    Core data processing module for the OCELOT (Observation-Centered Earth Learning 
+    Observation Transformer) GNN model. Handles time-series binning, feature extraction,
+    normalization, and quality control for multi-instrument observational datasets including
+    satellites (ATMS, AMSU-A, SSMIS, SEVIRI, AVHRR, ASCAT) and conventional observations
+    (surface stations, radiosondes).
+
+Key Functions:
+    - organize_bins_times: Temporal binning of observations into input-target pairs with
+      support for latent rollout (multiple target sub-windows)
+    - extract_features: Extracts and normalizes features from zarr files with instrument-
+      specific QC, metadata handling, and level-specific normalization for radiosondes
+    - _normalize_by_level_groups: Pressure-stratified normalization for atmospheric soundings
+
+Special Features:
+    - Radiosonde pressure-level matching with configurable tolerance
+    - Pressure metadata augmentation for vertical context
+    - Layer-specific normalization (surface/mid/upper atmosphere)
+    - Chunked scanning for memory-efficient processing of large zarr datasets
+    - Reproducible subsampling with stable seeding
+    - Multi-channel support for satellite instruments
+    - Quality control integration (QC flags, mask propagation)
+
+Technical Details:
+    - Supports both single-year and multi-year zarr files
+    - Handles missing data, fill values, and sentinel values
+    - Implements cosine transformation for cyclic metadata (wind direction)
+    - Feature normalization using pre-computed statistics
+    - Efficient indexing with numpy advanced indexing
+"""
+
 import hashlib
 import numpy as np
 import pandas as pd
@@ -293,7 +330,7 @@ def _stats_from_cfg(feature_stats, inst_name, feat_keys):
 
 def _normalize_by_level_groups(features, pressures, feature_stats, inst_name, feat_keys):
     """
-    FIX 3: Apply level-specific normalization for radiosondes.
+    Apply level-specific normalization for radiosondes.
     Normalizes features separately for different atmospheric layers (surface, mid, upper).
 
     Args:
@@ -371,8 +408,6 @@ def extract_features(z_dict, data_summary, bin_name, observation_config, feature
     Adds per-channel masks for inputs and targets so features can be missing independently.
     Inputs: keep a row if ANY feature channel is valid; metadata can be missing (imputed later).
     Targets: require metadata row to be valid; features may be missing per-channel.
-
-    ## MODIFIED to support latent rollout (multiple target windows).
     """
     print(f"\nProcessing {bin_name}...")
     for obs_type in list(data_summary[bin_name].keys()):
@@ -401,7 +436,7 @@ def extract_features(z_dict, data_summary, bin_name, observation_config, feature
                 matching_mode = level_selection.get("matching_mode", "exact")
 
                 if matching_mode == "nearest":
-                    # FIX 1: Nearest-level matching with tolerance
+                    # Nearest-level matching with tolerance
                     tolerance = level_selection.get("tolerance_hpa", 50)
                     if input_idx.size:
                         p_vals = z[col][input_idx]
@@ -420,7 +455,7 @@ def extract_features(z_dict, data_summary, bin_name, observation_config, feature
                                 keep_mask_tg |= (np.abs(p_vals_tg - level) <= tolerance)
                             target_indices_list[i] = idx[keep_mask_tg]
                 else:
-                    # Original exact matching (fallback)
+                    # Exact matching (fallback)
                     if input_idx.size:
                         input_idx = input_idx[np.isin(z[col][input_idx], levels)]
                     for i, idx in enumerate(target_indices_list):
@@ -573,7 +608,7 @@ def _get_feature(arrs, name, idx):
             # Extract input features
             input_features_raw = np.column_stack([_get_feature(z, k, input_idx) for k in feat_keys]).astype(np.float32)
 
-            # FIX 2: Separate computed metadata from zarr-based metadata
+            # Separate computed metadata from zarr-based metadata
             # Computed metadata keys that we compute on-the-fly
             computed_meta_keys = {'pressure_normalized', 'log_pressure_height'}
             zarr_meta_keys = [k for k in meta_keys if k not in computed_meta_keys]
@@ -584,7 +619,7 @@ def _get_feature(arrs, name, idx):
             input_lon_raw = z["longitude"][input_idx]
             input_times_raw = z["time"][input_idx]
 
-            # FIX 2: Add pressure-based metadata for radiosondes/conventional obs
+            # Add pressure-based metadata for radiosondes/conventional obs
             if inst_name == 'radiosonde' and 'airPressure' in feat_keys:
                 # Check if pressure metadata features are requested
                 pressure_meta_features = []
@@ -627,13 +662,13 @@ def _get_feature(arrs, name, idx):
                     target_times_raw_list.append(np.array([], dtype=np.float32))
                 else:
                     target_features_raw_list.append(np.column_stack([_get_feature(z, k, target_idx) for k in feat_keys]).astype(np.float32))
-                    # FIX 2: Use only zarr-based metadata keys (filter out computed ones)
+                    # Use only zarr-based metadata keys (filter out computed ones)
                     target_metadata_raw_list.append(_stack_or_empty(z, zarr_meta_keys, target_idx))
                     target_lat_raw_list.append(z["latitude"][target_idx])
                     target_lon_raw_list.append(z["longitude"][target_idx])
                     target_times_raw_list.append(z["time"][target_idx])
 
-            # FIX 2: Add pressure-based metadata for targets (radiosondes)
+            # Add pressure-based metadata for targets (radiosondes)
             if inst_name == 'radiosonde' and 'airPressure' in feat_keys:
                 pressure_meta_requested = ('pressure_normalized' in meta_keys) or ('log_pressure_height' in meta_keys)
 
@@ -972,7 +1007,7 @@ def _apply_relational_qc():
             else:
                 # Conventional processing (surface_obs, radiosonde)
 
-                # FIX 3: Try level-specific normalization first (for radiosondes)
+                # Try level-specific normalization first (for radiosondes)
                 input_features_norm = None
                 if inst_name == 'radiosonde' and 'airPressure' in feat_keys:
                     # Extract pressure values for level grouping
@@ -984,7 +1019,7 @@ def _apply_relational_qc():
                     )
 
                     if input_features_norm is not None:
-                        print(f"  [{inst_name}] Using level-specific normalization (FIX 3)")
+                        print(f"  [{inst_name}] Using level-specific normalization")
 
                 # Fall back to global normalization if level-specific not available
                 if input_features_norm is None:
@@ -1034,7 +1069,7 @@ def _apply_relational_qc():
                         continue
 
                     # Target normalization with clipping (conventional style)
-                    # FIX 3: Try level-specific normalization for radiosondes
+                    # Try level-specific normalization for radiosondes
                     target_features_norm = None
                     if inst_name == 'radiosonde' and 'airPressure' in feat_keys:
                         # Extract pressure values for level grouping
diff --git a/gnn_model/run_gnn.sh b/gnn_model/run_gnn.sh
@@ -57,8 +57,8 @@ echo "Visible GPUs on this node:"
 nvidia-smi
 
 # Launch training (env is propagated to ranks)
-srun --export=ALL --kill-on-bad-exit=1 --cpu-bind=cores python train_gnn.py
+# srun --export=ALL --kill-on-bad-exit=1 --cpu-bind=cores python train_gnn.py
 
 # Resume training from the latest checkpoint
-# srun --export=ALL --kill-on-bad-exit=1 --cpu-bind=cores python train_gnn.py --resume_from_latest
+srun --export=ALL --kill-on-bad-exit=1 --cpu-bind=cores python train_gnn.py --resume_from_latest
 # srun --export=ALL --kill-on-bad-exit=1 --cpu-bind=cores python train_gnn.py --resume_from_checkpoint checkpoints/last.ckpt
diff --git a/gnn_model/train_gnn.py b/gnn_model/train_gnn.py