Applied Linting

sbak5 · sbak5 · commit 446841ad2ada · 2025-11-18T13:51:05.000-08:00
diff --git a/src/nvidia_resiliency_ext/attribution/mcp_integration/README.md b/src/nvidia_resiliency_ext/attribution/mcp_integration/README.md
@@ -96,12 +96,7 @@ order = global_registry.get_execution_order(["log_analyzer", "fr_analyzer", "com
 
 **Tool Types**:
 1. **Module tools**: One per registered module (`log_analyzer`, `fr_analyzer`, etc.)
-<<<<<<< HEAD
 2. **Utility tools**: `status`, `get_result`
-=======
-2. **Pipeline tool**: `run_pipeline` for multi-module execution
-3. **Utility tools**: `status`, `get_result`
->>>>>>> bfd729b (Add MCP integration and changes in `attribution` modules to run with MCP)
 
 **Resource Pattern**:
 ```
@@ -118,11 +113,7 @@ Example: attribution://log_analyzer/f47ac10b-58cc-4372-a567-0e02b2c3d479
 #### NVRxMCPClient
 - Connects to a single MCP server
 - Async context manager pattern
-<<<<<<< HEAD
 - Methods: `run_module()`, `get_result()`
-=======
-- Methods: `run_module()`, `run_pipeline()`, `get_result()`
->>>>>>> bfd729b (Add MCP integration and changes in `attribution` modules to run with MCP)
 
 #### MultiServerClient
 - Manages multiple MCP servers
diff --git a/src/nvidia_resiliency_ext/attribution/mcp_integration/mcp_client.py b/src/nvidia_resiliency_ext/attribution/mcp_integration/mcp_client.py
@@ -16,13 +16,7 @@
 from mcp.client.session import ClientSession
 from mcp.client.stdio import StdioServerParameters, stdio_client
 
-<<<<<<< HEAD
 from nvidia_resiliency_ext.attribution.mcp_integration.registry import deserialize_result
-=======
-from nvidia_resiliency_ext.attribution.mcp_integration.registry import (
-    deserialize_result,
-)
->>>>>>> bfd729b (Add MCP integration and changes in `attribution` modules to run with MCP)
 
 logger = logging.getLogger(__name__)
 
@@ -145,11 +139,7 @@ async def run_module(self, module_name: str, **kwargs) -> Dict[str, Any]:
         result_str = await self.call_tool(module_name, arguments)
         return deserialize_result(result_str)
 
-<<<<<<< HEAD
     async def get_result(self, result_id: str) -> Dict[str, Any]:
-=======
-    async def get_result(self,  result_id: str) -> Dict[str, Any]:
->>>>>>> bfd729b (Add MCP integration and changes in `attribution` modules to run with MCP)
         """
         Retrieve a cached result by ID.
 
@@ -213,11 +203,7 @@ class MultiServerClient:
 
     def __init__(self):
         """Initialize the multi-server client."""
-<<<<<<< HEAD
         self.servers: Dict[str, NVRxMCPClient] = {}
-=======
-        self.servers: Dict[str, ClientSession] = {}
->>>>>>> bfd729b (Add MCP integration and changes in `attribution` modules to run with MCP)
         self.module_to_server: Dict[str, str] = {}
 
     def add_server(self, server_name: str, server_command: List[str]):
@@ -228,11 +214,7 @@ def add_server(self, server_name: str, server_command: List[str]):
             server_name: Name for the server
             server_command: Command to start the server
         """
-<<<<<<< HEAD
         self.servers[server_name] = NVRxMCPClient(server_command)
-=======
-        self.servers[server_name] = ClientSession(server_command)
->>>>>>> bfd729b (Add MCP integration and changes in `attribution` modules to run with MCP)
 
     async def connect_all(self):
         """Connect to all registered servers."""
diff --git a/src/nvidia_resiliency_ext/attribution/mcp_integration/mcp_server.py b/src/nvidia_resiliency_ext/attribution/mcp_integration/mcp_server.py
@@ -9,10 +9,6 @@
 import asyncio
 import json
 import logging
-<<<<<<< HEAD
-=======
-import uuid
->>>>>>> bfd729b (Add MCP integration and changes in `attribution` modules to run with MCP)
 from typing import Any, Dict, List, Optional
 
 from mcp.server import Server
@@ -202,11 +198,6 @@ async def _handle_module_execution(
         """Execute a single attribution module."""
         # Apply default values from input schema
         arguments_with_defaults = self.registry.apply_defaults(module_name, arguments)
-<<<<<<< HEAD
-
-=======
-        
->>>>>>> bfd729b (Add MCP integration and changes in `attribution` modules to run with MCP)
         # Get or create module instance
         if module_name not in self.module_instances:
             # Convert arguments to argparse.Namespace
@@ -242,19 +233,11 @@ async def _handle_module_execution(
 
         return [TextContent(type="text", text=serialize_result(response))]
 
-<<<<<<< HEAD
-=======
-
->>>>>>> bfd729b (Add MCP integration and changes in `attribution` modules to run with MCP)
     async def run(self):
         """Run the MCP server."""
         import os
 
-<<<<<<< HEAD
         logger.info("Starting NVRX Attribution MCP Server")
-=======
-        logger.info(f"Starting NVRX Attribution MCP Server")
->>>>>>> bfd729b (Add MCP integration and changes in `attribution` modules to run with MCP)
         logger.info(f"Registered modules: {self.registry.list_modules()}, pid: {os.getpid()}")
 
         async with stdio_server() as (read_stream, write_stream):
@@ -264,8 +247,4 @@ async def run(self):
 
     def run_sync(self):
         """Run the server synchronously."""
-<<<<<<< HEAD
-        asyncio.run(self.run())
-=======
         asyncio.run(self.run())
->>>>>>> bfd729b (Add MCP integration and changes in `attribution` modules to run with MCP)
diff --git a/src/nvidia_resiliency_ext/attribution/mcp_integration/module_definitions.py b/src/nvidia_resiliency_ext/attribution/mcp_integration/module_definitions.py
@@ -9,10 +9,6 @@
 from nvidia_resiliency_ext.attribution.mcp_integration.registry import global_registry
 from nvidia_resiliency_ext.attribution.trace_analyzer.fr_attribution import CollectiveAnalyzer
 
-<<<<<<< HEAD
-
-=======
->>>>>>> bfd729b (Add MCP integration and changes in `attribution` modules to run with MCP)
 def register_all_modules():
     """Register all NVRX attribution modules with the global registry."""
 
@@ -75,10 +71,6 @@ def register_all_modules():
         dependencies=[],
     )
 
-<<<<<<< HEAD
-
-=======
->>>>>>> bfd729b (Add MCP integration and changes in `attribution` modules to run with MCP)
 def create_args_from_dict(module_name: str, config: dict) -> argparse.Namespace:
     """
     Create an argparse.Namespace from a configuration dictionary.
diff --git a/src/nvidia_resiliency_ext/attribution/mcp_integration/registry.py b/src/nvidia_resiliency_ext/attribution/mcp_integration/registry.py
@@ -4,18 +4,10 @@
 """
 
 import hashlib
-<<<<<<< HEAD
 import json
 import logging
 from dataclasses import asdict, dataclass, is_dataclass
 from typing import Any, Dict, List, Optional, Type
-=======
-import inspect
-import json
-import logging
-from dataclasses import asdict, dataclass, is_dataclass
-from typing import Any, Callable, Dict, List, Optional, Type
->>>>>>> bfd729b (Add MCP integration and changes in `attribution` modules to run with MCP)
 
 from nvidia_resiliency_ext.attribution.base import NVRxAttribution
 
@@ -85,16 +77,13 @@ def register(
         )
         self._modules[name] = metadata
 
-<<<<<<< HEAD
     def unregister(self, name: str):
         """Unregister a module."""
         if name in self._modules:
             del self._modules[name]
         else:
             raise ValueError(f"Module '{name}' not registered")
 
-=======
->>>>>>> bfd729b (Add MCP integration and changes in `attribution` modules to run with MCP)
     def get_module_metadata(self, name: str) -> Optional[ModuleMetadata]:
         """Get metadata for a registered module."""
         return self._modules.get(name)
@@ -127,7 +116,6 @@ def apply_defaults(self, module_name: str, arguments: Dict[str, Any]) -> Dict[st
         metadata = self._modules.get(module_name)
         if not metadata:
             return arguments
-<<<<<<< HEAD
 
         # Create a copy to avoid modifying the original
         result = dict(arguments)
@@ -136,26 +124,11 @@ def apply_defaults(self, module_name: str, arguments: Dict[str, Any]) -> Dict[st
         input_schema = metadata.input_schema
         properties = input_schema.get("properties", {})
 
-=======
-        
-        # Create a copy to avoid modifying the original
-        result = dict(arguments)
-        
-        # Get the properties from the input schema
-        input_schema = metadata.input_schema
-        properties = input_schema.get("properties", {})
-        
->>>>>>> bfd729b (Add MCP integration and changes in `attribution` modules to run with MCP)
         # Apply defaults for missing arguments
         for param_name, param_schema in properties.items():
             if param_name not in result and "default" in param_schema:
                 result[param_name] = param_schema["default"]
                 logger.debug(f"Applied default for {param_name}: {param_schema['default']}")
-<<<<<<< HEAD
-
-=======
-        
->>>>>>> bfd729b (Add MCP integration and changes in `attribution` modules to run with MCP)
         return result
 
     def cache_result(self, module_name: str, arguments: Dict[str, Any], result: Any):
diff --git a/src/nvidia_resiliency_ext/attribution/mcp_integration/server_launcher.py b/src/nvidia_resiliency_ext/attribution/mcp_integration/server_launcher.py
@@ -57,13 +57,8 @@ def main():
         all_modules = global_registry.list_modules()
         for module in list(all_modules):
             if module not in args.modules:
-<<<<<<< HEAD
                 global_registry.unregister(module)
                 logger.info(f"Unregistered module: {module}")
-=======
-                # Remove from registry (simplified - in production, use proper filtering)
-                logger.info(f"Skipping module: {module}")
->>>>>>> bfd729b (Add MCP integration and changes in `attribution` modules to run with MCP)
         logger.info(f"Enabled modules: {args.modules}")
     else:
         logger.info(f"Enabled modules: {global_registry.list_modules()}")
diff --git a/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py b/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py
@@ -14,7 +14,7 @@
 from collections import Counter, defaultdict
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Dict, List, Tuple 
+from typing import Dict, List, Tuple
 
 from nvidia_resiliency_ext.attribution.base import AttributionState, NVRxAttribution
 from nvidia_resiliency_ext.attribution.utils import capture_logs
@@ -390,20 +390,22 @@ def group_collectives_by_windows(self):
             )
             already_participated = pg_window_participants[pg_window_key] & ranks_with_current_pg
             previous_participants = pg_window_participants[pg_window_key]
-            
+
             has_previous_participants = len(previous_participants) > 0
             has_significant_new_ranks = len(ranks_with_current_pg - previous_participants) >= 2
-            
+
             # Create new window if:
             # 1. Some ranks have already participated (same ranks coming back), OR
             # 2. We have previous participants and mostly/completely new ranks (different batch)
             should_create_new_window = False
-            
+
             if current_pg not in pgs_with_active_ranks_last_iter:
                 # PG was inactive - check if we need a new window
-                if already_participated or (has_previous_participants and has_significant_new_ranks):
+                if already_participated or (
+                    has_previous_participants and has_significant_new_ranks
+                ):
                     should_create_new_window = True
-            
+
             if should_create_new_window:
                 # We're starting a new window/phase
                 pg_window_counter[current_pg] += 1
@@ -554,8 +556,8 @@ def matching_collectives_per_process_group(collective_group):
                     if c.state != 'scheduled':
                         continue
                     rank_counts['appeared'].append(c.file_id)
-#                    if get_correct_seq_id(c) <= max_completed_collective_seq_id:
-#                        rank_counts['mismatched'].append(c.file_id)
+                #                    if get_correct_seq_id(c) <= max_completed_collective_seq_id:
+                #                        rank_counts['mismatched'].append(c.file_id)
                 appeared_rank_counts = Counter(rank_counts['appeared'])
                 # Ranks with less number of enqueued collectives than max_enqueued_collective_seq_id -> host not making expected progress
                 for rank_id in self.pg_configs[process_group]['ranks']:
@@ -717,24 +719,34 @@ def get_correct_seq_id(collective):
             for key, collective_group in self.collective_groups.items():
                 logger.debug(f"key: {key}, collective_group: {collective_group}")
                 matching_collectives_per_process_group((key, collective_group))
-            
+
             # Cross-window matching: if the same PG has missing ranks in different windows,
             # try to match them across windows
-            pg_all_windows = defaultdict(list)  # pg_id -> list of (window_idx, identified_ranks, missing_ranks)
-            
+            pg_all_windows = defaultdict(
+                list
+            )  # pg_id -> list of (window_idx, identified_ranks, missing_ranks)
+
             for pg_id, entries in missing_pg.items():
                 for entry in entries:
                     # entry format: (pg_id, pg_desc, op_type, size, dtype, total_nranks, identified_ranks, missing_ranks)
                     pg_desc = entry[1]  # e.g., "default_pg,0" or "default_pg,1"
                     identified_ranks_str = entry[6]
                     missing_ranks_str = entry[7]
-                    
-                    identified_ranks = set(map(int, identified_ranks_str.split(','))) if identified_ranks_str else set()
-                    missing_ranks = set(map(int, missing_ranks_str.split(','))) if missing_ranks_str else set()
-                    
+
+                    identified_ranks = (
+                        set(map(int, identified_ranks_str.split(',')))
+                        if identified_ranks_str
+                        else set()
+                    )
+                    missing_ranks = (
+                        set(map(int, missing_ranks_str.split(','))) if missing_ranks_str else set()
+                    )
+
                     window_idx = int(pg_desc.split(',')[-1]) if ',' in pg_desc else 0
-                    pg_all_windows[pg_id].append((window_idx, identified_ranks, missing_ranks, entry))
-            
+                    pg_all_windows[pg_id].append(
+                        (window_idx, identified_ranks, missing_ranks, entry)
+                    )
+
             # For each PG with multiple windows, try to match missing ranks across windows
             merged_missing_pg = defaultdict(list)
             for pg_id, windows_data in pg_all_windows.items():
@@ -743,19 +755,19 @@ def get_correct_seq_id(collective):
                     for _, _, _, entry in windows_data:
                         merged_missing_pg[pg_id].append(entry)
                     continue
-                
+
                 # Multiple windows for this PG - try to match across windows
                 all_identified = set()
                 all_missing = set()
                 representative_entry = windows_data[0][3]  # Use first window's entry as template
-                
+
                 for window_idx, identified, missing, entry in windows_data:
                     all_identified.update(identified)
                     all_missing.update(missing)
-                
+
                 # Ranks that are identified in at least one window should not be considered missing
                 truly_missing = all_missing - all_identified
-                
+
                 if truly_missing:
                     # Create merged entry with truly missing ranks
                     merged_entry = list(representative_entry)
@@ -767,7 +779,7 @@ def get_correct_seq_id(collective):
                     # No truly missing ranks after cross-window matching
                     # Don't add to merged_missing_pg (it's complete now)
                     pass
-            
+
             return completed_pg, merged_missing_pg
 
         completed_pg, missing_pg = match_collectives()
@@ -944,7 +956,6 @@ def find_valid_paths(graph, start_node, visited):
         logger.debug(f"unique_paths: {unique_paths}")
         return grouped_pgs
 
-
     def process_file(self, filepath: str):
         """
         Process a single file to extract collective operations and other metadata