Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions user_tools/docs/user-tools-onprem.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,22 @@ Before running any command, you can set environment variables to specify configu
- `RAPIDS_USER_TOOLS_CACHE_FOLDER`: specifies the location of a local directory that the RAPIDS-cli uses to store and cache the downloaded resources. The default is `/var/tmp/spark_rapids_user_tools_cache`. Note that caching the resources locally has an impact on the total execution time of the command.
- `RAPIDS_USER_TOOLS_OUTPUT_DIRECTORY`: specifies the location of a local directory that the RAPIDS-cli uses to generate the output. The wrapper CLI arguments override that environment variable (`--local_folder` for Qualification).

### 3.1 Logging (Python wrapper)

- The Python wrapper writes its own logs to a file. By default, logs are written to
`~/.spark_rapids_tools/logs/<tool>_<timestamp>.log`.
- Override the log file location by setting the environment variable
`RAPIDS_USER_TOOLS_LOG_FILE` to a local file path, for example:

```bash
export RAPIDS_USER_TOOLS_LOG_FILE=/var/tmp/spark-rapids-tools.log
```

- The log file is opened in append mode, so subsequent runs append to the same file.
- The provided path must be a local filesystem path (remote URIs like `s3://`, `gs://` are not supported).
- If the parent directory does not exist, it will be created automatically.
- To increase console verbosity, set `RAPIDS_USER_TOOLS_LOG_DEBUG=true`.

## Qualification command

### Local deployment
Expand Down
8 changes: 8 additions & 0 deletions user_tools/src/spark_rapids_pytools/common/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,14 @@ def get_rapids_tools_env(cls, k: str, def_val=None):
def set_rapids_tools_env(cls, k: str, val):
os.environ[cls.find_full_rapids_tools_env_key(k)] = str(val)

@classmethod
def get_or_set_rapids_tools_env(cls, k: str, default_val) -> str:
current_val = cls.get_rapids_tools_env(k)
if current_val is None or (isinstance(current_val, str) and current_val == ''):
cls.set_rapids_tools_env(k, default_val)
return str(default_val)
return current_val

@classmethod
def gen_str_header(cls, title: str, ruler='-', line_width: int = 40) -> str:
dash = ruler * line_width
Expand Down
56 changes: 48 additions & 8 deletions user_tools/src/spark_rapids_tools/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,52 @@ def stringify_path(fpath) -> str:
return os.path.abspath(expanded_path)


def validate_local_log_file_path(log_file_path: str) -> str:
"""
- Must be a local path (no URI schemes like s3://, gs://, abfss://, http://)
- Must include a file name (not end with a directory)
- Parent path must be a directory (or creatable later)
- If the target exists, it must not be a directory
Returns the normalized absolute path.
"""
if not log_file_path:
raise ValueError('Invalid LOG_FILE: path is empty')
if re.match(r'\w+://', str(log_file_path)):
raise ValueError('Invalid LOG_FILE: remote URIs are not supported; use a local file path')

normalized_path = stringify_path(log_file_path)
path_obj = Path(normalized_path)

if path_obj.name == '':
raise ValueError('Invalid LOG_FILE: must include a file name')

parent_dir = path_obj.parent
if parent_dir.exists() and not parent_dir.is_dir():
raise ValueError(f'Invalid LOG_FILE: parent path is not a directory: {parent_dir}')

if path_obj.exists() and path_obj.is_dir():
raise ValueError(f'Invalid LOG_FILE: points to a directory, not a file: {path_obj}')

return str(path_obj)


def resolve_and_prepare_log_file(short_name: str, uuid: str, tools_home_dir: str):
"""
Resolve LOG_FILE from env or default, validate/normalize it, set env to the
normalized value, ensure parent directory exists, and return the normalized value.
"""
default_log_dir = f'{tools_home_dir}/logs'
default_log_file = f'{default_log_dir}/{short_name}_{uuid}.log'

# In case not already set, set to default
resolved = Utils.get_or_set_rapids_tools_env('LOG_FILE', default_log_file)
normalized = validate_local_log_file_path(resolved)
if normalized != resolved:
Utils.set_rapids_tools_env('LOG_FILE', normalized)
FSUtil.make_dirs(str(Path(normalized).parent))
return normalized


def is_http_file(value: Any) -> bool:
try:
TypeAdapter(AnyHttpUrl).validate_python(value)
Expand Down Expand Up @@ -195,15 +241,9 @@ def init_environment(short_name: str) -> str:
tools_home_dir = FSUtil.build_path(home_dir, '.spark_rapids_tools')
Utils.set_rapids_tools_env('HOME', tools_home_dir)

# Set the 'LOG_FILE' environment variable and create the log directory.
log_dir = f'{tools_home_dir}/logs'
log_file = f'{log_dir}/{short_name}_{uuid}.log'
Utils.set_rapids_tools_env('LOG_FILE', log_file)
FSUtil.make_dirs(log_dir)

# Print the log file location
log_file = resolve_and_prepare_log_file(short_name, uuid, tools_home_dir)
print(Utils.gen_report_sec_header('Application Logs'))
print(f'Location: {log_file}')
print(f'Location : {log_file}')
print('In case of any errors, please share the log file with the Spark RAPIDS team.\n')

return uuid
Expand Down