|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# Copyright 2024 Google LLC |
| 3 | +# |
| 4 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +# you may not use this file except in compliance with the License. |
| 6 | +# You may obtain a copy of the License at |
| 7 | +# |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +# |
| 10 | +# Unless required by applicable law or agreed to in writing, software |
| 11 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +# See the License for the specific language governing permissions and |
| 14 | +# limitations under the License. |
| 15 | + |
| 16 | +"""Core logic for GCSFuse performance benchmarking with FIO.""" |
| 17 | + |
| 18 | +import json |
| 19 | +import logging |
| 20 | +import os |
| 21 | +import shlex |
| 22 | +import subprocess |
| 23 | +import sys |
| 24 | +import time |
| 25 | + |
| 26 | +def run_command(command, check=True, cwd=None, extra_env=None): |
| 27 | + """Runs a command and logs its output.""" |
| 28 | + logging.info(f"Running command: {' '.join(command)}") |
| 29 | + |
| 30 | + env = os.environ.copy() |
| 31 | + if extra_env: |
| 32 | + env.update(extra_env) |
| 33 | + |
| 34 | + try: |
| 35 | + result = subprocess.run( |
| 36 | + command, check=check, capture_output=True, text=True, cwd=cwd, env=env |
| 37 | + ) |
| 38 | + if result.stdout: |
| 39 | + logging.info(f"STDOUT: {result.stdout.strip()}") |
| 40 | + if result.stderr: |
| 41 | + # Use warning for stderr as some tools write info there |
| 42 | + logging.info(f"STDERR: {result.stderr.strip()}") |
| 43 | + return result |
| 44 | + except subprocess.CalledProcessError as e: |
| 45 | + logging.error(f"Command failed with exit code {e.returncode}") |
| 46 | + logging.error(f"STDOUT: {e.stdout.strip() if e.stdout else 'N/A'}") |
| 47 | + logging.error(f"STDERR: {e.stderr.strip() if e.stderr else 'N/A'}") |
| 48 | + raise |
| 49 | + |
| 50 | +def mount_gcsfuse(gcsfuse_bin, flags, bucket_name, mount_point): |
| 51 | + """Mounts the GCS bucket using GCSFuse.""" |
| 52 | + os.makedirs(mount_point, exist_ok=True) |
| 53 | + logging.info(f"Mounting gs://{bucket_name} to {mount_point}") |
| 54 | + cmd = [gcsfuse_bin] + shlex.split(flags) + [bucket_name, mount_point] |
| 55 | + run_command(cmd) |
| 56 | + time.sleep(2) # Give a moment for the mount to register |
| 57 | + if not os.path.ismount(mount_point): |
| 58 | + logging.error("Mounting failed. Check GCSFuse logs (e.g., in /var/log/syslog).") |
| 59 | + sys.exit(1) |
| 60 | + logging.info("Mount successful.") |
| 61 | + |
| 62 | + |
| 63 | +def unmount_gcsfuse(mount_point): |
| 64 | + """Unmounts the GCSFuse file system.""" |
| 65 | + logging.info(f"Unmounting {mount_point}") |
| 66 | + try: |
| 67 | + run_command(["fusermount", "-u", mount_point]) |
| 68 | + except (FileNotFoundError, subprocess.CalledProcessError): |
| 69 | + logging.warning("`fusermount -u` failed. Retrying with `sudo umount`.") |
| 70 | + time.sleep(2) |
| 71 | + run_command(["umount", "-l", mount_point], check=False) |
| 72 | + |
| 73 | + |
| 74 | +def run_fio_test(fio_config, mount_point, iteration, output_dir, fio_env=None): |
| 75 | + """Runs a single FIO test iteration.""" |
| 76 | + logging.info(f"Starting FIO test iteration {iteration}...") |
| 77 | + output_filename = os.path.join(output_dir, f"fio_results_iter_{iteration}.json") |
| 78 | + cmd = [ |
| 79 | + "fio", fio_config, "--output-format=json", f"--output={output_filename}", |
| 80 | + f"--directory={mount_point}" |
| 81 | + ] |
| 82 | + run_command(cmd, extra_env=fio_env) |
| 83 | + logging.info(f"FIO test iteration {iteration} complete. Results: {output_filename}") |
| 84 | + |
| 85 | + |
| 86 | +def parse_fio_output(filename): |
| 87 | + """Parses FIO JSON output to extract key metrics.""" |
| 88 | + try: |
| 89 | + with open(filename, "r") as f: |
| 90 | + data = json.load(f) |
| 91 | + except (json.JSONDecodeError, FileNotFoundError) as e: |
| 92 | + logging.error(f"Could not read or parse FIO output {filename}: {e}") |
| 93 | + return [] |
| 94 | + |
| 95 | + results = [] |
| 96 | + for job in data.get("jobs", []): |
| 97 | + job_name = job.get("jobname", "unnamed_job") |
| 98 | + for op in ["read", "write"]: |
| 99 | + if op in job: |
| 100 | + stats = job[op] |
| 101 | + # Bandwidth is in KiB/s, convert to MiB/s |
| 102 | + bw_mibps = stats.get("bw", 0) / 1024.0 |
| 103 | + if bw == 0: |
| 104 | + continue |
| 105 | + iops = stats.get("iops", 0) |
| 106 | + |
| 107 | + # Latency can be under 'lat_ns', 'clat_ns', etc. |
| 108 | + lat_stats = stats.get("lat_ns") or {} |
| 109 | + |
| 110 | + # Convert from ns to ms |
| 111 | + mean_lat_ms = lat_stats.get("mean", 0) / 1_000_000.0 |
| 112 | + |
| 113 | + # Percentiles are in a sub-dict with string keys |
| 114 | + percentiles = lat_stats.get("percentiles", {}) # FIO 3.x |
| 115 | + |
| 116 | + p99_key = next((k for k in percentiles if k.startswith("99.00")), None) |
| 117 | + p99_lat_ms = ( |
| 118 | + percentiles.get(p99_key, 0) / 1_000_000.0 if p99_key else 0 |
| 119 | + ) |
| 120 | + |
| 121 | + results.append({ |
| 122 | + "job_name": job_name, |
| 123 | + "operation": op, |
| 124 | + "bw_mibps": bw_mibps, |
| 125 | + "iops": iops, |
| 126 | + "mean_lat_ms": mean_lat_ms, |
| 127 | + "p99_lat_ms": p99_lat_ms, |
| 128 | + }) |
| 129 | + return results |
| 130 | + |
| 131 | + |
| 132 | +def print_summary(all_results): |
| 133 | + """Prints a summary of all FIO iterations.""" |
| 134 | + if not all_results: |
| 135 | + logging.warning("No results to summarize.") |
| 136 | + return |
| 137 | + |
| 138 | + logging.info("\n--- FIO Benchmark Summary ---") |
| 139 | + header = (f"{'Iter':<5} {'Job Name':<20} {'Op':<6} {'Bandwidth (MiB/s)':<20} " |
| 140 | + f"{'IOPS':<12} {'Mean Latency (ms)':<20} {'P99 Latency (ms)':<20}") |
| 141 | + print(header) |
| 142 | + print("-" * len(header)) |
| 143 | + for i, iteration_results in enumerate(all_results, 1): |
| 144 | + if not iteration_results: |
| 145 | + print(f"{i:<5} No results for this iteration.") |
| 146 | + continue |
| 147 | + for result in iteration_results: |
| 148 | + print(f"{i:<5} {result['job_name']:<20} {result['operation']:<6} " |
| 149 | + f"{result['bw_mibps']:<20.2f} {result['iops']:<12.2f} " |
| 150 | + f"{result['mean_lat_ms']:<20.4f} {result['p99_lat_ms']:<20.4f}") |
| 151 | + print("-" * len(header)) |
| 152 | + |
| 153 | + |
| 154 | +def run_benchmark( |
| 155 | + gcsfuse_flags, bucket_name, iterations, fio_config, work_dir, output_dir, fio_env=None |
| 156 | +): |
| 157 | + """Runs the full FIO benchmark suite.""" |
| 158 | + os.makedirs(work_dir, exist_ok=True) |
| 159 | + os.makedirs(output_dir, exist_ok=True) |
| 160 | + |
| 161 | + gcsfuse_bin = "/gcsfuse/gcsfuse" |
| 162 | + mount_point = os.path.join(work_dir, "mount_point") |
| 163 | + |
| 164 | + # Prepare environment for FIO |
| 165 | + fio_run_env = {"DIR": mount_point} |
| 166 | + if fio_env: |
| 167 | + fio_run_env.update(fio_env) |
| 168 | + |
| 169 | + all_results = [] |
| 170 | + |
| 171 | + for i in range(1, iterations + 1): |
| 172 | + logging.info(f"--- Starting Iteration {i}/{iterations} ---") |
| 173 | + output_filename = os.path.join(output_dir, |
| 174 | + f"fio_results_iter_{i}.json") |
| 175 | + if os.path.exists(output_filename): |
| 176 | + os.remove(output_filename) |
| 177 | + try: |
| 178 | + logging.info("Clearing page cache...") |
| 179 | + run_command(["sh", "-c", "echo 3 > /proc/sys/vm/drop_caches"]) |
| 180 | + |
| 181 | + mount_gcsfuse(gcsfuse_bin, gcsfuse_flags, bucket_name, mount_point) |
| 182 | + run_fio_test(fio_config, mount_point, i, output_dir, fio_env=fio_run_env) |
| 183 | + |
| 184 | + iteration_results = parse_fio_output(output_filename) |
| 185 | + all_results.append(iteration_results) |
| 186 | + finally: |
| 187 | + if os.path.ismount(mount_point): |
| 188 | + unmount_gcsfuse(mount_point) |
| 189 | + logging.info(f"--- Finished Iteration {i}/{iterations} ---") |
| 190 | + |
| 191 | + print_summary(all_results) |
0 commit comments