Add machine type and gcsfuse version in bigquery schema

vipnydav · vipnydav · commit 570e47649a9d · 2025-05-29T10:10:09.000Z
diff --git a/perf-benchmarking-for-releases/requirements.txt b/perf-benchmarking-for-releases/requirements.txt
@@ -1 +1,2 @@
 google-cloud-bigquery
+requests
diff --git a/perf-benchmarking-for-releases/run-benchmarks.sh b/perf-benchmarking-for-releases/run-benchmarks.sh
@@ -166,7 +166,7 @@ gcloud compute instances create "${VM_NAME}" \
     --network-interface=network-tier=PREMIUM,nic-type=GVNIC \
     --scopes=https://www.googleapis.com/auth/cloud-platform,https://www.googleapis.com/auth/devstorage.read_write \
     --network-performance-configs=total-egress-bandwidth-tier=TIER_1 \
-    --metadata GCSFUSE_VERSION="${GCSFUSE_VERSION}",GCS_BUCKET_WITH_FIO_TEST_DATA="${GCS_BUCKET_WITH_FIO_TEST_DATA}",RESULTS_BUCKET_NAME="${RESULTS_BUCKET_NAME}",LSSD_ENABLED="${LSSD_ENABLED}" \
+    --metadata GCSFUSE_VERSION="${GCSFUSE_VERSION}",GCS_BUCKET_WITH_FIO_TEST_DATA="${GCS_BUCKET_WITH_FIO_TEST_DATA}",RESULTS_BUCKET_NAME="${RESULTS_BUCKET_NAME}",LSSD_ENABLED="${LSSD_ENABLED}",MACHINE_TYPE="${MACHINE_TYPE}" \
     --metadata-from-file=startup-script=starter-script.sh \
     ${VM_LOCAL_SSD_ARGS}
 echo "VM created. Benchmarks will run on the VM."
diff --git a/perf-benchmarking-for-releases/upload-fio-output-to-bigquery.py b/perf-benchmarking-for-releases/upload-fio-output-to-bigquery.py
@@ -12,6 +12,22 @@
 
 args = parser.parse_args()
 
+import requests
+
+def fetch_metadata(attribute):
+    url = f"http://metadata.google.internal/computeMetadata/v1/instance/attributes/{attribute}"
+    headers = {"Metadata-Flavor": "Google"}
+    try:
+        response = requests.get(url, headers=headers, timeout=5)
+        response.raise_for_status()
+        return response.text
+    except Exception as e:
+        print(f"Failed to fetch metadata attribute '{attribute}': {e}")
+        return "unknown"
+
+machine_type = fetch_metadata("MACHINE_TYPE")
+gcsfuse_version = fetch_metadata("GCSFUSE_VERSION")
+
 # Load the results file
 with open(args.result_file) as f:
     try:
@@ -36,14 +52,16 @@
 # Create table if it doesn't exist
 schema = [
     bigquery.SchemaField("job_name", "STRING"),
+    bigquery.SchemaField("gcsfuse_version", "STRING"),
+    bigquery.SchemaField("machine_type", "STRING"),
     bigquery.SchemaField("start_time", "TIMESTAMP"),
     bigquery.SchemaField("file_size", "STRING"),
     bigquery.SchemaField("block_size", "STRING"),
     bigquery.SchemaField("nrfiles", "INTEGER"),
     bigquery.SchemaField("read_bandwidth_MiBps", "FLOAT"),
     bigquery.SchemaField("write_bandwidth_MiBps", "FLOAT"),
     bigquery.SchemaField("IOPS", "FLOAT"),
-    bigquery.SchemaField("duration_seconds", "FLOAT"),
+    bigquery.SchemaField("avg_latency_ms", "FLOAT"),
 ]
 
 try:
@@ -59,31 +77,45 @@
 rows = []
 for job in data.get("jobs", []):
     jobname = job.get("jobname")
-    # Correctly access job options using .get() for nested keys
     job_options = job.get("job options", {})
 
-    # Use get with a default value for each option and handle string conversion
-    file_size = job_options.get("filesize", data.get("global options",{}).get("filesize", "unknown"))
-    block_size = job_options.get("bs", data.get("global options",{}).get("bs", "unknown"))
-    
-    # Convert nrfiles to int, handle missing values and potential string values
-    nrfiles_str = job_options.get("nrfiles", data.get("global options",{}).get("nrfiles"))
+    file_size = job_options.get("filesize", data.get("global options", {}).get("filesize", "unknown"))
+    block_size = job_options.get("bs", data.get("global options", {}).get("bs", "unknown"))
+
+    nrfiles_str = job_options.get("nrfiles", data.get("global options", {}).get("nrfiles"))
     nrfiles = int(nrfiles_str) if nrfiles_str and isinstance(nrfiles_str, str) and nrfiles_str.isdigit() else 0
 
-    read_bw = job.get("read", {}).get("bw_bytes", 0) / (1024 * 1024)
-    write_bw = job.get("write", {}).get("bw_bytes", 0) / (1024 * 1024)
-    iops = job.get("read", {}).get("iops", 0.0) + job.get("write", {}).get("iops", 0.0)
+    read = job.get("read", {})
+    write = job.get("write", {})
+
+    read_bw = read.get("bw_bytes", 0) / (1024 * 1024)
+    write_bw = write.get("bw_bytes", 0) / (1024 * 1024)
+    iops = read.get("iops", 0.0) + write.get("iops", 0.0)
+
+    read_lat_ns = read.get("lat_ns", {}).get("mean")
+    write_lat_ns = write.get("lat_ns", {}).get("mean")
+
+    if read_lat_ns is not None and write_lat_ns is not None:
+        avg_latency_ms = ((read_lat_ns + write_lat_ns) / 2) / 1_000_000
+    elif read_lat_ns is not None:
+        avg_latency_ms = read_lat_ns / 1_000_000
+    elif write_lat_ns is not None:
+        avg_latency_ms = write_lat_ns / 1_000_000
+    else:
+        avg_latency_ms = 0.0
 
     rows.append({
         "job_name": jobname,
+        "gcsfuse_version": gcsfuse_version,
+        "machine_type": machine_type,
         "start_time": start_time,
         "file_size": file_size,
         "block_size": block_size,
         "nrfiles": nrfiles,
         "read_bandwidth_MiBps": read_bw,
         "write_bandwidth_MiBps": write_bw,
         "IOPS": iops,
-        "duration_seconds": job.get("job_runtime", 0) / 1000,
+        "avg_latency_ms": avg_latency_ms,
     })
 
 # Insert rows