Skip to content

Commit 20b385f

Browse files
committed
Review comments.
1 parent addaa82 commit 20b385f

File tree

1 file changed

+29
-0
lines changed

1 file changed

+29
-0
lines changed

ml_samples/parquet_read/load_parquet.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,18 @@
1+
#!/usr/bin/env python3
2+
# Copyright 2024 Google LLC
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
116
import polars as pl
217
import time
318
import pandas as pd
@@ -8,6 +23,7 @@
823
import argparse
924
import sys
1025
import re
26+
import subprocess
1127

1228

1329
def _generate_dummy_dataframe(num_rows: int) -> pd.DataFrame:
@@ -17,6 +33,16 @@ def _generate_dummy_dataframe(num_rows: int) -> pd.DataFrame:
1733
"float_col": np.random.random(size=num_rows),
1834
"str_col": np.random.choice(['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta'], size=num_rows)
1935
})
36+
37+
def clear_kernel_cache_bash():
38+
try:
39+
# Attempt to clear the cache with sudo, but suppress password prompt
40+
subprocess.run(['sudo', 'sh', '-c', 'echo 1 > /proc/sys/vm/drop_caches'], check=True, stdout=subprocess.DEVNULL,
41+
stderr=subprocess.DEVNULL)
42+
time.sleep(1) # Wait for the caches to be cleared
43+
except subprocess.CalledProcessError as e:
44+
# If sudo fails (likely due to no passwordless access), log the error and exit
45+
print(f"Failed to clear kernel cache: {e}")
2046

2147

2248
def create_parquet_file_if_not_exists(file_path: str, target_size_bytes: int, chunk_rows: int = 1_000_000):
@@ -61,6 +87,9 @@ def create_parquet_file_if_not_exists(file_path: str, target_size_bytes: int, ch
6187

6288
current_size = os.path.getsize(file_path)
6389
print(f"Wrote {total_rows:,} rows, current file size: {current_size / (1024**2):.2f} MiB")
90+
91+
# Clear the page cache to make sure, next read doesn't happen with the page-cache.
92+
clear_kernel_cache_bash()
6493

6594
if current_size >= target_size_bytes:
6695
break

0 commit comments

Comments
 (0)