1
+ #!/usr/bin/env python3
2
+ # Copyright 2024 Google LLC
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
1
16
import polars as pl
2
17
import time
3
18
import pandas as pd
8
23
import argparse
9
24
import sys
10
25
import re
26
+ import subprocess
11
27
12
28
13
29
def _generate_dummy_dataframe (num_rows : int ) -> pd .DataFrame :
@@ -17,6 +33,16 @@ def _generate_dummy_dataframe(num_rows: int) -> pd.DataFrame:
17
33
"float_col" : np .random .random (size = num_rows ),
18
34
"str_col" : np .random .choice (['alpha' , 'beta' , 'gamma' , 'delta' , 'epsilon' , 'zeta' , 'eta' , 'theta' ], size = num_rows )
19
35
})
36
+
37
+ def clear_kernel_cache_bash ():
38
+ try :
39
+ # Attempt to clear the cache with sudo, but suppress password prompt
40
+ subprocess .run (['sudo' , 'sh' , '-c' , 'echo 1 > /proc/sys/vm/drop_caches' ], check = True , stdout = subprocess .DEVNULL ,
41
+ stderr = subprocess .DEVNULL )
42
+ time .sleep (1 ) # Wait for the caches to be cleared
43
+ except subprocess .CalledProcessError as e :
44
+ # If sudo fails (likely due to no passwordless access), log the error and exit
45
+ print (f"Failed to clear kernel cache: { e } " )
20
46
21
47
22
48
def create_parquet_file_if_not_exists (file_path : str , target_size_bytes : int , chunk_rows : int = 1_000_000 ):
@@ -61,6 +87,9 @@ def create_parquet_file_if_not_exists(file_path: str, target_size_bytes: int, ch
61
87
62
88
current_size = os .path .getsize (file_path )
63
89
print (f"Wrote { total_rows :,} rows, current file size: { current_size / (1024 ** 2 ):.2f} MiB" )
90
+
91
+ # Clear the page cache to make sure, next read doesn't happen with the page-cache.
92
+ clear_kernel_cache_bash ()
64
93
65
94
if current_size >= target_size_bytes :
66
95
break
0 commit comments