From b5fd76ea5159b584f1c3300de15f77d4a85e43c4 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Mon, 30 Dec 2024 13:36:51 -0500 Subject: [PATCH] fully removed the old open_and_save, save_file, and link_result --- benchmark/job/cli.py | 7 +- env/pg_conn.py | 2 +- util/pg.py | 2 +- util/workspace.py | 158 ------------------------------------------- 4 files changed, 3 insertions(+), 166 deletions(-) diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py index c467e823..136c0b19 100644 --- a/benchmark/job/cli.py +++ b/benchmark/job/cli.py @@ -11,12 +11,7 @@ from benchmark.constants import DEFAULT_SCALE_FACTOR from util.log import DBGYM_LOGGER_NAME from util.shell import subprocess_run -from util.workspace import ( - DBGymWorkspace, - fully_resolve_path, - is_fully_resolved, - link_result, -) +from util.workspace import DBGymWorkspace, fully_resolve_path JOB_TABLES_URL = "https://event.cwi.nl/da/job/imdb.tgz" JOB_QUERIES_URL = "https://event.cwi.nl/da/job/job.tgz" diff --git a/env/pg_conn.py b/env/pg_conn.py index bbeee54d..540e6134 100644 --- a/env/pg_conn.py +++ b/env/pg_conn.py @@ -24,7 +24,7 @@ from util.log import DBGYM_LOGGER_NAME from util.pg import DBGYM_POSTGRES_DBNAME, SHARED_PRELOAD_LIBRARIES, get_kv_connstr -from util.workspace import DBGymWorkspace, open_and_save, parent_path_of_path +from util.workspace import DBGymWorkspace, parent_path_of_path CONNECT_TIMEOUT = 300 diff --git a/util/pg.py b/util/pg.py index a959e11b..23c06b60 100644 --- a/util/pg.py +++ b/util/pg.py @@ -11,7 +11,7 @@ import sqlalchemy from sqlalchemy import create_engine, text -from util.workspace import DBGymWorkspace, open_and_save +from util.workspace import DBGymWorkspace DBGYM_POSTGRES_USER = "dbgym_user" DBGYM_POSTGRES_PASS = "dbgym_pass" diff --git a/util/workspace.py b/util/workspace.py index 0db0beb6..286488ec 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -524,42 +524,6 @@ def is_child_path(child_path: os.PathLike[str], parent_path: os.PathLike[str]) - ) -# TODO(phw2): deprecate this once I'm done with unittest_workspace.py -def open_and_save( - dbgym_workspace: DBGymWorkspace, open_path: Path, mode: str = "r" -) -> IO[Any]: - """ - Open a file and "save" it to [workspace]/task_runs/run_*/. - It takes in a str | Path to match the interface of open(). - This file does not work if open_path is a symlink, to make its interface identical to that of open(). - Make sure to resolve all symlinks with fully_resolve_path(). - To avoid confusion, I'm enforcing this function to only work with absolute paths. - # TODO: maybe make it work on non-fully-resolved paths to better match open() - See the comment of save_file() for what "saving" means - If you are generating a "result" for the run, _do not_ use this. Just use the normal open(). - This shouldn't be too hard to remember because this function crashes if open_path doesn't exist, - and when you write results you're usually opening open_paths which do not exist. - """ - # validate open_path - assert isinstance(open_path, Path) - assert is_fully_resolved( - open_path - ), f"open_and_save(): open_path ({open_path}) should be a fully resolved path" - assert not os.path.islink( - open_path - ), f"open_path ({open_path}) should not be a symlink" - assert os.path.exists(open_path), f"open_path ({open_path}) does not exist" - # open_and_save *must* be called on files because it doesn't make sense to open a directory. note that this doesn't mean we'll always save - # a file though. we sometimes save a directory (see save_file() for details) - assert os.path.isfile(open_path), f"open_path ({open_path}) is not a file" - - # save - save_file(dbgym_workspace, open_path) - - # open - return open(open_path, mode=mode) - - def extract_from_task_run_path( dbgym_workspace: DBGymWorkspace, task_run_path: Path ) -> tuple[Path, str, Path, str]: @@ -593,128 +557,6 @@ def extract_from_task_run_path( return codebase_path, codebase_dname, org_path, org_dname -# TODO(phw2): deprecate this once I'm done with unittest_workspace.py -def save_file(dbgym_workspace: DBGymWorkspace, path: Path) -> None: - """ - If an external function takes in a file/directory as input, you will not be able to call open_and_save(). - In these situations, just call save_file(). - Like open_and_save(), this function only works with real absolute paths. - "Saving" can mean either copying the file or creating a symlink to it - We copy the file if it is a "config", meaning it just exists without having been generated - We create a symlink if it is a "dependency", meaning a task.py command was run to generate it - In these cases we create a symlink so we have full provenance for how the dependency was created - """ - # validate path - assert is_fully_resolved(path), f"path ({path}) should be a fully resolved path" - assert os.path.isfile(path), f"path ({path}) should be a file" - assert not is_child_path( - path, dbgym_workspace.dbgym_this_run_path - ), f"path ({path}) was generated in this task run ({dbgym_workspace.dbgym_this_run_path}). You do not need to save it" - - # save _something_ to dbgym_this_run_path - # save a symlink if the opened file was generated by a run. this is for two reasons: - # 1. files or dirs generated by a run are supposed to be immutable so saving a symlink is safe - # 2. files or dirs generated by a run may be very large (up to 100s of GBs) so we don't want to copy them - if is_child_path(path, dbgym_workspace.dbgym_runs_path): - # get paths we'll need later. - _, codebase_dname, org_path, org_dname = extract_from_task_run_path( - dbgym_workspace, path - ) - this_run_save_path = ( - dbgym_workspace.dbgym_this_run_path / codebase_dname / org_dname - ) - os.makedirs(this_run_save_path, exist_ok=True) - - # if the path file is directly in org_path, we symlink the file directly - parent_path = parent_path_of_path(path) - if parent_path.samefile(org_path): - fname = basename_of_path(path) - symlink_path = this_run_save_path / (fname + ".link") - try_create_symlink(path, symlink_path) - # else, we know the path file is _not_ directly inside org_path dir - # we go as far back as we can while still staying in org_path and symlink that "base" dir - # this is because lots of runs create dirs within org_path and it's just a waste of space to symlink every individual file - else: - # set base_path such that its parent is org_path - base_path = parent_path - while not parent_path_of_path(base_path).samefile(org_path): - base_path = parent_path_of_path(base_path) - - # create symlink - open_base_dname = basename_of_path(base_path) - symlink_path = this_run_save_path / (open_base_dname + ".link") - try_create_symlink(base_path, symlink_path) - # if it wasn't generated by a run - else: - # since we don't know where the file is at all, the location is "unknown" and the org is "all" - this_run_save_path = dbgym_workspace.dbgym_this_run_path / "unknown" / "all" - os.makedirs(this_run_save_path, exist_ok=True) - fname = basename_of_path(path) - # in this case, we want to copy instead of symlinking since it might disappear in the future - copy_path = this_run_save_path / fname - shutil.copy(path, copy_path) - - -# TODO(phw2): deprecate this once I'm done with unittest_workspace.py -def link_result( - dbgym_workspace: DBGymWorkspace, - result_path: Path, - custom_result_name: Optional[str] = None, -) -> Path: - """ - result_path must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path. - Further, result_path must have been generated by this invocation to task.py. This also means that - result_path itself can be a file or a dir but not a symlink. - Given a file or directory in task_runs/run_*/[codebase]/[org], this will create a symlink inside - symlinks/[codebase]/[org]/. - Will override the old symlink if there is one, so that symlinks/ always contains the latest generated - version of a file. - This function will return the path to the symlink that was created. - """ - assert isinstance(result_path, Path) - assert is_fully_resolved( - result_path - ), f"result_path ({result_path}) should be a fully resolved path" - assert is_child_path(result_path, dbgym_workspace.dbgym_this_run_path) - assert not os.path.islink(result_path) - - if type(custom_result_name) is str: - result_name = custom_result_name - else: - if os.path.isfile(result_path): - result_name = basename_of_path(result_path) + ".link" - elif os.path.isdir(result_path): - result_name = basename_of_path(result_path) + ".link" - else: - raise AssertionError("result_path must be either a file or dir") - - # Figure out the parent directory path of the symlink - codebase_path, codebase_dname, _, org_dname = extract_from_task_run_path( - dbgym_workspace, result_path - ) - # We're only supposed to save files generated by us, which means they should be in cur_task_runs_path() - assert codebase_path.samefile( - dbgym_workspace.cur_task_runs_path() - ), f"link_result should only be called on files generated by this invocation to task.py" - symlink_parent_path = ( - dbgym_workspace.dbgym_symlinks_path / codebase_dname / org_dname - ) - symlink_parent_path.mkdir(parents=True, exist_ok=True) - - # Remove the old symlink ("old" meaning created in an earlier run) if there is one - # Note that in a multi-threaded setting, this might remove one created by a process in the same run, - # meaning it's not "old" by our definition of "old". However, we'll always end up with a symlink - # file of the current run regardless of the order of threads. - assert result_name.endswith(".link") and not result_name.endswith( - ".link.link" - ), f'result_name ({result_name}) should end with ".link"' - symlink_path = symlink_parent_path / result_name - try_remove_file(symlink_path) - try_create_symlink(result_path, symlink_path) - - return symlink_path - - def try_create_symlink(src_path: Path, dst_path: Path) -> None: """ Our functions that create symlinks might be called by multiple processes at once