Skip to content

Commit

Permalink
fully removed the old open_and_save, save_file, and link_result
Browse files Browse the repository at this point in the history
  • Loading branch information
wangpatrick57 committed Dec 30, 2024
1 parent 1dcf669 commit b5fd76e
Show file tree
Hide file tree
Showing 4 changed files with 3 additions and 166 deletions.
7 changes: 1 addition & 6 deletions benchmark/job/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,7 @@
from benchmark.constants import DEFAULT_SCALE_FACTOR
from util.log import DBGYM_LOGGER_NAME
from util.shell import subprocess_run
from util.workspace import (
DBGymWorkspace,
fully_resolve_path,
is_fully_resolved,
link_result,
)
from util.workspace import DBGymWorkspace, fully_resolve_path

JOB_TABLES_URL = "https://event.cwi.nl/da/job/imdb.tgz"
JOB_QUERIES_URL = "https://event.cwi.nl/da/job/job.tgz"
Expand Down
2 changes: 1 addition & 1 deletion env/pg_conn.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

from util.log import DBGYM_LOGGER_NAME
from util.pg import DBGYM_POSTGRES_DBNAME, SHARED_PRELOAD_LIBRARIES, get_kv_connstr
from util.workspace import DBGymWorkspace, open_and_save, parent_path_of_path
from util.workspace import DBGymWorkspace, parent_path_of_path

CONNECT_TIMEOUT = 300

Expand Down
2 changes: 1 addition & 1 deletion util/pg.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import sqlalchemy
from sqlalchemy import create_engine, text

from util.workspace import DBGymWorkspace, open_and_save
from util.workspace import DBGymWorkspace

DBGYM_POSTGRES_USER = "dbgym_user"
DBGYM_POSTGRES_PASS = "dbgym_pass"
Expand Down
158 changes: 0 additions & 158 deletions util/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,42 +524,6 @@ def is_child_path(child_path: os.PathLike[str], parent_path: os.PathLike[str]) -
)


# TODO(phw2): deprecate this once I'm done with unittest_workspace.py
def open_and_save(
dbgym_workspace: DBGymWorkspace, open_path: Path, mode: str = "r"
) -> IO[Any]:
"""
Open a file and "save" it to [workspace]/task_runs/run_*/.
It takes in a str | Path to match the interface of open().
This file does not work if open_path is a symlink, to make its interface identical to that of open().
Make sure to resolve all symlinks with fully_resolve_path().
To avoid confusion, I'm enforcing this function to only work with absolute paths.
# TODO: maybe make it work on non-fully-resolved paths to better match open()
See the comment of save_file() for what "saving" means
If you are generating a "result" for the run, _do not_ use this. Just use the normal open().
This shouldn't be too hard to remember because this function crashes if open_path doesn't exist,
and when you write results you're usually opening open_paths which do not exist.
"""
# validate open_path
assert isinstance(open_path, Path)
assert is_fully_resolved(
open_path
), f"open_and_save(): open_path ({open_path}) should be a fully resolved path"
assert not os.path.islink(
open_path
), f"open_path ({open_path}) should not be a symlink"
assert os.path.exists(open_path), f"open_path ({open_path}) does not exist"
# open_and_save *must* be called on files because it doesn't make sense to open a directory. note that this doesn't mean we'll always save
# a file though. we sometimes save a directory (see save_file() for details)
assert os.path.isfile(open_path), f"open_path ({open_path}) is not a file"

# save
save_file(dbgym_workspace, open_path)

# open
return open(open_path, mode=mode)


def extract_from_task_run_path(
dbgym_workspace: DBGymWorkspace, task_run_path: Path
) -> tuple[Path, str, Path, str]:
Expand Down Expand Up @@ -593,128 +557,6 @@ def extract_from_task_run_path(
return codebase_path, codebase_dname, org_path, org_dname


# TODO(phw2): deprecate this once I'm done with unittest_workspace.py
def save_file(dbgym_workspace: DBGymWorkspace, path: Path) -> None:
"""
If an external function takes in a file/directory as input, you will not be able to call open_and_save().
In these situations, just call save_file().
Like open_and_save(), this function only works with real absolute paths.
"Saving" can mean either copying the file or creating a symlink to it
We copy the file if it is a "config", meaning it just exists without having been generated
We create a symlink if it is a "dependency", meaning a task.py command was run to generate it
In these cases we create a symlink so we have full provenance for how the dependency was created
"""
# validate path
assert is_fully_resolved(path), f"path ({path}) should be a fully resolved path"
assert os.path.isfile(path), f"path ({path}) should be a file"
assert not is_child_path(
path, dbgym_workspace.dbgym_this_run_path
), f"path ({path}) was generated in this task run ({dbgym_workspace.dbgym_this_run_path}). You do not need to save it"

# save _something_ to dbgym_this_run_path
# save a symlink if the opened file was generated by a run. this is for two reasons:
# 1. files or dirs generated by a run are supposed to be immutable so saving a symlink is safe
# 2. files or dirs generated by a run may be very large (up to 100s of GBs) so we don't want to copy them
if is_child_path(path, dbgym_workspace.dbgym_runs_path):
# get paths we'll need later.
_, codebase_dname, org_path, org_dname = extract_from_task_run_path(
dbgym_workspace, path
)
this_run_save_path = (
dbgym_workspace.dbgym_this_run_path / codebase_dname / org_dname
)
os.makedirs(this_run_save_path, exist_ok=True)

# if the path file is directly in org_path, we symlink the file directly
parent_path = parent_path_of_path(path)
if parent_path.samefile(org_path):
fname = basename_of_path(path)
symlink_path = this_run_save_path / (fname + ".link")
try_create_symlink(path, symlink_path)
# else, we know the path file is _not_ directly inside org_path dir
# we go as far back as we can while still staying in org_path and symlink that "base" dir
# this is because lots of runs create dirs within org_path and it's just a waste of space to symlink every individual file
else:
# set base_path such that its parent is org_path
base_path = parent_path
while not parent_path_of_path(base_path).samefile(org_path):
base_path = parent_path_of_path(base_path)

# create symlink
open_base_dname = basename_of_path(base_path)
symlink_path = this_run_save_path / (open_base_dname + ".link")
try_create_symlink(base_path, symlink_path)
# if it wasn't generated by a run
else:
# since we don't know where the file is at all, the location is "unknown" and the org is "all"
this_run_save_path = dbgym_workspace.dbgym_this_run_path / "unknown" / "all"
os.makedirs(this_run_save_path, exist_ok=True)
fname = basename_of_path(path)
# in this case, we want to copy instead of symlinking since it might disappear in the future
copy_path = this_run_save_path / fname
shutil.copy(path, copy_path)


# TODO(phw2): deprecate this once I'm done with unittest_workspace.py
def link_result(
dbgym_workspace: DBGymWorkspace,
result_path: Path,
custom_result_name: Optional[str] = None,
) -> Path:
"""
result_path must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path.
Further, result_path must have been generated by this invocation to task.py. This also means that
result_path itself can be a file or a dir but not a symlink.
Given a file or directory in task_runs/run_*/[codebase]/[org], this will create a symlink inside
symlinks/[codebase]/[org]/.
Will override the old symlink if there is one, so that symlinks/ always contains the latest generated
version of a file.
This function will return the path to the symlink that was created.
"""
assert isinstance(result_path, Path)
assert is_fully_resolved(
result_path
), f"result_path ({result_path}) should be a fully resolved path"
assert is_child_path(result_path, dbgym_workspace.dbgym_this_run_path)
assert not os.path.islink(result_path)

if type(custom_result_name) is str:
result_name = custom_result_name
else:
if os.path.isfile(result_path):
result_name = basename_of_path(result_path) + ".link"
elif os.path.isdir(result_path):
result_name = basename_of_path(result_path) + ".link"
else:
raise AssertionError("result_path must be either a file or dir")

# Figure out the parent directory path of the symlink
codebase_path, codebase_dname, _, org_dname = extract_from_task_run_path(
dbgym_workspace, result_path
)
# We're only supposed to save files generated by us, which means they should be in cur_task_runs_path()
assert codebase_path.samefile(
dbgym_workspace.cur_task_runs_path()
), f"link_result should only be called on files generated by this invocation to task.py"
symlink_parent_path = (
dbgym_workspace.dbgym_symlinks_path / codebase_dname / org_dname
)
symlink_parent_path.mkdir(parents=True, exist_ok=True)

# Remove the old symlink ("old" meaning created in an earlier run) if there is one
# Note that in a multi-threaded setting, this might remove one created by a process in the same run,
# meaning it's not "old" by our definition of "old". However, we'll always end up with a symlink
# file of the current run regardless of the order of threads.
assert result_name.endswith(".link") and not result_name.endswith(
".link.link"
), f'result_name ({result_name}) should end with ".link"'
symlink_path = symlink_parent_path / result_name
try_remove_file(symlink_path)
try_create_symlink(result_path, symlink_path)

return symlink_path


def try_create_symlink(src_path: Path, dst_path: Path) -> None:
"""
Our functions that create symlinks might be called by multiple processes at once
Expand Down

0 comments on commit b5fd76e

Please sign in to comment.