From b450d2d25ba974d0a4668730cd1ac54fcd1a5b07 Mon Sep 17 00:00:00 2001 From: Jaideep Rao Date: Tue, 10 Sep 2024 12:50:58 -0400 Subject: [PATCH] chore: replace platformdirs with xdg-base-dirs Signed-off-by: Jaideep Rao --- docs/data_mixing.md | 6 +----- requirements.txt | 2 +- src/instructlab/sdg/generate_data.py | 17 ++++++++--------- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/docs/data_mixing.md b/docs/data_mixing.md index 7dedb446..3d283158 100644 --- a/docs/data_mixing.md +++ b/docs/data_mixing.md @@ -9,11 +9,7 @@ The primary intended use of this is to specify an optional pre-generated dataset To use the [InstructLab Community pre-generated dataset](https://huggingface.co/datasets/instructlab/InstructLabCommunity) with all skills training, we first need to create a default recipe that specifies this dataset to include when mixing generated skills data. This recipe will get automatically picked up if placed in a `default_data_recipes/skills.yaml` subfolder and file under one of several possible locations - `'/home//.local/share/instructlab/sdg'`, `'/usr/local/share/instructlab/sdg'`, or `'/usr/share/instructlab/sdg'`. The exact list of possible locations is platform-dependent, and can be enumerated by a Python command like below: ```python -python3 -c ' -import os, platformdirs -print(list(platformdirs.PlatformDirs( - appname=os.path.join("instructlab", "sdg"), multipath=True -).iter_data_dirs()))' +python3 -c "import os; from xdg_base_dirs import xdg_data_home, xdg_data_dirs; data_dirs = [os.path.join(xdg_data_home(), 'instructlab', 'sdg')] + [os.path.join(dir, 'instructlab', 'sdg') for dir in xdg_data_dirs()]; print(data_dirs)" ``` For this example, we'll assume you want to place to default data recipe under the `~/.local/share/instructlab/sdg/` platform directory. diff --git a/requirements.txt b/requirements.txt index d8e9544c..9d3e75f2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,9 +6,9 @@ httpx>=0.25.0,<1.0.0 instructlab-schema>=0.4.0 langchain-text-splitters openai>=1.13.3,<2.0.0 -platformdirs>=4.2 # Note: this dependency goes along with langchain-text-splitters and may be # removed once that one is removed. # do not use 8.4.0 due to a bug in the library # https://github.com/instructlab/instructlab/issues/1389 tenacity>=8.3.0,!=8.4.0 +xdg-base-dirs>=6.0.1 diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 177fd90b..61934535 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -14,8 +14,8 @@ # Third Party # instructlab - All of these need to go away (other than sdg) - issue #6 from datasets import Dataset +from xdg_base_dirs import xdg_data_dirs, xdg_data_home import openai -import platformdirs # First Party # pylint: disable=ungrouped-imports @@ -208,10 +208,10 @@ def _sdg_init(ctx, pipeline): # Search for the pipeline in User and Site data directories # then for a package defined pipeline # and finally pipelines referenced by absolute path - pd = platformdirs.PlatformDirs( - appname=os.path.join("instructlab", "sdg"), multipath=True - ) - for d in pd.iter_data_dirs(): + data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")] + data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs()) + + for d in data_dirs: pipeline_path = os.path.join(d, "pipelines", pipeline) if os.path.exists(pipeline_path): _check_pipeline_dir(pipeline_path) @@ -246,10 +246,9 @@ def load_pipeline(yaml_basename): def _mixer_init(ctx, output_dir, date_suffix, knowledge_auxiliary_inst): - pd = platformdirs.PlatformDirs( - appname=os.path.join("instructlab", "sdg"), multipath=True - ) - data_dirs = list(pd.iter_data_dirs()) + data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")] + data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs()) + return DataMixer( data_dirs, output_dir,