Skip to content

Commit

Permalink
Merge pull request #269 from jaideepr97/switch-to-xdg-dirs
Browse files Browse the repository at this point in the history
chore: replace platformdirs with xdg-base-dirs
  • Loading branch information
jaideepr97 authored Sep 11, 2024
2 parents ec9545b + b450d2d commit b9d41ee
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 15 deletions.
6 changes: 1 addition & 5 deletions docs/data_mixing.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,7 @@ The primary intended use of this is to specify an optional pre-generated dataset
To use the [InstructLab Community pre-generated dataset](https://huggingface.co/datasets/instructlab/InstructLabCommunity) with all skills training, we first need to create a default recipe that specifies this dataset to include when mixing generated skills data. This recipe will get automatically picked up if placed in a `default_data_recipes/skills.yaml` subfolder and file under one of several possible locations - `'/home/<user>/.local/share/instructlab/sdg'`, `'/usr/local/share/instructlab/sdg'`, or `'/usr/share/instructlab/sdg'`. The exact list of possible locations is platform-dependent, and can be enumerated by a Python command like below:

```python
python3 -c '
import os, platformdirs
print(list(platformdirs.PlatformDirs(
appname=os.path.join("instructlab", "sdg"), multipath=True
).iter_data_dirs()))'
python3 -c "import os; from xdg_base_dirs import xdg_data_home, xdg_data_dirs; data_dirs = [os.path.join(xdg_data_home(), 'instructlab', 'sdg')] + [os.path.join(dir, 'instructlab', 'sdg') for dir in xdg_data_dirs()]; print(data_dirs)"
```

For this example, we'll assume you want to place to default data recipe under the `~/.local/share/instructlab/sdg/` platform directory.
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ httpx>=0.25.0,<1.0.0
instructlab-schema>=0.4.0
langchain-text-splitters
openai>=1.13.3,<2.0.0
platformdirs>=4.2
# Note: this dependency goes along with langchain-text-splitters and may be
# removed once that one is removed.
# do not use 8.4.0 due to a bug in the library
# https://github.com/instructlab/instructlab/issues/1389
tenacity>=8.3.0,!=8.4.0
xdg-base-dirs>=6.0.1
17 changes: 8 additions & 9 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
# Third Party
# instructlab - All of these need to go away (other than sdg) - issue #6
from datasets import Dataset
from xdg_base_dirs import xdg_data_dirs, xdg_data_home
import openai
import platformdirs

# First Party
# pylint: disable=ungrouped-imports
Expand Down Expand Up @@ -208,10 +208,10 @@ def _sdg_init(ctx, pipeline):
# Search for the pipeline in User and Site data directories
# then for a package defined pipeline
# and finally pipelines referenced by absolute path
pd = platformdirs.PlatformDirs(
appname=os.path.join("instructlab", "sdg"), multipath=True
)
for d in pd.iter_data_dirs():
data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())

for d in data_dirs:
pipeline_path = os.path.join(d, "pipelines", pipeline)
if os.path.exists(pipeline_path):
_check_pipeline_dir(pipeline_path)
Expand Down Expand Up @@ -246,10 +246,9 @@ def load_pipeline(yaml_basename):


def _mixer_init(ctx, output_dir, date_suffix, knowledge_auxiliary_inst):
pd = platformdirs.PlatformDirs(
appname=os.path.join("instructlab", "sdg"), multipath=True
)
data_dirs = list(pd.iter_data_dirs())
data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())

return DataMixer(
data_dirs,
output_dir,
Expand Down

0 comments on commit b9d41ee

Please sign in to comment.