From b450d2d25ba974d0a4668730cd1ac54fcd1a5b07 Mon Sep 17 00:00:00 2001
From: Jaideep Rao <jrao@redhat.com>
Date: Tue, 10 Sep 2024 12:50:58 -0400
Subject: [PATCH] chore: replace platformdirs with xdg-base-dirs

Signed-off-by: Jaideep Rao <jrao@redhat.com>
---
 docs/data_mixing.md                  |  6 +-----
 requirements.txt                     |  2 +-
 src/instructlab/sdg/generate_data.py | 17 ++++++++---------
 3 files changed, 10 insertions(+), 15 deletions(-)
diff --git a/docs/data_mixing.md b/docs/data_mixing.md
index 7dedb446..3d283158 100644
--- a/docs/data_mixing.md
+++ b/docs/data_mixing.md
@@ -9,11 +9,7 @@ The primary intended use of this is to specify an optional pre-generated dataset
 To use the [InstructLab Community pre-generated dataset](https://huggingface.co/datasets/instructlab/InstructLabCommunity) with all skills training, we first need to create a default recipe that specifies this dataset to include when mixing generated skills data. This recipe will get automatically picked up if placed in a `default_data_recipes/skills.yaml` subfolder and file under one of several possible locations - `'/home/<user>/.local/share/instructlab/sdg'`, `'/usr/local/share/instructlab/sdg'`, or `'/usr/share/instructlab/sdg'`. The exact list of possible locations is platform-dependent, and can be enumerated by a Python command like below:
 
 ```python
-python3 -c '
-import os, platformdirs
-print(list(platformdirs.PlatformDirs(
-    appname=os.path.join("instructlab", "sdg"), multipath=True
-).iter_data_dirs()))'
+python3 -c "import os; from xdg_base_dirs import xdg_data_home, xdg_data_dirs; data_dirs = [os.path.join(xdg_data_home(), 'instructlab', 'sdg')] + [os.path.join(dir, 'instructlab', 'sdg') for dir in xdg_data_dirs()]; print(data_dirs)"
 ```
 
 For this example, we'll assume you want to place to default data recipe under the `~/.local/share/instructlab/sdg/` platform directory.
diff --git a/requirements.txt b/requirements.txt
index d8e9544c..9d3e75f2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,9 +6,9 @@ httpx>=0.25.0,<1.0.0
 instructlab-schema>=0.4.0
 langchain-text-splitters
 openai>=1.13.3,<2.0.0
-platformdirs>=4.2
 # Note: this dependency goes along with langchain-text-splitters and may be
 #       removed once that one is removed.
 # do not use 8.4.0 due to a bug in the library
 # https://github.com/instructlab/instructlab/issues/1389
 tenacity>=8.3.0,!=8.4.0
+xdg-base-dirs>=6.0.1
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index 177fd90b..61934535 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -14,8 +14,8 @@
 # Third Party
 # instructlab - All of these need to go away (other than sdg) - issue #6
 from datasets import Dataset
+from xdg_base_dirs import xdg_data_dirs, xdg_data_home
 import openai
-import platformdirs
 
 # First Party
 # pylint: disable=ungrouped-imports
@@ -208,10 +208,10 @@ def _sdg_init(ctx, pipeline):
     # Search for the pipeline in User and Site data directories
     # then for a package defined pipeline
     # and finally pipelines referenced by absolute path
-    pd = platformdirs.PlatformDirs(
-        appname=os.path.join("instructlab", "sdg"), multipath=True
-    )
-    for d in pd.iter_data_dirs():
+    data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
+    data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())
+
+    for d in data_dirs:
         pipeline_path = os.path.join(d, "pipelines", pipeline)
         if os.path.exists(pipeline_path):
             _check_pipeline_dir(pipeline_path)
@@ -246,10 +246,9 @@ def load_pipeline(yaml_basename):
 
 
 def _mixer_init(ctx, output_dir, date_suffix, knowledge_auxiliary_inst):
-    pd = platformdirs.PlatformDirs(
-        appname=os.path.join("instructlab", "sdg"), multipath=True
-    )
-    data_dirs = list(pd.iter_data_dirs())
+    data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
+    data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())
+
     return DataMixer(
         data_dirs,
         output_dir,