argilla-io · pre-commit-ci · Nov 5, 2024 · Nov 5, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,23 +1,23 @@
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
       - id: check-yaml
         exclude: argilla/mkdocs.yml|examples/deployments/k8s
       - id: end-of-file-fixer
         exclude_types: [text, jupyter]
       - id: trailing-whitespace
 
-  - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: v0.4.8
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.7.2
     hooks:
       - id: ruff-format
 
   ##############################################################################
   # argilla specific hooks
   ##############################################################################
-  - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: v0.4.8
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.7.2
     hooks:
       - id: ruff
         files: 'argilla/src/.*\.py$'
@@ -35,7 +35,7 @@ repos:
           - argilla/LICENSE_HEADER
           - --fuzzy-match-generates-todo
   - repo: https://github.com/kynan/nbstripout
-    rev: 0.7.1
+    rev: 0.8.0
     hooks:
       - id: nbstripout
         files: '^argilla/.*\.ipynb$'
@@ -52,8 +52,8 @@ repos:
   ##############################################################################
   # argilla-server specific hooks
   ##############################################################################
-  - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: v0.4.8
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.7.2
     hooks:
       - id: ruff
         files: 'argila-server/src/.*\.py$'

diff --git a/argilla/docs/community/integrations/llamaindex_rag_github.ipynb b/argilla/docs/community/integrations/llamaindex_rag_github.ipynb
@@ -202,8 +202,7 @@
     "            \".svg\",\n",
     "            \".ico\",\n",
     "            \".json\",\n",
-    "            \".ipynb\",   # Erase this line if you want to include notebooks\n",
-    "\n",
+    "            \".ipynb\",  # Erase this line if you want to include notebooks\n",
     "        ],\n",
     "        GithubRepositoryReader.FilterType.EXCLUDE,\n",
     "    ),\n",
@@ -231,9 +230,7 @@
    "outputs": [],
    "source": [
     "# LLM settings\n",
-    "Settings.llm = OpenAI(\n",
-    "    model=\"gpt-3.5-turbo\", temperature=0.8, openai_api_key=openai_api_key\n",
-    ")\n",
+    "Settings.llm = OpenAI(model=\"gpt-3.5-turbo\", temperature=0.8, openai_api_key=openai_api_key)\n",
     "\n",
     "# Load the data and create the index\n",
     "index = VectorStoreIndex.from_documents(documents)\n",

diff --git a/argilla/docs/tutorials/image_classification.ipynb b/argilla/docs/tutorials/image_classification.ipynb
@@ -93,13 +93,7 @@
     "from PIL import Image\n",
     "\n",
     "from datasets import load_dataset, Dataset, load_metric\n",
-    "from transformers import (\n",
-    "    AutoImageProcessor,\n",
-    "    AutoModelForImageClassification,\n",
-    "    pipeline,\n",
-    "    Trainer,\n",
-    "    TrainingArguments\n",
-    ")\n",
+    "from transformers import AutoImageProcessor, AutoModelForImageClassification, pipeline, Trainer, TrainingArguments\n",
     "\n",
     "import argilla as rg"
    ]
@@ -182,7 +176,7 @@
     "            title=\"What digit do you see on the image?\",\n",
     "            labels=labels,\n",
     "        )\n",
-    "    ]\n",
+    "    ],\n",
     ")"
    ]
   },
@@ -246,7 +240,7 @@
     "n_rows = 100\n",
     "\n",
     "hf_dataset = load_dataset(\"ylecun/mnist\", streaming=True)\n",
-    "dataset_rows = [row for _,row in zip(range(n_rows), hf_dataset[\"train\"])]\n",
+    "dataset_rows = [row for _, row in zip(range(n_rows), hf_dataset[\"train\"])]\n",
     "hf_dataset = Dataset.from_list(dataset_rows)\n",
     "\n",
     "hf_dataset"
@@ -525,7 +519,8 @@
    ],
    "source": [
     "def greyscale_to_rgb(img) -> Image:\n",
-    "    return Image.merge('RGB', (img, img, img))\n",
+    "    return Image.merge(\"RGB\", (img, img, img))\n",
+    "\n",
     "\n",
     "submitted_image_rgb = [\n",
     "    {\n",
@@ -556,7 +551,7 @@
     "\n",
     "submitted_image_rgb_processed = [\n",
     "    {\n",
-    "        \"pixel_values\": processor(sample[\"image\"], return_tensors='pt')[\"pixel_values\"],\n",
+    "        \"pixel_values\": processor(sample[\"image\"], return_tensors=\"pt\")[\"pixel_values\"],\n",
     "        \"label\": sample[\"label\"],\n",
     "    }\n",
     "    for sample in submitted_image_rgb\n",
@@ -624,8 +619,8 @@
    "source": [
     "def collate_fn(batch):\n",
     "    return {\n",
-    "        'pixel_values': torch.stack([torch.tensor(x['pixel_values'][0]) for x in batch]),\n",
-    "        'labels': torch.tensor([int(x['label']) for x in batch])\n",
+    "        \"pixel_values\": torch.stack([torch.tensor(x[\"pixel_values\"][0]) for x in batch]),\n",
+    "        \"labels\": torch.tensor([int(x[\"label\"]) for x in batch]),\n",
     "    }"
    ]
   },
@@ -643,6 +638,8 @@
    "outputs": [],
    "source": [
     "metric = load_metric(\"accuracy\", trust_remote_code=True)\n",
+    "\n",
+    "\n",
     "def compute_metrics(p):\n",
     "    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)"
    ]
@@ -664,7 +661,7 @@
     "    checkpoint,\n",
     "    num_labels=len(labels),\n",
     "    id2label={int(i): int(c) for i, c in enumerate(labels)},\n",
-    "    label2id={int(c): int(i) for i, c in enumerate(labels)}\n",
+    "    label2id={int(c): int(i) for i, c in enumerate(labels)},\n",
     ")\n",
     "model.config"
    ]
@@ -698,19 +695,19 @@
    ],
    "source": [
     "training_args = TrainingArguments(\n",
-    "  output_dir=\"./image-classifier\",\n",
-    "  per_device_train_batch_size=16,\n",
-    "  eval_strategy=\"steps\",\n",
-    "  num_train_epochs=1,\n",
-    "  fp16=False, # True if you have a GPU with mixed precision support\n",
-    "  save_steps=100,\n",
-    "  eval_steps=100,\n",
-    "  logging_steps=10,\n",
-    "  learning_rate=2e-4,\n",
-    "  save_total_limit=2,\n",
-    "  remove_unused_columns=True,\n",
-    "  push_to_hub=False,\n",
-    "  load_best_model_at_end=True,\n",
+    "    output_dir=\"./image-classifier\",\n",
+    "    per_device_train_batch_size=16,\n",
+    "    eval_strategy=\"steps\",\n",
+    "    num_train_epochs=1,\n",
+    "    fp16=False,  # True if you have a GPU with mixed precision support\n",
+    "    save_steps=100,\n",
+    "    eval_steps=100,\n",
+    "    logging_steps=10,\n",
+    "    learning_rate=2e-4,\n",
+    "    save_total_limit=2,\n",
+    "    remove_unused_columns=True,\n",
+    "    push_to_hub=False,\n",
+    "    load_best_model_at_end=True,\n",
     ")\n",
     "\n",
     "trainer = Trainer(\n",
@@ -745,12 +742,14 @@
    "source": [
     "pipe = pipeline(\"image-classification\", model=model, image_processor=processor)\n",
     "\n",
+    "\n",
     "def run_inference(batch):\n",
     "    predictions = pipe(batch[\"image\"])\n",
     "    batch[\"image_label\"] = [prediction[0][\"label\"] for prediction in predictions]\n",
     "    batch[\"score\"] = [prediction[0][\"score\"] for prediction in predictions]\n",
     "    return batch\n",
     "\n",
+    "\n",
     "hf_dataset = hf_dataset.map(run_inference, batched=True)"
    ]
   },

diff --git a/argilla/docs/tutorials/image_preference.ipynb b/argilla/docs/tutorials/image_preference.ipynb
@@ -191,11 +191,10 @@
     "    metadata=[\n",
     "        rg.FloatMetadataProperty(name=\"toxicity\", title=\"Toxicity score\"),\n",
     "        rg.FloatMetadataProperty(name=\"identity_attack\", title=\"Identity attack score\"),\n",
-    "\n",
     "    ],\n",
     "    vectors=[\n",
     "        rg.VectorField(name=\"original_caption_vector\", dimensions=384),\n",
-    "    ]\n",
+    "    ],\n",
     ")"
    ]
   },
@@ -254,7 +253,7 @@
     "n_rows = 25\n",
     "\n",
     "hf_dataset = load_dataset(\"tomg-group-umd/pixelprose\", streaming=True)\n",
-    "dataset_rows = [row for _,row in zip(range(n_rows), hf_dataset[\"train\"])]\n",
+    "dataset_rows = [row for _, row in zip(range(n_rows), hf_dataset[\"train\"])]\n",
     "hf_dataset = Dataset.from_list(dataset_rows)\n",
     "\n",
     "hf_dataset"
@@ -341,8 +340,7 @@
     }
    ],
    "source": [
-    "hf_dataset = hf_dataset.filter(\n",
-    "    lambda x: any([x[\"url\"].endswith(extension) for extension in [\".jpg\", \".png\", \".jpeg\"]]))\n",
+    "hf_dataset = hf_dataset.filter(lambda x: any([x[\"url\"].endswith(extension) for extension in [\".jpg\", \".png\", \".jpeg\"]]))\n",
     "\n",
     "hf_dataset"
    ]
@@ -380,6 +378,7 @@
     "API_URL = \"https://api-inference.huggingface.co/models/black-forest-labs/FLUX.1-schnell\"\n",
     "headers = {\"Authorization\": f\"Bearer {os.getenv('HF_TOKEN')}\"}\n",
     "\n",
+    "\n",
     "def query(payload):\n",
     "    response = requests.post(API_URL, headers=headers, json=payload)\n",
     "    if response.status_code == 200:\n",
@@ -391,9 +390,8 @@
     "        image = query(payload)\n",
     "    return image\n",
     "\n",
-    "query({\n",
-    "\t\"inputs\": \"Astronaut riding a horse\"\n",
-    "})"
+    "\n",
+    "query({\"inputs\": \"Astronaut riding a horse\"})"
    ]
   },
   {
@@ -426,9 +424,10 @@
     "def generate_image(row):\n",
     "    caption = row[\"original_caption\"]\n",
     "    row[\"image_1\"] = query({\"inputs\": caption})\n",
-    "    row[\"image_2\"] = query({\"inputs\": caption + \" \"}) # space to avoid caching and getting the same image\n",
+    "    row[\"image_2\"] = query({\"inputs\": caption + \" \"})  # space to avoid caching and getting the same image\n",
     "    return row\n",
-    "    \n",
+    "\n",
+    "\n",
     "hf_dataset_with_images = hf_dataset.map(generate_image, batched=False)\n",
     "\n",
     "hf_dataset_with_images"
@@ -451,11 +450,13 @@
    "source": [
     "model = SentenceTransformer(\"TaylorAI/bge-micro-v2\")\n",
     "\n",
+    "\n",
     "def encode_questions(batch):\n",
     "    vectors_as_numpy = model.encode(batch[\"original_caption\"])\n",
     "    batch[\"original_caption_vector\"] = [x.tolist() for x in vectors_as_numpy]\n",
     "    return batch\n",
     "\n",
+    "\n",
     "hf_dataset_with_images_vectors = hf_dataset_with_images.map(encode_questions, batched=True)"
    ]
   },
@@ -474,11 +475,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset.records.log(records=hf_dataset_with_images_vectors, mapping={\n",
-    "    \"key\": \"id\",\n",
-    "    \"original_caption\": \"caption\",\n",
-    "    \"url\": \"image_original\",\n",
-    "})"
+    "dataset.records.log(\n",
+    "    records=hf_dataset_with_images_vectors,\n",
+    "    mapping={\n",
+    "        \"key\": \"id\",\n",
+    "        \"original_caption\": \"caption\",\n",
+    "        \"url\": \"image_original\",\n",
+    "    },\n",
+    ")"
    ]
   },
   {

diff --git a/argilla/docs/tutorials/token_classification.ipynb b/argilla/docs/tutorials/token_classification.ipynb
@@ -309,9 +309,7 @@
    "source": [
     "def predict_gliner(model, text, labels, threshold):\n",
     "    entities = model.predict_entities(text, labels, threshold)\n",
-    "    return [\n",
-    "        {k: v for k, v in ent.items() if k not in {\"score\", \"text\"}} for ent in entities\n",
-    "    ]"
+    "    return [{k: v for k, v in ent.items() if k not in {\"score\", \"text\"}} for ent in entities]"
    ]
   },
   {
@@ -330,9 +328,7 @@
     "data = dataset.records.to_list(flatten=True)\n",
     "updated_data = [\n",
     "    {\n",
-    "        \"span_label\": predict_gliner(\n",
-    "            model=gliner_model, text=sample[\"text\"], labels=labels, threshold=0.70\n",
-    "        ),\n",
+    "        \"span_label\": predict_gliner(model=gliner_model, text=sample[\"text\"], labels=labels, threshold=0.70),\n",
     "        \"id\": sample[\"id\"],\n",
     "    }\n",
     "    for sample in data\n",