From 7f6a8bf155bf8ead079691d6c6e0f1f7fffb6793 Mon Sep 17 00:00:00 2001
From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com>
Date: Mon, 5 Aug 2024 15:18:59 +0100
Subject: [PATCH] Commit for tutorial on using oversampled data for training

---
 ...nnotation_training_with_oversampling.ipynb | 80 +++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 medcat/2_train_model/2_supervised_training/meta_annotation_training_with_oversampling.ipynb
diff --git a/medcat/2_train_model/2_supervised_training/meta_annotation_training_with_oversampling.ipynb b/medcat/2_train_model/2_supervised_training/meta_annotation_training_with_oversampling.ipynb
new file mode 100644
index 0000000..fb43190
--- /dev/null
+++ b/medcat/2_train_model/2_supervised_training/meta_annotation_training_with_oversampling.ipynb
@@ -0,0 +1,80 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f1ac6acb",
+   "metadata": {},
+   "source": [
+    "# Oversampling data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9a431cf0",
+   "metadata": {},
+   "source": [
+    "You can generate synthetic data to help mitigate class imbalance. <br> Use this code to generate synthetic data using LLM - [link](https://gist.github.com/shubham-s-agarwal/401ef8bf6cbbd66fa0c76a8fbfc1f6c4) <br> <b>NOTE</b>: the generated data will require manual quality check to ensure that high quality and relevant data is used for training. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bd8066b5",
+   "metadata": {},
+   "source": [
+    "The data generated from the gist code and the format of the data required by MedCAT are different, requiring manual formatting at the moment. We will update this module to include the code to handle the same."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5860552",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Refer to the meta_annotation_training notebook for the initial steps"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "949299e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# To run the training with original + synthetic data\n",
+    "# Follow all the same steps till initializing the metacat model\n",
+    "\n",
+    "# Initialise and train meta_model\n",
+    "mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n",
+    "\n",
+    "# the format expected is [[['text','of','the','document'], [index of medical entity], \"label\" ],\n",
+    "#                ['text','of','the','document'], [index of medical entity], \"label\" ]]\n",
+    "\n",
+    "synthetic_data_export = [[],[],[]]\n",
+    "\n",
+    "results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path,data_oversampled=synthetic_data_export)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}