diff --git a/medcat/2_train_model/2_supervised_training/meta_annotation_training_with_oversampling.ipynb b/medcat/2_train_model/2_supervised_training/meta_annotation_training_with_oversampling.ipynb
new file mode 100644
index 0000000..fb43190
--- /dev/null
+++ b/medcat/2_train_model/2_supervised_training/meta_annotation_training_with_oversampling.ipynb
@@ -0,0 +1,80 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "f1ac6acb",
+ "metadata": {},
+ "source": [
+ "# Oversampling data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9a431cf0",
+ "metadata": {},
+ "source": [
+ "You can generate synthetic data to help mitigate class imbalance.
Use this code to generate synthetic data using LLM - [link](https://gist.github.com/shubham-s-agarwal/401ef8bf6cbbd66fa0c76a8fbfc1f6c4)
NOTE: the generated data will require manual quality check to ensure that high quality and relevant data is used for training. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bd8066b5",
+ "metadata": {},
+ "source": [
+ "The data generated from the gist code and the format of the data required by MedCAT are different, requiring manual formatting at the moment. We will update this module to include the code to handle the same."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d5860552",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Refer to the meta_annotation_training notebook for the initial steps"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "949299e4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# To run the training with original + synthetic data\n",
+ "# Follow all the same steps till initializing the metacat model\n",
+ "\n",
+ "# Initialise and train meta_model\n",
+ "mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n",
+ "\n",
+ "# the format expected is [[['text','of','the','document'], [index of medical entity], \"label\" ],\n",
+ "# ['text','of','the','document'], [index of medical entity], \"label\" ]]\n",
+ "\n",
+ "synthetic_data_export = [[],[],[]]\n",
+ "\n",
+ "results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path,data_oversampled=synthetic_data_export)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}