Commit for documentation update for meta annotation (#16)

* Commit for documentation update for meta annotation Documentation update for meta annotation training to include training for bert and 2 phase learning * Commit for tutorial on using oversampled data for training * Update meta_annotation_training.ipynb * Pushing base and advanced notebook setup * Pushing change for save results and model type --------- Co-authored-by: mart-r <[email protected]>
CogStack · Aug 15, 2024 · cf0908e · cf0908e
1 parent ebfaf25
commit cf0908e
Show file tree

Hide file tree

Showing 2 changed files with 2,862 additions and 21 deletions.
diff --git a/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb b/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb
@@ -2,7 +2,8 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
+   "id": "d58c720d",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -12,21 +13,34 @@
     "from medcat.cat import CAT\n",
     "from medcat.meta_cat import MetaCAT\n",
     "from medcat.config_meta_cat import ConfigMetaCAT\n",
-    "from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE\n",
+    "from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT\n",
     "from tokenizers import ByteLevelBPETokenizer"
    ]
   },
   {
-   "attachments": {},
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "ca80af0e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# if you want to enable info level logging\n",
+    "import logging\n",
+    "logging.basicConfig(level=logging.INFO,force=True)"
+   ]
+  },
+  {
    "cell_type": "markdown",
+   "id": "5d0606ec",
    "metadata": {},
    "source": [
     "# Set parameters"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
+   "id": "dd7a2e97",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -64,16 +78,49 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
+   "id": "35aa5605",
+   "metadata": {},
+   "source": [
+    "Before you run the next section please double check that the model meta_annotation names matches to those specified in the mct export.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8bf6f5c3",
    "metadata": {},
    "source": [
-    "Before you run the next section please double check that the model meta_annotation names matches to those specified in the mct export."
+    "Depending on the model pack you have, please run the LSTM model or BERT model section. <br>\n",
+    "If you are unsure, use this section to check the model type."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "2933f7e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for meta_model in meta_model_names:\n",
+    "    config_file = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config.json\")\n",
+    "    with open(config_file, 'r') as jfile:\n",
+    "        config_dict = json.load(jfile)\n",
+    "    print(f\"Model used for meta_{meta_model}:\",config_dict['model']['model_name'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "83701c19",
+   "metadata": {},
+   "source": [
+    "# For LSTM model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9e1720aa",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -90,30 +137,96 @@
     "    config = ConfigMetaCAT()\n",
     "    for key, value in config_dict.items():\n",
     "        setattr(config, key, value['py/state']['__dict__'])\n",
-    "        # Reset the config attributes. TODO: Talk to Mart about how his new config style has affected this and best practise going forward\n",
-    "\n",
+    "        \n",
     "    save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
     "    #Ideally this should replace the meta_models inside the modelpack\n",
     "\n",
     "    # Initialise and train meta_model\n",
     "    mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n",
-    "    results = mc.train(mctrainer_export_path, save_dir_path=save_dir_path)\n",
-    "\n",
+    "    results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n",
+    "    \n",
     "    # Save results\n",
-    "    json.dump(results, open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))"
+    "    json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "91ff4e28",
+   "metadata": {},
+   "source": [
+    "# For BERT model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e255dda2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for meta_model in meta_model_names:\n",
+    "    # load and sort out the config\n",
+    "    config_file = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config.json\")\n",
+    "    with open(config_file, 'r') as jfile:\n",
+    "        config_dict = json.load(jfile)\n",
+    "    config = ConfigMetaCAT()\n",
+    "    for key, value in config_dict.items():\n",
+    "        setattr(config, key, value['py/state']['__dict__'])\n",
+    "\n",
+    "    tokenizer = TokenizerWrapperBERT.load(os.path.join(base_dir_meta_models,\"meta_\"+meta_model), \n",
+    "                                          config.model['model_variant'])\n",
+    "    \n",
+    "    # change model name if training BERT for the first time\n",
+    "    config.model['model_name'] = 'bert'\n",
+    "    \n",
+    "    save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
+    "    #Ideally this should replace the meta_models inside the modelpack\n",
+    "\n",
+    "    # Initialise and train meta_model\n",
+    "    mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n",
+    "    results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ab23e424",
+   "metadata": {},
+   "source": [
+    "## If you dont have the model packs, and are training from scratch"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "16231060",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "config = ConfigMetaCAT()\n",
+    "# make sure to change the following parameters:\n",
+    "# config.model['nclasses']\n",
+    "# config.general['category_name']\n",
+    "\n",
+    "# change model name if training BERT for the first time\n",
+    "config.model['model_name'] = 'bert'\n",
+    "\n",
+    "tokenizer = TokenizerWrapperBERT.load(\"\", config.model['model_variant'])\n",
+    "\n",
+    "save_dir_path= \"test_meta\" # Where to save the meta_model and results. \n",
+    "#Ideally this should replace the meta_models inside the modelpack\n",
+    "\n",
+    "# Initialise and train meta_model\n",
+    "mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n",
+    "results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n",
+    "\n",
+    "# Save results\n",
+    "json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "medcat",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -127,15 +240,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.8"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "4e4ccc64ca47f932c34194843713e175cf3a19af3798844e4190152d16ba61ca"
-   }
+   "version": "3.8.8"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 5
 }