Skip to content

Commit

Permalink
Commit for documentation update for meta annotation (#16)
Browse files Browse the repository at this point in the history
* Commit for documentation update for meta annotation

Documentation update for meta annotation training to include training for bert and 2 phase learning

* Commit for tutorial on using oversampled data for training

* Update meta_annotation_training.ipynb

* Pushing base and advanced notebook setup

* Pushing change for save results and model type

---------

Co-authored-by: mart-r <[email protected]>
  • Loading branch information
shubham-s-agarwal and mart-r authored Aug 15, 2024
1 parent ebfaf25 commit cf0908e
Show file tree
Hide file tree
Showing 2 changed files with 2,862 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "d58c720d",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -12,21 +13,34 @@
"from medcat.cat import CAT\n",
"from medcat.meta_cat import MetaCAT\n",
"from medcat.config_meta_cat import ConfigMetaCAT\n",
"from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE\n",
"from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT\n",
"from tokenizers import ByteLevelBPETokenizer"
]
},
{
"attachments": {},
"cell_type": "code",
"execution_count": 2,
"id": "ca80af0e",
"metadata": {},
"outputs": [],
"source": [
"# if you want to enable info level logging\n",
"import logging\n",
"logging.basicConfig(level=logging.INFO,force=True)"
]
},
{
"cell_type": "markdown",
"id": "5d0606ec",
"metadata": {},
"source": [
"# Set parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"id": "dd7a2e97",
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -64,16 +78,49 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "35aa5605",
"metadata": {},
"source": [
"Before you run the next section please double check that the model meta_annotation names matches to those specified in the mct export.\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "8bf6f5c3",
"metadata": {},
"source": [
"Before you run the next section please double check that the model meta_annotation names matches to those specified in the mct export."
"Depending on the model pack you have, please run the LSTM model or BERT model section. <br>\n",
"If you are unsure, use this section to check the model type."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2933f7e1",
"metadata": {},
"outputs": [],
"source": [
"for meta_model in meta_model_names:\n",
" config_file = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config.json\")\n",
" with open(config_file, 'r') as jfile:\n",
" config_dict = json.load(jfile)\n",
" print(f\"Model used for meta_{meta_model}:\",config_dict['model']['model_name'])"
]
},
{
"cell_type": "markdown",
"id": "83701c19",
"metadata": {},
"source": [
"# For LSTM model"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9e1720aa",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -90,30 +137,96 @@
" config = ConfigMetaCAT()\n",
" for key, value in config_dict.items():\n",
" setattr(config, key, value['py/state']['__dict__'])\n",
" # Reset the config attributes. TODO: Talk to Mart about how his new config style has affected this and best practise going forward\n",
"\n",
" \n",
" save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
" #Ideally this should replace the meta_models inside the modelpack\n",
"\n",
" # Initialise and train meta_model\n",
" mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n",
" results = mc.train(mctrainer_export_path, save_dir_path=save_dir_path)\n",
"\n",
" results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n",
" \n",
" # Save results\n",
" json.dump(results, open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))"
" json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))"
]
},
{
"cell_type": "markdown",
"id": "91ff4e28",
"metadata": {},
"source": [
"# For BERT model"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e255dda2",
"metadata": {},
"outputs": [],
"source": [
"for meta_model in meta_model_names:\n",
" # load and sort out the config\n",
" config_file = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config.json\")\n",
" with open(config_file, 'r') as jfile:\n",
" config_dict = json.load(jfile)\n",
" config = ConfigMetaCAT()\n",
" for key, value in config_dict.items():\n",
" setattr(config, key, value['py/state']['__dict__'])\n",
"\n",
" tokenizer = TokenizerWrapperBERT.load(os.path.join(base_dir_meta_models,\"meta_\"+meta_model), \n",
" config.model['model_variant'])\n",
" \n",
" # change model name if training BERT for the first time\n",
" config.model['model_name'] = 'bert'\n",
" \n",
" save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
" #Ideally this should replace the meta_models inside the modelpack\n",
"\n",
" # Initialise and train meta_model\n",
" mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n",
" results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)"
]
},
{
"cell_type": "markdown",
"id": "ab23e424",
"metadata": {},
"source": [
"## If you dont have the model packs, and are training from scratch"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "16231060",
"metadata": {},
"outputs": [],
"source": []
"source": [
"config = ConfigMetaCAT()\n",
"# make sure to change the following parameters:\n",
"# config.model['nclasses']\n",
"# config.general['category_name']\n",
"\n",
"# change model name if training BERT for the first time\n",
"config.model['model_name'] = 'bert'\n",
"\n",
"tokenizer = TokenizerWrapperBERT.load(\"\", config.model['model_variant'])\n",
"\n",
"save_dir_path= \"test_meta\" # Where to save the meta_model and results. \n",
"#Ideally this should replace the meta_models inside the modelpack\n",
"\n",
"# Initialise and train meta_model\n",
"mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n",
"results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n",
"\n",
"# Save results\n",
"json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "medcat",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
Expand All @@ -127,15 +240,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "4e4ccc64ca47f932c34194843713e175cf3a19af3798844e4190152d16ba61ca"
}
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 5
}
Loading

0 comments on commit cf0908e

Please sign in to comment.