diff --git a/docs/source/content/examples/data_manaer.ipynb b/docs/source/content/examples/data_manaer.ipynb new file mode 100644 index 0000000..bf02536 --- /dev/null +++ b/docs/source/content/examples/data_manaer.ipynb @@ -0,0 +1,239 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Manager\n", + "When doing active learning we have our Original Data (OD) Labeled Data [LD] and Unlabeled Data [UD]\n", + "where UD and LD are subsets of OD.\n", + "The active learner operates on UD and returns indexes relative to it. We want to store those indices with respect\n", + "to OD, and sometimes see the subset of labels of LD. (The subset of labels of UD is Null)\n", + "\n", + "That's a fancy way of saying there is a lot book keeping to be done and this class solves that by doing it for you\n", + "\n", + "The main idea is that we store a mask (labeeld_mask) of indices that have been labeled and then expose UD , LD\n", + "and the labels by using fancy indexing with that mask. The manager exposes a an add_labels method which lets the\n", + "user add labels indexed with respect to UD and it will adjust the indices so that they match OD.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparation\n", + "In this part we prepare the data and learners, all normal stuff you've seen in other examples. \n", + "Some differences is that we're working with text " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "This example shows how to use the new data manager class.\n", + "For clarity, all the setup has been moved into functions and\n", + "the core is in the __main__ section which is commented\n", + "\n", + "Also look at prepare_manager to see how a DataManager is instantiated\n", + "\n", + "\"\"\"\n", + "\n", + "from sklearn.datasets import fetch_20newsgroups\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from modAL.datamanager import DataManager\n", + "import numpy as np\n", + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from functools import partial\n", + "\n", + "\n", + "from modAL.models import ActiveLearner\n", + "from modAL.batch import uncertainty_batch_sampling\n", + "\n", + "RANDOM_STATE_SEED = 123\n", + "np.random.seed(RANDOM_STATE_SEED)\n", + "BATCH_SIZE = 5\n", + "N_QUERIES = 50\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Utility Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_data():\n", + " SKIP_SIZE = 50 # Skip to make the example go fast.\n", + " docs, original_labels = fetch_20newsgroups(return_X_y=True)\n", + " docs_train = docs[::SKIP_SIZE]\n", + " original_labels_train = original_labels[::SKIP_SIZE]\n", + " docs_test = docs[1::SKIP_SIZE] # Offset by one means no overlap\n", + " original_labels_test = original_labels[\n", + " 1::SKIP_SIZE\n", + " ] # Offset by one means no overlap\n", + " return docs_train, original_labels_train, docs_test, original_labels_test\n", + "\n", + "\n", + "def prepare_features(docs_train, docs_test):\n", + " vectorizer = TfidfVectorizer(\n", + " stop_words=\"english\", ngram_range=(1, 3), max_df=0.9, max_features=5000\n", + " )\n", + "\n", + " vectors_train = vectorizer.fit_transform(docs_train).toarray()\n", + " vectors_test = vectorizer.transform(docs_test).toarray()\n", + " return vectors_train, vectors_test\n", + "\n", + "\n", + "\n", + "\n", + "def prepare_learner():\n", + "\n", + " estimator = RandomForestClassifier()\n", + " preset_batch = partial(uncertainty_batch_sampling, n_instances=BATCH_SIZE)\n", + " learner = ActiveLearner(estimator=estimator, query_strategy=preset_batch)\n", + " return learner\n", + "\n", + "\n", + "def make_pretty_summary_plot(performance_history):\n", + " with plt.style.context(\"seaborn-white\"):\n", + " fig, ax = plt.subplots(figsize=(8.5, 6), dpi=130)\n", + "\n", + " ax.plot(performance_history)\n", + " ax.scatter(range(len(performance_history)), performance_history, s=13)\n", + "\n", + " ax.xaxis.set_major_locator(\n", + " mpl.ticker.MaxNLocator(nbins=N_QUERIES + 3, integer=True)\n", + " )\n", + " ax.xaxis.grid(True)\n", + "\n", + " ax.yaxis.set_major_locator(mpl.ticker.MaxNLocator(nbins=10))\n", + " ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(xmax=1))\n", + " ax.set_ylim(bottom=0, top=1)\n", + " ax.yaxis.grid(True, linestyle=\"--\", alpha=1 / 2)\n", + "\n", + " ax.set_title(\"Incremental classification accuracy\")\n", + " ax.set_xlabel(\"Query iteration\")\n", + " ax.set_ylabel(\"Classification Accuracy\")\n", + "\n", + " plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Instantiate The Data Manager\n", + "Here we instantiate the manager. We pass it the feature vectors we'll be training on as well as the original documents (so we can easily indiex them) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_manager(vectors_train, docs_train):\n", + " manager = DataManager(vectors_train, sources=docs_train)\n", + " return manager\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using The Manager" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "docs_train, original_labels_train, docs_test, original_labels_test = prepare_data()\n", + "vectors_train, vectors_test = prepare_features(docs_train, docs_test)\n", + "manager = prepare_manager(vectors_train, docs_train)\n", + "learner = prepare_learner()\n", + "performance_history = []\n", + "# performance_history.append(learner.score(docs_test, original_labels_test))\n", + "\n", + "for i in range(N_QUERIES):\n", + " # Check if there are more examples that are not labeled. If not, break\n", + " if manager.unlabeld.size == 0:\n", + " break\n", + "\n", + " for index in range(1):\n", + " # query the learner as usual, in this case we are using a batch learning strategy\n", + " # so indices_to_label is an array\n", + " indices_to_label, query_instance = learner.query(manager.unlabeld)\n", + " labels = [] # Hold a list of the new labels\n", + " for ix in indices_to_label:\n", + " \"\"\"\n", + " Here is the tricky part that the manager solves. The indicies are indexed with respect to \n", + " unlabeled data but we want to work with them with respect to the original data. \n", + " The manager makes this almost transparent\n", + " \"\"\"\n", + " '''\n", + " Map the index that is with respect to unlabeled data back to an index with respect to the \n", + " whole dataset\n", + " '''\n", + " original_ix = manager.get_original_index_from_unlabeled_index(ix)\n", + " #print(manager.sources[original_ix]) #Show the original data so we can decide what to label\n", + " # Now we can lookup the label in the original set of labels without any bookkeeping\n", + " y = original_labels_train[original_ix]\n", + " # We create a Label instance, a tuple of index and label\n", + " # The index should be with respect to the unlabeled data, the add_labels function will automatically\n", + " # calculate the offsets\n", + " label = (ix, y)\n", + " # append the labels to a list\n", + " labels.append(label)\n", + " # Insert them all at once.\n", + " manager.add_labels(labels)\n", + " # Note that if you need to add labels with indicies that repsect the original dataset you can do\n", + " # manager.add_labels(labels,offset_to_unlabeled=False)\n", + " # Now teach as usual\n", + " learner.teach(manager.labeled, manager.labels)\n", + " performance_history.append(learner.score(vectors_test, original_labels_test))\n", + "# Finnaly make a nice plot\n", + "make_pretty_summary_plot(performance_history)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/data_manager_and_text_classification.py b/examples/data_manager_and_text_classification.py new file mode 100644 index 0000000..bb9b089 --- /dev/null +++ b/examples/data_manager_and_text_classification.py @@ -0,0 +1,130 @@ +""" +This example shows how to use the new data manager class. +For clarity, all the setup has been moved into functions and +the core is in the __main__ section which is commented + +Also look at prepare_manager to see how a DataManager is instantiated + +""" + +from sklearn.datasets import fetch_20newsgroups +from sklearn.ensemble import RandomForestClassifier +from modAL.datamanager import DataManager +import numpy as np +import matplotlib as mpl +import matplotlib.pyplot as plt +from sklearn.feature_extraction.text import TfidfVectorizer +from functools import partial + + +from modAL.models import ActiveLearner +from modAL.batch import uncertainty_batch_sampling + +RANDOM_STATE_SEED = 123 +np.random.seed(RANDOM_STATE_SEED) +BATCH_SIZE = 5 +N_QUERIES = 50 + + +def prepare_data(): + SKIP_SIZE = 50 # Skip to make the example go fast. + docs, original_labels = fetch_20newsgroups(return_X_y=True) + docs_train = docs[::SKIP_SIZE] + original_labels_train = original_labels[::SKIP_SIZE] + docs_test = docs[1::SKIP_SIZE] # Offset by one means no overlap + original_labels_test = original_labels[ + 1::SKIP_SIZE + ] # Offset by one means no overlap + return docs_train, original_labels_train, docs_test, original_labels_test + + +def prepare_features(docs_train, docs_test): + vectorizer = TfidfVectorizer( + stop_words="english", ngram_range=(1, 3), max_df=0.9, max_features=5000 + ) + + vectors_train = vectorizer.fit_transform(docs_train).toarray() + vectors_test = vectorizer.transform(docs_test).toarray() + return vectors_train, vectors_test + + +def prepare_manager(vectors_train, docs_train): + manager = DataManager(vectors_train, sources=docs_train) + return manager + + +def prepare_learner(): + + estimator = RandomForestClassifier() + preset_batch = partial(uncertainty_batch_sampling, n_instances=BATCH_SIZE) + learner = ActiveLearner(estimator=estimator, query_strategy=preset_batch) + return learner + + +def make_pretty_summary_plot(performance_history): + with plt.style.context("seaborn-white"): + fig, ax = plt.subplots(figsize=(8.5, 6), dpi=130) + + ax.plot(performance_history) + ax.scatter(range(len(performance_history)), performance_history, s=13) + + ax.xaxis.set_major_locator( + mpl.ticker.MaxNLocator(nbins=N_QUERIES + 3, integer=True) + ) + ax.xaxis.grid(True) + + ax.yaxis.set_major_locator(mpl.ticker.MaxNLocator(nbins=10)) + ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(xmax=1)) + ax.set_ylim(bottom=0, top=1) + ax.yaxis.grid(True, linestyle="--", alpha=1 / 2) + + ax.set_title("Incremental classification accuracy") + ax.set_xlabel("Query iteration") + ax.set_ylabel("Classification Accuracy") + + plt.show() + + +if __name__ == "__main__": + docs_train, original_labels_train, docs_test, original_labels_test = prepare_data() + vectors_train, vectors_test = prepare_features(docs_train, docs_test) + manager = prepare_manager(vectors_train, docs_train) + learner = prepare_learner() + performance_history = [] + # performance_history.append(learner.score(docs_test, original_labels_test)) + + for i in range(N_QUERIES): + # Check if there are more examples that are not labeled. If not, break + if manager.unlabeld.size == 0: + break + + for index in range(1): + # query the learner as usual, in this case we are using a batch learning strategy + # so indices_to_label is an array + indices_to_label, query_instance = learner.query(manager.unlabeld) + labels = [] # Hold a list of the new labels + for ix in indices_to_label: + """ + Here is the tricky part that the manager solves. The indicies are indexed with respect to unlabeled data + but we want to work with them with respect to the original data. The manager makes this almost transparent + """ + # Map the index that is with respect to unlabeled data back to an index with respect to the whole dataset + original_ix = manager.get_original_index_from_unlabeled_index(ix) + # print(manager.sources[original_ix]) #Show the original data so we can decide what to label + # Now we can lookup the label in the original set of labels without any bookkeeping + y = original_labels_train[original_ix] + # We create a Label instance, a tuple of index and label + # The index should be with respect to the unlabeled data, the add_labels function will automatically + # calculate the offsets + label = (ix, y) + # append the labels to a list + labels.append(label) + # Insert them all at once. + manager.add_labels(labels) + # Note that if you need to add labels with indicies that repsect the original dataset you can do + # manager.add_labels(labels,offset_to_unlabeled=False) + # Now teach as usual + learner.teach(manager.labeled, manager.labels) + performance_history.append(learner.score(vectors_test, original_labels_test)) + # Finnaly make a nice plot + make_pretty_summary_plot(performance_history) diff --git a/modAL/datamanager.py b/modAL/datamanager.py new file mode 100644 index 0000000..77b6cb3 --- /dev/null +++ b/modAL/datamanager.py @@ -0,0 +1,136 @@ +import typing +from typing import List, Tuple, Any, Union, Optional,Generic,TypeVar +import numpy as np +Label = Tuple[int, np.dtype] +LabelList = List[Label] +Sources = List[Any] + + +class DataManager: + def __init__( + self, + features: np.ndarray, + labels_dtype: Optional[np.dtype] = None, + sources: Optional[Sources] = None, + ): + """ + + When doing active learning we have our Original Data (OD) Labeled Data [LD] and Unlabeled Data [UD] + where UD and LD are subsets of OD. + The active learner operates on UD and returns indexes relative to it. We want to store those indices with respect + to OD, and sometimes see the subset of labels of LD. (The subset of labels of UD is Null) + + That's a fancy way of saying there is a lot book keeping to be done and this class solves that by doing it for you + + The main idea is that we store a mask (labeeld_mask) of indices that have been labeled and then expose UD , LD + and the labels by using fancy indexing with that mask. The manager exposes a an add_labels method which lets the + user add labels indexed with respect to UD and it will adjust the indices so that they match OD. + + :param features: An array of the features that will be used for AL. + :param labels: Any prexesiting labels. Each label is a tuple(idx,label) + :param source: A list of the original data + """ + self.features = features + + self._labels =np.empty(shape=self.features.shape[0],dtype=labels_dtype) + self.labeled_mask = np.zeros(self.features.shape[0], dtype=bool) + self.sources = np.array(sources if sources else []) + + @property + def labels(self): + ''' + + Returns the labels indexed with respect to LD + + ''' + return self._labels[self.labeled_mask] + + @property + def unlabeled_mask(self): + ''' + + Returns: a mask which is true for all unlabeled points + + ''' + return np.logical_not(self.labeled_mask) + + def _update_masks(self, labels: Union[LabelList, Label]): + for label in labels: + self.labeled_mask[label[0]] = True + + def _offset_new_labes(self, labels: LabelList): + """ + This is where the magic happens. + We take self.unlabeled_mask.nonzero()[0] which gives us an array of the indices that appear in the unlabeled + data. So if the original label was at position 0 we look up the "real index" in the unlabeled_indices array to + get it's true index + :param labels: + :return: + """ + if len(self._labels) == 0: + # Nothing to correct in this case + return labels + correctLabels: LabelList = [] + unlabeled_indices = self.unlabeled_mask.nonzero()[0] + + for label in labels: + newIndex = unlabeled_indices[label[0]] + newLabel: Label = (newIndex, label[1]) + correctLabels.append(newLabel) + return correctLabels + + def add_labels(self, labels: LabelList,offset_to_unlabeled=True): + if isinstance(labels, tuple): # if this is a single example + labels: LabelList = [labels] + elif isinstance(labels, list): + pass + else: + raise Exception( + "Malformed input. Please add either a tuple (ix,label) or a list [(ix,label),..]" + ) + if offset_to_unlabeled: + labels = self._offset_new_labes(labels) + self._update_masks(labels) + for label in labels: + self._labels[label[0]] = label[1] + + @property + def unlabeld(self): + """ + + :return: Returns UD, all of the unlabeled data points + """ + return self.features[self.unlabeled_mask] + + @property + def labeled(self): + """ + :return: Returns LD, all of the labeld data points + """ + return self.features[self.labeled_mask] + + @property + def remaining_sources(self): + """ + + :return: Returns the original data, as opposed to features, with respect to UD + """ + return self.sources[self.unlabeled_mask] + + def get_original_index_from_unlabeled_index(self, ixs:Union[int, List[int]]): + ''' + Utility function that takes as input indices from the unlabeled subset and returns the equivalent indices + in the complete array. + Useful for testing purposes, where we have the existing labels and want to take them in the order in which + the active learner specifes. + :param ixs: + :return: + ''' + unlabeled_indices = self.unlabeled_mask.nonzero()[0] + if isinstance(ixs, np.int64): + ixs = [ixs] + return list(map(lambda x: unlabeled_indices[x], ixs)) + + + +__all__ = [Label, LabelList, DataManager] diff --git a/tests/datamanager_tests.py b/tests/datamanager_tests.py new file mode 100644 index 0000000..57b1e36 --- /dev/null +++ b/tests/datamanager_tests.py @@ -0,0 +1,78 @@ +import typing +import unittest +import numpy as np +from modAL.datamanager import DataManager + + +def first_true(ar :np.ndarray): + return ar.nonzero()[0][0] + + +class TestAddLabels(unittest.TestCase): + def test_test_that_when_the_first_add_is_at_0_it_updates_correctly(self): + features = np.array([[x + y for x in range(10)] for y in range(10)]) + self.assertEqual(features.shape, (10, 10)) + manager = DataManager(features=features) + manager.add_labels([(0, 1)]) + self.assertEqual(first_true(manager.labeled_mask), 0) + # the index of the first unlabeled example is one past the first labeled + self.assertEqual(first_true(manager.unlabeled_mask), 1) + def test_addto_first_continuously(self): + features = np.array([[x+y for x in range(10)] for y in range(10)]) + self.assertEqual(features.shape,(10,10)) + manager = DataManager(features=features) + manager.add_labels([(0,1)]) + self.assertEqual(first_true(manager.labeled_mask),0) + self.assertEqual(first_true(manager.unlabeled_mask), 1) + + manager.add_labels([(0,1)]) + self.assertEqual(first_true(manager.labeled_mask),0) + self.assertEqual(first_true(manager.unlabeled_mask), 2) + + manager.add_labels([(0, 1)]) + self.assertEqual(first_true(manager.labeled_mask), 0) + self.assertEqual(first_true(manager.unlabeled_mask), 3) + + def test_adding_in_the_middle(self): + features = np.array([[x+y for x in range(10)] for y in range(10)]) + self.assertEqual(features.shape,(10,10)) + manager = DataManager(features=features) + manager.add_labels([(2,1)]) + self.assertEqual(first_true(manager.labeled_mask),2) + self.assertEqual(first_true(manager.unlabeled_mask), 0) + def test_adding_two_in_the_middle(self): + features = np.array([[x + y for x in range(10)] for y in range(10)]) + self.assertEqual(features.shape, (10, 10)) + manager = DataManager(features=features) + manager.add_labels([(2, 1)]) + self.assertEqual(first_true(manager.labeled_mask), 2) + self.assertEqual(first_true(manager.unlabeled_mask), 0) + + manager.add_labels([(1, 1)]) + self.assertEqual(first_true(manager.labeled_mask), 1) + # We still didn't label the one at 0 + self.assertEqual(first_true(manager.unlabeled_mask), 0) + + def test_adding_two_in_the_middle_and_then_at_0(self): + features = np.array([[x + y for x in range(10)] for y in range(10)]) + self.assertEqual(features.shape, (10, 10)) + manager = DataManager(features=features) + manager.add_labels([(2, 1)]) + self.assertEqual(first_true(manager.labeled_mask), 2) + self.assertEqual(first_true(manager.unlabeled_mask), 0) + + manager.add_labels([(1, 1)]) + self.assertEqual(first_true(manager.labeled_mask), 1) + # We still didn't label the one at 0 + self.assertEqual(first_true(manager.unlabeled_mask), 0) + + manager.add_labels([(0, 1)]) + self.assertEqual(first_true(manager.labeled_mask), 0) + # we labeled 0,1,2 the next one should be 3 + self.assertEqual(first_true(manager.unlabeled_mask), 3) + + + + +if __name__ == '__main__': + unittest.main()