diff --git a/notebooks/local_outlier_factor.ipynb b/notebooks/local_outlier_factor.ipynb
new file mode 100644
index 0000000..546f894
--- /dev/null
+++ b/notebooks/local_outlier_factor.ipynb
@@ -0,0 +1,713 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Local outlier factor - Gabbar"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "%matplotlib inline\n",
+ "%config InlineBackend.figure_format = 'retina'\n",
+ "\n",
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.externals import joblib\n",
+ "from sklearn.neighbors import NearestNeighbors\n",
+ "\n",
+ "from sklearn.metrics import classification_report\n",
+ "from sklearn.metrics import confusion_matrix"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "pd.set_option('display.precision', 2)\n",
+ "pd.set_option('display.max_columns', None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "random_state = 5\n",
+ "cv = 10"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "non_training_attributes = ['changeset_id', 'changeset_harmful', 'feature_id', 'feature_type']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Prepare datasets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "labelled_path = '../downloads/anomaly-detection/labelled/'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(2152, 15)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " changeset_id | \n",
+ " changeset_harmful | \n",
+ " feature_id | \n",
+ " feature_type | \n",
+ " action_create | \n",
+ " action_modify | \n",
+ " action_delete | \n",
+ " feature_version | \n",
+ " highway_tag_created | \n",
+ " highway_tag_deleted | \n",
+ " highway_value_difference | \n",
+ " primary_tags_difference | \n",
+ " area_of_feature_bbox | \n",
+ " length_of_longest_segment | \n",
+ " feature_name_touched | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 819 | \n",
+ " 47509417 | \n",
+ " 1 | \n",
+ " 173213864 | \n",
+ " way | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 3074.38 | \n",
+ " 8.08e-02 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1957 | \n",
+ " 46428224 | \n",
+ " 1 | \n",
+ " 28084835 | \n",
+ " way | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 7 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 81860.36 | \n",
+ " 7.76e-02 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " 48793277 | \n",
+ " 1 | \n",
+ " 493973632 | \n",
+ " way | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 83641.75 | \n",
+ " 2.29e-01 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 616 | \n",
+ " 47531648 | \n",
+ " 1 | \n",
+ " 4409709 | \n",
+ " way | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 9 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 9303.95 | \n",
+ " 1.25e-01 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1563 | \n",
+ " 47303259 | \n",
+ " 1 | \n",
+ " 39560764 | \n",
+ " way | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1216.48 | \n",
+ " 7.95e-02 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1340 | \n",
+ " 47375444 | \n",
+ " 1 | \n",
+ " 147200518 | \n",
+ " way | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 5954.32 | \n",
+ " 0.00e+00 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1645 | \n",
+ " 47271076 | \n",
+ " 1 | \n",
+ " 235175103 | \n",
+ " way | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 27.42 | \n",
+ " 8.60e-03 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1507 | \n",
+ " 47307997 | \n",
+ " 1 | \n",
+ " 332091899 | \n",
+ " way | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 37047.15 | \n",
+ " 2.74e-01 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 816 | \n",
+ " 47509435 | \n",
+ " 1 | \n",
+ " 54378535 | \n",
+ " way | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 7 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 160485.22 | \n",
+ " 1.91e-01 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 930 | \n",
+ " 47480390 | \n",
+ " 1 | \n",
+ " 39473445 | \n",
+ " way | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 616.93 | \n",
+ " 2.54e-02 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " changeset_id changeset_harmful feature_id feature_type action_create \\\n",
+ "819 47509417 1 173213864 way 0 \n",
+ "1957 46428224 1 28084835 way 0 \n",
+ "22 48793277 1 493973632 way 0 \n",
+ "616 47531648 1 4409709 way 0 \n",
+ "1563 47303259 1 39560764 way 0 \n",
+ "1340 47375444 1 147200518 way 0 \n",
+ "1645 47271076 1 235175103 way 0 \n",
+ "1507 47307997 1 332091899 way 0 \n",
+ "816 47509435 1 54378535 way 0 \n",
+ "930 47480390 1 39473445 way 0 \n",
+ "\n",
+ " action_modify action_delete feature_version highway_tag_created \\\n",
+ "819 1 0 5 0 \n",
+ "1957 1 0 7 0 \n",
+ "22 1 0 2 0 \n",
+ "616 1 0 9 0 \n",
+ "1563 1 0 5 0 \n",
+ "1340 1 0 3 0 \n",
+ "1645 1 0 6 0 \n",
+ "1507 1 0 2 0 \n",
+ "816 1 0 7 0 \n",
+ "930 1 0 6 0 \n",
+ "\n",
+ " highway_tag_deleted highway_value_difference primary_tags_difference \\\n",
+ "819 0 0 0 \n",
+ "1957 0 0 0 \n",
+ "22 0 0 0 \n",
+ "616 0 0 0 \n",
+ "1563 0 0 0 \n",
+ "1340 0 0 0 \n",
+ "1645 0 0 0 \n",
+ "1507 0 0 0 \n",
+ "816 0 0 0 \n",
+ "930 0 0 0 \n",
+ "\n",
+ " area_of_feature_bbox length_of_longest_segment feature_name_touched \n",
+ "819 3074.38 8.08e-02 0 \n",
+ "1957 81860.36 7.76e-02 0 \n",
+ "22 83641.75 2.29e-01 0 \n",
+ "616 9303.95 1.25e-01 0 \n",
+ "1563 1216.48 7.95e-02 0 \n",
+ "1340 5954.32 0.00e+00 0 \n",
+ "1645 27.42 8.60e-03 0 \n",
+ "1507 37047.15 2.74e-01 0 \n",
+ "816 160485.22 1.91e-01 0 \n",
+ "930 616.93 2.54e-02 0 "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "labelled = pd.read_csv(labelled_path + 'attributes.csv')\n",
+ "print(labelled.shape)\n",
+ "\n",
+ "# Sort the dataset randomly.\n",
+ "labelled = labelled.sample(labelled.shape[0], random_state=random_state)\n",
+ "labelled.sample(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Shape before dropping duplicates: (2152, 15)\n",
+ "Shape after dropping duplicates: (2152, 15)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Drop all duplicate samples.\n",
+ "print('Shape before dropping duplicates: {}'.format(labelled.shape))\n",
+ "labelled = labelled.drop_duplicates(subset=['changeset_id', 'feature_id'])\n",
+ "print('Shape after dropping duplicates: {}'.format(labelled.shape))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total inliers: (2099, 15)\n",
+ "Total outliers: (53, 15)\n"
+ ]
+ }
+ ],
+ "source": [
+ "inliers = labelled[labelled['changeset_harmful'] == 1]\n",
+ "print('Total inliers: {}'.format(inliers.shape))\n",
+ "\n",
+ "outliers = labelled[labelled['changeset_harmful'] == -1]\n",
+ "print('Total outliers: {}'.format(outliers.shape))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Model training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',\n",
+ " metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "model = NearestNeighbors()\n",
+ "model.fit(labelled.drop(non_training_attributes, axis=1))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Inliers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " changeset_id | \n",
+ " changeset_harmful | \n",
+ " feature_id | \n",
+ " feature_type | \n",
+ " action_create | \n",
+ " action_modify | \n",
+ " action_delete | \n",
+ " feature_version | \n",
+ " highway_tag_created | \n",
+ " highway_tag_deleted | \n",
+ " highway_value_difference | \n",
+ " primary_tags_difference | \n",
+ " area_of_feature_bbox | \n",
+ " length_of_longest_segment | \n",
+ " feature_name_touched | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2140 | \n",
+ " 44929925 | \n",
+ " -1 | \n",
+ " 21868906 | \n",
+ " way | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 16 | \n",
+ " 0 | \n",
+ " 1.08e+03 | \n",
+ " 0.14 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 126 | \n",
+ " 48255854 | \n",
+ " -1 | \n",
+ " 490323405 | \n",
+ " way | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 9.48e+04 | \n",
+ " 0.05 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2130 | \n",
+ " 45018087 | \n",
+ " -1 | \n",
+ " 247378462 | \n",
+ " way | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 15 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 11 | \n",
+ " 0 | \n",
+ " 5.19e+06 | \n",
+ " 0.08 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2131 | \n",
+ " 45017819 | \n",
+ " -1 | \n",
+ " 456458523 | \n",
+ " way | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1.01e+07 | \n",
+ " 0.13 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 73 | \n",
+ " 48388526 | \n",
+ " -1 | \n",
+ " 491268465 | \n",
+ " way | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " -1 | \n",
+ " 0 | \n",
+ " 1.72e+04 | \n",
+ " 0.13 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " changeset_id changeset_harmful feature_id feature_type action_create \\\n",
+ "2140 44929925 -1 21868906 way 0 \n",
+ "126 48255854 -1 490323405 way 0 \n",
+ "2130 45018087 -1 247378462 way 0 \n",
+ "2131 45017819 -1 456458523 way 0 \n",
+ "73 48388526 -1 491268465 way 0 \n",
+ "\n",
+ " action_modify action_delete feature_version highway_tag_created \\\n",
+ "2140 1 0 5 0 \n",
+ "126 1 0 2 0 \n",
+ "2130 1 0 15 0 \n",
+ "2131 1 0 4 0 \n",
+ "73 1 0 2 0 \n",
+ "\n",
+ " highway_tag_deleted highway_value_difference primary_tags_difference \\\n",
+ "2140 0 16 0 \n",
+ "126 0 0 1 \n",
+ "2130 0 11 0 \n",
+ "2131 0 1 0 \n",
+ "73 0 -1 0 \n",
+ "\n",
+ " area_of_feature_bbox length_of_longest_segment feature_name_touched \n",
+ "2140 1.08e+03 0.14 0 \n",
+ "126 9.48e+04 0.05 0 \n",
+ "2130 5.19e+06 0.08 0 \n",
+ "2131 1.01e+07 0.13 0 \n",
+ "73 1.72e+04 0.13 0 "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "labelled[labelled.changeset_harmful == -1].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(array([[ 0. , 16.01992315, 16.05116334, 17.69844641,\n",
+ " 18.80077688]]), array([[ 114, 1447, 1441, 1986, 1661]]))"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "model.kneighbors(labelled[labelled.changeset_id == 44929925].drop(non_training_attributes, axis=1))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "changeset_id 47432011\n",
+ "changeset_harmful 1\n",
+ "feature_id 343328885\n",
+ "feature_type way\n",
+ "action_create 0\n",
+ "action_modify 1\n",
+ "action_delete 0\n",
+ "feature_version 3\n",
+ "highway_tag_created 0\n",
+ "highway_tag_deleted 0\n",
+ "highway_value_difference 0\n",
+ "primary_tags_difference 0\n",
+ "area_of_feature_bbox 1.1e+03\n",
+ "length_of_longest_segment 0.038\n",
+ "feature_name_touched 0\n",
+ "Name: 1071, dtype: object"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "labelled.iloc[1661]"
+ ]
+ }
+ ],
+ "metadata": {
+ "anaconda-cloud": {},
+ "kernelspec": {
+ "display_name": "Python [conda root]",
+ "language": "python",
+ "name": "conda-root-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.5.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}