diff --git a/notebooks/local_outlier_factor.ipynb b/notebooks/local_outlier_factor.ipynb new file mode 100644 index 0000000..546f894 --- /dev/null +++ b/notebooks/local_outlier_factor.ipynb @@ -0,0 +1,713 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Local outlier factor - Gabbar" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "%config InlineBackend.figure_format = 'retina'\n", + "\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.externals import joblib\n", + "from sklearn.neighbors import NearestNeighbors\n", + "\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.metrics import confusion_matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "pd.set_option('display.precision', 2)\n", + "pd.set_option('display.max_columns', None)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "random_state = 5\n", + "cv = 10" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "non_training_attributes = ['changeset_id', 'changeset_harmful', 'feature_id', 'feature_type']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "labelled_path = '../downloads/anomaly-detection/labelled/'" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2152, 15)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
changeset_idchangeset_harmfulfeature_idfeature_typeaction_createaction_modifyaction_deletefeature_versionhighway_tag_createdhighway_tag_deletedhighway_value_differenceprimary_tags_differencearea_of_feature_bboxlength_of_longest_segmentfeature_name_touched
819475094171173213864way010500003074.388.08e-020
195746428224128084835way0107000081860.367.76e-020
22487932771493973632way0102000083641.752.29e-010
6164753164814409709way010900009303.951.25e-010
156347303259139560764way010500001216.487.95e-020
1340473754441147200518way010300005954.320.00e+000
1645472710761235175103way0106000027.428.60e-030
1507473079971332091899way0102000037047.152.74e-010
81647509435154378535way01070000160485.221.91e-010
93047480390139473445way01060000616.932.54e-020
\n", + "
" + ], + "text/plain": [ + " changeset_id changeset_harmful feature_id feature_type action_create \\\n", + "819 47509417 1 173213864 way 0 \n", + "1957 46428224 1 28084835 way 0 \n", + "22 48793277 1 493973632 way 0 \n", + "616 47531648 1 4409709 way 0 \n", + "1563 47303259 1 39560764 way 0 \n", + "1340 47375444 1 147200518 way 0 \n", + "1645 47271076 1 235175103 way 0 \n", + "1507 47307997 1 332091899 way 0 \n", + "816 47509435 1 54378535 way 0 \n", + "930 47480390 1 39473445 way 0 \n", + "\n", + " action_modify action_delete feature_version highway_tag_created \\\n", + "819 1 0 5 0 \n", + "1957 1 0 7 0 \n", + "22 1 0 2 0 \n", + "616 1 0 9 0 \n", + "1563 1 0 5 0 \n", + "1340 1 0 3 0 \n", + "1645 1 0 6 0 \n", + "1507 1 0 2 0 \n", + "816 1 0 7 0 \n", + "930 1 0 6 0 \n", + "\n", + " highway_tag_deleted highway_value_difference primary_tags_difference \\\n", + "819 0 0 0 \n", + "1957 0 0 0 \n", + "22 0 0 0 \n", + "616 0 0 0 \n", + "1563 0 0 0 \n", + "1340 0 0 0 \n", + "1645 0 0 0 \n", + "1507 0 0 0 \n", + "816 0 0 0 \n", + "930 0 0 0 \n", + "\n", + " area_of_feature_bbox length_of_longest_segment feature_name_touched \n", + "819 3074.38 8.08e-02 0 \n", + "1957 81860.36 7.76e-02 0 \n", + "22 83641.75 2.29e-01 0 \n", + "616 9303.95 1.25e-01 0 \n", + "1563 1216.48 7.95e-02 0 \n", + "1340 5954.32 0.00e+00 0 \n", + "1645 27.42 8.60e-03 0 \n", + "1507 37047.15 2.74e-01 0 \n", + "816 160485.22 1.91e-01 0 \n", + "930 616.93 2.54e-02 0 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labelled = pd.read_csv(labelled_path + 'attributes.csv')\n", + "print(labelled.shape)\n", + "\n", + "# Sort the dataset randomly.\n", + "labelled = labelled.sample(labelled.shape[0], random_state=random_state)\n", + "labelled.sample(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape before dropping duplicates: (2152, 15)\n", + "Shape after dropping duplicates: (2152, 15)\n" + ] + } + ], + "source": [ + "# Drop all duplicate samples.\n", + "print('Shape before dropping duplicates: {}'.format(labelled.shape))\n", + "labelled = labelled.drop_duplicates(subset=['changeset_id', 'feature_id'])\n", + "print('Shape after dropping duplicates: {}'.format(labelled.shape))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total inliers: (2099, 15)\n", + "Total outliers: (53, 15)\n" + ] + } + ], + "source": [ + "inliers = labelled[labelled['changeset_harmful'] == 1]\n", + "print('Total inliers: {}'.format(inliers.shape))\n", + "\n", + "outliers = labelled[labelled['changeset_harmful'] == -1]\n", + "print('Total outliers: {}'.format(outliers.shape))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model training" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',\n", + " metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = NearestNeighbors()\n", + "model.fit(labelled.drop(non_training_attributes, axis=1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inliers" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
changeset_idchangeset_harmfulfeature_idfeature_typeaction_createaction_modifyaction_deletefeature_versionhighway_tag_createdhighway_tag_deletedhighway_value_differenceprimary_tags_differencearea_of_feature_bboxlength_of_longest_segmentfeature_name_touched
214044929925-121868906way0105001601.08e+030.140
12648255854-1490323405way010200019.48e+040.050
213045018087-1247378462way01015001105.19e+060.080
213145017819-1456458523way010400101.01e+070.130
7348388526-1491268465way010200-101.72e+040.130
\n", + "
" + ], + "text/plain": [ + " changeset_id changeset_harmful feature_id feature_type action_create \\\n", + "2140 44929925 -1 21868906 way 0 \n", + "126 48255854 -1 490323405 way 0 \n", + "2130 45018087 -1 247378462 way 0 \n", + "2131 45017819 -1 456458523 way 0 \n", + "73 48388526 -1 491268465 way 0 \n", + "\n", + " action_modify action_delete feature_version highway_tag_created \\\n", + "2140 1 0 5 0 \n", + "126 1 0 2 0 \n", + "2130 1 0 15 0 \n", + "2131 1 0 4 0 \n", + "73 1 0 2 0 \n", + "\n", + " highway_tag_deleted highway_value_difference primary_tags_difference \\\n", + "2140 0 16 0 \n", + "126 0 0 1 \n", + "2130 0 11 0 \n", + "2131 0 1 0 \n", + "73 0 -1 0 \n", + "\n", + " area_of_feature_bbox length_of_longest_segment feature_name_touched \n", + "2140 1.08e+03 0.14 0 \n", + "126 9.48e+04 0.05 0 \n", + "2130 5.19e+06 0.08 0 \n", + "2131 1.01e+07 0.13 0 \n", + "73 1.72e+04 0.13 0 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labelled[labelled.changeset_harmful == -1].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([[ 0. , 16.01992315, 16.05116334, 17.69844641,\n", + " 18.80077688]]), array([[ 114, 1447, 1441, 1986, 1661]]))" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.kneighbors(labelled[labelled.changeset_id == 44929925].drop(non_training_attributes, axis=1))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "changeset_id 47432011\n", + "changeset_harmful 1\n", + "feature_id 343328885\n", + "feature_type way\n", + "action_create 0\n", + "action_modify 1\n", + "action_delete 0\n", + "feature_version 3\n", + "highway_tag_created 0\n", + "highway_tag_deleted 0\n", + "highway_value_difference 0\n", + "primary_tags_difference 0\n", + "area_of_feature_bbox 1.1e+03\n", + "length_of_longest_segment 0.038\n", + "feature_name_touched 0\n", + "Name: 1071, dtype: object" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labelled.iloc[1661]" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [conda root]", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}