Skip to content

Commit

Permalink
Merge pull request #89 from mapbox/anomaly-detection
Browse files Browse the repository at this point in the history
Anomaly detection
  • Loading branch information
bkowshik authored Jul 10, 2017
2 parents 7d1e637 + 57f5194 commit b304270
Show file tree
Hide file tree
Showing 40 changed files with 66,789 additions and 8,145 deletions.
103 changes: 16 additions & 87 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,101 +40,30 @@ npm install


```bash
# Get a prediction for a changeset.
gabbar 47734592
# A prediction of "-1" represents that this feature is an anomaly (outlier).
gabbar 49172351
[
{
"attributes": {
"action_create": 0,
"action_delete": 0,
"action_modify": 1,
"feature_version": 15,
"geometry_kinks": 0,
"geometry_line_distance": 0.619,
"geometry_type_node": 0,
"geometry_type_relation": 0,
"geometry_type_way": 1,
"new_barrier=yes": 0,
"new_bicycle=no": 0,
"new_bicycle=yes": 0,
"new_bridge=yes": 0,
"new_construction=motorway": 0,
"new_foot=yes": 0,
"new_footway=sidewalk": 0,
"new_highway=coastline": 0,
"new_highway=footway": 0,
"new_highway=living_street": 0,
"new_highway=motorway": 0,
"new_highway=path": 0,
"new_highway=primary": 1,
"new_highway=road": 0,
"new_highway=secondary": 0,
"new_highway=service": 0,
"new_highway=tertiary": 0,
"new_highway=track": 0,
"new_highway=unclassified": 0,
"new_horse=no": 0,
"new_horse=yes": 0,
"new_landuse=cemetery": 0,
"new_landuse=footway": 0,
"new_landuse=forest": 0,
"new_landuse=grass": 0,
"new_landuse=recreation_ground": 0,
"new_landuse_1=park": 0,
"new_landuse_1=recreation_ground": 0,
"new_landuse_2=festival area": 0,
"new_landuse_3=dog park": 0,
"new_landuse_3=recreation": 0,
"new_landuse_4=recreation_ground": 0,
"new_landuse_5=water_park": 0,
"new_lanes=2": 0,
"new_layer=1": 0,
"new_leisure=park": 0,
"new_lit=yes": 0,
"new_maxspeed=50": 0,
"new_maxspeed=8": 0,
"new_natural=footway": 0,
"new_natural=tree_row": 0,
"new_noname=yes": 0,
"new_oneway=no": 0,
"new_park=yes": 0,
"new_surface=asphalt": 0,
"new_surface=dirt": 0,
"new_surface=gravel": 0,
"new_surface=unpaved": 0,
"new_surface_1=asphalt": 0,
"new_surface_1=ground": 0,
"new_surface_2=unpaved": 0,
"new_surface_2=wood": 0,
"new_tracktype=grade3": 0,
"new_user_mapping_days": 0,
"new_waterway=river": 0,
"new_waterway=stream": 0,
"old_construction=path": 0,
"old_embankment=yes": 0,
"old_highway=construction": 0,
"old_highway=footway": 0,
"old_highway=path": 0,
"old_highway=pedestrian": 0,
"old_highway=primary": 0,
"old_highway=residential": 0,
"old_highway=service": 0,
"old_highway=tertiary": 0,
"old_highway=unclassified": 0,
"old_lit=no": 0,
"old_maxspeed=30": 0,
"old_natural=coastline": 0,
"old_oneway=yes": 0,
"old_park=paseo": 0,
"old_user_mapping_days": 0,
"old_width=0": 0
"area_of_feature_bbox": 109591.9146,
"feature_name_touched": 0,
"feature_version": 17,
"highway_tag_created": 41,
"highway_tag_deleted": 0,
"highway_value_difference": 0,
"length_of_longest_segment": 0.1577,
"primary_tags_difference": 1
},
"changeset_id": "49626684",
"feature_id": "17166500",
"changeset_id": "49172351",
"feature_id": "124863896",
"feature_type": "way",
"prediction": 0,
"timestamp": "2017-06-30 16:18:07.965246",
"version": 0.5
"prediction": -1,
"score": -0.1493,
"timestamp": "2017-07-10 10:33:02.925012",
"version": "0.6.2"
}
]
```
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.5.1
0.6.2
5 changes: 2 additions & 3 deletions data/feature-classifier/download-osmcha.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ if (argv.help) {
}

// Url to download checked chagesets with one feature modifications.
let url = 'https://osmcha.mapbox.com/api/v1/changesets/?page_size=500&checked=1&create__gte=0&create__lte=0&modify__gte=1&modify__lte=1&delete__gte=0&delete__lte=0&format=json'

let url = 'https://osmcha.mapbox.com/api/v1/changesets/?page_size=500&format=json&checked=1&date__gte=2017-01-01'

function download(url, callback) {
process.stderr.write(url + '\n');
Expand All @@ -37,7 +36,7 @@ function download(url, callback) {

let q = queue(1);
console.log('changeset_id,harmful');
for (var i = 1; i < 100; i++) {
for (var i = 1; i < 1000; i++) {
let pageURL = url + '&page=' + i;
q.defer(download, pageURL);
}
Expand Down
27 changes: 23 additions & 4 deletions datasets/analyze-predictions.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,12 @@ console.log('Total predictions: ' + predictions.length);

let harmfuls = [], goods = [];
for (let prediction of predictions) {
if (prediction === '.DS_Store') continue;

prediction = JSON.parse(fs.readFileSync(path.join(argv.predictionsDir, prediction)));

if (prediction == 1) harmfuls.push(prediction)
else goods.push(prediction)
if (prediction.prediction === -1) harmfuls.push(prediction);
else goods.push(prediction);
}
console.log('Features predicted good: ' + goods.length);
console.log('Features predicted harmful: ' + harmfuls.length);
Expand All @@ -46,14 +48,31 @@ console.log('\nChangesets predicted good ...');
let goodChangesets = new Set([]);
while (true) {
// To review a sample of 25 changesets.
if (goodChangesets.size > 25) break;
if ((goodChangesets.size >= goods.length) || (goodChangesets.size > 25)) break;

// Randomly select a good changeset.
let good = goods[getRandomIntInclusive(0, goods.length)];
let good = goods[getRandomIntInclusive(0, goods.length - 1)];

// If changeset is already seen, skip.
if (goodChangesets.has(good.changeset_id)) continue;

console.log(OSMCHA_URL + good.changeset_id + '/' + '\t' + getFeatureHash(good) + '.json');
goodChangesets.add(good.changeset_id);
}


console.log('\nChangesets predicted harmful ...');
let harmfulChangesets = new Set([]);
while (true) {
// To review a sample of 25 changesets.
if ((harmfulChangesets.size >= harmfuls.length) || (harmfulChangesets.size > 25)) break;

// Randomly select a good changeset.
let harmful = harmfuls[getRandomIntInclusive(0, harmfuls.length - 1)];

// If changeset is already seen, skip.
if (harmfulChangesets.has(harmful.changeset_id)) continue;

console.log(OSMCHA_URL + harmful.changeset_id + '/' + '\t' + getFeatureHash(harmful) + '.json');
harmfulChangesets.add(harmful.changeset_id);
}
154 changes: 154 additions & 0 deletions datasets/anomaly-detection.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
'use strict';

const argv = require('minimist')(process.argv.slice(2));
const fs = require('fs');
const path = require('path');
const csv = require('csv');
const turf = require('@turf/turf');
const realChangesetToChangeset = require('../gabbar/utilities/real-changeset').realChangesetToChangeset;
const getSamples = require('../gabbar/filters/highway').getSamples;
const simpleStatistics = require('simple-statistics');

const featureAttributes = require('../gabbar/attributes/feature');
const highwayAttributes = require('../gabbar/attributes/highway');
const userAttributes = require('../gabbar/attributes/user');
const userDatasources = require('../gabbar/datasources/user');


if (!argv.changesets || !argv.realChangesetsDir || !argv.userDetailsDir) {
console.log('');
console.log('USAGE: node highway-attributes.js OPTIONS');
console.log('');
console.log(' OPTIONS');
console.log(' --changesets changesets.csv Dump of changesets from osmcha');
console.log(' --realChangesetsDir real-changesets/ Directory with real changesets');
console.log(' --userDetailsDir user-details/ Directory with user details');
console.log('');
process.exit(0);
}

csv.parse(fs.readFileSync(argv.changesets), (error, rows) => {

let header = [
'changeset_id',
'changeset_harmful',
'feature_id',
'feature_type',
'action_create',
'action_modify',
'action_delete',
'feature_version',
'highway_tag_created',
'highway_tag_deleted',
'highway_value_difference',
'primary_tags_difference',
'area_of_feature_bbox',
'length_of_longest_segment',
'feature_name_touched',
// 'geometry_distance_between_versions',
// 'old_geometry_line_distance',
// 'old_geometry_number_of_nodes',
// 'old_geometry_kinks',
// 'old_geometry_area',
// 'new_geometry_line_distance',
// 'new_geometry_number_of_nodes',
// 'new_geometry_kinks',
// 'new_geometry_area',
// 'old_user_mapping_days',
// 'new_user_mapping_days',
// 'difference_user_mapping_days',
// 'ratio_user_mapping_days',
// 'old_number_of_tags',
// 'new_number_of_tags',
// 'difference_number_of_tags',
// 'ratio_number_of_tags',
// 'old_tags',
// 'new_tags',
];
// for (let item of featureAttributes.getPrimaryTags()) header.push(item);

let attributes = [];
attributes.push(header);

let seenChangesets = new Set([]);
for (let row of rows) {
let changesetID = row[0];

// Checking for duplicate changesets.
if (seenChangesets.has(changesetID)) continue;
seenChangesets.add(changesetID);

let realChangeset;
try {
realChangeset = JSON.parse(fs.readFileSync(path.join(argv.realChangesetsDir, changesetID + '.json')));
} catch (error) {
// When the real changeset file does not exist.
continue;
}
let changeset = realChangesetToChangeset(realChangeset);

let harmful = row[1];
// Inliers are labeled 1, while outliers are labeled -1.
if (harmful === 'true') harmful = -1;
else if (harmful === 'false') harmful = 1;
else harmful = '';

let samples = getSamples(changeset);
for (let sample of samples) {
let newVersion = sample[0];
let oldVersion = sample[1];

let newUsername = featureAttributes.getUsername(newVersion);
let newUserDetails = userDatasources.getUserDetails(newUsername, argv.userDetailsDir);

let oldUsername = featureAttributes.getUsername(oldVersion);
let oldUserDetails = userDatasources.getUserDetails(oldUsername, argv.userDetailsDir);

// Skipping changesets from user labelled due to user_block.
if (newUsername === 'chinakz') continue;
if (oldUsername === 'chinakz') continue;

let sampleAttributes = [
changesetID,
harmful,
featureAttributes.getFeatureID(newVersion),
featureAttributes.getGeometryType(newVersion),
featureAttributes.getAction(newVersion) === 'create' ? 1 : 0,
featureAttributes.getAction(newVersion) === 'modify' ? 1 : 0,
featureAttributes.getAction(newVersion) === 'delete' ? 1 : 0,
featureAttributes.getFeatureVersion(newVersion),
highwayAttributes.isHighwayTagCreated(newVersion, oldVersion),
highwayAttributes.isHighwayTagDeleted(newVersion, oldVersion),
highwayAttributes.getHighwayValueDifference(newVersion, oldVersion),
simpleStatistics.sumSimple(featureAttributes.getPrimaryTagCount(newVersion)) - simpleStatistics.sumSimple(featureAttributes.getPrimaryTagCount(oldVersion)),
featureAttributes.getBBOXArea(newVersion),
featureAttributes.getLengthOfLongestSegment(newVersion),
featureAttributes.isNameTouched(newVersion, oldVersion),
// featureAttributes.getDistanceBetweenVersions(newVersion, oldVersion),
// featureAttributes.getLineDistance(oldVersion),
// featureAttributes.getNumberOfNodes(oldVersion),
// featureAttributes.getKinks(oldVersion).length,
// featureAttributes.getArea(oldVersion),
// featureAttributes.getLineDistance(newVersion),
// featureAttributes.getNumberOfNodes(newVersion),
// featureAttributes.getKinks(newVersion).length,
// featureAttributes.getArea(newVersion),
// userAttributes.getMappingDays(oldUserDetails),
// userAttributes.getMappingDays(newUserDetails),
// userAttributes.getMappingDays(newUserDetails) - userAttributes.getMappingDays(oldUserDetails),
// userAttributes.getMappingDays(oldUserDetails) ? userAttributes.getMappingDays(newUserDetails) / userAttributes.getMappingDays(oldUserDetails) : 0,
// featureAttributes.getNumberOfTags(oldVersion),
// featureAttributes.getNumberOfTags(newVersion),
// featureAttributes.getNumberOfTags(newVersion) - featureAttributes.getNumberOfTags(oldVersion),
// featureAttributes.getNumberOfTags(oldVersion) ? featureAttributes.getNumberOfTags(newVersion) / featureAttributes.getNumberOfTags(oldVersion) : 0,
// highwayAttributes.tagsToString(oldVersion, newVersion),
// highwayAttributes.tagsToString(newVersion, oldVersion),
];
// for (let item of featureAttributes.getPrimaryTagCount(newVersion)) sampleAttributes.push(item);
attributes.push(sampleAttributes);
}
}
csv.stringify(attributes, (error, asString) => {
console.log(asString);
});
});
38 changes: 38 additions & 0 deletions datasets/extract-feature.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
'use strict';

const argv = require('minimist')(process.argv.slice(2));
const fs = require('fs');
const path = require('path');
const turf = require('@turf/turf');

const rcUtilities = require('../gabbar/utilities/real-changeset');
const cUtilities = require('../gabbar/utilities/changeset');


if (!argv.realChangesetsDir || !argv.changesetID || !argv.featureType || !argv.featureID) {
console.log('');
console.log('USAGE: node highway-attributes.js OPTIONS');
console.log('');
console.log(' OPTIONS');
console.log(' --realChangesetsDir real-changesets/');
console.log(' --changesetID ID of changeset (Ex: 135233)');
console.log(' --featureType node | way | relation');
console.log(' --featureID ID of feature (Ex: 234334)');
console.log('');
process.exit(0);
}

let filepath = path.join(argv.realChangesetsDir, argv.changesetID + '.json');
let realChangeset = JSON.parse(fs.readFileSync(filepath));
let changeset = rcUtilities.realChangesetToChangeset(realChangeset);

let versions = cUtilities.getAllFeatures(changeset);
for (let version of versions) {
let newVersion = version[0];
let oldVersion = version[1];

if ((newVersion.properties.type == argv.featureType) && (newVersion.properties.id == argv.featureID)) {
console.log(JSON.stringify(turf.featureCollection(version), null, 4));
process.exit(0);
}
}
Loading

0 comments on commit b304270

Please sign in to comment.