Skip to content

Commit 96e74da

Browse files
aiegorovahans
andcommitted
Switch to grid-based neighbor search and parallelize
This changes the neighbor search to use a grid-based indexing scheme instead of the nanoflann-based kd-tree. We reorder the input points into eps^2-sized cells. This way we can directly access the points of such a cell and only have to query the 8-neighborhood of a point's cell to find all its potential neighbors. For most datasets this is faster than first building up the kd-tree and then querying it. We further change the algorithm to require only a single pass over the data, where for each point we find its eps-neighborhood and record if that point could be a potential core point for any point in the eps-neighborhood. We remember up two three potential core points. In the next step, for each point that has at least one potential core point, we label that point with its index (it for sure will be part of a cluster). We finally need to merge the clusters iteratively until convergence in the last step. Co-authored-by: Alexander Hans <[email protected]>
1 parent 21d81d1 commit 96e74da

File tree

6 files changed

+149
-104
lines changed

6 files changed

+149
-104
lines changed

cpp/BUILD.bazel

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,9 @@ cc_library(
1010
name = "dbscan",
1111
srcs = ["dbscan.cpp"],
1212
hdrs = ["dbscan.hpp"],
13+
copts = ["-fopenmp"],
14+
linkopts = ["-fopenmp"],
1315
visibility = ["//visibility:public"],
14-
deps = [
15-
"@nanoflann",
16-
],
1716
)
1817

1918
cc_binary(

cpp/dbscan.cpp

Lines changed: 136 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -1,133 +1,181 @@
11
#include "cpp/dbscan.hpp"
22

3-
#include <nanoflann.hpp>
3+
#include <cmath>
4+
#include <numeric>
5+
#include <climits>
6+
#include <unordered_map>
47

58
namespace dbscan {
69

7-
namespace {
8-
9-
class PointsVectorAdaptor
10-
{
11-
public:
12-
explicit PointsVectorAdaptor(std::vector<Dbscan::Point> const& points)
13-
: points_{points}
14-
{
15-
}
16-
17-
std::size_t kdtree_get_point_count() const
18-
{
19-
return std::size(points_);
20-
}
21-
22-
float kdtree_get_pt(std::size_t idx, int dim) const
23-
{
24-
return points_[idx][dim];
25-
}
26-
27-
template <typename Bbox>
28-
bool kdtree_get_bbox(Bbox&) const
29-
{
30-
return false;
31-
}
32-
33-
private:
34-
std::vector<Dbscan::Point> const& points_;
35-
};
36-
37-
} // namespace
38-
3910
Dbscan::Dbscan(float const eps, std::uint32_t const min_samples, std::size_t const num_points_hint)
40-
: eps_squared_{eps * eps}
11+
: eps_{eps}
12+
, eps_squared_{square(eps)}
4113
, min_samples_{min_samples}
4214
{
4315
if (num_points_hint > 0) {
4416
labels_.reserve(num_points_hint);
4517
neighbors_.reserve(num_points_hint);
4618
visited_.reserve(num_points_hint);
4719
to_visit_.reserve(num_points_hint);
20+
counts_.reserve(num_points_hint);
21+
offsets_.reserve(num_points_hint);
4822
}
4923
}
5024

5125
auto Dbscan::fit_predict(std::vector<Dbscan::Point> const& points) -> std::vector<Dbscan::Label>
5226
{
53-
PointsVectorAdaptor adapter{points};
27+
labels_.assign(std::size(points), undefined);
28+
visited_.assign(std::size(points), false);
5429

55-
constexpr auto num_dims{2};
56-
constexpr auto leaf_size{32};
57-
nanoflann::
58-
KDTreeSingleIndexAdaptor<nanoflann::L2_Adaptor<float, PointsVectorAdaptor>, PointsVectorAdaptor, num_dims>
59-
points_kd_tree{num_dims, adapter, nanoflann::KDTreeSingleIndexAdaptorParams{leaf_size}};
60-
points_kd_tree.buildIndex();
30+
if (std::size(points) <= 1) {
31+
return labels_;
32+
}
6133

62-
nanoflann::SearchParams params{};
63-
params.sorted = false;
34+
// calculate min_max of the current point cloud
35+
Dbscan::Point min{points[0]};
36+
Dbscan::Point max{points[0]};
37+
for (auto const& pt : points) {
38+
min[0] = std::min(min[0], pt[0]);
39+
min[1] = std::min(min[1], pt[1]);
40+
max[0] = std::max(max[0], pt[0]);
41+
max[1] = std::max(max[1], pt[1]);
42+
}
6443

65-
labels_.assign(std::size(points), undefined);
44+
// derive num_bins out of it
45+
float const range_x{max[0] - min[0]};
46+
float const range_y{max[1] - min[1]};
47+
auto const num_bins_x{static_cast<std::uint32_t>(std::ceil(range_x / eps_))};
48+
auto const num_bins_y{static_cast<std::uint32_t>(std::ceil(range_y / eps_))};
49+
50+
// count number of points in every bin
51+
counts_.assign(num_bins_x * num_bins_y, 0);
52+
53+
// FIRST PASS OVER THE POINTS
54+
for (auto const& pt : points) {
55+
auto const bin_x{static_cast<std::uint32_t>(std::floor((pt[0] - min[0]) / eps_))};
56+
auto const bin_y{static_cast<std::uint32_t>(std::floor((pt[1] - min[1]) / eps_))};
57+
auto const index{bin_y * num_bins_x + bin_x};
58+
counts_[index] += 1;
59+
}
6660

67-
Label cluster_count{0};
68-
std::vector<Label> clusters{};
61+
// calculate the offsets for each cell (bin)
62+
offsets_.clear();
63+
std::exclusive_scan(std::cbegin(counts_), std::cend(counts_), std::back_inserter(offsets_), 0);
64+
65+
// re-sorting the points (calculating index mapping) based on the bin indices
66+
auto scratch = offsets_;
67+
std::vector<Point> new_points(std::size(points));
68+
std::vector<std::uint32_t> new_point_to_point_index_map(std::size(points));
69+
std::uint32_t i{0};
70+
for (auto const& pt : points) {
71+
auto const bin_x{static_cast<std::uint32_t>(std::floor((pt[0] - min[0]) / eps_))};
72+
auto const bin_y{static_cast<std::uint32_t>(std::floor((pt[1] - min[1]) / eps_))};
73+
auto const index{bin_y * num_bins_x + bin_x};
74+
auto const new_pt_index{scratch[index]};
75+
scratch[index] += 1;
76+
new_points[new_pt_index] = pt;
77+
new_point_to_point_index_map[new_pt_index] = i++;
78+
}
6979

70-
for (auto i{0UL}; i < std::size(points); ++i) {
71-
// skip point if it has already been processed
72-
if (labels_[i] != undefined) {
73-
continue;
74-
}
80+
std::vector<std::uint32_t> num_neighbors(std::size(new_points), 0);
7581

76-
// find number of neighbors of current point
77-
if (points_kd_tree.radiusSearch(points[i].data(), eps_squared_, neighbors_, params) < min_samples_) {
78-
labels_[i] = noise;
79-
continue;
80-
}
82+
constexpr auto num_core_points_entries{3U};
83+
std::vector<std::array<std::int32_t, num_core_points_entries>> core_points_ids;
84+
core_points_ids.assign(new_points.size(), {-1, -1, -1});
8185

82-
// This point has at least min_samples_ in its eps neighborhood, so it's considered a core point. Time to
83-
// start a new cluster.
86+
#pragma omp parallel for
87+
for (auto i = 0UL; i < std::size(new_points); ++i) {
88+
auto const pt{new_points[i]};
89+
auto const bin_x{static_cast<std::int32_t>(std::floor((pt[0] - min[0]) / eps_))};
90+
auto const bin_y{static_cast<std::int32_t>(std::floor((pt[1] - min[1]) / eps_))};
8491

85-
auto const current_cluster_id{cluster_count++};
86-
labels_[i] = current_cluster_id;
92+
std::vector<std::uint32_t> local_neighbors;
8793

88-
to_visit_.clear();
89-
visited_.assign(std::size(points), false);
94+
constexpr std::array<int, 9> dx = {-1, +0, +1, -1, +0, +1, -1, +0, +1};
95+
constexpr std::array<int, 9> dy = {-1, -1, -1, +0, +0, +0, +1, +1, +1};
9096

91-
for (auto const& n : neighbors_) {
92-
if (!visited_[n.first]) {
93-
to_visit_.push_back(n.first);
97+
for (auto ni{0}; ni < 9; ++ni) {
98+
auto const nx{bin_x + dx[ni]};
99+
auto const ny{bin_y + dy[ni]};
100+
if (nx < 0 || ny < 0 || nx >= static_cast<std::int32_t>(num_bins_x) ||
101+
ny >= static_cast<std::int32_t>(num_bins_y)) {
102+
continue;
94103
}
95-
visited_[n.first] = true;
96-
}
97-
98-
for (auto j{0UL}; j < std::size(to_visit_); ++j) {
99-
auto const neighbor{to_visit_[j]};
104+
auto const neighbor_bin{ny * num_bins_x + nx};
100105

101-
if (labels_[neighbor] == noise) {
102-
// This was considered as a seed before, but didn't have enough points in its eps neighborhood.
103-
// Since it's in the current seed's neighborhood, we label it as belonging to this label, but it
104-
// won't be used as a seed again.
105-
labels_[neighbor] = current_cluster_id;
106-
continue;
106+
for (auto j{0U}; j < counts_[neighbor_bin]; ++j) {
107+
auto const neighbor_pt_index{offsets_[neighbor_bin] + j};
108+
if (neighbor_pt_index == i) {
109+
continue;
110+
}
111+
auto const& neighbor_pt{new_points[neighbor_pt_index]};
112+
if ((square(neighbor_pt[0] - pt[0]) + square(neighbor_pt[1] - pt[1])) < eps_squared_) {
113+
local_neighbors.push_back(neighbor_pt_index);
114+
}
107115
}
116+
}
108117

109-
if (labels_[neighbor] != undefined) {
110-
// Point belongs already to a cluster: skip it.
111-
continue;
118+
if (std::size(local_neighbors) > min_samples_) {
119+
for (auto const n : local_neighbors) {
120+
auto& cps{core_points_ids[n]};
121+
for (auto cp_id{0U}; cp_id < num_core_points_entries; ++cp_id) {
122+
if (cps[cp_id] == -1) {
123+
cps[cp_id] = i;
124+
break;
125+
}
126+
}
112127
}
128+
}
129+
}
113130

114-
// assign the current cluster's label to the neighbor
115-
labels_[neighbor] = current_cluster_id;
131+
for (auto i{0UL}; i < std::size(new_points); ++i) {
132+
if (core_points_ids[i][0] >= 0) {
133+
labels_[i] = static_cast<Label>(i);
134+
} else {
135+
labels_[i] = noise;
136+
}
137+
}
116138

117-
// and query its neighborhood to see if it also to be considered as a core point
118-
if (points_kd_tree.radiusSearch(points[neighbor].data(), eps_squared_, neighbors_, params) < min_samples_) {
139+
bool converged{false};
140+
while (!converged) {
141+
converged = true;
142+
for (auto i{0UL}; i < std::size(new_points); ++i) {
143+
if (labels_[i] == -1) {
119144
continue;
120145
}
121-
for (auto const& n : neighbors_) {
122-
if (!visited_[n.first]) {
123-
to_visit_.push_back(n.first);
146+
for (auto const current_core_idx : core_points_ids[i]) {
147+
if (current_core_idx == -1) {
148+
continue;
149+
}
150+
if (labels_[i] < labels_[current_core_idx]) {
151+
labels_[current_core_idx] = labels_[i];
152+
converged = false;
153+
} else if (labels_[i] > labels_[current_core_idx]) {
154+
labels_[i] = labels_[current_core_idx];
155+
converged = false;
124156
}
125-
visited_[n.first] = true;
126157
}
127158
}
128159
}
129160

130-
return labels_;
161+
std::unordered_map<Label, Label> labels_map;
162+
labels_map.reserve(labels_.size());
163+
164+
Label num_labels{0};
165+
labels_map[noise] = noise;
166+
for (auto const l : labels_) {
167+
if (labels_map.find(l) == labels_map.end()) {
168+
labels_map[l] = num_labels;
169+
num_labels++;
170+
}
171+
}
172+
173+
std::vector<Label> labels(std::size(labels_));
174+
for (auto i{0U}; i < std::size(labels_); ++i) {
175+
labels[new_point_to_point_index_map[i]] = labels_map[labels_[i]];
176+
}
177+
178+
return labels;
131179
}
132180

133181
} // namespace dbscan

cpp/dbscan.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@
66

77
namespace dbscan {
88

9+
template <typename T>
10+
inline T square(T const v)
11+
{
12+
return v * v;
13+
}
14+
915
class Dbscan
1016
{
1117
public:
@@ -20,6 +26,7 @@ class Dbscan
2026
[[nodiscard]] std::vector<Label> fit_predict(std::vector<Point> const& points);
2127

2228
private:
29+
float eps_;
2330
float eps_squared_;
2431
std::uint32_t min_samples_;
2532

@@ -28,6 +35,8 @@ class Dbscan
2835
std::vector<std::pair<std::uint32_t, float>> neighbors_;
2936
std::vector<bool> visited_;
3037
std::vector<std::uint32_t> to_visit_;
38+
std::vector<std::uint32_t> counts_;
39+
std::vector<std::uint32_t> offsets_{};
3140
};
3241

3342
} // namespace dbscan

python/benchmark.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def benchmark():
5555
("cpp", py_dbscan.DBSCAN(test.eps, test.min_samples)),
5656
]:
5757
runtime = timeit.timeit(
58-
"y_pred = algorithm.fit_predict(test.X)", number=10, globals=locals()
58+
"y_pred = algorithm.fit_predict(test.X)", number=200, globals=locals()
5959
)
6060
print(f" {name:8}: {(runtime * 1000):>8.2f} ms")
6161

python/dbscan_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
def test_moon():
1212
"""Basic test using two moons sample."""
13-
X, _ = datasets.make_moons(n_samples=1000)
13+
X, _ = datasets.make_moons(n_samples=1000, random_state=42)
1414
dbscan = py_dbscan.DBSCAN(0.05, 10)
1515
y_pred = dbscan.fit_predict(X)
1616

repositories.bzl

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,6 @@ def dbscan_dependencies():
1212
url = "https://github.com/bazelbuild/rules_python/releases/download/0.20.0/rules_python-0.20.0.tar.gz",
1313
)
1414

15-
maybe(
16-
http_archive,
17-
name = "nanoflann",
18-
build_file = "@dbscan//third_party:nanoflann.BUILD.bazel",
19-
sha256 = "cbcecf22bec528a8673a113ee9b0e134f91f1f96be57e913fa1f74e98e4449fa",
20-
strip_prefix = "nanoflann-1.4.3",
21-
urls = [
22-
"https://github.com/jlblancoc/nanoflann/archive/refs/tags/v1.4.3.tar.gz",
23-
],
24-
)
25-
2615
maybe(
2716
http_archive,
2817
name = "nanobench",

0 commit comments

Comments
 (0)