Switch to grid-based neighbor search and parallelize

aiegorov · ahans · ahans · commit 96e74da1f84b · 2023-07-19T13:40:12.000+02:00
This changes the neighbor search to use a grid-based indexing scheme
instead of the nanoflann-based kd-tree. We reorder the input points into
eps^2-sized cells. This way we can directly access the points of such a
cell and only have to query the 8-neighborhood of a point's cell to find
all its potential neighbors. For most datasets this is faster than first
building up the kd-tree and then querying it.

We further change the algorithm to require only a single pass over the
data, where for each point we find its eps-neighborhood and record if
that point could be a potential core point for any point in the
eps-neighborhood. We remember up two three potential core points. In the
next step, for each point that has at least one potential core point, we
label that point with its index (it for sure will be part of a cluster).
We finally need to merge the clusters iteratively until convergence in
the last step.

Co-authored-by: Alexander Hans &lt;ahans@users.noreply.github.com&gt;
diff --git a/cpp/BUILD.bazel b/cpp/BUILD.bazel
@@ -10,10 +10,9 @@ cc_library(
     name = "dbscan",
     srcs = ["dbscan.cpp"],
     hdrs = ["dbscan.hpp"],
+    copts = ["-fopenmp"],
+    linkopts = ["-fopenmp"],
     visibility = ["//visibility:public"],
-    deps = [
-        "@nanoflann",
-    ],
 )
 
 cc_binary(
diff --git a/cpp/dbscan.cpp b/cpp/dbscan.cpp
@@ -1,133 +1,181 @@
 #include "cpp/dbscan.hpp"
 
-#include <nanoflann.hpp>
+#include <cmath>
+#include <numeric>
+#include <climits>
+#include <unordered_map>
 
 namespace dbscan {
 
-namespace {
-
-class PointsVectorAdaptor
-{
-public:
-    explicit PointsVectorAdaptor(std::vector<Dbscan::Point> const& points)
-        : points_{points}
-    {
-    }
-
-    std::size_t kdtree_get_point_count() const
-    {
-        return std::size(points_);
-    }
-
-    float kdtree_get_pt(std::size_t idx, int dim) const
-    {
-        return points_[idx][dim];
-    }
-
-    template <typename Bbox>
-    bool kdtree_get_bbox(Bbox&) const
-    {
-        return false;
-    }
-
-private:
-    std::vector<Dbscan::Point> const& points_;
-};
-
-}  // namespace
-
 Dbscan::Dbscan(float const eps, std::uint32_t const min_samples, std::size_t const num_points_hint)
-    : eps_squared_{eps * eps}
+    : eps_{eps}
+    , eps_squared_{square(eps)}
     , min_samples_{min_samples}
 {
     if (num_points_hint > 0) {
         labels_.reserve(num_points_hint);
         neighbors_.reserve(num_points_hint);
         visited_.reserve(num_points_hint);
         to_visit_.reserve(num_points_hint);
+        counts_.reserve(num_points_hint);
+        offsets_.reserve(num_points_hint);
     }
 }
 
 auto Dbscan::fit_predict(std::vector<Dbscan::Point> const& points) -> std::vector<Dbscan::Label>
 {
-    PointsVectorAdaptor adapter{points};
+    labels_.assign(std::size(points), undefined);
+    visited_.assign(std::size(points), false);
 
-    constexpr auto num_dims{2};
-    constexpr auto leaf_size{32};
-    nanoflann::
-        KDTreeSingleIndexAdaptor<nanoflann::L2_Adaptor<float, PointsVectorAdaptor>, PointsVectorAdaptor, num_dims>
-            points_kd_tree{num_dims, adapter, nanoflann::KDTreeSingleIndexAdaptorParams{leaf_size}};
-    points_kd_tree.buildIndex();
+    if (std::size(points) <= 1) {
+        return labels_;
+    }
 
-    nanoflann::SearchParams params{};
-    params.sorted = false;
+    // calculate min_max of the current point cloud
+    Dbscan::Point min{points[0]};
+    Dbscan::Point max{points[0]};
+    for (auto const& pt : points) {
+        min[0] = std::min(min[0], pt[0]);
+        min[1] = std::min(min[1], pt[1]);
+        max[0] = std::max(max[0], pt[0]);
+        max[1] = std::max(max[1], pt[1]);
+    }
 
-    labels_.assign(std::size(points), undefined);
+    // derive num_bins out of it
+    float const range_x{max[0] - min[0]};
+    float const range_y{max[1] - min[1]};
+    auto const num_bins_x{static_cast<std::uint32_t>(std::ceil(range_x / eps_))};
+    auto const num_bins_y{static_cast<std::uint32_t>(std::ceil(range_y / eps_))};
+
+    // count number of points in every bin
+    counts_.assign(num_bins_x * num_bins_y, 0);
+
+    // FIRST PASS OVER THE POINTS
+    for (auto const& pt : points) {
+        auto const bin_x{static_cast<std::uint32_t>(std::floor((pt[0] - min[0]) / eps_))};
+        auto const bin_y{static_cast<std::uint32_t>(std::floor((pt[1] - min[1]) / eps_))};
+        auto const index{bin_y * num_bins_x + bin_x};
+        counts_[index] += 1;
+    }
 
-    Label cluster_count{0};
-    std::vector<Label> clusters{};
+    // calculate the offsets for each cell (bin)
+    offsets_.clear();
+    std::exclusive_scan(std::cbegin(counts_), std::cend(counts_), std::back_inserter(offsets_), 0);
+
+    // re-sorting the points (calculating index mapping) based on the bin indices
+    auto scratch = offsets_;
+    std::vector<Point> new_points(std::size(points));
+    std::vector<std::uint32_t> new_point_to_point_index_map(std::size(points));
+    std::uint32_t i{0};
+    for (auto const& pt : points) {
+        auto const bin_x{static_cast<std::uint32_t>(std::floor((pt[0] - min[0]) / eps_))};
+        auto const bin_y{static_cast<std::uint32_t>(std::floor((pt[1] - min[1]) / eps_))};
+        auto const index{bin_y * num_bins_x + bin_x};
+        auto const new_pt_index{scratch[index]};
+        scratch[index] += 1;
+        new_points[new_pt_index] = pt;
+        new_point_to_point_index_map[new_pt_index] = i++;
+    }
 
-    for (auto i{0UL}; i < std::size(points); ++i) {
-        // skip point if it has already been processed
-        if (labels_[i] != undefined) {
-            continue;
-        }
+    std::vector<std::uint32_t> num_neighbors(std::size(new_points), 0);
 
-        // find number of neighbors of current point
-        if (points_kd_tree.radiusSearch(points[i].data(), eps_squared_, neighbors_, params) < min_samples_) {
-            labels_[i] = noise;
-            continue;
-        }
+    constexpr auto num_core_points_entries{3U};
+    std::vector<std::array<std::int32_t, num_core_points_entries>> core_points_ids;
+    core_points_ids.assign(new_points.size(), {-1, -1, -1});
 
-        // This point has at least min_samples_ in its eps neighborhood, so it's considered a core point. Time to
-        // start a new cluster.
+#pragma omp parallel for
+    for (auto i = 0UL; i < std::size(new_points); ++i) {
+        auto const pt{new_points[i]};
+        auto const bin_x{static_cast<std::int32_t>(std::floor((pt[0] - min[0]) / eps_))};
+        auto const bin_y{static_cast<std::int32_t>(std::floor((pt[1] - min[1]) / eps_))};
 
-        auto const current_cluster_id{cluster_count++};
-        labels_[i] = current_cluster_id;
+        std::vector<std::uint32_t> local_neighbors;
 
-        to_visit_.clear();
-        visited_.assign(std::size(points), false);
+        constexpr std::array<int, 9> dx = {-1, +0, +1, -1, +0, +1, -1, +0, +1};
+        constexpr std::array<int, 9> dy = {-1, -1, -1, +0, +0, +0, +1, +1, +1};
 
-        for (auto const& n : neighbors_) {
-            if (!visited_[n.first]) {
-                to_visit_.push_back(n.first);
+        for (auto ni{0}; ni < 9; ++ni) {
+            auto const nx{bin_x + dx[ni]};
+            auto const ny{bin_y + dy[ni]};
+            if (nx < 0 || ny < 0 || nx >= static_cast<std::int32_t>(num_bins_x) ||
+                ny >= static_cast<std::int32_t>(num_bins_y)) {
+                continue;
             }
-            visited_[n.first] = true;
-        }
-
-        for (auto j{0UL}; j < std::size(to_visit_); ++j) {
-            auto const neighbor{to_visit_[j]};
+            auto const neighbor_bin{ny * num_bins_x + nx};
 
-            if (labels_[neighbor] == noise) {
-                // This was considered as a seed before, but didn't have enough points in its eps neighborhood.
-                // Since it's in the current seed's neighborhood, we label it as belonging to this label, but it
-                // won't be used as a seed again.
-                labels_[neighbor] = current_cluster_id;
-                continue;
+            for (auto j{0U}; j < counts_[neighbor_bin]; ++j) {
+                auto const neighbor_pt_index{offsets_[neighbor_bin] + j};
+                if (neighbor_pt_index == i) {
+                    continue;
+                }
+                auto const& neighbor_pt{new_points[neighbor_pt_index]};
+                if ((square(neighbor_pt[0] - pt[0]) + square(neighbor_pt[1] - pt[1])) < eps_squared_) {
+                    local_neighbors.push_back(neighbor_pt_index);
+                }
             }
+        }
 
-            if (labels_[neighbor] != undefined) {
-                // Point belongs already to a cluster: skip it.
-                continue;
+        if (std::size(local_neighbors) > min_samples_) {
+            for (auto const n : local_neighbors) {
+                auto& cps{core_points_ids[n]};
+                for (auto cp_id{0U}; cp_id < num_core_points_entries; ++cp_id) {
+                    if (cps[cp_id] == -1) {
+                        cps[cp_id] = i;
+                        break;
+                    }
+                }
             }
+        }
+    }
 
-            // assign the current cluster's label to the neighbor
-            labels_[neighbor] = current_cluster_id;
+    for (auto i{0UL}; i < std::size(new_points); ++i) {
+        if (core_points_ids[i][0] >= 0) {
+            labels_[i] = static_cast<Label>(i);
+        } else {
+            labels_[i] = noise;
+        }
+    }
 
-            // and query its neighborhood to see if it also to be considered as a core point
-            if (points_kd_tree.radiusSearch(points[neighbor].data(), eps_squared_, neighbors_, params) < min_samples_) {
+    bool converged{false};
+    while (!converged) {
+        converged = true;
+        for (auto i{0UL}; i < std::size(new_points); ++i) {
+            if (labels_[i] == -1) {
                 continue;
             }
-            for (auto const& n : neighbors_) {
-                if (!visited_[n.first]) {
-                    to_visit_.push_back(n.first);
+            for (auto const current_core_idx : core_points_ids[i]) {
+                if (current_core_idx == -1) {
+                    continue;
+                }
+                if (labels_[i] < labels_[current_core_idx]) {
+                    labels_[current_core_idx] = labels_[i];
+                    converged = false;
+                } else if (labels_[i] > labels_[current_core_idx]) {
+                    labels_[i] = labels_[current_core_idx];
+                    converged = false;
                 }
-                visited_[n.first] = true;
             }
         }
     }
 
-    return labels_;
+    std::unordered_map<Label, Label> labels_map;
+    labels_map.reserve(labels_.size());
+
+    Label num_labels{0};
+    labels_map[noise] = noise;
+    for (auto const l : labels_) {
+        if (labels_map.find(l) == labels_map.end()) {
+            labels_map[l] = num_labels;
+            num_labels++;
+        }
+    }
+
+    std::vector<Label> labels(std::size(labels_));
+    for (auto i{0U}; i < std::size(labels_); ++i) {
+        labels[new_point_to_point_index_map[i]] = labels_map[labels_[i]];
+    }
+
+    return labels;
 }
 
 }  // namespace dbscan
diff --git a/cpp/dbscan.hpp b/cpp/dbscan.hpp
@@ -6,6 +6,12 @@
 
 namespace dbscan {
 
+template <typename T>
+inline T square(T const v)
+{
+    return v * v;
+}
+
 class Dbscan
 {
 public:
@@ -20,6 +26,7 @@ class Dbscan
     [[nodiscard]] std::vector<Label> fit_predict(std::vector<Point> const& points);
 
 private:
+    float eps_;
     float eps_squared_;
     std::uint32_t min_samples_;
 
@@ -28,6 +35,8 @@ class Dbscan
     std::vector<std::pair<std::uint32_t, float>> neighbors_;
     std::vector<bool> visited_;
     std::vector<std::uint32_t> to_visit_;
+    std::vector<std::uint32_t> counts_;
+    std::vector<std::uint32_t> offsets_{};
 };
 
 }  // namespace dbscan
diff --git a/python/benchmark.py b/python/benchmark.py
@@ -55,7 +55,7 @@ def benchmark():
             ("cpp", py_dbscan.DBSCAN(test.eps, test.min_samples)),
         ]:
             runtime = timeit.timeit(
-                "y_pred = algorithm.fit_predict(test.X)", number=10, globals=locals()
+                "y_pred = algorithm.fit_predict(test.X)", number=200, globals=locals()
             )
             print(f"    {name:8}: {(runtime * 1000):>8.2f} ms")
 
diff --git a/python/dbscan_test.py b/python/dbscan_test.py
@@ -10,7 +10,7 @@
 
 def test_moon():
     """Basic test using two moons sample."""
-    X, _ = datasets.make_moons(n_samples=1000)
+    X, _ = datasets.make_moons(n_samples=1000, random_state=42)
     dbscan = py_dbscan.DBSCAN(0.05, 10)
     y_pred = dbscan.fit_predict(X)
 
diff --git a/repositories.bzl b/repositories.bzl
@@ -12,17 +12,6 @@ def dbscan_dependencies():
         url = "https://github.com/bazelbuild/rules_python/releases/download/0.20.0/rules_python-0.20.0.tar.gz",
     )
 
-    maybe(
-        http_archive,
-        name = "nanoflann",
-        build_file = "@dbscan//third_party:nanoflann.BUILD.bazel",
-        sha256 = "cbcecf22bec528a8673a113ee9b0e134f91f1f96be57e913fa1f74e98e4449fa",
-        strip_prefix = "nanoflann-1.4.3",
-        urls = [
-            "https://github.com/jlblancoc/nanoflann/archive/refs/tags/v1.4.3.tar.gz",
-        ],
-    )
-
     maybe(
         http_archive,
         name = "nanobench",

Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,7 @@ def benchmark():`
`55`	`55`	`("cpp", py_dbscan.DBSCAN(test.eps, test.min_samples)),`
`56`	`56`	`]:`
`57`	`57`	`runtime = timeit.timeit(`
`58`		`- "y_pred = algorithm.fit_predict(test.X)", number=10, globals=locals()`
	`58`	`+ "y_pred = algorithm.fit_predict(test.X)", number=200, globals=locals()`
`59`	`59`	`)`
`60`	`60`	`print(f" {name:8}: {(runtime * 1000):>8.2f} ms")`
`61`	`61`