|
1 | 1 | #include "cpp/dbscan.hpp" |
2 | 2 |
|
3 | | -#include <nanoflann.hpp> |
| 3 | +#include <cmath> |
| 4 | +#include <numeric> |
| 5 | +#include <climits> |
| 6 | +#include <unordered_map> |
4 | 7 |
|
5 | 8 | namespace dbscan { |
6 | 9 |
|
7 | | -namespace { |
8 | | - |
9 | | -class PointsVectorAdaptor |
10 | | -{ |
11 | | -public: |
12 | | - explicit PointsVectorAdaptor(std::vector<Dbscan::Point> const& points) |
13 | | - : points_{points} |
14 | | - { |
15 | | - } |
16 | | - |
17 | | - std::size_t kdtree_get_point_count() const |
18 | | - { |
19 | | - return std::size(points_); |
20 | | - } |
21 | | - |
22 | | - float kdtree_get_pt(std::size_t idx, int dim) const |
23 | | - { |
24 | | - return points_[idx][dim]; |
25 | | - } |
26 | | - |
27 | | - template <typename Bbox> |
28 | | - bool kdtree_get_bbox(Bbox&) const |
29 | | - { |
30 | | - return false; |
31 | | - } |
32 | | - |
33 | | -private: |
34 | | - std::vector<Dbscan::Point> const& points_; |
35 | | -}; |
36 | | - |
37 | | -} // namespace |
38 | | - |
39 | 10 | Dbscan::Dbscan(float const eps, std::uint32_t const min_samples, std::size_t const num_points_hint) |
40 | | - : eps_squared_{eps * eps} |
| 11 | + : eps_{eps} |
| 12 | + , eps_squared_{square(eps)} |
41 | 13 | , min_samples_{min_samples} |
42 | 14 | { |
43 | 15 | if (num_points_hint > 0) { |
44 | 16 | labels_.reserve(num_points_hint); |
45 | 17 | neighbors_.reserve(num_points_hint); |
46 | 18 | visited_.reserve(num_points_hint); |
47 | 19 | to_visit_.reserve(num_points_hint); |
| 20 | + counts_.reserve(num_points_hint); |
| 21 | + offsets_.reserve(num_points_hint); |
48 | 22 | } |
49 | 23 | } |
50 | 24 |
|
51 | 25 | auto Dbscan::fit_predict(std::vector<Dbscan::Point> const& points) -> std::vector<Dbscan::Label> |
52 | 26 | { |
53 | | - PointsVectorAdaptor adapter{points}; |
| 27 | + labels_.assign(std::size(points), undefined); |
| 28 | + visited_.assign(std::size(points), false); |
54 | 29 |
|
55 | | - constexpr auto num_dims{2}; |
56 | | - constexpr auto leaf_size{32}; |
57 | | - nanoflann:: |
58 | | - KDTreeSingleIndexAdaptor<nanoflann::L2_Adaptor<float, PointsVectorAdaptor>, PointsVectorAdaptor, num_dims> |
59 | | - points_kd_tree{num_dims, adapter, nanoflann::KDTreeSingleIndexAdaptorParams{leaf_size}}; |
60 | | - points_kd_tree.buildIndex(); |
| 30 | + if (std::size(points) <= 1) { |
| 31 | + return labels_; |
| 32 | + } |
61 | 33 |
|
62 | | - nanoflann::SearchParams params{}; |
63 | | - params.sorted = false; |
| 34 | + // calculate min_max of the current point cloud |
| 35 | + Dbscan::Point min{points[0]}; |
| 36 | + Dbscan::Point max{points[0]}; |
| 37 | + for (auto const& pt : points) { |
| 38 | + min[0] = std::min(min[0], pt[0]); |
| 39 | + min[1] = std::min(min[1], pt[1]); |
| 40 | + max[0] = std::max(max[0], pt[0]); |
| 41 | + max[1] = std::max(max[1], pt[1]); |
| 42 | + } |
64 | 43 |
|
65 | | - labels_.assign(std::size(points), undefined); |
| 44 | + // derive num_bins out of it |
| 45 | + float const range_x{max[0] - min[0]}; |
| 46 | + float const range_y{max[1] - min[1]}; |
| 47 | + auto const num_bins_x{static_cast<std::uint32_t>(std::ceil(range_x / eps_))}; |
| 48 | + auto const num_bins_y{static_cast<std::uint32_t>(std::ceil(range_y / eps_))}; |
| 49 | + |
| 50 | + // count number of points in every bin |
| 51 | + counts_.assign(num_bins_x * num_bins_y, 0); |
| 52 | + |
| 53 | + // FIRST PASS OVER THE POINTS |
| 54 | + for (auto const& pt : points) { |
| 55 | + auto const bin_x{static_cast<std::uint32_t>(std::floor((pt[0] - min[0]) / eps_))}; |
| 56 | + auto const bin_y{static_cast<std::uint32_t>(std::floor((pt[1] - min[1]) / eps_))}; |
| 57 | + auto const index{bin_y * num_bins_x + bin_x}; |
| 58 | + counts_[index] += 1; |
| 59 | + } |
66 | 60 |
|
67 | | - Label cluster_count{0}; |
68 | | - std::vector<Label> clusters{}; |
| 61 | + // calculate the offsets for each cell (bin) |
| 62 | + offsets_.clear(); |
| 63 | + std::exclusive_scan(std::cbegin(counts_), std::cend(counts_), std::back_inserter(offsets_), 0); |
| 64 | + |
| 65 | + // re-sorting the points (calculating index mapping) based on the bin indices |
| 66 | + auto scratch = offsets_; |
| 67 | + std::vector<Point> new_points(std::size(points)); |
| 68 | + std::vector<std::uint32_t> new_point_to_point_index_map(std::size(points)); |
| 69 | + std::uint32_t i{0}; |
| 70 | + for (auto const& pt : points) { |
| 71 | + auto const bin_x{static_cast<std::uint32_t>(std::floor((pt[0] - min[0]) / eps_))}; |
| 72 | + auto const bin_y{static_cast<std::uint32_t>(std::floor((pt[1] - min[1]) / eps_))}; |
| 73 | + auto const index{bin_y * num_bins_x + bin_x}; |
| 74 | + auto const new_pt_index{scratch[index]}; |
| 75 | + scratch[index] += 1; |
| 76 | + new_points[new_pt_index] = pt; |
| 77 | + new_point_to_point_index_map[new_pt_index] = i++; |
| 78 | + } |
69 | 79 |
|
70 | | - for (auto i{0UL}; i < std::size(points); ++i) { |
71 | | - // skip point if it has already been processed |
72 | | - if (labels_[i] != undefined) { |
73 | | - continue; |
74 | | - } |
| 80 | + std::vector<std::uint32_t> num_neighbors(std::size(new_points), 0); |
75 | 81 |
|
76 | | - // find number of neighbors of current point |
77 | | - if (points_kd_tree.radiusSearch(points[i].data(), eps_squared_, neighbors_, params) < min_samples_) { |
78 | | - labels_[i] = noise; |
79 | | - continue; |
80 | | - } |
| 82 | + constexpr auto num_core_points_entries{3U}; |
| 83 | + std::vector<std::array<std::int32_t, num_core_points_entries>> core_points_ids; |
| 84 | + core_points_ids.assign(new_points.size(), {-1, -1, -1}); |
81 | 85 |
|
82 | | - // This point has at least min_samples_ in its eps neighborhood, so it's considered a core point. Time to |
83 | | - // start a new cluster. |
| 86 | +#pragma omp parallel for |
| 87 | + for (auto i = 0UL; i < std::size(new_points); ++i) { |
| 88 | + auto const pt{new_points[i]}; |
| 89 | + auto const bin_x{static_cast<std::int32_t>(std::floor((pt[0] - min[0]) / eps_))}; |
| 90 | + auto const bin_y{static_cast<std::int32_t>(std::floor((pt[1] - min[1]) / eps_))}; |
84 | 91 |
|
85 | | - auto const current_cluster_id{cluster_count++}; |
86 | | - labels_[i] = current_cluster_id; |
| 92 | + std::vector<std::uint32_t> local_neighbors; |
87 | 93 |
|
88 | | - to_visit_.clear(); |
89 | | - visited_.assign(std::size(points), false); |
| 94 | + constexpr std::array<int, 9> dx = {-1, +0, +1, -1, +0, +1, -1, +0, +1}; |
| 95 | + constexpr std::array<int, 9> dy = {-1, -1, -1, +0, +0, +0, +1, +1, +1}; |
90 | 96 |
|
91 | | - for (auto const& n : neighbors_) { |
92 | | - if (!visited_[n.first]) { |
93 | | - to_visit_.push_back(n.first); |
| 97 | + for (auto ni{0}; ni < 9; ++ni) { |
| 98 | + auto const nx{bin_x + dx[ni]}; |
| 99 | + auto const ny{bin_y + dy[ni]}; |
| 100 | + if (nx < 0 || ny < 0 || nx >= static_cast<std::int32_t>(num_bins_x) || |
| 101 | + ny >= static_cast<std::int32_t>(num_bins_y)) { |
| 102 | + continue; |
94 | 103 | } |
95 | | - visited_[n.first] = true; |
96 | | - } |
97 | | - |
98 | | - for (auto j{0UL}; j < std::size(to_visit_); ++j) { |
99 | | - auto const neighbor{to_visit_[j]}; |
| 104 | + auto const neighbor_bin{ny * num_bins_x + nx}; |
100 | 105 |
|
101 | | - if (labels_[neighbor] == noise) { |
102 | | - // This was considered as a seed before, but didn't have enough points in its eps neighborhood. |
103 | | - // Since it's in the current seed's neighborhood, we label it as belonging to this label, but it |
104 | | - // won't be used as a seed again. |
105 | | - labels_[neighbor] = current_cluster_id; |
106 | | - continue; |
| 106 | + for (auto j{0U}; j < counts_[neighbor_bin]; ++j) { |
| 107 | + auto const neighbor_pt_index{offsets_[neighbor_bin] + j}; |
| 108 | + if (neighbor_pt_index == i) { |
| 109 | + continue; |
| 110 | + } |
| 111 | + auto const& neighbor_pt{new_points[neighbor_pt_index]}; |
| 112 | + if ((square(neighbor_pt[0] - pt[0]) + square(neighbor_pt[1] - pt[1])) < eps_squared_) { |
| 113 | + local_neighbors.push_back(neighbor_pt_index); |
| 114 | + } |
107 | 115 | } |
| 116 | + } |
108 | 117 |
|
109 | | - if (labels_[neighbor] != undefined) { |
110 | | - // Point belongs already to a cluster: skip it. |
111 | | - continue; |
| 118 | + if (std::size(local_neighbors) > min_samples_) { |
| 119 | + for (auto const n : local_neighbors) { |
| 120 | + auto& cps{core_points_ids[n]}; |
| 121 | + for (auto cp_id{0U}; cp_id < num_core_points_entries; ++cp_id) { |
| 122 | + if (cps[cp_id] == -1) { |
| 123 | + cps[cp_id] = i; |
| 124 | + break; |
| 125 | + } |
| 126 | + } |
112 | 127 | } |
| 128 | + } |
| 129 | + } |
113 | 130 |
|
114 | | - // assign the current cluster's label to the neighbor |
115 | | - labels_[neighbor] = current_cluster_id; |
| 131 | + for (auto i{0UL}; i < std::size(new_points); ++i) { |
| 132 | + if (core_points_ids[i][0] >= 0) { |
| 133 | + labels_[i] = static_cast<Label>(i); |
| 134 | + } else { |
| 135 | + labels_[i] = noise; |
| 136 | + } |
| 137 | + } |
116 | 138 |
|
117 | | - // and query its neighborhood to see if it also to be considered as a core point |
118 | | - if (points_kd_tree.radiusSearch(points[neighbor].data(), eps_squared_, neighbors_, params) < min_samples_) { |
| 139 | + bool converged{false}; |
| 140 | + while (!converged) { |
| 141 | + converged = true; |
| 142 | + for (auto i{0UL}; i < std::size(new_points); ++i) { |
| 143 | + if (labels_[i] == -1) { |
119 | 144 | continue; |
120 | 145 | } |
121 | | - for (auto const& n : neighbors_) { |
122 | | - if (!visited_[n.first]) { |
123 | | - to_visit_.push_back(n.first); |
| 146 | + for (auto const current_core_idx : core_points_ids[i]) { |
| 147 | + if (current_core_idx == -1) { |
| 148 | + continue; |
| 149 | + } |
| 150 | + if (labels_[i] < labels_[current_core_idx]) { |
| 151 | + labels_[current_core_idx] = labels_[i]; |
| 152 | + converged = false; |
| 153 | + } else if (labels_[i] > labels_[current_core_idx]) { |
| 154 | + labels_[i] = labels_[current_core_idx]; |
| 155 | + converged = false; |
124 | 156 | } |
125 | | - visited_[n.first] = true; |
126 | 157 | } |
127 | 158 | } |
128 | 159 | } |
129 | 160 |
|
130 | | - return labels_; |
| 161 | + std::unordered_map<Label, Label> labels_map; |
| 162 | + labels_map.reserve(labels_.size()); |
| 163 | + |
| 164 | + Label num_labels{0}; |
| 165 | + labels_map[noise] = noise; |
| 166 | + for (auto const l : labels_) { |
| 167 | + if (labels_map.find(l) == labels_map.end()) { |
| 168 | + labels_map[l] = num_labels; |
| 169 | + num_labels++; |
| 170 | + } |
| 171 | + } |
| 172 | + |
| 173 | + std::vector<Label> labels(std::size(labels_)); |
| 174 | + for (auto i{0U}; i < std::size(labels_); ++i) { |
| 175 | + labels[new_point_to_point_index_map[i]] = labels_map[labels_[i]]; |
| 176 | + } |
| 177 | + |
| 178 | + return labels; |
131 | 179 | } |
132 | 180 |
|
133 | 181 | } // namespace dbscan |
0 commit comments