pyg-team
diff --git a/‎benchmark/classes/hash_map.py
+12-5 b/‎benchmark/classes/hash_map.py
+12-5
diff --git a/‎pyg_lib/__init__.py
-1 b/‎pyg_lib/__init__.py
-1
diff --git a/‎pyg_lib/classes/__init__.py
-18 b/‎pyg_lib/classes/__init__.py
-18
diff --git a/‎pyg_lib/csrc/classes/cpu/hash_map.cpp
+151 b/‎pyg_lib/csrc/classes/cpu/hash_map.cpp
+151
diff --git a/‎pyg_lib/csrc/classes/cpu/hash_map_impl.h
-68 b/‎pyg_lib/csrc/classes/cpu/hash_map_impl.h
-68
diff --git a/‎pyg_lib/csrc/classes/cuda/hash_map_impl.cu ‎pyg_lib/csrc/classes/cuda/hash_map.cu b/‎pyg_lib/csrc/classes/cuda/hash_map_impl.cu ‎pyg_lib/csrc/classes/cuda/hash_map.cu
diff --git a/‎pyg_lib/csrc/classes/hash_map.cpp
-47 b/‎pyg_lib/csrc/classes/hash_map.cpp
-47
diff --git a/‎pyg_lib/csrc/classes/hash_map.h
-19 b/‎pyg_lib/csrc/classes/hash_map.h
-19
diff --git a/‎pyg_lib/csrc/classes/hash_map_impl.h
-14 b/‎pyg_lib/csrc/classes/hash_map_impl.h
-14
diff --git a/‎test/csrc/classes/test_hash_map.cpp
+3-2 b/‎test/csrc/classes/test_hash_map.cpp
+3-2
@@ -4,7 +4,7 @@
 import pandas as pd
 import torch
 
-from pyg_lib.classes import HashMap
+import pyg_lib  # noqa
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
@@ -30,6 +30,13 @@
     query2 = torch.randperm(args.num_queries, device=args.device)
     query2 = query2[:args.num_queries]
 
+    if key1.is_cpu:
+        HashMap = torch.classes.pyg.CPUHashMap
+    elif key1.is_cuda:
+        HashMap = torch.classes.pyg.CUDAHashMap
+    else:
+        raise NotImplementedError(f"Unsupported device '{args.device}'")
+
     t_init = t_get = 0
     for i in range(num_warmups + num_steps):
         torch.cuda.synchronize()
@@ -55,7 +62,7 @@
         t_start = time.perf_counter()
         hash_map = torch.full((args.num_keys, ), fill_value=-1,
                               dtype=torch.long, device=args.device)
-        hash_map[key2] = torch.arange(args.num_keys)
+        hash_map[key2] = torch.arange(args.num_keys, device=args.device)
         torch.cuda.synchronize()
         if i >= num_warmups:
             t_init += time.perf_counter() - t_start
@@ -85,7 +92,7 @@
             if i >= num_warmups:
                 t_get += time.perf_counter() - t_start
 
-    print(f' Pandas Init: {t_init / num_steps:.4f}s')
-    print(f' Pandas  Get: {t_get / num_steps:.4f}s')
+        print(f' Pandas Init: {t_init / num_steps:.4f}s')
+        print(f' Pandas  Get: {t_get / num_steps:.4f}s')
 
-    assert out1.equal(torch.tensor(out3))
+        assert out1.equal(torch.tensor(out3))
@@ -34,7 +34,6 @@ def load_library(lib_name: str) -> None:
 load_library('libpyg')
 
 import pyg_lib.ops  # noqa
-import pyg_lib.classes  # noqa
 import pyg_lib.partition  # noqa
 import pyg_lib.sampler  # noqa
 
 
@@ -0,0 +1,151 @@
+#include <ATen/ATen.h>
+#include <ATen/Parallel.h>
+#include <parallel_hashmap/phmap.h>
+#include <torch/library.h>
+
+namespace pyg {
+namespace classes {
+
+namespace {
+
+#define DISPATCH_CASE_KEY(...)                         \
+  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)   \
+  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
+
+#define DISPATCH_KEY(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, DISPATCH_CASE_KEY(__VA_ARGS__))
+
+struct HashMapImpl {
+  virtual ~HashMapImpl() = default;
+  virtual at::Tensor get(const at::Tensor& query) = 0;
+  virtual at::Tensor keys() = 0;
+};
+
+template <typename KeyType>
+struct CPUHashMapImpl : HashMapImpl {
+ public:
+  using ValueType = int64_t;
+
+  CPUHashMapImpl(const at::Tensor& key) {
+    map_.reserve(key.numel());
+
+    const auto key_data = key.data_ptr<KeyType>();
+
+    const auto num_threads = at::get_num_threads();
+    const auto grain_size =
+        std::max((key.numel() + num_threads - 1) / num_threads,
+                 at::internal::GRAIN_SIZE);
+
+    at::parallel_for(0, key.numel(), grain_size, [&](int64_t beg, int64_t end) {
+      for (int64_t i = beg; i < end; ++i) {
+        const auto [iterator, inserted] = map_.insert({key_data[i], i});
+        TORCH_CHECK(inserted, "Found duplicated key in 'HashMap'.");
+      }
+    });
+  }
+
+  at::Tensor get(const at::Tensor& query) override {
+    const auto options =
+        query.options().dtype(c10::CppTypeToScalarType<ValueType>::value);
+    const auto out = at::empty({query.numel()}, options);
+    const auto query_data = query.data_ptr<KeyType>();
+    const auto out_data = out.data_ptr<ValueType>();
+
+    const auto num_threads = at::get_num_threads();
+    const auto grain_size =
+        std::max((query.numel() + num_threads - 1) / num_threads,
+                 at::internal::GRAIN_SIZE);
+
+    at::parallel_for(0, query.numel(), grain_size, [&](int64_t b, int64_t e) {
+      for (int64_t i = b; i < e; ++i) {
+        const auto it = map_.find(query_data[i]);
+        out_data[i] = (it != map_.end()) ? it->second : -1;
+      }
+    });
+
+    return out;
+  }
+
+  at::Tensor keys() override {
+    const auto size = static_cast<int64_t>(map_.size());
+
+    at::Tensor key;
+    if (std::is_same<KeyType, int16_t>::value) {
+      key = at::empty({size}, at::TensorOptions().dtype(at::kShort));
+    } else if (std::is_same<KeyType, int32_t>::value) {
+      key = at::empty({size}, at::TensorOptions().dtype(at::kInt));
+    } else {
+      key = at::empty({size}, at::TensorOptions().dtype(at::kLong));
+    }
+    const auto key_data = key.data_ptr<KeyType>();
+
+    for (const auto& pair : map_) {  // No efficient multi-threading possible :(
+      key_data[pair.second] = pair.first;
+    }
+
+    return key;
+  }
+
+ private:
+  phmap::parallel_flat_hash_map<
+      KeyType,
+      ValueType,
+      phmap::priv::hash_default_hash<KeyType>,
+      phmap::priv::hash_default_eq<KeyType>,
+      phmap::priv::Allocator<std::pair<const KeyType, ValueType>>,
+      12,
+      std::mutex>
+      map_;
+};
+
+struct CPUHashMap : torch::CustomClassHolder {
+ public:
+  CPUHashMap(const at::Tensor& key) {
+    at::TensorArg key_arg{key, "key", 0};
+    at::CheckedFrom c{"CPUHashMap.init"};
+    at::checkDeviceType(c, key, at::DeviceType::CPU);
+    at::checkDim(c, key_arg, 1);
+    at::checkContiguous(c, key_arg);
+
+    DISPATCH_KEY(key.scalar_type(), "cpu_hash_map_init", [&] {
+      map_ = std::make_unique<CPUHashMapImpl<scalar_t>>(key);
+    });
+  }
+
+  at::Tensor get(const at::Tensor& query) {
+    at::TensorArg query_arg{query, "query", 0};
+    at::CheckedFrom c{"CPUHashMap.get"};
+    at::checkDeviceType(c, query, at::DeviceType::CPU);
+    at::checkDim(c, query_arg, 1);
+    at::checkContiguous(c, query_arg);
+
+    return map_->get(query);
+  }
+
+  at::Tensor keys() { return map_->keys(); }
+
+ private:
+  std::unique_ptr<HashMapImpl> map_;
+};
+
+}  // namespace
+
+TORCH_LIBRARY_FRAGMENT(pyg, m) {
+  m.class_<CPUHashMap>("CPUHashMap")
+      .def(torch::init<at::Tensor&>())
+      .def("get", &CPUHashMap::get)
+      .def("keys", &CPUHashMap::keys)
+      .def_pickle(
+          // __getstate__
+          [](const c10::intrusive_ptr<CPUHashMap>& self) -> at::Tensor {
+            return self->keys();
+          },
+          // __setstate__
+          [](const at::Tensor& state) -> c10::intrusive_ptr<CPUHashMap> {
+            return c10::make_intrusive<CPUHashMap>(state);
+          });
+}
+
+}  // namespace classes
+}  // namespace pyg
@@ -1,13 +1,14 @@
 #include <ATen/ATen.h>
 #include <gtest/gtest.h>
 
-#include "pyg_lib/csrc/classes/hash_map.h"
+#include "pyg_lib/csrc/classes/cpu/hash_map.cpp"
 
 TEST(HashMapTest, BasicAssertions) {
   auto options = at::TensorOptions().dtype(at::kLong);
   auto key = at::tensor({0, 10, 30, 20}, options);
 
-  auto map = pyg::classes::HashMap(key);
+  auto map = pyg::classes::CPUHashMap(key);
+  EXPECT_TRUE(at::equal(map.keys(), key));
 
   auto query = at::tensor({30, 10, 20, 40}, options);
   auto expected = at::tensor({2, 1, 3, -1}, options);