diff --git a/src/shogun/labels/BinaryLabelEncoder.h b/src/shogun/labels/BinaryLabelEncoder.h new file mode 100644 index 00000000000..78cffa107d0 --- /dev/null +++ b/src/shogun/labels/BinaryLabelEncoder.h @@ -0,0 +1,113 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Yuhui Liu + * + */ +#ifndef _BINARYLABELENCODER__H__ +#define _BINARYLABELENCODER__H__ + +#include +#include +#include +#include +#include +#include +#include +namespace shogun +{ + /** @brief Implements a reversible mapping from + * any form of labels to binary labels (+1, -1). + */ + class BinaryLabelEncoder : public LabelEncoder + { + public: + BinaryLabelEncoder() = default; + + ~BinaryLabelEncoder() = default; + + SGVector fit(const std::shared_ptr& labs) override + { + const auto result_vector = labs->as()->get_labels(); + check_is_valid(result_vector); + if (!can_convert_float_to_int(result_vector)) + { + std::set s( + result_vector.begin(), result_vector.end()); + io::warn( + "({}, {}) have been converted to (-1, 1).", *s.begin(), + *s.end()); + } + return fit_impl(result_vector); + } + + std::shared_ptr + transform(const std::shared_ptr& labs) override + { + const auto result_vector = labs->as()->get_labels(); + check_is_valid(result_vector); + auto transformed_vec = transform_impl(result_vector); + + std::transform( + transformed_vec.begin(), transformed_vec.end(), + transformed_vec.begin(), [](float64_t e) { + return Math::fequals( + e, 0.0, + std::numeric_limits::epsilon()) + ? -1.0 + : e; + }); + return std::make_shared(transformed_vec); + } + + std::shared_ptr + inverse_transform(const std::shared_ptr& labs) override + { + auto normalized_labels = labs->as(); + normalized_labels->ensure_valid(); + auto normalized_vector = normalized_labels->get_labels(); + std::transform( + normalized_vector.begin(), normalized_vector.end(), + normalized_vector.begin(), [](float64_t e) { + return Math::fequals( + e, -1.0, + std::numeric_limits::epsilon()) + ? 0.0 + : e; + }); + auto origin_vec = inverse_transform_impl(normalized_vector); + SGVector result_vev(origin_vec.vlen); + std::transform( + origin_vec.begin(), origin_vec.end(), result_vev.begin(), + [](auto&& e) { return static_cast(e); }); + return std::make_shared(result_vev); + } + + std::shared_ptr + fit_transform(const std::shared_ptr& labs) override + { + const auto result_vector = labs->as()->get_labels(); + return std::make_shared( + transform_impl(fit_impl(result_vector))); + } + + virtual const char* get_name() const + { + return "BinaryLabelEncoder"; + } + + private: + void check_is_valid(const SGVector& vec) + { + const auto unique_set = + std::unordered_set(vec.begin(), vec.end()); + require( + unique_set.size() == 2, + "Cannot interpret ({}) as binary labels, need exactly two " + "classes.", + fmt::join(unique_set, ", ")); + } + }; +} // namespace shogun + +#endif \ No newline at end of file diff --git a/src/shogun/labels/LabelEncoder.h b/src/shogun/labels/LabelEncoder.h new file mode 100644 index 00000000000..c5ae4b6a346 --- /dev/null +++ b/src/shogun/labels/LabelEncoder.h @@ -0,0 +1,155 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Yuhui Liu + * + */ + +#ifndef _LABELENCODER__H__ +#define _LABELENCODER__H__ + +#include +#include +#include +#include +#include +#include +namespace shogun +{ + /** @brief Implements a reversible mapping from any + * form of labels to one of Shogun's target label spaces + * (binary, multi-class, etc). + */ + class LabelEncoder : public SGObject + { + public: + LabelEncoder() = default; + + virtual ~LabelEncoder() = default; + + /** Fit label encoder + * + * @param Target values. + * @return SGVector which contains unique labels. + */ + virtual SGVector + fit(const std::shared_ptr& labs) = 0; + /** Transform labels to normalized encoding. + * + * @param Target values to be transformed. + * @return Labels transformed to be normalized. + */ + virtual std::shared_ptr + transform(const std::shared_ptr& labs) = 0; + /** Transform labels back to original encoding. + * + * @param normailzed encoding labels + * @return original encoding labels + */ + virtual std::shared_ptr + inverse_transform(const std::shared_ptr&) = 0; + + /** Fit label encoder and return encoded labels. + * + * @param Target values. + * @return Labels transformed to be normalized. + */ + virtual std::shared_ptr + fit_transform(const std::shared_ptr&) = 0; + + virtual const char* get_name() const + { + return "LabelEncoder"; + } + + protected: + virtual bool check_is_contiguous(const SGVector& vec) + { + return false; + } + + void create_mapping(const SGVector& origin_vector) + { + std::for_each( + origin_vector.begin(), origin_vector.end(), + [this](const auto& old_label) { + auto new_label = std::distance( + unique_labels.begin(), unique_labels.find(old_label)); + inverse_mapping[new_label] = old_label; + mapping[old_label] = new_label; + }); + } + + SGVector fit_impl(const SGVector& origin_vector) + { + is_fitted = true; + std::copy( + origin_vector.begin(), origin_vector.end(), + std::inserter(unique_labels, unique_labels.begin())); + if (check_is_contiguous(origin_vector)) + { + is_fitted = false; + } + create_mapping(origin_vector); + return SGVector( + unique_labels.begin(), unique_labels.end()); + } + + SGVector + transform_impl(const SGVector& result_vector) + { + if (!is_fitted && unique_labels.size()) + return result_vector; + require(is_fitted, "Transform expect to be called after fit."); + SGVector converted(result_vector.vlen); + std::transform( + result_vector.begin(), result_vector.end(), converted.begin(), + [& mapping = mapping](const auto& old_label) { + return mapping[old_label]; + }); + return converted; + } + + SGVector + inverse_transform_impl(const SGVector& result_vector) + { + if (!is_fitted && unique_labels.size()) + { + return result_vector; + } + require( + is_fitted, "Inverse transform expect to be called after fit."); + SGVector original_vector(result_vector.vlen); + std::transform( + result_vector.begin(), result_vector.end(), + original_vector.begin(), + [& inverse_mapping = inverse_mapping](const auto& e) { + return inverse_mapping[e]; + }); + return original_vector; + } + + bool can_convert_float_to_int(const SGVector& vec) const + { + SGVector converted(vec.vlen); + std::transform( + vec.begin(), vec.end(), converted.begin(), + [](auto&& e) { return static_cast(e); }); + return std::equal( + vec.begin(), vec.end(), converted.begin(), + [&](auto&& e1, auto&& e2) { + return Math::fequals(e1, static_cast(e2), eps); + }); + } + + std::set unique_labels; + + std::unordered_map mapping; + std::unordered_map inverse_mapping; + static constexpr float64_t eps = + std::numeric_limits::epsilon(); + bool is_fitted = false; + }; +} // namespace shogun + +#endif \ No newline at end of file diff --git a/src/shogun/labels/MulticlassLabelsEncoder.h b/src/shogun/labels/MulticlassLabelsEncoder.h new file mode 100644 index 00000000000..9948b625cd2 --- /dev/null +++ b/src/shogun/labels/MulticlassLabelsEncoder.h @@ -0,0 +1,91 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Yuhui Liu + * + */ +#ifndef _MulticlassLabelsEncoder__H__ +#define _MulticlassLabelsEncoder__H__ + +#include +#include +#include +#include +#include +#include + +namespace shogun +{ + /** @brief Implements a reversible mapping from + * any form of labels to multi-class labels. + */ + class MulticlassLabelsEncoder : public LabelEncoder + { + public: + MulticlassLabelsEncoder() = default; + + ~MulticlassLabelsEncoder() = default; + + SGVector fit(const std::shared_ptr& labs) override + { + const auto result_vector = labs->as()->get_labels(); + if (!can_convert_float_to_int(result_vector)) + { + std::set s( + result_vector.begin(), result_vector.end()); + io::warn( + "({}) have been converted to (0...{})", fmt::join(s, ", "), + result_vector.vlen - 1); + } + return fit_impl(result_vector); + } + + std::shared_ptr + transform(const std::shared_ptr& labs) override + { + const auto result_vector = labs->as()->get_labels(); + return std::make_shared( + transform_impl(result_vector)); + } + + std::shared_ptr + inverse_transform(const std::shared_ptr& labs) override + { + auto normalized_vector = labs->as()->get_labels(); + return std::make_shared( + inverse_transform_impl(normalized_vector)); + } + + std::shared_ptr + fit_transform(const std::shared_ptr& labs) override + { + const auto result_vector = labs->as()->get_labels(); + return std::make_shared( + transform_impl(fit_impl(result_vector))); + } + + virtual const char* get_name() const + { + return "MulticlassLabelsEncoder"; + } + + protected: + bool check_is_contiguous(const SGVector& vec) override + { + if (const auto vlen = unique_labels.size() == vec.size()) + { + const auto [min_v, max_v] = std::minmax_element( + unique_labels.begin(), unique_labels.end()); + if (Math::fequals(*min_v, 0.0, eps) && + Math::fequals( + *max_v, static_cast(vlen - 1), eps)) + { + return true; + } + } + return false; + } + }; +} // namespace shogun + +#endif \ No newline at end of file diff --git a/tests/unit/labels/LabelsEncoder_unittest.cc b/tests/unit/labels/LabelsEncoder_unittest.cc new file mode 100644 index 00000000000..c2e50a81dd2 --- /dev/null +++ b/tests/unit/labels/LabelsEncoder_unittest.cc @@ -0,0 +1,198 @@ +/* + * This software is distributed under BSD 3-clause license (see LICENSE file). + * + * Authors: Yuhui Liu + */ + +#include +#include +#include +#include +#include +using namespace shogun; + +TEST(BinaryLabelEncoder, fit_transform) +{ + auto label_encoder = std::make_shared(); + SGVector vec{-1, -1, 1, -1, 1}; + auto origin_labels = std::make_shared(vec); + auto unique_vec = label_encoder->fit(origin_labels); + EXPECT_EQ(-1, unique_vec[0]); + EXPECT_EQ(1, unique_vec[1]); + + auto result_labels = label_encoder->transform(origin_labels); + auto result_vec = result_labels->as()->get_labels(); + + SGVector expected_res{-1, -1, 1, -1, 1}; + for (int i = 0; i < 5; i++) + { + EXPECT_EQ(expected_res[i], result_vec[i]); + } + + auto inv_result = label_encoder->inverse_transform(result_labels) + ->as() + ->get_labels(); + + for (int i = 0; i < 5; i++) + { + EXPECT_EQ(vec[i], inv_result[i]); + } +} + +TEST(BinaryLabelEncoder, labels_not_neg1_or_1) +{ + auto label_encoder = std::make_shared(); + SGVector vec{-100, 200, -100, 200, -100}; + auto origin_labels = std::make_shared(vec); + auto unique_vec = label_encoder->fit(origin_labels); + EXPECT_EQ(-100, unique_vec[0]); + EXPECT_EQ(200, unique_vec[1]); + + auto result_labels = label_encoder->transform(origin_labels); + auto result_vec = result_labels->as()->get_labels(); + SGVector expected_vec{-1, 1, -1, 1, -1}; + for (int i = 0; i < 5; i++) + { + EXPECT_EQ(expected_vec[i], result_vec[i]); + } + + auto inv_result = label_encoder->inverse_transform(result_labels) + ->as() + ->get_labels(); + + for (int i = 0; i < 5; i++) + { + EXPECT_EQ(vec[i], inv_result[i]); + } + + SGVector test_vec{-1, -1, -1, -1, -1, 1}; + auto test_labels = std::make_shared(test_vec); + auto inv_test = label_encoder->inverse_transform(test_labels) + ->as() + ->get_labels(); + SGVector expected_inv_test{-100, -100, -100, -100, -100, 200}; + for (int i = 0; i < 6; i++) + { + EXPECT_EQ(expected_inv_test[i], inv_test[i]); + } +} + +TEST(BinaryLabelEncoder, more_than_two_labels) +{ + auto label_encoder = std::make_shared(); + SGVector vec{-100, 200, -100, 200, -100, 42}; + auto origin_labels = std::make_shared(vec); + + EXPECT_THROW(label_encoder->fit(origin_labels), ShogunException); + + EXPECT_THROW(label_encoder->transform(origin_labels), ShogunException); + + SGVector vec2{-1, -1, 1, 0}; + auto result_labels = std::make_shared(vec2); + EXPECT_THROW( + label_encoder->inverse_transform(result_labels), ShogunException); + + SGVector vec3{0, 1, 1, 0}; + auto result_labels2 = std::make_shared(vec3); + EXPECT_THROW( + label_encoder->inverse_transform(result_labels2), ShogunException); +} + +TEST(MulticlassLabelsEncoder, fit_transform) +{ + auto eps = std::numeric_limits::epsilon(); + auto label_encoder = std::make_shared(); + SGVector vec{1.0, 2.0, 2.0, 6.0}; + auto origin_labels = std::make_shared(vec); + auto unique_vec = label_encoder->fit(origin_labels); + EXPECT_NEAR(1, unique_vec[0], eps); + EXPECT_NEAR(2, unique_vec[1], eps); + EXPECT_NEAR(6, unique_vec[2], eps); + + auto result_labels = label_encoder->transform(origin_labels); + auto result_vec = result_labels->as()->get_labels(); + SGVector expected_res{0, 1, 1, 2}; + for (int i = 0; i < 4; i++) + { + EXPECT_NEAR(expected_res[i], result_vec[i], eps); + } + + auto inv_result = label_encoder->inverse_transform(result_labels) + ->as() + ->get_labels(); + + for (int i = 0; i < 4; i++) + { + EXPECT_NEAR(vec[i], inv_result[i], eps); + } +} + +TEST(MulticlassLabelsEncoder, negative_labels) +{ + auto eps = std::numeric_limits::epsilon(); + auto label_encoder = std::make_shared(); + SGVector vec{-100.1, 200.4, -2.868, 6.98, -2.868}; + auto origin_labels = std::make_shared(vec); + auto unique_vec = label_encoder->fit(origin_labels); + EXPECT_NEAR(-100.1, unique_vec[0], eps); + EXPECT_NEAR(-2.868, unique_vec[1], eps); + EXPECT_NEAR(6.98, unique_vec[2], eps); + EXPECT_NEAR(200.4, unique_vec[3], eps); + + auto result_labels = label_encoder->transform(origin_labels); + auto result_vec = result_labels->as()->get_labels(); + SGVector expected_res{0, 3, 1, 2, 1}; + for (int i = 0; i < 5; i++) + { + EXPECT_NEAR(expected_res[i], result_vec[i], eps); + } + + auto inv_result = label_encoder->inverse_transform(result_labels) + ->as() + ->get_labels(); + + for (int i = 0; i < 5; i++) + { + EXPECT_NEAR(vec[i], inv_result[i], eps); + } + + SGVector test_vec{0, 1, 2, 3, 1, 3}; + auto test_labels = std::make_shared(test_vec); + auto inv_test = label_encoder->inverse_transform(test_labels) + ->as() + ->get_labels(); + SGVector expected_inv{-100.1, -2.868, 6.98, + 200.4, -2.868, 200.4}; + for (int i = 0; i < 6; i++) + { + EXPECT_NEAR(expected_inv[i], inv_test[i], eps); + } +} + +TEST(MulticlassLabelsEncoder, contiguous_labels) +{ + auto eps = std::numeric_limits::epsilon(); + auto label_encoder = std::make_shared(); + SGVector vec{0, 1, 2, 3, 4, 5}; + auto origin_labels = std::make_shared(vec); + auto unique_vec = label_encoder->fit(origin_labels); + for (int i = 0; i < 6; i++) + { + EXPECT_NEAR(vec[i], unique_vec[i], eps); + } + + auto result_labels = label_encoder->transform(origin_labels); + auto result_vec = result_labels->as()->get_labels(); + for (int i = 0; i < 6; i++) + { + EXPECT_NEAR(vec[i], result_vec[i], eps); + } + + auto inv_labels = label_encoder->inverse_transform(result_labels); + auto inv_vec = inv_labels->as()->get_labels(); + + for (int i = 0; i < 6; i++) + { + EXPECT_NEAR(vec[i], inv_vec[i], eps); + } +} \ No newline at end of file