From 4fd918a4e66d6175fa1c4791940bcd11aaa61b2b Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 18 Dec 2024 12:31:31 +0100 Subject: [PATCH] GH-45190: [C++][Compute] Add rank_percentile function --- cpp/src/arrow/compute/api_vector.cc | 13 + cpp/src/arrow/compute/api_vector.h | 24 ++ cpp/src/arrow/compute/kernels/vector_rank.cc | 307 ++++++++++++------ .../compute/kernels/vector_sort_internal.h | 7 + .../arrow/compute/kernels/vector_sort_test.cc | 158 ++++++++- docs/source/cpp/compute.rst | 38 ++- 6 files changed, 428 insertions(+), 119 deletions(-) diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index 22ecf1cc87844..54e04298b6aa6 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -48,6 +48,7 @@ using compute::DictionaryEncodeOptions; using compute::FilterOptions; using compute::NullPlacement; using compute::RankOptions; +using compute::RankPercentileOptions; template <> struct EnumTraits @@ -151,6 +152,10 @@ static auto kRankOptionsType = GetFunctionOptionsType( DataMember("sort_keys", &RankOptions::sort_keys), DataMember("null_placement", &RankOptions::null_placement), DataMember("tiebreaker", &RankOptions::tiebreaker)); +static auto kRankPercentileOptionsType = GetFunctionOptionsType( + DataMember("sort_keys", &RankPercentileOptions::sort_keys), + DataMember("null_placement", &RankPercentileOptions::null_placement), + DataMember("factor", &RankPercentileOptions::factor)); static auto kPairwiseOptionsType = GetFunctionOptionsType( DataMember("periods", &PairwiseOptions::periods)); static auto kListFlattenOptionsType = GetFunctionOptionsType( @@ -228,6 +233,14 @@ RankOptions::RankOptions(std::vector sort_keys, NullPlacement null_plac tiebreaker(tiebreaker) {} constexpr char RankOptions::kTypeName[]; +RankPercentileOptions::RankPercentileOptions(std::vector sort_keys, + NullPlacement null_placement, double factor) + : FunctionOptions(internal::kRankPercentileOptionsType), + sort_keys(std::move(sort_keys)), + null_placement(null_placement), + factor(factor) {} +constexpr char RankPercentileOptions::kTypeName[]; + PairwiseOptions::PairwiseOptions(int64_t periods) : FunctionOptions(internal::kPairwiseOptionsType), periods(periods) {} constexpr char PairwiseOptions::kTypeName[]; diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index ada1665b3ec7c..5fd8241e9bd81 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -195,6 +195,30 @@ class ARROW_EXPORT RankOptions : public FunctionOptions { Tiebreaker tiebreaker; }; +/// \brief Percentile rank options +class ARROW_EXPORT RankPercentileOptions : public FunctionOptions { + public: + explicit RankPercentileOptions(std::vector sort_keys = {}, + NullPlacement null_placement = NullPlacement::AtEnd, + double factor = 1.0); + /// Convenience constructor for array inputs + explicit RankPercentileOptions(SortOrder order, + NullPlacement null_placement = NullPlacement::AtEnd, + double factor = 1.0) + : RankPercentileOptions({SortKey("", order)}, null_placement, factor) {} + + static constexpr char const kTypeName[] = "RankPercentileOptions"; + static RankPercentileOptions Defaults() { return RankPercentileOptions(); } + + /// Column key(s) to order by and how to order by these sort keys. + std::vector sort_keys; + /// Whether nulls and NaNs are placed at the start or at the end + NullPlacement null_placement; + /// Factor to apply to the output. + /// Use 1.0 for results in (0, 1), 100.0 for percentages, etc. + double factor; +}; + /// \brief Partitioning options for NthToIndices class ARROW_EXPORT PartitionNthOptions : public FunctionOptions { public: diff --git a/cpp/src/arrow/compute/kernels/vector_rank.cc b/cpp/src/arrow/compute/kernels/vector_rank.cc index 4fdc83788c6f4..50af9c6d599e2 100644 --- a/cpp/src/arrow/compute/kernels/vector_rank.cc +++ b/cpp/src/arrow/compute/kernels/vector_rank.cc @@ -15,9 +15,13 @@ // specific language governing permissions and limitations // under the License. +#include +#include + #include "arrow/compute/function.h" #include "arrow/compute/kernels/vector_sort_internal.h" #include "arrow/compute/registry.h" +#include "arrow/util/logging.h" namespace arrow::compute::internal { @@ -32,10 +36,6 @@ namespace { // is the same as the value at the previous sort index. constexpr uint64_t kDuplicateMask = 1ULL << 63; -constexpr bool NeedsDuplicates(RankOptions::Tiebreaker tiebreaker) { - return tiebreaker != RankOptions::First; -} - template void MarkDuplicates(const NullPartitionResult& sorted, ValueSelector&& value_selector) { using T = decltype(value_selector(int64_t{})); @@ -63,81 +63,145 @@ void MarkDuplicates(const NullPartitionResult& sorted, ValueSelector&& value_sel } } -Result CreateRankings(ExecContext* ctx, const NullPartitionResult& sorted, - const NullPlacement null_placement, - const RankOptions::Tiebreaker tiebreaker) { - auto length = sorted.overall_end() - sorted.overall_begin(); - ARROW_ASSIGN_OR_RAISE(auto rankings, - MakeMutableUInt64Array(length, ctx->memory_pool())); - auto out_begin = rankings->GetMutableValues(1); - uint64_t rank; - - auto is_duplicate = [](uint64_t index) { return (index & kDuplicateMask) != 0; }; - auto original_index = [](uint64_t index) { return index & ~kDuplicateMask; }; - - switch (tiebreaker) { - case RankOptions::Dense: { - rank = 0; - for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) { - if (!is_duplicate(*it)) { - ++rank; - } - out_begin[original_index(*it)] = rank; +struct RankingsEmitter { + virtual ~RankingsEmitter() = default; + virtual bool NeedsDuplicates() = 0; + virtual Result CreateRankings(ExecContext* ctx, + const NullPartitionResult& sorted) = 0; +}; + +// A helper class that emits rankings for the "rank_percentile" function +struct PercentileRankingsEmitter : public RankingsEmitter { + explicit PercentileRankingsEmitter(double factor) : factor_(factor) {} + + bool NeedsDuplicates() override { return true; } + + Result CreateRankings(ExecContext* ctx, + const NullPartitionResult& sorted) override { + const int64_t length = sorted.overall_end() - sorted.overall_begin(); + ARROW_ASSIGN_OR_RAISE(auto rankings, + MakeMutableFloat64Array(length, ctx->memory_pool())); + auto out_begin = rankings->GetMutableValues(1); + + auto is_duplicate = [](uint64_t index) { return (index & kDuplicateMask) != 0; }; + auto original_index = [](uint64_t index) { return index & ~kDuplicateMask; }; + + // The count of values strictly less than the value being considered + int64_t cum_freq = 0; + auto it = sorted.overall_begin(); + + while (it < sorted.overall_end()) { + // Look for a run of duplicate values + DCHECK(!is_duplicate(*it)); + auto run_end = it; + while (++run_end < sorted.overall_end() && is_duplicate(*run_end)) { + } + // The run length, i.e. the frequency of the current value + int64_t freq = run_end - it; + double percentile = (cum_freq + 0.5 * freq) * factor_ / static_cast(length); + // Output percentile rank values + for (; it < run_end; ++it) { + out_begin[original_index(*it)] = percentile; } - break; + cum_freq += freq; } + DCHECK_EQ(cum_freq, length); + return Datum(rankings); + } + + private: + const double factor_; +}; + +// A helper class that emits rankings for the "rank" function +struct OrdinalRankingsEmitter : public RankingsEmitter { + explicit OrdinalRankingsEmitter(RankOptions::Tiebreaker tiebreaker) + : tiebreaker_(tiebreaker) {} + + bool NeedsDuplicates() override { return tiebreaker_ != RankOptions::First; } - case RankOptions::First: { - rank = 0; - for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) { - // No duplicate marks expected for RankOptions::First - DCHECK(!is_duplicate(*it)); - out_begin[*it] = ++rank; + Result CreateRankings(ExecContext* ctx, + const NullPartitionResult& sorted) override { + const int64_t length = sorted.overall_end() - sorted.overall_begin(); + ARROW_ASSIGN_OR_RAISE(auto rankings, + MakeMutableUInt64Array(length, ctx->memory_pool())); + auto out_begin = rankings->GetMutableValues(1); + uint64_t rank; + + auto is_duplicate = [](uint64_t index) { return (index & kDuplicateMask) != 0; }; + auto original_index = [](uint64_t index) { return index & ~kDuplicateMask; }; + + switch (tiebreaker_) { + case RankOptions::Dense: { + rank = 0; + for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) { + if (!is_duplicate(*it)) { + ++rank; + } + out_begin[original_index(*it)] = rank; + } + break; } - break; - } - case RankOptions::Min: { - rank = 0; - for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) { - if (!is_duplicate(*it)) { - rank = (it - sorted.overall_begin()) + 1; + case RankOptions::First: { + rank = 0; + for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) { + // No duplicate marks expected for RankOptions::First + DCHECK(!is_duplicate(*it)); + out_begin[*it] = ++rank; } - out_begin[original_index(*it)] = rank; + break; } - break; - } - case RankOptions::Max: { - rank = length; - for (auto it = sorted.overall_end() - 1; it >= sorted.overall_begin(); --it) { - out_begin[original_index(*it)] = rank; - // If the current index isn't marked as duplicate, then it's the last - // tie in a row (since we iterate in reverse order), so update rank - // for the next row of ties. - if (!is_duplicate(*it)) { - rank = it - sorted.overall_begin(); + case RankOptions::Min: { + rank = 0; + for (auto it = sorted.overall_begin(); it < sorted.overall_end(); ++it) { + if (!is_duplicate(*it)) { + rank = (it - sorted.overall_begin()) + 1; + } + out_begin[original_index(*it)] = rank; } + break; + } + + case RankOptions::Max: { + rank = length; + for (auto it = sorted.overall_end() - 1; it >= sorted.overall_begin(); --it) { + out_begin[original_index(*it)] = rank; + // If the current index isn't marked as duplicate, then it's the last + // tie in a row (since we iterate in reverse order), so update rank + // for the next row of ties. + if (!is_duplicate(*it)) { + rank = it - sorted.overall_begin(); + } + } + break; } - break; } + + return Datum(rankings); } - return Datum(rankings); -} + private: + const RankOptions::Tiebreaker tiebreaker_; +}; const RankOptions* GetDefaultRankOptions() { static const auto kDefaultRankOptions = RankOptions::Defaults(); return &kDefaultRankOptions; } +const RankPercentileOptions* GetDefaultPercentileRankOptions() { + static const auto kDefaultPercentileRankOptions = RankPercentileOptions::Defaults(); + return &kDefaultPercentileRankOptions; +} + template class RankerMixin : public TypeVisitor { public: RankerMixin(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end, const InputType& input, const SortOrder order, - const NullPlacement null_placement, - const RankOptions::Tiebreaker tiebreaker, Datum* output) + const NullPlacement null_placement, RankingsEmitter* emitter) : TypeVisitor(), ctx_(ctx), indices_begin_(indices_begin), @@ -145,15 +209,17 @@ class RankerMixin : public TypeVisitor { input_(input), order_(order), null_placement_(null_placement), - tiebreaker_(tiebreaker), physical_type_(GetPhysicalType(input.type())), - output_(output) {} + emitter_(emitter) {} - Status Run() { return physical_type_->Accept(this); } + Result Run() { + RETURN_NOT_OK(physical_type_->Accept(this)); + return emitter_->CreateRankings(ctx_, sorted_); + } -#define VISIT(TYPE) \ - Status Visit(const TYPE& type) { \ - return static_cast(this)->template RankInternal(); \ +#define VISIT(TYPE) \ + Status Visit(const TYPE& type) { \ + return static_cast(this)->template SortAndMarkDuplicates(); \ } VISIT_SORTABLE_PHYSICAL_TYPES(VISIT) @@ -167,9 +233,9 @@ class RankerMixin : public TypeVisitor { const InputType& input_; const SortOrder order_; const NullPlacement null_placement_; - const RankOptions::Tiebreaker tiebreaker_; const std::shared_ptr physical_type_; - Datum* output_; + RankingsEmitter* emitter_; + NullPartitionResult sorted_{}; }; template @@ -181,26 +247,23 @@ class Ranker : public RankerMixin> { using RankerMixin::RankerMixin; template - Status RankInternal() { + Status SortAndMarkDuplicates() { using GetView = GetViewType; using ArrayType = typename TypeTraits::ArrayType; ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_)); ArrayType array(input_.data()); - ARROW_ASSIGN_OR_RAISE(NullPartitionResult sorted, + ARROW_ASSIGN_OR_RAISE(sorted_, array_sorter(indices_begin_, indices_end_, array, 0, ArraySortOptions(order_, null_placement_), ctx_)); - if (NeedsDuplicates(tiebreaker_)) { + if (emitter_->NeedsDuplicates()) { auto value_selector = [&array](int64_t index) { return GetView::LogicalValue(array.GetView(index)); }; - MarkDuplicates(sorted, value_selector); + MarkDuplicates(sorted_, value_selector); } - ARROW_ASSIGN_OR_RAISE(*output_, - CreateRankings(ctx_, sorted, null_placement_, tiebreaker_)); - return Status::OK(); } }; @@ -214,26 +277,21 @@ class Ranker : public RankerMixin - Status RankInternal() { + Status SortAndMarkDuplicates() { if (physical_chunks_.empty()) { return Status::OK(); } - ARROW_ASSIGN_OR_RAISE( - NullPartitionResult sorted, - SortChunkedArray(ctx_, indices_begin_, indices_end_, physical_type_, - physical_chunks_, order_, null_placement_)); - - if (NeedsDuplicates(tiebreaker_)) { + sorted_, SortChunkedArray(ctx_, indices_begin_, indices_end_, physical_type_, + physical_chunks_, order_, null_placement_)); + if (emitter_->NeedsDuplicates()) { const auto arrays = GetArrayPointers(physical_chunks_); auto value_selector = [resolver = ChunkedArrayResolver(span(arrays))](int64_t index) { return resolver.Resolve(index).Value(); }; - MarkDuplicates(sorted, value_selector); + MarkDuplicates(sorted_, value_selector); } - ARROW_ASSIGN_OR_RAISE(*output_, - CreateRankings(ctx_, sorted, null_placement_, tiebreaker_)); return Status::OK(); } @@ -242,7 +300,7 @@ class Ranker : public RankerMixin ExecuteImpl(const std::vector& args, const FunctionOptions* options, ExecContext* ctx) const override { - const auto& rank_options = checked_cast(*options); switch (args[0].kind()) { case Datum::ARRAY: { - return Rank(*args[0].make_array(), rank_options, ctx); + return Rank(*args[0].make_array(), *options, ctx); } break; case Datum::CHUNKED_ARRAY: { - return Rank(*args[0].chunked_array(), rank_options, ctx); + return Rank(*args[0].chunked_array(), *options, ctx); } break; default: break; @@ -278,14 +347,19 @@ class RankMetaFunction : public MetaFunction { args[0].ToString()); } - private: + protected: + struct UnpackedOptions { + SortOrder order{SortOrder::Ascending}; + NullPlacement null_placement; + std::unique_ptr emitter; + }; + + virtual UnpackedOptions UnpackOptions(const FunctionOptions&) const = 0; + template - static Result Rank(const T& input, const RankOptions& options, - ExecContext* ctx) { - SortOrder order = SortOrder::Ascending; - if (!options.sort_keys.empty()) { - order = options.sort_keys[0].order; - } + Result Rank(const T& input, const FunctionOptions& function_options, + ExecContext* ctx) const { + auto options = UnpackOptions(function_options); int64_t length = input.length(); ARROW_ASSIGN_OR_RAISE(auto indices, @@ -294,11 +368,45 @@ class RankMetaFunction : public MetaFunction { auto* indices_end = indices_begin + length; std::iota(indices_begin, indices_end, 0); - Datum output; - Ranker ranker(ctx, indices_begin, indices_end, input, order, - options.null_placement, options.tiebreaker, &output); - ARROW_RETURN_NOT_OK(ranker.Run()); - return output; + Ranker ranker(ctx, indices_begin, indices_end, input, options.order, + options.null_placement, options.emitter.get()); + return ranker.Run(); + } +}; + +class RankMetaFunction : public RankMetaFunctionBase { + public: + RankMetaFunction() + : RankMetaFunctionBase("rank", Arity::Unary(), rank_doc, GetDefaultRankOptions()) {} + + protected: + UnpackedOptions UnpackOptions(const FunctionOptions& function_options) const override { + const auto& options = checked_cast(function_options); + UnpackedOptions unpacked{ + SortOrder::Ascending, options.null_placement, + std::make_unique(options.tiebreaker)}; + if (!options.sort_keys.empty()) { + unpacked.order = options.sort_keys[0].order; + } + return unpacked; + } +}; + +class RankPercentileMetaFunction : public RankMetaFunctionBase { + public: + RankPercentileMetaFunction() + : RankMetaFunctionBase("rank_percentile", Arity::Unary(), rank_percentile_doc, + GetDefaultPercentileRankOptions()) {} + + protected: + UnpackedOptions UnpackOptions(const FunctionOptions& function_options) const override { + const auto& options = checked_cast(function_options); + UnpackedOptions unpacked{SortOrder::Ascending, options.null_placement, + std::make_unique(options.factor)}; + if (!options.sort_keys.empty()) { + unpacked.order = options.sort_keys[0].order; + } + return unpacked; } }; @@ -306,6 +414,7 @@ class RankMetaFunction : public MetaFunction { void RegisterVectorRank(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::make_shared())); + DCHECK_OK(registry->AddFunction(std::make_shared())); } } // namespace arrow::compute::internal diff --git a/cpp/src/arrow/compute/kernels/vector_sort_internal.h b/cpp/src/arrow/compute/kernels/vector_sort_internal.h index cc6b7834a3021..6288aa26eaa9a 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_internal.h +++ b/cpp/src/arrow/compute/kernels/vector_sort_internal.h @@ -806,4 +806,11 @@ inline Result> MakeMutableUInt64Array( return ArrayData::Make(uint64(), length, {nullptr, std::move(data)}, /*null_count=*/0); } +inline Result> MakeMutableFloat64Array( + int64_t length, MemoryPool* memory_pool) { + auto buffer_size = length * sizeof(double); + ARROW_ASSIGN_OR_RAISE(auto data, AllocateBuffer(buffer_size, memory_pool)); + return ArrayData::Make(float64(), length, {nullptr, std::move(data)}, /*null_count=*/0); +} + } // namespace arrow::compute::internal diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc b/cpp/src/arrow/compute/kernels/vector_sort_test.cc index 7f0ef641f6ceb..15ee4c013f791 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc @@ -2205,9 +2205,9 @@ TEST_F(TestNestedSortIndices, SortRecordBatch) { TestSort(GetRecordBatch()); } TEST_F(TestNestedSortIndices, SortTable) { TestSort(GetTable()); } // ---------------------------------------------------------------------- -// Tests for Rank +// Tests for Rank and Percentile Rank -class TestRank : public ::testing::Test { +class BaseTestRank : public ::testing::Test { protected: // Create several test datums from `array`. One of which is the unmodified Array // while the rest are chunked variants based on it. @@ -2236,6 +2236,11 @@ class TestRank : public ::testing::Test { datums_ = {chunked_array}; } + DatumVector datums_; +}; + +class TestRank : public BaseTestRank { + protected: static void AssertRank(const DatumVector& datums, SortOrder order, NullPlacement null_placement, RankOptions::Tiebreaker tiebreaker, const std::shared_ptr& expected) { @@ -2310,8 +2315,6 @@ class TestRank : public ::testing::Test { AssertRank(SortOrder::Descending, NullPlacement::AtStart, RankOptions::Dense, ArrayFromJSON(uint64(), "[3, 4, 2, 1, 2, 1, 4]")); } - - DatumVector datums_; }; TEST_F(TestRank, Real) { @@ -2466,5 +2469,152 @@ TEST_F(TestRank, EmptyChunks) { } } +class TestRankPercentile : public BaseTestRank { + public: + void AssertRankPercentile(const DatumVector& datums, SortOrder order, + NullPlacement null_placement, double factor, + const std::shared_ptr& expected) { + const std::vector sort_keys{SortKey("foo", order)}; + RankPercentileOptions options(sort_keys, null_placement, factor); + ARROW_SCOPED_TRACE("options = ", options.ToString()); + for (const auto& datum : datums) { + ASSERT_OK_AND_ASSIGN(auto actual, + CallFunction("rank_percentile", {datum}, &options)); + ValidateOutput(actual); + AssertDatumsEqual(expected, actual, /*verbose=*/true); + } + } + + void AssertRankPercentile(const DatumVector& datums, SortOrder order, + NullPlacement null_placement, double factor, + const std::string& expected) { + AssertRankPercentile(datums, order, null_placement, factor, + ArrayFromJSON(float64(), expected)); + } + + void AssertRankPercentile(SortOrder order, NullPlacement null_placement, double factor, + const std::shared_ptr& expected) { + AssertRankPercentile(datums_, order, null_placement, factor, expected); + } + + void AssertRankPercentile(SortOrder order, NullPlacement null_placement, double factor, + const std::string& expected) { + AssertRankPercentile(datums_, order, null_placement, factor, + ArrayFromJSON(float64(), expected)); + } + + void AssertRankPercentileEmpty(std::shared_ptr type) { + for (auto null_placement : AllNullPlacements()) { + for (auto order : AllOrders()) { + AssertRankPercentile({ArrayFromJSON(type, "[]")}, order, null_placement, + /*factor=*/1.0, "[]"); + AssertRankPercentile({ArrayFromJSON(type, "[null]")}, order, null_placement, + /*factor=*/1.0, "[0.5]"); + AssertRankPercentile({ArrayFromJSON(type, "[null]")}, order, null_placement, + /*factor=*/10.0, "[5]"); + AssertRankPercentile({ArrayFromJSON(type, "[null, null, null]")}, order, + null_placement, /*factor=*/1.0, "[0.5, 0.5, 0.5]"); + AssertRankPercentile({ArrayFromJSON(type, "[null, null, null]")}, order, + null_placement, /*factor=*/100.0, "[50, 50, 50]"); + } + } + } + + // Expecting an input ordered like [1, 2, 1, 2, 1] + void AssertRankPercentile_12121() { + for (auto null_placement : AllNullPlacements()) { + AssertRankPercentile(SortOrder::Ascending, null_placement, 100.0, + "[30.0, 80.0, 30.0, 80.0, 30.0]"); + AssertRankPercentile(SortOrder::Descending, null_placement, 100.0, + "[70.0, 20.0, 70.0, 20.0, 70.0]"); + } + } + + // Expecting an input ordered like [null, 1, null, 2, null] + void AssertRankPercentile_N1N2N() { + AssertRankPercentile(SortOrder::Ascending, NullPlacement::AtStart, 1.0, + "[0.3, 0.7, 0.3, 0.9, 0.3]"); + AssertRankPercentile(SortOrder::Ascending, NullPlacement::AtEnd, 1.0, + "[0.7, 0.1, 0.7, 0.3, 0.7]"); + AssertRankPercentile(SortOrder::Descending, NullPlacement::AtStart, 1.0, + "[0.3, 0.9, 0.3, 0.7, 0.3]"); + AssertRankPercentile(SortOrder::Descending, NullPlacement::AtEnd, 1.0, + "[0.7, 0.3, 0.7, 0.1, 0.7]"); + } + + void AssertRankPercentileNumeric(std::shared_ptr type) { + ARROW_SCOPED_TRACE("type = ", type->ToString()); + AssertRankPercentileEmpty(type); + + // Reproduce the example from https://en.wikipedia.org/wiki/Percentile_rank + SetInput(ArrayFromJSON(type, "[7, 5, 5, 4, 4, 3, 3, 3, 2, 1]")); + for (auto null_placement : AllNullPlacements()) { + AssertRankPercentile(SortOrder::Ascending, null_placement, 10.0, + "[9.5, 8.0, 8.0, 6.0, 6.0, 3.5, 3.5, 3.5, 1.5, 0.5]"); + AssertRankPercentile(SortOrder::Ascending, null_placement, 100.0, + "[95, 80, 80, 60, 60, 35, 35, 35, 15, 5]"); + AssertRankPercentile(SortOrder::Descending, null_placement, 10.0, + "[0.5, 2.0, 2.0, 4.0, 4.0, 6.5, 6.5, 6.5, 8.5, 9.5]"); + AssertRankPercentile(SortOrder::Descending, null_placement, 100.0, + "[5, 20, 20, 40, 40, 65, 65, 65, 85, 95]"); + } + + // With nulls + SetInput(ArrayFromJSON(type, "[null, 1, null, 2, null]")); + AssertRankPercentile_N1N2N(); + } + + void AssertRankPercentileBinaryLike(std::shared_ptr type) { + ARROW_SCOPED_TRACE("type = ", type->ToString()); + AssertRankPercentileEmpty(type); + + SetInput(ArrayFromJSON(type, R"(["", "ab", "", "ab", ""])")); + AssertRankPercentile_12121(); + // With nulls + SetInput(ArrayFromJSON(type, R"([null, "", null, "ab", null])")); + AssertRankPercentile_N1N2N(); + } +}; + +TEST_F(TestRankPercentile, Real) { + for (auto type : ::arrow::FloatingPointTypes()) { + AssertRankPercentileNumeric(type); + } +} + +TEST_F(TestRankPercentile, Integral) { + for (auto type : ::arrow::IntTypes()) { + AssertRankPercentileNumeric(type); + } +} + +TEST_F(TestRankPercentile, Boolean) { + auto type = boolean(); + AssertRankPercentileEmpty(type); + + SetInput(ArrayFromJSON(type, "[false, true, false, true, false]")); + AssertRankPercentile_12121(); + // With nulls + SetInput(ArrayFromJSON(type, "[null, false, null, true, null]")); + AssertRankPercentile_N1N2N(); +} + +TEST_F(TestRankPercentile, BinaryLike) { + for (auto type : BaseBinaryTypes()) { + AssertRankPercentileBinaryLike(type); + } +} + +TEST_F(TestRankPercentile, FixedSizeBinary) { + auto type = fixed_size_binary(3); + AssertRankPercentileEmpty(type); + + SetInput(ArrayFromJSON(type, R"(["abc", "def", "abc", "def", "abc"])")); + AssertRankPercentile_12121(); + // With nulls + SetInput(ArrayFromJSON(type, R"([null, "abc", null, "def", null])")); + AssertRankPercentile_N1N2N(); +} + } // namespace compute } // namespace arrow diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 92f3e44039147..64fda25851dee 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1796,19 +1796,21 @@ in the respective option classes. Binary- and String-like inputs are ordered lexicographically as bytestrings, even for String types. -+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ -| Function name | Arity | Input types | Output type | Options class | Notes | -+=======================+============+=========================================================+===================+================================+================+ -| array_sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`ArraySortOptions` | \(1) \(2) | -+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ -| partition_nth_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`PartitionNthOptions` | \(3) | -+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ -| rank | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`RankOptions` | \(4) | -+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ -| select_k_unstable | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SelectKOptions` | \(5) \(6) | -+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ -| sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SortOptions` | \(1) \(5) | -+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ ++-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=======================+============+=========================================================+===================+=================================+================+ +| array_sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`ArraySortOptions` | \(1) \(2) | ++-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ +| partition_nth_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`PartitionNthOptions` | \(3) | ++-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ +| rank | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`RankOptions` | \(4) | ++-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ +| rank_percentile | Unary | Boolean, Numeric, Temporal, Binary- and String-like | Float64 | :struct:`RankPercentileOptions` | \(5) | ++-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ +| select_k_unstable | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SelectKOptions` | \(6) \(7) | ++-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ +| sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SortOptions` | \(1) \(6) | ++-----------------------+------------+---------------------------------------------------------+-------------------+---------------------------------+----------------+ * \(1) The output is an array of indices into the input, that define a @@ -1823,13 +1825,17 @@ in the respective option classes. :func:`std::nth_element`). *N* is given in :member:`PartitionNthOptions::pivot`. -* \(4) The output is a one-based numerical array of ranks +* \(4) The output is a one-based numerical array of ranks. -* \(5) The input can be an array, chunked array, record batch or +* \(5) The output is an array of quantiles between 0 and a constant *factor*. + The *factor* can be configured in :class:`RankPercentileOptions` + (use 100.0 for a percentile rank). + +* \(6) The input can be an array, chunked array, record batch or table. If the input is a record batch or table, one or more sort keys must be specified. -* \(6) The output is an array of indices into the input, that define a +* \(7) The output is an array of indices into the input, that define a non-stable sort of the input. .. _cpp-compute-vector-structural-transforms: