From 9ae88e87e686b6b24d70e0d5430dd9f3ef57f4b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20R=2E=20H=C3=B6lzlwimmer?= Date: Sat, 23 May 2026 21:13:16 +0000 Subject: [PATCH 01/11] feat(format): draft arrow.range canonical extension type Add a canonical extension type for bounded ranges (mathematical intervals), distinct from Arrow's calendar Interval (duration) type. - Spec: docs/source/format/CanonicalExtensions.rst adds the Range section. Storage is Struct with both bounds nullable (null = +/-infinity, treated as exclusive). A closed parameter (left/right/both/neither, pandas vocabulary) is carried as JSON extension metadata; the subtype is read from storage. Disambiguates from the calendar Interval type per DB convention (INTERVAL = duration, RANGE/PERIOD = bounded set). - C++ reference impl: cpp/src/arrow/extension/range.{h,cc} (RangeType/RangeArray) with serialize/deserialize, storage validation, registration in the global registry, tests, and CMake/meson wiring. --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/extension/CMakeLists.txt | 3 +- cpp/src/arrow/extension/meson.build | 2 + cpp/src/arrow/extension/range.cc | 206 +++++++++++++++ cpp/src/arrow/extension/range.h | 104 ++++++++ cpp/src/arrow/extension/range_test.cc | 290 +++++++++++++++++++++ cpp/src/arrow/extension_type.cc | 2 + cpp/src/arrow/meson.build | 1 + docs/source/format/CanonicalExtensions.rst | 96 +++++++ 9 files changed, 704 insertions(+), 1 deletion(-) create mode 100644 cpp/src/arrow/extension/range.cc create mode 100644 cpp/src/arrow/extension/range.h create mode 100644 cpp/src/arrow/extension/range_test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 45cd7e838121..0d5adf587bba 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -1022,6 +1022,7 @@ if(ARROW_JSON) arrow_add_object_library(ARROW_JSON extension/fixed_shape_tensor.cc extension/opaque.cc + extension/range.cc extension/tensor_internal.cc extension/variable_shape_tensor.cc json/options.cc diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index ae52bc32a998..966c927bf2ce 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -18,7 +18,8 @@ set(CANONICAL_EXTENSION_TESTS bool8_test.cc json_test.cc uuid_test.cc) if(ARROW_JSON) - list(APPEND CANONICAL_EXTENSION_TESTS tensor_extension_array_test.cc opaque_test.cc) + list(APPEND CANONICAL_EXTENSION_TESTS tensor_extension_array_test.cc opaque_test.cc + range_test.cc) endif() add_arrow_test(test diff --git a/cpp/src/arrow/extension/meson.build b/cpp/src/arrow/extension/meson.build index 84dafe4bbe32..8be6b1321a1c 100644 --- a/cpp/src/arrow/extension/meson.build +++ b/cpp/src/arrow/extension/meson.build @@ -21,6 +21,7 @@ if needs_json canonical_extension_tests += [ 'tensor_extension_array_test.cc', 'opaque_test.cc', + 'range_test.cc', ] endif @@ -38,6 +39,7 @@ install_headers( 'json.h', 'opaque.h', 'parquet_variant.h', + 'range.h', 'uuid.h', 'variable_shape_tensor.h', ], diff --git a/cpp/src/arrow/extension/range.cc b/cpp/src/arrow/extension/range.cc new file mode 100644 index 000000000000..2d5d35b2f438 --- /dev/null +++ b/cpp/src/arrow/extension/range.cc @@ -0,0 +1,206 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension/range.h" + +#include +#include + +#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep +#include "arrow/util/logging_internal.h" + +#include +#include +#include + +namespace arrow::extension { + +namespace { + +/// Map RangeClosed -> the JSON string value used in serialization. +std::string_view ClosedToString(RangeClosed closed) { + switch (closed) { + case RangeClosed::Left: + return "left"; + case RangeClosed::Right: + return "right"; + case RangeClosed::Both: + return "both"; + case RangeClosed::Neither: + return "neither"; + } + // unreachable + return "right"; +} + +/// Parse the JSON "closed" string into a RangeClosed enum. +/// Returns an error if the string is not one of the four valid values. +Result ClosedFromString(std::string_view s) { + if (s == "left") return RangeClosed::Left; + if (s == "right") return RangeClosed::Right; + if (s == "both") return RangeClosed::Both; + if (s == "neither") return RangeClosed::Neither; + return Status::Invalid( + "Invalid value for RangeType \"closed\" parameter: \"", s, + "\". Expected one of: \"left\", \"right\", \"both\", \"neither\"."); +} + +/// Build the storage Struct type for a given value subtype. +std::shared_ptr MakeStorageType(const std::shared_ptr& value_type) { + // Both "lower" and "upper" are nullable (null = infinite bound). + return struct_({field("lower", value_type, /*nullable=*/true), + field("upper", value_type, /*nullable=*/true)}); +} + +} // namespace + +// --------------------------------------------------------------------------- +// RangeType + +std::shared_ptr RangeType::value_type() const { + // storage_type() is a struct with two fields; both share the same type. + return internal::checked_cast(*storage_type()).field(0)->type(); +} + +std::string RangeType::ToString(bool show_metadata) const { + std::stringstream ss; + ss << "extension<" << extension_name() + << "[value_type=" << value_type()->ToString(show_metadata) + << ", closed=" << ClosedToString(closed_) << "]>"; + return ss.str(); +} + +bool RangeType::ExtensionEquals(const ExtensionType& other) const { + if (extension_name() != other.extension_name()) { + return false; + } + const auto& other_range = internal::checked_cast(other); + return storage_type()->Equals(*other_range.storage_type()) && + closed_ == other_range.closed_; +} + +std::string RangeType::Serialize() const { + rapidjson::Document document; + document.SetObject(); + rapidjson::Document::AllocatorType& allocator = document.GetAllocator(); + + auto closed_str = ClosedToString(closed_); + rapidjson::Value closed_value(closed_str.data(), + static_cast(closed_str.size()), + allocator); + document.AddMember(rapidjson::Value("closed", allocator), closed_value, allocator); + + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + document.Accept(writer); + return buffer.GetString(); +} + +Result> RangeType::Deserialize( + std::shared_ptr storage_type, const std::string& serialized_data) const { + // Validate storage type structure. + if (storage_type->id() != Type::STRUCT) { + return Status::Invalid("RangeType storage type must be a Struct, got ", + storage_type->ToString()); + } + const auto& struct_type = internal::checked_cast(*storage_type); + if (struct_type.num_fields() != 2) { + return Status::Invalid( + "RangeType storage Struct must have exactly 2 fields, got ", + struct_type.num_fields()); + } + const auto& lower_field = struct_type.field(0); + const auto& upper_field = struct_type.field(1); + if (lower_field->name() != "lower") { + return Status::Invalid( + "RangeType storage Struct field 0 must be named \"lower\", got \"", + lower_field->name(), "\""); + } + if (upper_field->name() != "upper") { + return Status::Invalid( + "RangeType storage Struct field 1 must be named \"upper\", got \"", + upper_field->name(), "\""); + } + if (!lower_field->nullable()) { + return Status::Invalid("RangeType storage Struct field \"lower\" must be nullable"); + } + if (!upper_field->nullable()) { + return Status::Invalid("RangeType storage Struct field \"upper\" must be nullable"); + } + if (!lower_field->type()->Equals(*upper_field->type())) { + return Status::Invalid( + "RangeType storage Struct fields \"lower\" and \"upper\" must have the same " + "type, got \"", + lower_field->type()->ToString(), "\" and \"", upper_field->type()->ToString(), + "\""); + } + + // Parse "closed" parameter from JSON metadata. + // Empty metadata defaults to {"closed": "right"}. + RangeClosed closed = RangeClosed::Right; + if (!serialized_data.empty()) { + rapidjson::Document document; + const auto& parsed = + document.Parse(serialized_data.data(), serialized_data.length()); + if (parsed.HasParseError()) { + return Status::Invalid("Invalid serialized JSON data for RangeType: ", + rapidjson::GetParseError_En(parsed.GetParseError()), ": ", + serialized_data); + } + if (!document.IsObject()) { + return Status::Invalid( + "Invalid serialized JSON data for RangeType: not an object"); + } + if (document.HasMember("closed")) { + const auto& closed_val = document["closed"]; + if (!closed_val.IsString()) { + return Status::Invalid( + "Invalid serialized JSON data for RangeType: \"closed\" is not a string"); + } + ARROW_ASSIGN_OR_RAISE( + closed, ClosedFromString(std::string_view(closed_val.GetString(), + closed_val.GetStringLength()))); + } + } + + return std::make_shared(std::move(storage_type), closed); +} + +std::shared_ptr RangeType::MakeArray(std::shared_ptr data) const { + DCHECK_EQ(data->type->id(), Type::EXTENSION); + DCHECK_EQ("arrow.range", + internal::checked_cast(*data->type).extension_name()); + return std::make_shared(data); +} + +Result> RangeType::Make( + std::shared_ptr value_type, RangeClosed closed) { + auto storage = MakeStorageType(value_type); + return std::make_shared(std::move(storage), closed); +} + +// --------------------------------------------------------------------------- +// Free factory function + +std::shared_ptr range(std::shared_ptr value_type, + RangeClosed closed) { + auto result = RangeType::Make(std::move(value_type), closed); + ARROW_CHECK_OK(result.status()); + return std::move(result).ValueOrDie(); +} + +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/range.h b/cpp/src/arrow/extension/range.h new file mode 100644 index 000000000000..b87f847c4b0c --- /dev/null +++ b/cpp/src/arrow/extension/range.h @@ -0,0 +1,104 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/extension_type.h" +#include "arrow/type.h" + +namespace arrow::extension { + +/// \brief Which bound(s) of an arrow.range interval are inclusive. +/// +/// Null (infinite) bounds are always exclusive regardless of this value. +enum class RangeClosed { + /// Lower bound is inclusive, upper bound is exclusive: [lower, upper) + Left, + /// Lower bound is exclusive, upper bound is inclusive: (lower, upper] + Right, + /// Both bounds are inclusive: [lower, upper] + Both, + /// Both bounds are exclusive: (lower, upper) + Neither, +}; + +/// \brief RangeType represents a bounded set (mathematical interval) over an +/// orderable Arrow type T. +/// +/// Storage is a Struct with exactly two nullable fields: +/// - "lower": T NULLABLE (null = unbounded below, i.e. -infinity) +/// - "upper": T NULLABLE (null = unbounded above, i.e. +infinity) +/// +/// The outer struct's validity bit marks a null/absent range. +/// +/// The "closed" parameter controls which finite bounds are inclusive. +/// Null (infinite) bounds are always treated as exclusive. +class ARROW_EXPORT RangeType : public ExtensionType { + public: + /// \brief Construct a RangeType. + /// + /// \param[in] storage_type A two-field Struct type with nullable fields + /// "lower" and "upper" of the same orderable Arrow type T. + /// \param[in] closed Which bound(s) are inclusive. + explicit RangeType(std::shared_ptr storage_type, RangeClosed closed) + : ExtensionType(std::move(storage_type)), closed_(closed) {} + + std::string extension_name() const override { return "arrow.range"; } + std::string ToString(bool show_metadata = false) const override; + bool ExtensionEquals(const ExtensionType& other) const override; + std::string Serialize() const override; + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized_data) const override; + + /// \brief Create a RangeArray from ArrayData. + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + /// \brief Factory function. + /// + /// Constructs the required two-field struct storage type internally. + /// \param[in] value_type The orderable Arrow subtype T for lower and upper. + /// \param[in] closed Which bound(s) are inclusive. + static Result> Make(std::shared_ptr value_type, + RangeClosed closed = RangeClosed::Right); + + /// \brief Return the bound-inclusivity parameter. + RangeClosed closed() const { return closed_; } + + /// \brief Return the Arrow subtype T (the type of "lower" and "upper" fields). + std::shared_ptr value_type() const; + + private: + RangeClosed closed_; +}; + +/// \brief Array class for arrow.range extension arrays. +class ARROW_EXPORT RangeArray : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; + +/// \brief Create a RangeType with the given value subtype and closed parameter. +/// +/// This is a convenience wrapper around RangeType::Make that aborts on error. +/// For recoverable error handling prefer RangeType::Make. +ARROW_EXPORT std::shared_ptr range(std::shared_ptr value_type, + RangeClosed closed = RangeClosed::Right); + +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/range_test.cc b/cpp/src/arrow/extension/range_test.cc new file mode 100644 index 000000000000..eb9c7abe0074 --- /dev/null +++ b/cpp/src/arrow/extension/range_test.cc @@ -0,0 +1,290 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "arrow/extension/range.h" +#include "arrow/extension_type.h" +#include "arrow/io/memory.h" +#include "arrow/ipc/reader.h" +#include "arrow/ipc/writer.h" +#include "arrow/record_batch.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { + +using internal::checked_pointer_cast; + +// --------------------------------------------------------------------------- +// Helpers + +static std::shared_ptr RangeInt32Right() { + return checked_pointer_cast( + extension::range(int32(), extension::RangeClosed::Right)); +} + +static std::shared_ptr RangeInt32Both() { + return checked_pointer_cast( + extension::range(int32(), extension::RangeClosed::Both)); +} + +static std::shared_ptr RangeInt64Left() { + return checked_pointer_cast( + extension::range(int64(), extension::RangeClosed::Left)); +} + +// --------------------------------------------------------------------------- +// Basics + +TEST(RangeType, Basics) { + auto type = RangeInt32Right(); + ASSERT_EQ("arrow.range", type->extension_name()); + ASSERT_EQ(*int32(), *type->value_type()); + ASSERT_EQ(extension::RangeClosed::Right, type->closed()); + ASSERT_EQ(*type, *type); + ASSERT_NE(*arrow::null(), *type); + ASSERT_THAT(type->Serialize(), ::testing::Not(::testing::IsEmpty())); + ASSERT_EQ(R"({"closed":"right"})", type->Serialize()); + ASSERT_EQ("extension", + type->ToString(false)); +} + +TEST(RangeType, AllClosedValues) { + using C = extension::RangeClosed; + auto left = checked_pointer_cast( + extension::range(int32(), C::Left)); + auto right = checked_pointer_cast( + extension::range(int32(), C::Right)); + auto both = checked_pointer_cast( + extension::range(int32(), C::Both)); + auto neither = checked_pointer_cast( + extension::range(int32(), C::Neither)); + + ASSERT_EQ(R"({"closed":"left"})", left->Serialize()); + ASSERT_EQ(R"({"closed":"right"})", right->Serialize()); + ASSERT_EQ(R"({"closed":"both"})", both->Serialize()); + ASSERT_EQ(R"({"closed":"neither"})", neither->Serialize()); +} + +// --------------------------------------------------------------------------- +// Equals + +TEST(RangeType, Equals) { + auto type_i32_right = RangeInt32Right(); + auto type_i32_both = RangeInt32Both(); + auto type_i64_left = RangeInt64Left(); + auto type_i32_right2 = RangeInt32Right(); + + // Same object. + ASSERT_EQ(*type_i32_right, *type_i32_right); + + // Different instances but same parameters. + ASSERT_EQ(*type_i32_right, *type_i32_right2); + + // Different closed value. + ASSERT_NE(*type_i32_right, *type_i32_both); + + // Different value_type. + ASSERT_NE(*type_i32_right, *type_i64_left); + + // Not equal to a non-range type. + ASSERT_NE(*type_i32_right, *arrow::null()); + ASSERT_NE(*type_i32_right, *arrow::int32()); +} + +// --------------------------------------------------------------------------- +// CreateFromArray + +TEST(RangeType, CreateFromArray) { + auto type = RangeInt32Right(); + // Build a StructArray that matches the storage type. + auto storage_type = type->storage_type(); + auto lower = ArrayFromJSON(int32(), "[1, null, 5]"); + auto upper = ArrayFromJSON(int32(), "[10, 20, null]"); + ASSERT_OK_AND_ASSIGN(auto storage, StructArray::Make({lower, upper}, + {field("lower", int32(), true), + field("upper", int32(), true)})); + auto array = ExtensionType::WrapArray(type, storage); + ASSERT_EQ(3, array->length()); + ASSERT_EQ(0, array->null_count()); +} + +// --------------------------------------------------------------------------- +// Deserialize - valid cases + +void CheckDeserialize(const std::string& serialized, + const std::shared_ptr& expected) { + auto type = checked_pointer_cast(expected); + ASSERT_OK_AND_ASSIGN(auto deserialized, + type->Deserialize(type->storage_type(), serialized)); + ASSERT_EQ(*expected, *deserialized); +} + +TEST(RangeType, Deserialize) { + // Normal JSON + ASSERT_NO_FATAL_FAILURE( + CheckDeserialize(R"({"closed": "right"})", + extension::range(int32(), extension::RangeClosed::Right))); + ASSERT_NO_FATAL_FAILURE( + CheckDeserialize(R"({"closed": "left"})", + extension::range(int32(), extension::RangeClosed::Left))); + ASSERT_NO_FATAL_FAILURE( + CheckDeserialize(R"({"closed": "both"})", + extension::range(int32(), extension::RangeClosed::Both))); + ASSERT_NO_FATAL_FAILURE( + CheckDeserialize(R"({"closed": "neither"})", + extension::range(int32(), extension::RangeClosed::Neither))); + + // Extra fields are tolerated (forward-compatibility). + ASSERT_NO_FATAL_FAILURE( + CheckDeserialize(R"({"closed": "right", "extra": 42})", + extension::range(int32(), extension::RangeClosed::Right))); + + // Empty metadata defaults to "right". + ASSERT_NO_FATAL_FAILURE( + CheckDeserialize("", extension::range(int32(), extension::RangeClosed::Right))); + + // Empty JSON object (no "closed" key) also defaults to "right". + ASSERT_NO_FATAL_FAILURE( + CheckDeserialize("{}", extension::range(int32(), extension::RangeClosed::Right))); +} + +// --------------------------------------------------------------------------- +// Deserialize - invalid cases + +TEST(RangeType, DeserializeInvalidMetadata) { + auto type = RangeInt32Right(); + + // Empty string is valid (defaults to "right"); truly malformed JSON fails. + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Missing a name for object member"), + type->Deserialize(type->storage_type(), "{")); + + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("not an object"), + type->Deserialize(type->storage_type(), "[]")); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("\"closed\" is not a string"), + type->Deserialize(type->storage_type(), R"({"closed": 42})")); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Invalid value for RangeType"), + type->Deserialize(type->storage_type(), R"({"closed": "unknown"})")); +} + +TEST(RangeType, DeserializeInvalidStorage) { + auto type = RangeInt32Right(); + auto wrong_storage_not_struct = int32(); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("must be a Struct"), + type->Deserialize(wrong_storage_not_struct, R"({"closed":"right"})")); + + // Wrong number of fields. + auto one_field = struct_({field("lower", int32(), true)}); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("exactly 2 fields"), + type->Deserialize(one_field, R"({"closed":"right"})")); + + // Wrong field name for field 0. + auto bad_lower_name = + struct_({field("start", int32(), true), field("upper", int32(), true)}); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("named \"lower\""), + type->Deserialize(bad_lower_name, R"({"closed":"right"})")); + + // Wrong field name for field 1. + auto bad_upper_name = + struct_({field("lower", int32(), true), field("end", int32(), true)}); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("named \"upper\""), + type->Deserialize(bad_upper_name, R"({"closed":"right"})")); + + // Fields have different types. + auto mismatched_types = + struct_({field("lower", int32(), true), field("upper", int64(), true)}); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("same type"), + type->Deserialize(mismatched_types, R"({"closed":"right"})")); + + // Non-nullable lower field. + auto lower_not_nullable = + struct_({field("lower", int32(), /*nullable=*/false), field("upper", int32(), true)}); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("\"lower\" must be nullable"), + type->Deserialize(lower_not_nullable, R"({"closed":"right"})")); + + // Non-nullable upper field. + auto upper_not_nullable = + struct_({field("lower", int32(), true), field("upper", int32(), /*nullable=*/false)}); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("\"upper\" must be nullable"), + type->Deserialize(upper_not_nullable, R"({"closed":"right"})")); +} + +// --------------------------------------------------------------------------- +// Metadata (Serialize/Deserialize) round-trip + +TEST(RangeType, MetadataRoundTrip) { + using C = extension::RangeClosed; + for (const auto& type : + {extension::range(int32(), C::Left), extension::range(int32(), C::Right), + extension::range(int32(), C::Both), extension::range(int32(), C::Neither), + extension::range(int64(), C::Right), extension::range(date32(), C::Both)}) { + auto rt = checked_pointer_cast(type); + std::string serialized = rt->Serialize(); + ASSERT_OK_AND_ASSIGN(auto deserialized, + rt->Deserialize(rt->storage_type(), serialized)); + ASSERT_EQ(*type, *deserialized) << "Round-trip failed for: " << type->ToString(); + } +} + +// --------------------------------------------------------------------------- +// IPC (BatchRoundTrip) -- registration round-trip + +TEST(RangeType, BatchRoundTrip) { + auto type = RangeInt32Right(); + auto lower = ArrayFromJSON(int32(), "[1, null, 5]"); + auto upper = ArrayFromJSON(int32(), "[10, 20, null]"); + ASSERT_OK_AND_ASSIGN(auto storage, StructArray::Make({lower, upper}, + {field("lower", int32(), true), + field("upper", int32(), true)})); + auto array = ExtensionType::WrapArray(type, storage); + auto batch = + RecordBatch::Make(schema({field("rng", type)}), array->length(), {array}); + + std::shared_ptr written; + { + ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); + ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), + out_stream.get())); + ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); + + io::BufferReader reader(complete_ipc_stream); + std::shared_ptr batch_reader; + ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); + ASSERT_OK(batch_reader->ReadNext(&written)); + } + + ASSERT_EQ(*batch->schema(), *written->schema()); + ASSERT_BATCHES_EQUAL(*batch, *written); +} + +} // namespace arrow diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index ce88c9517411..45cf5ed1e16f 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -31,6 +31,7 @@ #ifdef ARROW_JSON # include "arrow/extension/fixed_shape_tensor.h" # include "arrow/extension/opaque.h" +# include "arrow/extension/range.h" # include "arrow/extension/variable_shape_tensor.h" #endif #include "arrow/extension/json.h" @@ -156,6 +157,7 @@ static void CreateGlobalRegistry() { #ifdef ARROW_JSON ext_types.push_back(extension::fixed_shape_tensor(int64(), {})); ext_types.push_back(extension::opaque(null(), "", "")); + ext_types.push_back(extension::range(int32(), extension::RangeClosed::Right)); ext_types.push_back(extension::variable_shape_tensor(int64(), 0)); #endif diff --git a/cpp/src/arrow/meson.build b/cpp/src/arrow/meson.build index 4b8faebecfd7..d03a41d56966 100644 --- a/cpp/src/arrow/meson.build +++ b/cpp/src/arrow/meson.build @@ -507,6 +507,7 @@ if needs_json 'sources': [ 'extension/fixed_shape_tensor.cc', 'extension/opaque.cc', + 'extension/range.cc', 'extension/tensor_internal.cc', 'extension/variable_shape_tensor.cc', 'json/options.cc', diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index c6cd8f3ea13a..c1840d7bdf8c 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -573,6 +573,102 @@ This extension type is intended to be compatible with ANSI SQL's ``TIMESTAMP WIT It is also *permissible* for the ``offset_minutes`` field to be dictionary-encoded or run-end-encoded. +.. _range_extension: + +Range +===== + +Range represents a bounded set (mathematical interval) defined by a lower and +an upper bound over an orderable Arrow type T. It is the Arrow equivalent of +PostgreSQL's `range types`_ and SQL:2011 ``PERIOD`` types. + +.. note:: + + **Disambiguation from Arrow's calendar** ``Interval`` **type.** + Arrow already has an ``Interval`` type (``INTERVAL_MONTHS``, + ``INTERVAL_DAY_TIME``, ``INTERVAL_MONTH_DAY_NANO``) that represents a + *duration* -- a signed difference between two points in time. The + ``arrow.range`` extension type is an entirely different concept: it + represents a *bounded set* with explicit lower and upper endpoints, + analogous to a closed or open interval in mathematics. The naming + follows database convention: SQL uses ``INTERVAL`` for durations and + ``RANGE`` (or ``PERIOD``) for bounded sets. + +* Extension name: ``arrow.range``. + +* The storage type of the extension is a ``Struct`` with exactly **two fields, + in order**: + + * ``lower``: the lower bound, type **T**, **nullable**. + A null value means the range is unbounded below (negative infinity). + * ``upper``: the upper bound, type **T**, **nullable**. + A null value means the range is unbounded above (positive infinity). + + **T** (the *subtype* or *value type*) may be any orderable Arrow type: + integer, floating-point, decimal, date, time, or timestamp types. Both + fields share the same type T. The subtype is read directly from the + storage struct and is **not** duplicated in the extension metadata. + + The outer struct's validity bit marks a null/absent range (a missing range, + distinct from an empty range). + +.. note:: + + Both ``lower`` and ``upper`` struct fields **must** be nullable. A null + bound represents an infinite endpoint and is **always treated as + exclusive**, regardless of the value of the ``closed`` parameter. You + cannot include positive or negative infinity in a closed bound. + +* Extension type parameters: + + * **closed** = which finite bound(s) are inclusive. Allowed values + (following pandas interval vocabulary): + + * ``"left"`` -- lower bound inclusive, upper bound exclusive: ``[lower, upper)`` + * ``"right"`` -- lower bound exclusive, upper bound inclusive: ``(lower, upper]`` + * ``"both"`` -- both bounds inclusive: ``[lower, upper]`` + * ``"neither"`` -- both bounds exclusive: ``(lower, upper)`` + +* Description of the serialization: + + The extension metadata **must** be either an empty string or a valid JSON + object. The JSON object may contain one key: + + * ``"closed"`` (string, optional): one of ``"left"``, ``"right"``, + ``"both"``, or ``"neither"``. When absent (including when the metadata is + an empty string), it defaults to ``"right"``. + + Additional keys in the JSON object should be ignored to allow + forward-compatible extensions. + + Examples: + + - ``{"closed": "right"}`` -- half-open interval, right-closed (default) + - ``{"closed": "left"}`` -- half-open interval, left-closed + - ``{"closed": "both"}`` -- closed interval + - ``{"closed": "neither"}``-- open interval + - ``""`` -- equivalent to ``{"closed": "right"}`` + +* Semantics: + + * A range value ``[lower, upper]`` with ``closed="both"`` contains every + value x such that ``lower <= x <= upper``. + * A range value ``[lower, upper]`` with ``closed="neither"`` is *empty* + when ``lower == upper`` (a degenerate open interval), and contains values + x such that ``lower < x < upper`` otherwise. + * Implementations should document behavior when ``lower > upper``; the + recommended interpretation is that such a range is *empty*. + * A null outer struct value represents a missing (absent) range, not an + empty range. + * A null ``lower`` field means the range has no lower bound (extends to + negative infinity); the lower bound is always exclusive in this case. + * A null ``upper`` field means the range has no upper bound (extends to + positive infinity); the upper bound is always exclusive in this case. + * A range where both ``lower`` and ``upper`` are null represents the + universal range ``(-inf, +inf)``. + +.. _range types: https://www.postgresql.org/docs/current/rangetypes.html + Community Extension Types ========================= From 93502baf8c2e618cba1fb72047d19980e2b6c214 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20R=2E=20H=C3=B6lzlwimmer?= Date: Sat, 23 May 2026 21:18:14 +0000 Subject: [PATCH 02/11] fix(format): require explicit closed in arrow.range metadata The closedness is no longer defaulted on the wire: empty metadata or a JSON object without a "closed" key is now rejected by Deserialize, so a serialized arrow.range is always unambiguous. The C++ convenience default argument for constructing a RangeType in code is left-closed ([lower, upper)), matching the PostgreSQL/Rust/Python range convention. Spec and tests updated. --- cpp/src/arrow/extension/range.cc | 55 ++++++++++++---------- cpp/src/arrow/extension/range.h | 4 +- cpp/src/arrow/extension/range_test.cc | 26 ++++++---- cpp/src/arrow/extension_type.cc | 2 +- docs/source/format/CanonicalExtensions.rst | 16 ++++--- 5 files changed, 59 insertions(+), 44 deletions(-) diff --git a/cpp/src/arrow/extension/range.cc b/cpp/src/arrow/extension/range.cc index 2d5d35b2f438..97733fe04010 100644 --- a/cpp/src/arrow/extension/range.cc +++ b/cpp/src/arrow/extension/range.cc @@ -149,33 +149,36 @@ Result> RangeType::Deserialize( "\""); } - // Parse "closed" parameter from JSON metadata. - // Empty metadata defaults to {"closed": "right"}. - RangeClosed closed = RangeClosed::Right; - if (!serialized_data.empty()) { - rapidjson::Document document; - const auto& parsed = - document.Parse(serialized_data.data(), serialized_data.length()); - if (parsed.HasParseError()) { - return Status::Invalid("Invalid serialized JSON data for RangeType: ", - rapidjson::GetParseError_En(parsed.GetParseError()), ": ", - serialized_data); - } - if (!document.IsObject()) { - return Status::Invalid( - "Invalid serialized JSON data for RangeType: not an object"); - } - if (document.HasMember("closed")) { - const auto& closed_val = document["closed"]; - if (!closed_val.IsString()) { - return Status::Invalid( - "Invalid serialized JSON data for RangeType: \"closed\" is not a string"); - } - ARROW_ASSIGN_OR_RAISE( - closed, ClosedFromString(std::string_view(closed_val.GetString(), - closed_val.GetStringLength()))); - } + // Parse the required "closed" parameter from JSON metadata. The closedness + // is not defaulted on the wire: empty metadata or a missing key is invalid. + if (serialized_data.empty()) { + return Status::Invalid( + "RangeType metadata must be a JSON object with a required \"closed\" key, " + "got an empty string"); + } + rapidjson::Document document; + const auto& parsed = document.Parse(serialized_data.data(), serialized_data.length()); + if (parsed.HasParseError()) { + return Status::Invalid("Invalid serialized JSON data for RangeType: ", + rapidjson::GetParseError_En(parsed.GetParseError()), ": ", + serialized_data); + } + if (!document.IsObject()) { + return Status::Invalid("Invalid serialized JSON data for RangeType: not an object"); + } + if (!document.HasMember("closed")) { + return Status::Invalid( + "RangeType metadata is missing the required \"closed\" key: ", serialized_data); + } + const auto& closed_val = document["closed"]; + if (!closed_val.IsString()) { + return Status::Invalid( + "Invalid serialized JSON data for RangeType: \"closed\" is not a string"); } + ARROW_ASSIGN_OR_RAISE( + RangeClosed closed, + ClosedFromString( + std::string_view(closed_val.GetString(), closed_val.GetStringLength()))); return std::make_shared(std::move(storage_type), closed); } diff --git a/cpp/src/arrow/extension/range.h b/cpp/src/arrow/extension/range.h index b87f847c4b0c..3af60c1937ab 100644 --- a/cpp/src/arrow/extension/range.h +++ b/cpp/src/arrow/extension/range.h @@ -76,7 +76,7 @@ class ARROW_EXPORT RangeType : public ExtensionType { /// \param[in] value_type The orderable Arrow subtype T for lower and upper. /// \param[in] closed Which bound(s) are inclusive. static Result> Make(std::shared_ptr value_type, - RangeClosed closed = RangeClosed::Right); + RangeClosed closed = RangeClosed::Left); /// \brief Return the bound-inclusivity parameter. RangeClosed closed() const { return closed_; } @@ -99,6 +99,6 @@ class ARROW_EXPORT RangeArray : public ExtensionArray { /// This is a convenience wrapper around RangeType::Make that aborts on error. /// For recoverable error handling prefer RangeType::Make. ARROW_EXPORT std::shared_ptr range(std::shared_ptr value_type, - RangeClosed closed = RangeClosed::Right); + RangeClosed closed = RangeClosed::Left); } // namespace arrow::extension diff --git a/cpp/src/arrow/extension/range_test.cc b/cpp/src/arrow/extension/range_test.cc index eb9c7abe0074..2988439986c4 100644 --- a/cpp/src/arrow/extension/range_test.cc +++ b/cpp/src/arrow/extension/range_test.cc @@ -156,14 +156,14 @@ TEST(RangeType, Deserialize) { ASSERT_NO_FATAL_FAILURE( CheckDeserialize(R"({"closed": "right", "extra": 42})", extension::range(int32(), extension::RangeClosed::Right))); +} - // Empty metadata defaults to "right". - ASSERT_NO_FATAL_FAILURE( - CheckDeserialize("", extension::range(int32(), extension::RangeClosed::Right))); - - // Empty JSON object (no "closed" key) also defaults to "right". - ASSERT_NO_FATAL_FAILURE( - CheckDeserialize("{}", extension::range(int32(), extension::RangeClosed::Right))); +TEST(RangeType, DefaultClosedIsLeft) { + // The C++ convenience default is left-closed; the wire format still always + // carries an explicit "closed". + auto type = checked_pointer_cast(extension::range(int32())); + ASSERT_EQ(extension::RangeClosed::Left, type->closed()); + ASSERT_EQ(R"({"closed":"left"})", type->Serialize()); } // --------------------------------------------------------------------------- @@ -172,7 +172,17 @@ TEST(RangeType, Deserialize) { TEST(RangeType, DeserializeInvalidMetadata) { auto type = RangeInt32Right(); - // Empty string is valid (defaults to "right"); truly malformed JSON fails. + // "closed" is required on the wire: empty metadata is invalid. + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("empty string"), + type->Deserialize(type->storage_type(), "")); + + // A JSON object without the "closed" key is invalid. + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("missing the required \"closed\" key"), + type->Deserialize(type->storage_type(), "{}")); + + // Truly malformed JSON fails. EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, testing::HasSubstr("Missing a name for object member"), type->Deserialize(type->storage_type(), "{")); diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index 45cf5ed1e16f..1dd840621f20 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -157,7 +157,7 @@ static void CreateGlobalRegistry() { #ifdef ARROW_JSON ext_types.push_back(extension::fixed_shape_tensor(int64(), {})); ext_types.push_back(extension::opaque(null(), "", "")); - ext_types.push_back(extension::range(int32(), extension::RangeClosed::Right)); + ext_types.push_back(extension::range(int32())); ext_types.push_back(extension::variable_shape_tensor(int64(), 0)); #endif diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index c1840d7bdf8c..55d4d195a832 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -631,23 +631,25 @@ PostgreSQL's `range types`_ and SQL:2011 ``PERIOD`` types. * Description of the serialization: - The extension metadata **must** be either an empty string or a valid JSON - object. The JSON object may contain one key: + The extension metadata **must** be a valid JSON object containing the + **required** key: - * ``"closed"`` (string, optional): one of ``"left"``, ``"right"``, - ``"both"``, or ``"neither"``. When absent (including when the metadata is - an empty string), it defaults to ``"right"``. + * ``"closed"`` (string, **required**): one of ``"left"``, ``"right"``, + ``"both"``, or ``"neither"``. + + The closedness is **not** defaulted on the wire: an empty metadata string, + or a JSON object without a ``"closed"`` key, is invalid. This keeps the + serialized form unambiguous for consumers. Additional keys in the JSON object should be ignored to allow forward-compatible extensions. Examples: - - ``{"closed": "right"}`` -- half-open interval, right-closed (default) + - ``{"closed": "right"}`` -- half-open interval, right-closed - ``{"closed": "left"}`` -- half-open interval, left-closed - ``{"closed": "both"}`` -- closed interval - ``{"closed": "neither"}``-- open interval - - ``""`` -- equivalent to ``{"closed": "right"}`` * Semantics: From 68600a9f40282fff9c4cccc115a2ee785fbcb29a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20R=2E=20H=C3=B6lzlwimmer?= Date: Sat, 23 May 2026 21:40:35 +0000 Subject: [PATCH 03/11] test(format): fix arrow.range test compile and link errors Verified by building the arrow-canonical-extensions-test target (50/50 pass, 10/10 RangeType). Two fixes to the previously-uncompiled test: - include arrow/array/array_nested.h for the full StructArray definition (it is only forward-declared in type_fwd.h). - wrap the CheckDeserialize helper in an anonymous namespace to avoid a link-time collision with the identically named helper in opaque_test.cc. --- cpp/src/arrow/extension/range_test.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/src/arrow/extension/range_test.cc b/cpp/src/arrow/extension/range_test.cc index 2988439986c4..b9526024328a 100644 --- a/cpp/src/arrow/extension/range_test.cc +++ b/cpp/src/arrow/extension/range_test.cc @@ -18,6 +18,7 @@ #include #include +#include "arrow/array/array_nested.h" #include "arrow/extension/range.h" #include "arrow/extension_type.h" #include "arrow/io/memory.h" @@ -129,6 +130,8 @@ TEST(RangeType, CreateFromArray) { // --------------------------------------------------------------------------- // Deserialize - valid cases +namespace { + void CheckDeserialize(const std::string& serialized, const std::shared_ptr& expected) { auto type = checked_pointer_cast(expected); @@ -137,6 +140,8 @@ void CheckDeserialize(const std::string& serialized, ASSERT_EQ(*expected, *deserialized); } +} // namespace + TEST(RangeType, Deserialize) { // Normal JSON ASSERT_NO_FATAL_FAILURE( From 75059ccc944fde90443aa9a6e0c97a5cf3bf0cb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20R=2E=20H=C3=B6lzlwimmer?= Date: Sun, 24 May 2026 00:01:29 +0000 Subject: [PATCH 04/11] style(format): align range extension type with neighbor C++ conventions --- cpp/src/arrow/extension/range.cc | 2 +- cpp/src/arrow/extension/range.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/arrow/extension/range.cc b/cpp/src/arrow/extension/range.cc index 97733fe04010..fe7089953932 100644 --- a/cpp/src/arrow/extension/range.cc +++ b/cpp/src/arrow/extension/range.cc @@ -78,7 +78,7 @@ std::shared_ptr RangeType::value_type() const { std::string RangeType::ToString(bool show_metadata) const { std::stringstream ss; - ss << "extension<" << extension_name() + ss << "extension<" << this->extension_name() << "[value_type=" << value_type()->ToString(show_metadata) << ", closed=" << ClosedToString(closed_) << "]>"; return ss.str(); diff --git a/cpp/src/arrow/extension/range.h b/cpp/src/arrow/extension/range.h index 3af60c1937ab..ab11734507b7 100644 --- a/cpp/src/arrow/extension/range.h +++ b/cpp/src/arrow/extension/range.h @@ -17,8 +17,6 @@ #pragma once -#include - #include "arrow/extension_type.h" #include "arrow/type.h" From d315a468db6079480054d01146c888f08db3f8be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20R=2E=20H=C3=B6lzlwimmer?= Date: Sun, 24 May 2026 00:01:29 +0000 Subject: [PATCH 05/11] docs(format): refine arrow.range spec and add status and C++ API entries --- docs/source/cpp/api/extension.rst | 14 +++++++ docs/source/format/CanonicalExtensions.rst | 47 ++++++++-------------- docs/source/status.rst | 2 + 3 files changed, 32 insertions(+), 31 deletions(-) diff --git a/docs/source/cpp/api/extension.rst b/docs/source/cpp/api/extension.rst index 5b9620907f2b..6b1e2e9a8df0 100644 --- a/docs/source/cpp/api/extension.rst +++ b/docs/source/cpp/api/extension.rst @@ -42,6 +42,10 @@ Extension Type classes :project: arrow_cpp :members: +.. doxygenclass:: arrow::extension::RangeType + :project: arrow_cpp + :members: + Extension Array classes ======================= @@ -61,3 +65,13 @@ Extension Array classes :project: arrow_cpp :members: +.. doxygenclass:: arrow::extension::RangeArray + :project: arrow_cpp + :members: + +Extension functions +=================== + +.. doxygenfunction:: arrow::extension::range + :project: arrow_cpp + diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 55d4d195a832..361083c38c4f 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -609,15 +609,14 @@ PostgreSQL's `range types`_ and SQL:2011 ``PERIOD`` types. fields share the same type T. The subtype is read directly from the storage struct and is **not** duplicated in the extension metadata. - The outer struct's validity bit marks a null/absent range (a missing range, - distinct from an empty range). - -.. note:: - - Both ``lower`` and ``upper`` struct fields **must** be nullable. A null - bound represents an infinite endpoint and is **always treated as - exclusive**, regardless of the value of the ``closed`` parameter. You - cannot include positive or negative infinity in a closed bound. + Both ``lower`` and ``upper`` fields **must** be nullable. A null bound + represents an infinite endpoint and is **always treated as exclusive**, + regardless of the value of the ``closed`` parameter; positive and negative + infinity can never be included in a closed bound. A null ``lower`` means + the range extends to negative infinity, a null ``upper`` means it extends to + positive infinity, and a range whose ``lower`` and ``upper`` are both null + is the universal range ``(-inf, +inf)``. The outer struct's validity bit + marks a null/absent range (a missing range, distinct from an empty range). * Extension type parameters: @@ -629,6 +628,12 @@ PostgreSQL's `range types`_ and SQL:2011 ``PERIOD`` types. * ``"both"`` -- both bounds inclusive: ``[lower, upper]`` * ``"neither"`` -- both bounds exclusive: ``(lower, upper)`` + A range thus contains every value x permitted by its finite bounds and + ``closed`` setting: with ``closed="both"`` every x such that + ``lower <= x <= upper``, with ``closed="neither"`` every x such that + ``lower < x < upper``. A range is *empty* when ``lower > upper``, or when + ``lower == upper`` and at least one bound is exclusive. + * Description of the serialization: The extension metadata **must** be a valid JSON object containing the @@ -639,10 +644,8 @@ PostgreSQL's `range types`_ and SQL:2011 ``PERIOD`` types. The closedness is **not** defaulted on the wire: an empty metadata string, or a JSON object without a ``"closed"`` key, is invalid. This keeps the - serialized form unambiguous for consumers. - - Additional keys in the JSON object should be ignored to allow - forward-compatible extensions. + serialized form unambiguous for consumers. Additional keys in the JSON + object should be ignored to allow forward-compatible extensions. Examples: @@ -651,24 +654,6 @@ PostgreSQL's `range types`_ and SQL:2011 ``PERIOD`` types. - ``{"closed": "both"}`` -- closed interval - ``{"closed": "neither"}``-- open interval -* Semantics: - - * A range value ``[lower, upper]`` with ``closed="both"`` contains every - value x such that ``lower <= x <= upper``. - * A range value ``[lower, upper]`` with ``closed="neither"`` is *empty* - when ``lower == upper`` (a degenerate open interval), and contains values - x such that ``lower < x < upper`` otherwise. - * Implementations should document behavior when ``lower > upper``; the - recommended interpretation is that such a range is *empty*. - * A null outer struct value represents a missing (absent) range, not an - empty range. - * A null ``lower`` field means the range has no lower bound (extends to - negative infinity); the lower bound is always exclusive in this case. - * A null ``upper`` field means the range has no upper bound (extends to - positive infinity); the upper bound is always exclusive in this case. - * A range where both ``lower`` and ``upper`` are null represents the - universal range ``(-inf, +inf)``. - .. _range types: https://www.postgresql.org/docs/current/rangetypes.html Community Extension Types diff --git a/docs/source/status.rst b/docs/source/status.rst index 6379741878ca..f3af1a50e0d2 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -131,6 +131,8 @@ Data Types +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Parquet Variant | | | ✓ | | | ✓ | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +| Range | ✓ | | | | | | | | ++-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ Notes: From 257a5a6e992bdb334c2d73416e875a3d7f628bd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20R=2E=20H=C3=B6lzlwimmer?= Date: Sun, 24 May 2026 00:01:29 +0000 Subject: [PATCH 06/11] feat(python): add pyarrow bindings for arrow.range extension type --- docs/source/python/api/arrays.rst | 2 + docs/source/python/api/datatypes.rst | 2 + python/pyarrow/__init__.py | 9 +- python/pyarrow/array.pxi | 23 ++++ python/pyarrow/includes/libarrow.pxd | 21 ++++ python/pyarrow/lib.pxd | 4 + python/pyarrow/public-api.pxi | 2 + python/pyarrow/scalar.pxi | 6 + python/pyarrow/tests/test_extension_type.py | 66 +++++++++++ python/pyarrow/types.pxi | 123 ++++++++++++++++++++ 10 files changed, 254 insertions(+), 4 deletions(-) diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index 290ce09befb1..5a4dcecdc567 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -101,6 +101,7 @@ may expose data type-specific methods or properties. JsonArray UuidArray Bool8Array + RangeArray .. _api.scalar: @@ -169,3 +170,4 @@ classes may expose data type-specific methods or properties. JsonScalar UuidScalar Bool8Scalar + RangeScalar diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst index ea9e547d32c7..71ee00557f33 100644 --- a/docs/source/python/api/datatypes.rst +++ b/docs/source/python/api/datatypes.rst @@ -73,6 +73,7 @@ These should be used to create Arrow data types and schemas. sparse_union opaque bool8 + range_ uuid json_ field @@ -146,6 +147,7 @@ implemented by PyArrow. JsonType UuidType Bool8Type + RangeType .. _api.types.checking: .. currentmodule:: pyarrow.types diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index adfc50d57395..1c4e82181aee 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -165,7 +165,7 @@ def print_entry(label, value): union, sparse_union, dense_union, dictionary, run_end_encoded, - bool8, fixed_shape_tensor, json_, opaque, uuid, + bool8, fixed_shape_tensor, json_, opaque, range_, uuid, field, type_for_alias, DataType, DictionaryType, StructType, @@ -177,7 +177,7 @@ def print_entry(label, value): Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, RunEndEncodedType, Bool8Type, FixedShapeTensorType, - JsonType, OpaqueType, UuidType, + JsonType, OpaqueType, RangeType, UuidType, UnknownExtensionType, register_extension_type, unregister_extension_type, DictionaryMemo, @@ -214,7 +214,7 @@ def print_entry(label, value): Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, StructArray, ExtensionArray, RunEndEncodedArray, Bool8Array, FixedShapeTensorArray, - JsonArray, OpaqueArray, UuidArray, + JsonArray, OpaqueArray, RangeArray, UuidArray, scalar, NA, _NULL as NULL, Scalar, NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, @@ -232,7 +232,8 @@ def print_entry(label, value): FixedSizeBinaryScalar, DictionaryScalar, MapScalar, StructScalar, UnionScalar, RunEndEncodedScalar, Bool8Scalar, ExtensionScalar, - FixedShapeTensorScalar, JsonScalar, OpaqueScalar, UuidScalar) + FixedShapeTensorScalar, JsonScalar, OpaqueScalar, + RangeScalar, UuidScalar) # Buffers, allocation from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ecdbb342d3e2..a25b774d8df0 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4956,6 +4956,29 @@ cdef class Bool8Array(ExtensionArray): return Bool8Array.from_storage(storage_arr) +cdef class RangeArray(ExtensionArray): + """ + Concrete class for range extension arrays. + + Examples + -------- + Define the extension type for a range array + + >>> import pyarrow as pa + >>> range_type = pa.range_(pa.int32(), "both") + + Create an extension array + + >>> storage = pa.array( + ... [{"lower": 1, "upper": 5}, {"lower": None, "upper": 10}], + ... range_type.storage_type, + ... ) + >>> arr = pa.ExtensionArray.from_storage(range_type, storage) + >>> isinstance(arr, pa.RangeArray) + True + """ + + cdef dict _array_classes = { _Type_NA: NullArray, _Type_BOOL: BooleanArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 79522c12474b..795129e6b434 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -3085,6 +3085,27 @@ cdef extern from "arrow/extension/bool8.h" namespace "arrow::extension" nogil: cdef cppclass CBool8Array" arrow::extension::Bool8Array"(CExtensionArray): pass + +cdef extern from "arrow/extension/range.h" namespace "arrow::extension" nogil: + cdef enum class CRangeClosed" arrow::extension::RangeClosed": + Left + Right + Both + Neither + + cdef cppclass CRangeType" arrow::extension::RangeType"(CExtensionType): + + @staticmethod + CResult[shared_ptr[CDataType]] Make(shared_ptr[CDataType] value_type, + CRangeClosed closed) + + CRangeClosed closed() + shared_ptr[CDataType] value_type() + + cdef cppclass CRangeArray" arrow::extension::RangeArray"(CExtensionArray): + pass + + cdef extern from "arrow/util/compression.h" namespace "arrow" nogil: cdef enum CCompressionType" arrow::Compression::type": CCompressionType_UNCOMPRESSED" arrow::Compression::UNCOMPRESSED" diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 683faa7855c5..c4195df0dfe8 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -203,6 +203,10 @@ cdef class Bool8Type(BaseExtensionType): cdef: const CBool8Type* bool8_ext_type +cdef class RangeType(BaseExtensionType): + cdef: + const CRangeType* range_ext_type + cdef class OpaqueType(BaseExtensionType): cdef: const COpaqueType* opaque_ext_type diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index d1fa1192debc..7bb56435b7e2 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -131,6 +131,8 @@ cdef api object pyarrow_wrap_data_type( out = Bool8Type.__new__(Bool8Type) elif extension_name == b"arrow.fixed_shape_tensor": out = FixedShapeTensorType.__new__(FixedShapeTensorType) + elif extension_name == b"arrow.range": + out = RangeType.__new__(RangeType) elif extension_name == b"arrow.opaque": out = OpaqueType.__new__(OpaqueType) elif extension_name == b"arrow.uuid": diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index fb7de926edc1..4b2599724283 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1611,6 +1611,12 @@ cdef class Bool8Scalar(ExtensionScalar): py_val = super().as_py() return None if py_val is None else py_val != 0 + +cdef class RangeScalar(ExtensionScalar): + """ + Concrete class for range extension scalar. + """ + cdef dict _scalar_classes = { _Type_BOOL: BooleanScalar, _Type_UINT8: UInt8Scalar, diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 465b556876b4..91947f26198e 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1915,6 +1915,72 @@ def test_opaque_type(pickle_module, storage_type, storage): assert inner == storage +@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) +@pytest.mark.parametrize("value_type,bounds", [ + (pa.int32(), [{"lower": 1, "upper": 5}, {"lower": None, "upper": 10}]), + (pa.int64(), [{"lower": None, "upper": None}, {"lower": 2, "upper": 8}]), + (pa.float64(), [{"lower": 0.0, "upper": 1.5}, None]), +]) +def test_range_type(pickle_module, closed, value_type, bounds): + range_type = pa.range_(value_type, closed) + assert range_type.extension_name == "arrow.range" + assert range_type.value_type == value_type + assert range_type.closed == closed + assert range_type.storage_type == pa.struct([ + pa.field("lower", value_type, nullable=True), + pa.field("upper", value_type, nullable=True), + ]) + assert "arrow.range" in str(range_type) + + # the closed parameter defaults to "left" + assert pa.range_(value_type).closed == "left" + + assert range_type == range_type + assert range_type == pa.range_(value_type, closed) + assert range_type != value_type + # different closed parameter -> not equal + other_closed = "right" if closed != "right" else "left" + assert range_type != pa.range_(value_type, other_closed) + # different value type -> not equal + assert range_type != pa.range_(pa.decimal128(12, 3), closed) + + # Pickle roundtrip + result = pickle_module.loads(pickle_module.dumps(range_type)) + assert result == range_type + assert result.closed == closed + assert result.value_type == value_type + + # IPC roundtrip + range_arr_class = range_type.__arrow_ext_class__() + storage = pa.array(bounds, range_type.storage_type) + arr = pa.ExtensionArray.from_storage(range_type, storage) + assert isinstance(arr, range_arr_class) + + # extension is registered by default + buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"])) + batch = ipc_read_batch(buf) + + assert batch.column(0).type.extension_name == "arrow.range" + assert batch.column(0).type.closed == closed + assert isinstance(batch.column(0), range_arr_class) + assert batch.column(0) == arr + + # cast storage -> extension type + result = storage.cast(range_type) + assert result == arr + + # cast extension type -> storage type + inner = arr.cast(range_type.storage_type) + assert inner == storage + + +def test_range_type_invalid_closed(): + with pytest.raises(ValueError, match="Invalid value for range"): + pa.range_(pa.int32(), "invalid") + with pytest.raises(ValueError, match="Invalid value for range"): + pa.range_(pa.int32(), "") + + def test_bool8_type(pickle_module): bool8_type = pa.bool8() storage_type = pa.int8() diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index ec1a5a2ba9a3..d78a8d6cdf5d 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -2091,6 +2091,62 @@ cdef class Bool8Type(BaseExtensionType): return Bool8Scalar +cdef class RangeType(BaseExtensionType): + """ + Concrete class for range extension type. + + Range represents a bounded set (a mathematical interval) over an orderable + Arrow value type. The underlying storage is a Struct with two nullable + fields "lower" and "upper" of the value type, where a null bound denotes an + unbounded (infinite) side. The "closed" parameter controls which finite + bounds are inclusive. + + Examples + -------- + Create an instance of range extension type: + + >>> import pyarrow as pa + >>> pa.range_(pa.int32(), "both") + RangeType(extension) + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + BaseExtensionType.init(self, type) + self.range_ext_type = type.get() + + @property + def value_type(self): + """ + The Arrow value type of the "lower" and "upper" bounds. + """ + return pyarrow_wrap_data_type(self.range_ext_type.value_type()) + + @property + def closed(self): + """ + Which bound(s) are inclusive, as one of "left", "right", "both" or + "neither". + """ + cdef CRangeClosed c_closed = self.range_ext_type.closed() + if c_closed == CRangeClosed.Left: + return "left" + elif c_closed == CRangeClosed.Right: + return "right" + elif c_closed == CRangeClosed.Both: + return "both" + else: + return "neither" + + def __arrow_ext_class__(self): + return RangeArray + + def __reduce__(self): + return range_, (self.value_type, self.closed) + + def __arrow_ext_scalar_class__(self): + return RangeScalar + + cdef class OpaqueType(BaseExtensionType): """ Concrete class for opaque extension type. @@ -5706,6 +5762,73 @@ def bool8(): return out +def range_(DataType value_type not None, str closed="left"): + """ + Create instance of range extension type. + + Parameters + ---------- + value_type : DataType + The orderable Arrow type of the "lower" and "upper" interval bounds. + closed : str, default "left" + Which bound(s) are inclusive. One of "left", "right", "both" or + "neither". + + Examples + -------- + Create an instance of a range extension type: + + >>> import pyarrow as pa + >>> type = pa.range_(pa.int32(), "both") + >>> type + RangeType(extension) + + Inspect the data type: + + >>> type.value_type + DataType(int32) + >>> type.closed + 'both' + >>> type.storage_type + StructType(struct) + + Create a range array: + + >>> storage = pa.array( + ... [{"lower": 1, "upper": 5}, {"lower": None, "upper": 10}], + ... type.storage_type, + ... ) + >>> arr = pa.ExtensionArray.from_storage(type, storage) + >>> arr.type + RangeType(extension) + + Returns + ------- + type : RangeType + """ + + cdef CRangeClosed c_closed + if closed == "left": + c_closed = CRangeClosed.Left + elif closed == "right": + c_closed = CRangeClosed.Right + elif closed == "both": + c_closed = CRangeClosed.Both + elif closed == "neither": + c_closed = CRangeClosed.Neither + else: + raise ValueError( + f"Invalid value for range \"closed\" parameter: {closed!r}. " + "Expected one of: 'left', 'right', 'both', 'neither'.") + + cdef: + shared_ptr[CDataType] c_type = GetResultValue( + CRangeType.Make(value_type.sp_type, c_closed)) + RangeType out = RangeType.__new__(RangeType) + out.init(c_type) + return out + + def opaque(DataType storage_type, str type_name not None, str vendor_name not None): """ Create instance of opaque extension type. From d4134329f8541b7415309f81eff3858e9eb1e8ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20R=2E=20H=C3=B6lzlwimmer?= Date: Sun, 24 May 2026 12:30:23 +0000 Subject: [PATCH 07/11] feat(format): allow non-nullable bounds in arrow.range --- cpp/src/arrow/extension/range.cc | 24 +++++------ cpp/src/arrow/extension/range.h | 21 +++++++--- cpp/src/arrow/extension/range_test.cc | 47 ++++++++++++++++------ docs/source/format/CanonicalExtensions.rst | 29 +++++++------ 4 files changed, 77 insertions(+), 44 deletions(-) diff --git a/cpp/src/arrow/extension/range.cc b/cpp/src/arrow/extension/range.cc index fe7089953932..d39d90acfe96 100644 --- a/cpp/src/arrow/extension/range.cc +++ b/cpp/src/arrow/extension/range.cc @@ -60,10 +60,12 @@ Result ClosedFromString(std::string_view s) { } /// Build the storage Struct type for a given value subtype. -std::shared_ptr MakeStorageType(const std::shared_ptr& value_type) { - // Both "lower" and "upper" are nullable (null = infinite bound). - return struct_({field("lower", value_type, /*nullable=*/true), - field("upper", value_type, /*nullable=*/true)}); +std::shared_ptr MakeStorageType(const std::shared_ptr& value_type, + bool allow_unbounded) { + // Nullable bounds can represent an unbounded (infinite) endpoint; non-nullable + // bounds are always finite. + return struct_({field("lower", value_type, allow_unbounded), + field("upper", value_type, allow_unbounded)}); } } // namespace @@ -135,12 +137,6 @@ Result> RangeType::Deserialize( "RangeType storage Struct field 1 must be named \"upper\", got \"", upper_field->name(), "\""); } - if (!lower_field->nullable()) { - return Status::Invalid("RangeType storage Struct field \"lower\" must be nullable"); - } - if (!upper_field->nullable()) { - return Status::Invalid("RangeType storage Struct field \"upper\" must be nullable"); - } if (!lower_field->type()->Equals(*upper_field->type())) { return Status::Invalid( "RangeType storage Struct fields \"lower\" and \"upper\" must have the same " @@ -191,8 +187,8 @@ std::shared_ptr RangeType::MakeArray(std::shared_ptr data) con } Result> RangeType::Make( - std::shared_ptr value_type, RangeClosed closed) { - auto storage = MakeStorageType(value_type); + std::shared_ptr value_type, RangeClosed closed, bool allow_unbounded) { + auto storage = MakeStorageType(value_type, allow_unbounded); return std::make_shared(std::move(storage), closed); } @@ -200,8 +196,8 @@ Result> RangeType::Make( // Free factory function std::shared_ptr range(std::shared_ptr value_type, - RangeClosed closed) { - auto result = RangeType::Make(std::move(value_type), closed); + RangeClosed closed, bool allow_unbounded) { + auto result = RangeType::Make(std::move(value_type), closed, allow_unbounded); ARROW_CHECK_OK(result.status()); return std::move(result).ValueOrDie(); } diff --git a/cpp/src/arrow/extension/range.h b/cpp/src/arrow/extension/range.h index ab11734507b7..b9f5f55e6106 100644 --- a/cpp/src/arrow/extension/range.h +++ b/cpp/src/arrow/extension/range.h @@ -39,9 +39,12 @@ enum class RangeClosed { /// \brief RangeType represents a bounded set (mathematical interval) over an /// orderable Arrow type T. /// -/// Storage is a Struct with exactly two nullable fields: -/// - "lower": T NULLABLE (null = unbounded below, i.e. -infinity) -/// - "upper": T NULLABLE (null = unbounded above, i.e. +infinity) +/// Storage is a Struct with exactly two fields "lower" and "upper" of the same +/// orderable type T. Each field may independently be nullable or not: a nullable +/// bound can hold null to represent an unbounded (infinite) endpoint on that +/// side, while a non-nullable bound is always finite. +/// - "lower": T (null, when nullable = unbounded below, i.e. -infinity) +/// - "upper": T (null, when nullable = unbounded above, i.e. +infinity) /// /// The outer struct's validity bit marks a null/absent range. /// @@ -70,11 +73,16 @@ class ARROW_EXPORT RangeType : public ExtensionType { /// \brief Factory function. /// - /// Constructs the required two-field struct storage type internally. + /// Constructs the two-field struct storage type internally. /// \param[in] value_type The orderable Arrow subtype T for lower and upper. /// \param[in] closed Which bound(s) are inclusive. + /// \param[in] allow_unbounded Whether each side may be unbounded (infinite). + /// When true, the "lower" and "upper" fields are nullable and a null bound + /// denotes an infinite endpoint; when false, both bounds are non-nullable + /// and the range is always finite. Defaults to true. static Result> Make(std::shared_ptr value_type, - RangeClosed closed = RangeClosed::Left); + RangeClosed closed = RangeClosed::Left, + bool allow_unbounded = true); /// \brief Return the bound-inclusivity parameter. RangeClosed closed() const { return closed_; } @@ -97,6 +105,7 @@ class ARROW_EXPORT RangeArray : public ExtensionArray { /// This is a convenience wrapper around RangeType::Make that aborts on error. /// For recoverable error handling prefer RangeType::Make. ARROW_EXPORT std::shared_ptr range(std::shared_ptr value_type, - RangeClosed closed = RangeClosed::Left); + RangeClosed closed = RangeClosed::Left, + bool allow_unbounded = true); } // namespace arrow::extension diff --git a/cpp/src/arrow/extension/range_test.cc b/cpp/src/arrow/extension/range_test.cc index b9526024328a..ef4463fb0c11 100644 --- a/cpp/src/arrow/extension/range_test.cc +++ b/cpp/src/arrow/extension/range_test.cc @@ -238,20 +238,43 @@ TEST(RangeType, DeserializeInvalidStorage) { EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, testing::HasSubstr("same type"), type->Deserialize(mismatched_types, R"({"closed":"right"})")); +} - // Non-nullable lower field. - auto lower_not_nullable = - struct_({field("lower", int32(), /*nullable=*/false), field("upper", int32(), true)}); - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, testing::HasSubstr("\"lower\" must be nullable"), - type->Deserialize(lower_not_nullable, R"({"closed":"right"})")); +// --------------------------------------------------------------------------- +// Non-nullable / asymmetric bounds +// +// Bound nullability is only needed to represent an unbounded (infinite) +// endpoint; non-nullable bounds describe a finite-only range and are accepted. - // Non-nullable upper field. - auto upper_not_nullable = - struct_({field("lower", int32(), true), field("upper", int32(), /*nullable=*/false)}); - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, testing::HasSubstr("\"upper\" must be nullable"), - type->Deserialize(upper_not_nullable, R"({"closed":"right"})")); +TEST(RangeType, NonNullableBounds) { + auto type = RangeInt32Right(); + + // Both bounds non-nullable: accepted (a finite-only range). + auto both_non_nullable = struct_( + {field("lower", int32(), /*nullable=*/false), + field("upper", int32(), /*nullable=*/false)}); + ASSERT_OK_AND_ASSIGN(auto from_non_nullable, + type->Deserialize(both_non_nullable, R"({"closed":"right"})")); + ASSERT_EQ( + *int32(), + *checked_pointer_cast(from_non_nullable)->value_type()); + + // Asymmetric: lower nullable (may be -inf), upper non-nullable (always finite). + auto asymmetric = struct_( + {field("lower", int32(), /*nullable=*/true), + field("upper", int32(), /*nullable=*/false)}); + ASSERT_OK_AND_ASSIGN(auto from_asymmetric, + type->Deserialize(asymmetric, R"({"closed":"left"})")); + ASSERT_EQ(extension::RangeClosed::Left, + checked_pointer_cast(from_asymmetric)->closed()); + + // The factory can build non-nullable bounds via allow_unbounded=false. + auto finite = checked_pointer_cast( + extension::range(int32(), extension::RangeClosed::Both, /*allow_unbounded=*/false)); + const auto& finite_storage = + internal::checked_cast(*finite->storage_type()); + ASSERT_FALSE(finite_storage.field(0)->nullable()); + ASSERT_FALSE(finite_storage.field(1)->nullable()); } // --------------------------------------------------------------------------- diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 361083c38c4f..1cdd9a9be56f 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -599,24 +599,29 @@ PostgreSQL's `range types`_ and SQL:2011 ``PERIOD`` types. * The storage type of the extension is a ``Struct`` with exactly **two fields, in order**: - * ``lower``: the lower bound, type **T**, **nullable**. - A null value means the range is unbounded below (negative infinity). - * ``upper``: the upper bound, type **T**, **nullable**. - A null value means the range is unbounded above (positive infinity). + * ``lower``: the lower bound, type **T**, *optionally nullable*. + When the field is nullable, a null value means the range is unbounded below + (negative infinity). + * ``upper``: the upper bound, type **T**, *optionally nullable*. + When the field is nullable, a null value means the range is unbounded above + (positive infinity). **T** (the *subtype* or *value type*) may be any orderable Arrow type: integer, floating-point, decimal, date, time, or timestamp types. Both fields share the same type T. The subtype is read directly from the storage struct and is **not** duplicated in the extension metadata. - Both ``lower`` and ``upper`` fields **must** be nullable. A null bound - represents an infinite endpoint and is **always treated as exclusive**, - regardless of the value of the ``closed`` parameter; positive and negative - infinity can never be included in a closed bound. A null ``lower`` means - the range extends to negative infinity, a null ``upper`` means it extends to - positive infinity, and a range whose ``lower`` and ``upper`` are both null - is the universal range ``(-inf, +inf)``. The outer struct's validity bit - marks a null/absent range (a missing range, distinct from an empty range). + Each of ``lower`` and ``upper`` **may** be nullable, independently of the + other. Nullability is **only** needed to represent an unbounded side: a + nullable bound may hold null to mean an infinite endpoint, while a + non-nullable bound is always finite. A null bound is **always treated as + exclusive**, regardless of the value of the ``closed`` parameter; positive and + negative infinity can never be included in a closed bound. A null ``lower`` + means the range extends to negative infinity, a null ``upper`` means it + extends to positive infinity, and a range whose ``lower`` and ``upper`` are + both null (and both nullable) is the universal range ``(-inf, +inf)``. The + outer struct's validity bit marks a null/absent range (a missing range, + distinct from an empty range). * Extension type parameters: From 8f1d3aa248dd3b98a2420e6971f18ee7158b4c6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20R=2E=20H=C3=B6lzlwimmer?= Date: Sun, 24 May 2026 12:30:23 +0000 Subject: [PATCH 08/11] feat(python): add allow_unbounded option to pyarrow range_ --- python/pyarrow/includes/libarrow.pxd | 3 ++- python/pyarrow/tests/test_extension_type.py | 22 +++++++++++++++++++++ python/pyarrow/types.pxi | 17 ++++++++++------ 3 files changed, 35 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 795129e6b434..26acf0985ab5 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -3097,7 +3097,8 @@ cdef extern from "arrow/extension/range.h" namespace "arrow::extension" nogil: @staticmethod CResult[shared_ptr[CDataType]] Make(shared_ptr[CDataType] value_type, - CRangeClosed closed) + CRangeClosed closed, + c_bool allow_unbounded) CRangeClosed closed() shared_ptr[CDataType] value_type() diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 91947f26198e..3610e16b58ce 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1981,6 +1981,28 @@ def test_range_type_invalid_closed(): pa.range_(pa.int32(), "") +def test_range_type_allow_unbounded(): + # Default: bounds are nullable (can represent an unbounded / infinite side). + nullable = pa.range_(pa.int32(), "both") + assert nullable.storage_type.field("lower").nullable + assert nullable.storage_type.field("upper").nullable + + # allow_unbounded=False: a finite-only range with non-nullable bounds. + finite = pa.range_(pa.int32(), "both", allow_unbounded=False) + assert not finite.storage_type.field("lower").nullable + assert not finite.storage_type.field("upper").nullable + assert finite.value_type == pa.int32() + assert finite.closed == "both" + + # Distinct types: storage nullability differs. + assert finite != nullable + + # A non-nullable-bounds range round-trips through its storage. + storage = pa.array([{"lower": 1, "upper": 5}], finite.storage_type) + arr = pa.ExtensionArray.from_storage(finite, storage) + assert arr.type == finite + + def test_bool8_type(pickle_module): bool8_type = pa.bool8() storage_type = pa.int8() diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index d78a8d6cdf5d..0777c181c502 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -2096,10 +2096,10 @@ cdef class RangeType(BaseExtensionType): Concrete class for range extension type. Range represents a bounded set (a mathematical interval) over an orderable - Arrow value type. The underlying storage is a Struct with two nullable - fields "lower" and "upper" of the value type, where a null bound denotes an - unbounded (infinite) side. The "closed" parameter controls which finite - bounds are inclusive. + Arrow value type. The underlying storage is a Struct with two fields + "lower" and "upper" of the value type, each optionally nullable; when a + bound field is nullable, a null value denotes an unbounded (infinite) side. + The "closed" parameter controls which finite bounds are inclusive. Examples -------- @@ -5762,7 +5762,7 @@ def bool8(): return out -def range_(DataType value_type not None, str closed="left"): +def range_(DataType value_type not None, str closed="left", allow_unbounded=True): """ Create instance of range extension type. @@ -5773,6 +5773,11 @@ def range_(DataType value_type not None, str closed="left"): closed : str, default "left" Which bound(s) are inclusive. One of "left", "right", "both" or "neither". + allow_unbounded : bool, default True + Whether each side may be unbounded (infinite). When True the "lower" and + "upper" storage fields are nullable (a null bound is an infinite + endpoint); when False both bounds are non-nullable and the range is + always finite. Examples -------- @@ -5823,7 +5828,7 @@ def range_(DataType value_type not None, str closed="left"): cdef: shared_ptr[CDataType] c_type = GetResultValue( - CRangeType.Make(value_type.sp_type, c_closed)) + CRangeType.Make(value_type.sp_type, c_closed, allow_unbounded)) RangeType out = RangeType.__new__(RangeType) out.init(c_type) return out From 3da2985eca14088afe6915e8f90442965bd5904f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20R=2E=20H=C3=B6lzlwimmer?= Date: Sun, 24 May 2026 22:55:41 +0000 Subject: [PATCH 09/11] style(format): apply clang-format and cmake-format to range extension --- cpp/src/arrow/extension/CMakeLists.txt | 5 +- cpp/src/arrow/extension/range.cc | 30 +++++------ cpp/src/arrow/extension/range_test.cc | 72 ++++++++++++-------------- 3 files changed, 50 insertions(+), 57 deletions(-) diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index 966c927bf2ce..8f7bae112e9f 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -18,7 +18,10 @@ set(CANONICAL_EXTENSION_TESTS bool8_test.cc json_test.cc uuid_test.cc) if(ARROW_JSON) - list(APPEND CANONICAL_EXTENSION_TESTS tensor_extension_array_test.cc opaque_test.cc + list(APPEND + CANONICAL_EXTENSION_TESTS + tensor_extension_array_test.cc + opaque_test.cc range_test.cc) endif() diff --git a/cpp/src/arrow/extension/range.cc b/cpp/src/arrow/extension/range.cc index d39d90acfe96..466951c961fc 100644 --- a/cpp/src/arrow/extension/range.cc +++ b/cpp/src/arrow/extension/range.cc @@ -101,9 +101,8 @@ std::string RangeType::Serialize() const { rapidjson::Document::AllocatorType& allocator = document.GetAllocator(); auto closed_str = ClosedToString(closed_); - rapidjson::Value closed_value(closed_str.data(), - static_cast(closed_str.size()), - allocator); + rapidjson::Value closed_value( + closed_str.data(), static_cast(closed_str.size()), allocator); document.AddMember(rapidjson::Value("closed", allocator), closed_value, allocator); rapidjson::StringBuffer buffer; @@ -121,9 +120,8 @@ Result> RangeType::Deserialize( } const auto& struct_type = internal::checked_cast(*storage_type); if (struct_type.num_fields() != 2) { - return Status::Invalid( - "RangeType storage Struct must have exactly 2 fields, got ", - struct_type.num_fields()); + return Status::Invalid("RangeType storage Struct must have exactly 2 fields, got ", + struct_type.num_fields()); } const auto& lower_field = struct_type.field(0); const auto& upper_field = struct_type.field(1); @@ -163,18 +161,17 @@ Result> RangeType::Deserialize( return Status::Invalid("Invalid serialized JSON data for RangeType: not an object"); } if (!document.HasMember("closed")) { - return Status::Invalid( - "RangeType metadata is missing the required \"closed\" key: ", serialized_data); + return Status::Invalid("RangeType metadata is missing the required \"closed\" key: ", + serialized_data); } const auto& closed_val = document["closed"]; if (!closed_val.IsString()) { return Status::Invalid( "Invalid serialized JSON data for RangeType: \"closed\" is not a string"); } - ARROW_ASSIGN_OR_RAISE( - RangeClosed closed, - ClosedFromString( - std::string_view(closed_val.GetString(), closed_val.GetStringLength()))); + ARROW_ASSIGN_OR_RAISE(RangeClosed closed, + ClosedFromString(std::string_view(closed_val.GetString(), + closed_val.GetStringLength()))); return std::make_shared(std::move(storage_type), closed); } @@ -186,8 +183,9 @@ std::shared_ptr RangeType::MakeArray(std::shared_ptr data) con return std::make_shared(data); } -Result> RangeType::Make( - std::shared_ptr value_type, RangeClosed closed, bool allow_unbounded) { +Result> RangeType::Make(std::shared_ptr value_type, + RangeClosed closed, + bool allow_unbounded) { auto storage = MakeStorageType(value_type, allow_unbounded); return std::make_shared(std::move(storage), closed); } @@ -195,8 +193,8 @@ Result> RangeType::Make( // --------------------------------------------------------------------------- // Free factory function -std::shared_ptr range(std::shared_ptr value_type, - RangeClosed closed, bool allow_unbounded) { +std::shared_ptr range(std::shared_ptr value_type, RangeClosed closed, + bool allow_unbounded) { auto result = RangeType::Make(std::move(value_type), closed, allow_unbounded); ARROW_CHECK_OK(result.status()); return std::move(result).ValueOrDie(); diff --git a/cpp/src/arrow/extension/range_test.cc b/cpp/src/arrow/extension/range_test.cc index ef4463fb0c11..4b18327e42e1 100644 --- a/cpp/src/arrow/extension/range_test.cc +++ b/cpp/src/arrow/extension/range_test.cc @@ -69,14 +69,14 @@ TEST(RangeType, Basics) { TEST(RangeType, AllClosedValues) { using C = extension::RangeClosed; - auto left = checked_pointer_cast( - extension::range(int32(), C::Left)); - auto right = checked_pointer_cast( - extension::range(int32(), C::Right)); - auto both = checked_pointer_cast( - extension::range(int32(), C::Both)); - auto neither = checked_pointer_cast( - extension::range(int32(), C::Neither)); + auto left = + checked_pointer_cast(extension::range(int32(), C::Left)); + auto right = + checked_pointer_cast(extension::range(int32(), C::Right)); + auto both = + checked_pointer_cast(extension::range(int32(), C::Both)); + auto neither = + checked_pointer_cast(extension::range(int32(), C::Neither)); ASSERT_EQ(R"({"closed":"left"})", left->Serialize()); ASSERT_EQ(R"({"closed":"right"})", right->Serialize()); @@ -119,9 +119,9 @@ TEST(RangeType, CreateFromArray) { auto storage_type = type->storage_type(); auto lower = ArrayFromJSON(int32(), "[1, null, 5]"); auto upper = ArrayFromJSON(int32(), "[10, 20, null]"); - ASSERT_OK_AND_ASSIGN(auto storage, StructArray::Make({lower, upper}, - {field("lower", int32(), true), - field("upper", int32(), true)})); + ASSERT_OK_AND_ASSIGN( + auto storage, StructArray::Make({lower, upper}, {field("lower", int32(), true), + field("upper", int32(), true)})); auto array = ExtensionType::WrapArray(type, storage); ASSERT_EQ(3, array->length()); ASSERT_EQ(0, array->null_count()); @@ -147,12 +147,10 @@ TEST(RangeType, Deserialize) { ASSERT_NO_FATAL_FAILURE( CheckDeserialize(R"({"closed": "right"})", extension::range(int32(), extension::RangeClosed::Right))); - ASSERT_NO_FATAL_FAILURE( - CheckDeserialize(R"({"closed": "left"})", - extension::range(int32(), extension::RangeClosed::Left))); - ASSERT_NO_FATAL_FAILURE( - CheckDeserialize(R"({"closed": "both"})", - extension::range(int32(), extension::RangeClosed::Both))); + ASSERT_NO_FATAL_FAILURE(CheckDeserialize( + R"({"closed": "left"})", extension::range(int32(), extension::RangeClosed::Left))); + ASSERT_NO_FATAL_FAILURE(CheckDeserialize( + R"({"closed": "both"})", extension::range(int32(), extension::RangeClosed::Both))); ASSERT_NO_FATAL_FAILURE( CheckDeserialize(R"({"closed": "neither"})", extension::range(int32(), extension::RangeClosed::Neither))); @@ -178,9 +176,8 @@ TEST(RangeType, DeserializeInvalidMetadata) { auto type = RangeInt32Right(); // "closed" is required on the wire: empty metadata is invalid. - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, testing::HasSubstr("empty string"), - type->Deserialize(type->storage_type(), "")); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("empty string"), + type->Deserialize(type->storage_type(), "")); // A JSON object without the "closed" key is invalid. EXPECT_RAISES_WITH_MESSAGE_THAT( @@ -188,9 +185,9 @@ TEST(RangeType, DeserializeInvalidMetadata) { type->Deserialize(type->storage_type(), "{}")); // Truly malformed JSON fails. - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, testing::HasSubstr("Missing a name for object member"), - type->Deserialize(type->storage_type(), "{")); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, + testing::HasSubstr("Missing a name for object member"), + type->Deserialize(type->storage_type(), "{")); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("not an object"), type->Deserialize(type->storage_type(), "[]")); @@ -214,9 +211,8 @@ TEST(RangeType, DeserializeInvalidStorage) { // Wrong number of fields. auto one_field = struct_({field("lower", int32(), true)}); - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, testing::HasSubstr("exactly 2 fields"), - type->Deserialize(one_field, R"({"closed":"right"})")); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("exactly 2 fields"), + type->Deserialize(one_field, R"({"closed":"right"})")); // Wrong field name for field 0. auto bad_lower_name = @@ -250,19 +246,16 @@ TEST(RangeType, NonNullableBounds) { auto type = RangeInt32Right(); // Both bounds non-nullable: accepted (a finite-only range). - auto both_non_nullable = struct_( - {field("lower", int32(), /*nullable=*/false), - field("upper", int32(), /*nullable=*/false)}); + auto both_non_nullable = struct_({field("lower", int32(), /*nullable=*/false), + field("upper", int32(), /*nullable=*/false)}); ASSERT_OK_AND_ASSIGN(auto from_non_nullable, type->Deserialize(both_non_nullable, R"({"closed":"right"})")); - ASSERT_EQ( - *int32(), - *checked_pointer_cast(from_non_nullable)->value_type()); + ASSERT_EQ(*int32(), + *checked_pointer_cast(from_non_nullable)->value_type()); // Asymmetric: lower nullable (may be -inf), upper non-nullable (always finite). - auto asymmetric = struct_( - {field("lower", int32(), /*nullable=*/true), - field("upper", int32(), /*nullable=*/false)}); + auto asymmetric = struct_({field("lower", int32(), /*nullable=*/true), + field("upper", int32(), /*nullable=*/false)}); ASSERT_OK_AND_ASSIGN(auto from_asymmetric, type->Deserialize(asymmetric, R"({"closed":"left"})")); ASSERT_EQ(extension::RangeClosed::Left, @@ -301,12 +294,11 @@ TEST(RangeType, BatchRoundTrip) { auto type = RangeInt32Right(); auto lower = ArrayFromJSON(int32(), "[1, null, 5]"); auto upper = ArrayFromJSON(int32(), "[10, 20, null]"); - ASSERT_OK_AND_ASSIGN(auto storage, StructArray::Make({lower, upper}, - {field("lower", int32(), true), - field("upper", int32(), true)})); + ASSERT_OK_AND_ASSIGN( + auto storage, StructArray::Make({lower, upper}, {field("lower", int32(), true), + field("upper", int32(), true)})); auto array = ExtensionType::WrapArray(type, storage); - auto batch = - RecordBatch::Make(schema({field("rng", type)}), array->length(), {array}); + auto batch = RecordBatch::Make(schema({field("rng", type)}), array->length(), {array}); std::shared_ptr written; { From 847d38c548af66d7e6e4426db7836b4d0b67c06c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20R=2E=20H=C3=B6lzlwimmer?= Date: Thu, 4 Jun 2026 19:14:25 +0000 Subject: [PATCH 10/11] feat(format): add arrow.range_inc canonical extension type Add a sibling canonical extension type to arrow.range that stores bound inclusivity per value via non-nullable boolean lower_inc/upper_inc fields, storage Struct. arrow.range carries a single type-level closed parameter, sufficient for discrete ranges that canonicalize to one closedness (int4range, int8range, daterange). Continuous ranges (numrange, tsrange, tstzrange) cannot be canonicalized, so closedness must travel with each value. arrow.range_inc mirrors PostgreSQL's internal range representation for that case; both types coexist. The type has no metadata parameters: inclusivity lives in storage, so Serialize emits {} and Deserialize accepts empty/{}/extra keys. A null (infinite) bound is always exclusive regardless of its flag. Covers C++ (type, array, registration, tests), pyarrow bindings and tests, and the format spec, status table, and C++/Python API docs. --- cpp/src/arrow/extension/range.cc | 142 ++++++++++++ cpp/src/arrow/extension/range.h | 73 +++++++ cpp/src/arrow/extension/range_test.cc | 231 ++++++++++++++++++++ cpp/src/arrow/extension_type.cc | 1 + docs/source/cpp/api/extension.rst | 11 + docs/source/format/CanonicalExtensions.rst | 73 +++++++ docs/source/python/api/datatypes.rst | 2 + docs/source/status.rst | 2 + python/pyarrow/__init__.py | 9 +- python/pyarrow/array.pxi | 23 ++ python/pyarrow/includes/libarrow.pxd | 11 + python/pyarrow/lib.pxd | 4 + python/pyarrow/public-api.pxi | 2 + python/pyarrow/scalar.pxi | 5 + python/pyarrow/tests/test_extension_type.py | 92 ++++++++ python/pyarrow/types.pxi | 104 +++++++++ 16 files changed, 781 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/extension/range.cc b/cpp/src/arrow/extension/range.cc index 466951c961fc..372e5df1b69b 100644 --- a/cpp/src/arrow/extension/range.cc +++ b/cpp/src/arrow/extension/range.cc @@ -200,4 +200,146 @@ std::shared_ptr range(std::shared_ptr value_type, RangeClose return std::move(result).ValueOrDie(); } +// --------------------------------------------------------------------------- +// RangeIncType + +namespace { + +/// Build the storage Struct type for a per-value-inclusivity range. In addition +/// to the "lower"/"upper" bounds (nullable iff unbounded endpoints are allowed), +/// it carries two non-nullable boolean fields recording each bound's inclusivity. +std::shared_ptr MakeIncStorageType(const std::shared_ptr& value_type, + bool allow_unbounded) { + return struct_({field("lower", value_type, allow_unbounded), + field("upper", value_type, allow_unbounded), + field("lower_inc", boolean(), /*nullable=*/false), + field("upper_inc", boolean(), /*nullable=*/false)}); +} + +} // namespace + +std::shared_ptr RangeIncType::value_type() const { + // storage_type() is a struct whose "lower"/"upper" fields share the same type. + return internal::checked_cast(*storage_type()).field(0)->type(); +} + +std::string RangeIncType::ToString(bool show_metadata) const { + std::stringstream ss; + ss << "extension<" << this->extension_name() + << "[value_type=" << value_type()->ToString(show_metadata) << "]>"; + return ss.str(); +} + +bool RangeIncType::ExtensionEquals(const ExtensionType& other) const { + if (extension_name() != other.extension_name()) { + return false; + } + // All parameters (value type, bound nullability, the boolean flag fields) are + // part of the storage type, so a storage comparison is sufficient. + return storage_type()->Equals(*other.storage_type()); +} + +std::string RangeIncType::Serialize() const { + // Inclusivity is stored per value, so there is no type-level parameter to + // serialize. Emit an empty JSON object for explicitness and forward-compat. + return "{}"; +} + +Result> RangeIncType::Deserialize( + std::shared_ptr storage_type, const std::string& serialized_data) const { + // Validate storage type structure. + if (storage_type->id() != Type::STRUCT) { + return Status::Invalid("RangeIncType storage type must be a Struct, got ", + storage_type->ToString()); + } + const auto& struct_type = internal::checked_cast(*storage_type); + if (struct_type.num_fields() != 4) { + return Status::Invalid("RangeIncType storage Struct must have exactly 4 fields, got ", + struct_type.num_fields()); + } + const auto& lower_field = struct_type.field(0); + const auto& upper_field = struct_type.field(1); + const auto& lower_inc_field = struct_type.field(2); + const auto& upper_inc_field = struct_type.field(3); + if (lower_field->name() != "lower") { + return Status::Invalid( + "RangeIncType storage Struct field 0 must be named \"lower\", got \"", + lower_field->name(), "\""); + } + if (upper_field->name() != "upper") { + return Status::Invalid( + "RangeIncType storage Struct field 1 must be named \"upper\", got \"", + upper_field->name(), "\""); + } + if (lower_inc_field->name() != "lower_inc") { + return Status::Invalid( + "RangeIncType storage Struct field 2 must be named \"lower_inc\", got \"", + lower_inc_field->name(), "\""); + } + if (upper_inc_field->name() != "upper_inc") { + return Status::Invalid( + "RangeIncType storage Struct field 3 must be named \"upper_inc\", got \"", + upper_inc_field->name(), "\""); + } + if (!lower_field->type()->Equals(*upper_field->type())) { + return Status::Invalid( + "RangeIncType storage Struct fields \"lower\" and \"upper\" must have the " + "same type, got \"", + lower_field->type()->ToString(), "\" and \"", upper_field->type()->ToString(), + "\""); + } + if (lower_inc_field->type()->id() != Type::BOOL || + upper_inc_field->type()->id() != Type::BOOL) { + return Status::Invalid( + "RangeIncType storage Struct fields \"lower_inc\" and \"upper_inc\" must be " + "boolean, got \"", + lower_inc_field->type()->ToString(), "\" and \"", + upper_inc_field->type()->ToString(), "\""); + } + if (lower_inc_field->nullable() || upper_inc_field->nullable()) { + return Status::Invalid( + "RangeIncType storage Struct fields \"lower_inc\" and \"upper_inc\" must be " + "non-nullable"); + } + + // Unlike RangeType, the metadata carries no parameters: inclusivity lives in + // the storage fields. Accept an empty string or any JSON object (ignoring + // unknown keys for forward compatibility). + if (!serialized_data.empty()) { + rapidjson::Document document; + const auto& parsed = document.Parse(serialized_data.data(), serialized_data.length()); + if (parsed.HasParseError()) { + return Status::Invalid("Invalid serialized JSON data for RangeIncType: ", + rapidjson::GetParseError_En(parsed.GetParseError()), ": ", + serialized_data); + } + if (!document.IsObject()) { + return Status::Invalid( + "Invalid serialized JSON data for RangeIncType: not an object"); + } + } + + return std::make_shared(std::move(storage_type)); +} + +std::shared_ptr RangeIncType::MakeArray(std::shared_ptr data) const { + DCHECK_EQ(data->type->id(), Type::EXTENSION); + DCHECK_EQ("arrow.range_inc", + internal::checked_cast(*data->type).extension_name()); + return std::make_shared(data); +} + +Result> RangeIncType::Make(std::shared_ptr value_type, + bool allow_unbounded) { + auto storage = MakeIncStorageType(value_type, allow_unbounded); + return std::make_shared(std::move(storage)); +} + +std::shared_ptr range_inc(std::shared_ptr value_type, + bool allow_unbounded) { + auto result = RangeIncType::Make(std::move(value_type), allow_unbounded); + ARROW_CHECK_OK(result.status()); + return std::move(result).ValueOrDie(); +} + } // namespace arrow::extension diff --git a/cpp/src/arrow/extension/range.h b/cpp/src/arrow/extension/range.h index b9f5f55e6106..87322e6ec0a7 100644 --- a/cpp/src/arrow/extension/range.h +++ b/cpp/src/arrow/extension/range.h @@ -108,4 +108,77 @@ ARROW_EXPORT std::shared_ptr range(std::shared_ptr value_typ RangeClosed closed = RangeClosed::Left, bool allow_unbounded = true); +/// \brief RangeIncType represents a bounded set (mathematical interval) over an +/// orderable Arrow type T whose bound inclusivity is stored **per value**. +/// +/// Unlike RangeType, which carries a single type-level "closed" parameter, this +/// type records the inclusivity of each row's bounds in two boolean storage +/// fields. This is required for continuous ranges (e.g. PostgreSQL's +/// ``numrange``, ``tsrange``, ``tstzrange``) which cannot be canonicalized to a +/// uniform closedness. It mirrors PostgreSQL's internal range representation. +/// +/// Storage is a Struct with exactly four fields, in order: +/// - "lower": T (null, when nullable = unbounded below, i.e. -infinity) +/// - "upper": T (null, when nullable = unbounded above, i.e. +infinity) +/// - "lower_inc": boolean, non-nullable: is the lower bound inclusive? +/// - "upper_inc": boolean, non-nullable: is the upper bound inclusive? +/// +/// "lower" and "upper" share the same orderable type T and may independently be +/// nullable to represent an unbounded (infinite) endpoint. A null (infinite) +/// bound is always treated as exclusive, regardless of its "*_inc" flag. +/// +/// The outer struct's validity bit marks a null/absent range. +/// +/// There is no type-level "closed" parameter, so the extension metadata carries +/// no parameters (serialized as the empty JSON object ``{}``). +class ARROW_EXPORT RangeIncType : public ExtensionType { + public: + /// \brief Construct a RangeIncType. + /// + /// \param[in] storage_type A four-field Struct type with fields "lower", + /// "upper" (same orderable type T) and non-nullable boolean "lower_inc", + /// "upper_inc". + explicit RangeIncType(std::shared_ptr storage_type) + : ExtensionType(std::move(storage_type)) {} + + std::string extension_name() const override { return "arrow.range_inc"; } + std::string ToString(bool show_metadata = false) const override; + bool ExtensionEquals(const ExtensionType& other) const override; + std::string Serialize() const override; + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized_data) const override; + + /// \brief Create a RangeIncArray from ArrayData. + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + /// \brief Factory function. + /// + /// Constructs the four-field struct storage type internally. + /// \param[in] value_type The orderable Arrow subtype T for lower and upper. + /// \param[in] allow_unbounded Whether each side may be unbounded (infinite). + /// When true, the "lower" and "upper" fields are nullable and a null bound + /// denotes an infinite endpoint; when false, both bounds are non-nullable + /// and the range is always finite. The "lower_inc" and "upper_inc" fields + /// are always non-nullable. Defaults to true. + static Result> Make(std::shared_ptr value_type, + bool allow_unbounded = true); + + /// \brief Return the Arrow subtype T (the type of "lower" and "upper" fields). + std::shared_ptr value_type() const; +}; + +/// \brief Array class for arrow.range_inc extension arrays. +class ARROW_EXPORT RangeIncArray : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; + +/// \brief Create a RangeIncType with the given value subtype. +/// +/// This is a convenience wrapper around RangeIncType::Make that aborts on error. +/// For recoverable error handling prefer RangeIncType::Make. +ARROW_EXPORT std::shared_ptr range_inc(std::shared_ptr value_type, + bool allow_unbounded = true); + } // namespace arrow::extension diff --git a/cpp/src/arrow/extension/range_test.cc b/cpp/src/arrow/extension/range_test.cc index 4b18327e42e1..be529d372be3 100644 --- a/cpp/src/arrow/extension/range_test.cc +++ b/cpp/src/arrow/extension/range_test.cc @@ -317,4 +317,235 @@ TEST(RangeType, BatchRoundTrip) { ASSERT_BATCHES_EQUAL(*batch, *written); } +// =========================================================================== +// RangeIncType -- per-value bound inclusivity +// =========================================================================== + +namespace { + +std::shared_ptr IncStorage(const std::shared_ptr& value_type, + bool nullable_bounds = true) { + return struct_({field("lower", value_type, nullable_bounds), + field("upper", value_type, nullable_bounds), + field("lower_inc", boolean(), /*nullable=*/false), + field("upper_inc", boolean(), /*nullable=*/false)}); +} + +} // namespace + +// --------------------------------------------------------------------------- +// Basics + +TEST(RangeIncType, Basics) { + auto type = + checked_pointer_cast(extension::range_inc(int32())); + ASSERT_EQ("arrow.range_inc", type->extension_name()); + ASSERT_EQ(*int32(), *type->value_type()); + ASSERT_EQ(*type, *type); + ASSERT_NE(*arrow::null(), *type); + // No type-level parameters: metadata is the empty JSON object. + ASSERT_EQ("{}", type->Serialize()); + ASSERT_EQ("extension", type->ToString(false)); + // Storage carries the two non-nullable boolean inclusivity fields. + const auto& storage = internal::checked_cast(*type->storage_type()); + ASSERT_EQ(4, storage.num_fields()); + ASSERT_EQ("lower_inc", storage.field(2)->name()); + ASSERT_EQ("upper_inc", storage.field(3)->name()); + ASSERT_EQ(*boolean(), *storage.field(2)->type()); + ASSERT_FALSE(storage.field(2)->nullable()); + ASSERT_FALSE(storage.field(3)->nullable()); +} + +// --------------------------------------------------------------------------- +// Equals + +TEST(RangeIncType, Equals) { + auto i32 = checked_pointer_cast(extension::range_inc(int32())); + auto i32b = + checked_pointer_cast(extension::range_inc(int32())); + auto i64 = checked_pointer_cast(extension::range_inc(int64())); + auto i32_finite = checked_pointer_cast( + extension::range_inc(int32(), /*allow_unbounded=*/false)); + + // Same object / same parameters. + ASSERT_EQ(*i32, *i32); + ASSERT_EQ(*i32, *i32b); + + // Different value type. + ASSERT_NE(*i32, *i64); + + // Different bound nullability is part of storage, hence a different type. + ASSERT_NE(*i32, *i32_finite); + + // Not equal to non-range types, including a plain arrow.range. + ASSERT_NE(*i32, *arrow::int32()); + ASSERT_NE(*i32, *extension::range(int32())); +} + +// --------------------------------------------------------------------------- +// CreateFromArray + +TEST(RangeIncType, CreateFromArray) { + auto type = + checked_pointer_cast(extension::range_inc(int32())); + auto lower = ArrayFromJSON(int32(), "[1, null, 5]"); + auto upper = ArrayFromJSON(int32(), "[10, 20, null]"); + auto lower_inc = ArrayFromJSON(boolean(), "[true, false, true]"); + auto upper_inc = ArrayFromJSON(boolean(), "[false, false, true]"); + ASSERT_OK_AND_ASSIGN(auto storage, + StructArray::Make({lower, upper, lower_inc, upper_inc}, + type->storage_type()->fields())); + auto array = ExtensionType::WrapArray(type, storage); + ASSERT_EQ(3, array->length()); + ASSERT_EQ(0, array->null_count()); +} + +// --------------------------------------------------------------------------- +// Deserialize - valid cases (metadata carries no parameters) + +TEST(RangeIncType, DeserializeMetadata) { + auto type = + checked_pointer_cast(extension::range_inc(int32())); + + // Empty string, empty object, and extra keys are all accepted. + for (const auto& serialized : + {std::string(""), std::string("{}"), std::string(R"({"extra": 42})")}) { + ASSERT_OK_AND_ASSIGN(auto deserialized, + type->Deserialize(type->storage_type(), serialized)); + ASSERT_EQ(*type, *deserialized) << "Failed for metadata: " << serialized; + } +} + +// --------------------------------------------------------------------------- +// Deserialize - invalid cases + +TEST(RangeIncType, DeserializeInvalidMetadata) { + auto type = + checked_pointer_cast(extension::range_inc(int32())); + + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, + testing::HasSubstr("Missing a name for object member"), + type->Deserialize(type->storage_type(), "{")); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("not an object"), + type->Deserialize(type->storage_type(), "[]")); +} + +TEST(RangeIncType, DeserializeInvalidStorage) { + auto type = + checked_pointer_cast(extension::range_inc(int32())); + + // Not a struct. + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("must be a Struct"), + type->Deserialize(int32(), "{}")); + + // Wrong number of fields (a plain 2-field range struct). + auto two_fields = + struct_({field("lower", int32(), true), field("upper", int32(), true)}); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("exactly 4 fields"), + type->Deserialize(two_fields, "{}")); + + // Wrong inc field names. + auto bad_inc_name = + struct_({field("lower", int32(), true), field("upper", int32(), true), + field("lo_inc", boolean(), false), field("upper_inc", boolean(), false)}); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("named \"lower_inc\""), + type->Deserialize(bad_inc_name, "{}")); + + // Inc fields not boolean. + auto non_bool_inc = + struct_({field("lower", int32(), true), field("upper", int32(), true), + field("lower_inc", int8(), false), field("upper_inc", int8(), false)}); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("must be boolean"), + type->Deserialize(non_bool_inc, "{}")); + + // Inc fields nullable: rejected (would produce ambiguous data). + auto nullable_inc = + struct_({field("lower", int32(), true), field("upper", int32(), true), + field("lower_inc", boolean(), true), field("upper_inc", boolean(), true)}); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("must be non-nullable"), + type->Deserialize(nullable_inc, "{}")); + + // Bounds have different types. + auto mismatched = struct_({field("lower", int32(), true), field("upper", int64(), true), + field("lower_inc", boolean(), false), + field("upper_inc", boolean(), false)}); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("same type"), + type->Deserialize(mismatched, "{}")); +} + +// --------------------------------------------------------------------------- +// Non-nullable bounds + +TEST(RangeIncType, NonNullableBounds) { + auto type = + checked_pointer_cast(extension::range_inc(int32())); + + // Both bounds non-nullable: accepted (a finite-only range). + ASSERT_OK_AND_ASSIGN( + auto from_non_nullable, + type->Deserialize(IncStorage(int32(), /*nullable_bounds=*/false), "{}")); + ASSERT_EQ( + *int32(), + *checked_pointer_cast(from_non_nullable)->value_type()); + + // The factory can build non-nullable bounds via allow_unbounded=false. + auto finite = checked_pointer_cast( + extension::range_inc(int32(), /*allow_unbounded=*/false)); + const auto& finite_storage = + internal::checked_cast(*finite->storage_type()); + ASSERT_FALSE(finite_storage.field(0)->nullable()); + ASSERT_FALSE(finite_storage.field(1)->nullable()); + // The inc fields are non-nullable regardless of allow_unbounded. + ASSERT_FALSE(finite_storage.field(2)->nullable()); + ASSERT_FALSE(finite_storage.field(3)->nullable()); +} + +// --------------------------------------------------------------------------- +// Metadata round-trip + +TEST(RangeIncType, MetadataRoundTrip) { + for (const auto& type : + {extension::range_inc(int32()), extension::range_inc(int64()), + extension::range_inc(date32()), extension::range_inc(int32(), false)}) { + auto rt = checked_pointer_cast(type); + std::string serialized = rt->Serialize(); + ASSERT_OK_AND_ASSIGN(auto deserialized, + rt->Deserialize(rt->storage_type(), serialized)); + ASSERT_EQ(*type, *deserialized) << "Round-trip failed for: " << type->ToString(); + } +} + +// --------------------------------------------------------------------------- +// IPC (BatchRoundTrip) -- registration round-trip + +TEST(RangeIncType, BatchRoundTrip) { + auto type = + checked_pointer_cast(extension::range_inc(int32())); + auto lower = ArrayFromJSON(int32(), "[1, null, 5]"); + auto upper = ArrayFromJSON(int32(), "[10, 20, null]"); + auto lower_inc = ArrayFromJSON(boolean(), "[true, false, true]"); + auto upper_inc = ArrayFromJSON(boolean(), "[false, false, true]"); + ASSERT_OK_AND_ASSIGN(auto storage, + StructArray::Make({lower, upper, lower_inc, upper_inc}, + type->storage_type()->fields())); + auto array = ExtensionType::WrapArray(type, storage); + auto batch = RecordBatch::Make(schema({field("rng", type)}), array->length(), {array}); + + std::shared_ptr written; + { + ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); + ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), + out_stream.get())); + ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); + + io::BufferReader reader(complete_ipc_stream); + std::shared_ptr batch_reader; + ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); + ASSERT_OK(batch_reader->ReadNext(&written)); + } + + ASSERT_EQ(*batch->schema(), *written->schema()); + ASSERT_BATCHES_EQUAL(*batch, *written); +} + } // namespace arrow diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index 1dd840621f20..bb673bbc416f 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -158,6 +158,7 @@ static void CreateGlobalRegistry() { ext_types.push_back(extension::fixed_shape_tensor(int64(), {})); ext_types.push_back(extension::opaque(null(), "", "")); ext_types.push_back(extension::range(int32())); + ext_types.push_back(extension::range_inc(int32())); ext_types.push_back(extension::variable_shape_tensor(int64(), 0)); #endif diff --git a/docs/source/cpp/api/extension.rst b/docs/source/cpp/api/extension.rst index 6b1e2e9a8df0..f2c700e74d78 100644 --- a/docs/source/cpp/api/extension.rst +++ b/docs/source/cpp/api/extension.rst @@ -46,6 +46,10 @@ Extension Type classes :project: arrow_cpp :members: +.. doxygenclass:: arrow::extension::RangeIncType + :project: arrow_cpp + :members: + Extension Array classes ======================= @@ -69,9 +73,16 @@ Extension Array classes :project: arrow_cpp :members: +.. doxygenclass:: arrow::extension::RangeIncArray + :project: arrow_cpp + :members: + Extension functions =================== .. doxygenfunction:: arrow::extension::range :project: arrow_cpp +.. doxygenfunction:: arrow::extension::range_inc + :project: arrow_cpp + diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 1cdd9a9be56f..b94a770a99af 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -661,6 +661,79 @@ PostgreSQL's `range types`_ and SQL:2011 ``PERIOD`` types. .. _range types: https://www.postgresql.org/docs/current/rangetypes.html +.. _range_inc_extension: + +Range Inc +========= + +Range Inc represents a bounded set (mathematical interval) over an orderable +Arrow type T whose bound inclusivity is recorded **per value** rather than as a +single type-level parameter. It is the companion of the :ref:`Range +` extension type for ranges that cannot be canonicalized to a +uniform closedness. + +.. note:: + + **When to use** ``arrow.range`` **vs.** ``arrow.range_inc``. + Discrete ranges (e.g. PostgreSQL's ``int4range``, ``int8range``, + ``daterange``) canonicalize to a single closedness (left-closed), so they + are best represented by :ref:`arrow.range `, which stores + the closedness once in the type metadata. Continuous ranges (e.g. + PostgreSQL's ``numrange``, ``tsrange``, ``tstzrange``) **cannot** be + canonicalized: two values may share the same endpoints yet differ in + whether those endpoints are included. ``arrow.range_inc`` stores the + inclusivity of each bound alongside the bound itself, mirroring PostgreSQL's + internal range representation, and is the appropriate choice for that case. + +* Extension name: ``arrow.range_inc``. + +* The storage type of the extension is a ``Struct`` with exactly **four fields, + in order**: + + * ``lower``: the lower bound, type **T**, *optionally nullable*. + When the field is nullable, a null value means the range is unbounded below + (negative infinity). + * ``upper``: the upper bound, type **T**, *optionally nullable*. + When the field is nullable, a null value means the range is unbounded above + (positive infinity). + * ``lower_inc``: a **non-nullable** ``boolean`` -- ``true`` when the lower + bound is inclusive for that value, ``false`` when it is exclusive. + * ``upper_inc``: a **non-nullable** ``boolean`` -- ``true`` when the upper + bound is inclusive for that value, ``false`` when it is exclusive. + + **T** (the *subtype* or *value type*) may be any orderable Arrow type: + integer, floating-point, decimal, date, time, or timestamp types. The + ``lower`` and ``upper`` fields share the same type T, read directly from the + storage struct; the subtype is **not** duplicated in the extension metadata. + + Each of ``lower`` and ``upper`` **may** be nullable, independently of the + other, exactly as in :ref:`arrow.range `: nullability is + only needed to represent an unbounded side. A null bound is **always treated + as exclusive**, regardless of its ``lower_inc`` / ``upper_inc`` flag; positive + and negative infinity can never be included. The ``lower_inc`` and + ``upper_inc`` fields are **always non-nullable**. The outer struct's validity + bit marks a null/absent range (a missing range, distinct from an empty range). + +* Extension type parameters: + + This type has **no** type-level parameters. Unlike :ref:`arrow.range + `, inclusivity is not fixed by the type; it is carried per + value in the ``lower_inc`` and ``upper_inc`` fields. + + For a given value, the range contains every x permitted by its finite bounds + and per-value flags: with both flags ``true`` every x such that + ``lower <= x <= upper``, with both flags ``false`` every x such that + ``lower < x < upper``. A value is *empty* when ``lower > upper``, or when + ``lower == upper`` and at least one of ``lower_inc`` / ``upper_inc`` is + ``false``. + +* Description of the serialization: + + Because inclusivity is stored per value, the type carries no parameters and + the extension metadata is an **empty JSON object** ``{}``. For + forward-compatibility, an empty metadata string is also accepted on read, and + any additional keys in the JSON object should be ignored. + Community Extension Types ========================= diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst index 71ee00557f33..2f6125443dd8 100644 --- a/docs/source/python/api/datatypes.rst +++ b/docs/source/python/api/datatypes.rst @@ -74,6 +74,7 @@ These should be used to create Arrow data types and schemas. opaque bool8 range_ + range_inc uuid json_ field @@ -148,6 +149,7 @@ implemented by PyArrow. UuidType Bool8Type RangeType + RangeIncType .. _api.types.checking: .. currentmodule:: pyarrow.types diff --git a/docs/source/status.rst b/docs/source/status.rst index f3af1a50e0d2..02684cce3496 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -133,6 +133,8 @@ Data Types +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Range | ✓ | | | | | | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +| Range Inc | ✓ | | | | | | | | ++-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ Notes: diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 1c4e82181aee..0b6ad2400e52 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -165,7 +165,8 @@ def print_entry(label, value): union, sparse_union, dense_union, dictionary, run_end_encoded, - bool8, fixed_shape_tensor, json_, opaque, range_, uuid, + bool8, fixed_shape_tensor, json_, opaque, range_, range_inc, + uuid, field, type_for_alias, DataType, DictionaryType, StructType, @@ -177,7 +178,7 @@ def print_entry(label, value): Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, RunEndEncodedType, Bool8Type, FixedShapeTensorType, - JsonType, OpaqueType, RangeType, UuidType, + JsonType, OpaqueType, RangeType, RangeIncType, UuidType, UnknownExtensionType, register_extension_type, unregister_extension_type, DictionaryMemo, @@ -214,7 +215,7 @@ def print_entry(label, value): Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, StructArray, ExtensionArray, RunEndEncodedArray, Bool8Array, FixedShapeTensorArray, - JsonArray, OpaqueArray, RangeArray, UuidArray, + JsonArray, OpaqueArray, RangeArray, RangeIncArray, UuidArray, scalar, NA, _NULL as NULL, Scalar, NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, @@ -233,7 +234,7 @@ def print_entry(label, value): MapScalar, StructScalar, UnionScalar, RunEndEncodedScalar, Bool8Scalar, ExtensionScalar, FixedShapeTensorScalar, JsonScalar, OpaqueScalar, - RangeScalar, UuidScalar) + RangeScalar, RangeIncScalar, UuidScalar) # Buffers, allocation from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index a25b774d8df0..c76f7dcbc8c6 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4979,6 +4979,29 @@ cdef class RangeArray(ExtensionArray): """ +cdef class RangeIncArray(ExtensionArray): + """ + Concrete class for range_inc extension arrays. + + Examples + -------- + Define the extension type for a range_inc array + + >>> import pyarrow as pa + >>> range_inc_type = pa.range_inc(pa.float64()) + + Create an extension array + + >>> storage = pa.array( + ... [{"lower": 1.0, "upper": 5.0, "lower_inc": True, "upper_inc": False}], + ... range_inc_type.storage_type, + ... ) + >>> arr = pa.ExtensionArray.from_storage(range_inc_type, storage) + >>> isinstance(arr, pa.RangeIncArray) + True + """ + + cdef dict _array_classes = { _Type_NA: NullArray, _Type_BOOL: BooleanArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 26acf0985ab5..bd9ccec281de 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -3106,6 +3106,17 @@ cdef extern from "arrow/extension/range.h" namespace "arrow::extension" nogil: cdef cppclass CRangeArray" arrow::extension::RangeArray"(CExtensionArray): pass + cdef cppclass CRangeIncType" arrow::extension::RangeIncType"(CExtensionType): + + @staticmethod + CResult[shared_ptr[CDataType]] Make(shared_ptr[CDataType] value_type, + c_bool allow_unbounded) + + shared_ptr[CDataType] value_type() + + cdef cppclass CRangeIncArray" arrow::extension::RangeIncArray"(CExtensionArray): + pass + cdef extern from "arrow/util/compression.h" namespace "arrow" nogil: cdef enum CCompressionType" arrow::Compression::type": diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index c4195df0dfe8..a990c5a1b209 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -207,6 +207,10 @@ cdef class RangeType(BaseExtensionType): cdef: const CRangeType* range_ext_type +cdef class RangeIncType(BaseExtensionType): + cdef: + const CRangeIncType* range_inc_ext_type + cdef class OpaqueType(BaseExtensionType): cdef: const COpaqueType* opaque_ext_type diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 7bb56435b7e2..6655fe6b279c 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -133,6 +133,8 @@ cdef api object pyarrow_wrap_data_type( out = FixedShapeTensorType.__new__(FixedShapeTensorType) elif extension_name == b"arrow.range": out = RangeType.__new__(RangeType) + elif extension_name == b"arrow.range_inc": + out = RangeIncType.__new__(RangeIncType) elif extension_name == b"arrow.opaque": out = OpaqueType.__new__(OpaqueType) elif extension_name == b"arrow.uuid": diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 4b2599724283..ca7e01ae0b21 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1617,6 +1617,11 @@ cdef class RangeScalar(ExtensionScalar): Concrete class for range extension scalar. """ +cdef class RangeIncScalar(ExtensionScalar): + """ + Concrete class for range_inc extension scalar. + """ + cdef dict _scalar_classes = { _Type_BOOL: BooleanScalar, _Type_UINT8: UInt8Scalar, diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 3610e16b58ce..8c0888fb5f22 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -2003,6 +2003,98 @@ def test_range_type_allow_unbounded(): assert arr.type == finite +@pytest.mark.parametrize("value_type,rows", [ + (pa.int32(), [ + {"lower": 1, "upper": 5, "lower_inc": True, "upper_inc": False}, + {"lower": None, "upper": 10, "lower_inc": False, "upper_inc": True}, + ]), + (pa.float64(), [ + {"lower": 0.0, "upper": 1.5, "lower_inc": True, "upper_inc": True}, + None, + ]), +]) +def test_range_inc_type(pickle_module, value_type, rows): + range_type = pa.range_inc(value_type) + assert range_type.extension_name == "arrow.range_inc" + assert range_type.value_type == value_type + # Storage carries the two bounds plus per-value, non-nullable inclusivity flags. + assert range_type.storage_type == pa.struct([ + pa.field("lower", value_type, nullable=True), + pa.field("upper", value_type, nullable=True), + pa.field("lower_inc", pa.bool_(), nullable=False), + pa.field("upper_inc", pa.bool_(), nullable=False), + ]) + assert "arrow.range_inc" in str(range_type) + # No type-level closed parameter. + assert not hasattr(range_type, "closed") + + assert range_type == range_type + assert range_type == pa.range_inc(value_type) + assert range_type != value_type + # different value type -> not equal + assert range_type != pa.range_inc(pa.decimal128(12, 3)) + # distinct from a plain arrow.range over the same value type + assert range_type != pa.range_(value_type) + + # Pickle roundtrip + result = pickle_module.loads(pickle_module.dumps(range_type)) + assert result == range_type + assert result.value_type == value_type + + # IPC roundtrip + range_arr_class = range_type.__arrow_ext_class__() + storage = pa.array(rows, range_type.storage_type) + arr = pa.ExtensionArray.from_storage(range_type, storage) + assert isinstance(arr, range_arr_class) + + # extension is registered by default + buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"])) + batch = ipc_read_batch(buf) + + assert batch.column(0).type.extension_name == "arrow.range_inc" + assert batch.column(0).type.value_type == value_type + assert isinstance(batch.column(0), range_arr_class) + assert batch.column(0) == arr + + # cast storage -> extension type + result = storage.cast(range_type) + assert result == arr + + # cast extension type -> storage type + inner = arr.cast(range_type.storage_type) + assert inner == storage + + +def test_range_inc_type_allow_unbounded(): + # Default: bounds are nullable (can represent an unbounded / infinite side). + nullable = pa.range_inc(pa.int32()) + assert nullable.storage_type.field("lower").nullable + assert nullable.storage_type.field("upper").nullable + # The inclusivity flags are always non-nullable. + assert not nullable.storage_type.field("lower_inc").nullable + assert not nullable.storage_type.field("upper_inc").nullable + + # allow_unbounded=False: a finite-only range with non-nullable bounds. + finite = pa.range_inc(pa.int32(), allow_unbounded=False) + assert not finite.storage_type.field("lower").nullable + assert not finite.storage_type.field("upper").nullable + # The flags stay non-nullable regardless of allow_unbounded. + assert not finite.storage_type.field("lower_inc").nullable + assert not finite.storage_type.field("upper_inc").nullable + assert finite.value_type == pa.int32() + + # Distinct types: storage nullability differs. + assert finite != nullable + + # A non-nullable-bounds range_inc round-trips through its storage. + storage = pa.array( + [{"lower": 1, "upper": 5, "lower_inc": True, "upper_inc": False}], + finite.storage_type, + ) + arr = pa.ExtensionArray.from_storage(finite, storage) + assert arr.type == finite + + def test_bool8_type(pickle_module): bool8_type = pa.bool8() storage_type = pa.int8() diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 0777c181c502..3626241a88f5 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -2147,6 +2147,50 @@ cdef class RangeType(BaseExtensionType): return RangeScalar +cdef class RangeIncType(BaseExtensionType): + """ + Concrete class for range_inc extension type. + + Like :class:`RangeType`, this represents a bounded set (a mathematical + interval) over an orderable Arrow value type, but the inclusivity of each + bound is stored *per value* rather than as a single type-level parameter. + The underlying storage is a Struct with four fields: "lower" and "upper" + (the value type, each optionally nullable to denote an unbounded endpoint) + and non-nullable boolean "lower_inc" and "upper_inc" recording whether each + bound is inclusive. This is required for continuous ranges (e.g. + PostgreSQL's ``numrange``, ``tsrange``, ``tstzrange``) that cannot be + canonicalized to a uniform closedness. + + Examples + -------- + Create an instance of range_inc extension type: + + >>> import pyarrow as pa + >>> pa.range_inc(pa.float64()) + RangeIncType(extension) + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + BaseExtensionType.init(self, type) + self.range_inc_ext_type = type.get() + + @property + def value_type(self): + """ + The Arrow value type of the "lower" and "upper" bounds. + """ + return pyarrow_wrap_data_type(self.range_inc_ext_type.value_type()) + + def __arrow_ext_class__(self): + return RangeIncArray + + def __reduce__(self): + return range_inc, (self.value_type,) + + def __arrow_ext_scalar_class__(self): + return RangeIncScalar + + cdef class OpaqueType(BaseExtensionType): """ Concrete class for opaque extension type. @@ -5834,6 +5878,66 @@ def range_(DataType value_type not None, str closed="left", allow_unbounded=True return out +def range_inc(DataType value_type not None, allow_unbounded=True): + """ + Create instance of range_inc extension type. + + Unlike :func:`range_`, the inclusivity of each bound is stored per value + (in boolean "lower_inc"/"upper_inc" storage fields) rather than as a single + type-level "closed" parameter, so there is no "closed" argument. This is + required for continuous ranges (e.g. PostgreSQL's ``numrange``, ``tsrange``, + ``tstzrange``) that cannot be canonicalized to a uniform closedness. + + Parameters + ---------- + value_type : DataType + The orderable Arrow type of the "lower" and "upper" interval bounds. + allow_unbounded : bool, default True + Whether each side may be unbounded (infinite). When True the "lower" and + "upper" storage fields are nullable (a null bound is an infinite + endpoint); when False both bounds are non-nullable and the range is + always finite. The "lower_inc" and "upper_inc" fields are always + non-nullable. + + Examples + -------- + Create an instance of a range_inc extension type: + + >>> import pyarrow as pa + >>> type = pa.range_inc(pa.float64()) + >>> type + RangeIncType(extension) + + Inspect the data type: + + >>> type.value_type + DataType(double) + >>> type.storage_type + StructType(struct) + + Create a range_inc array: + + >>> storage = pa.array( + ... [{"lower": 1.0, "upper": 5.0, "lower_inc": True, "upper_inc": False}], + ... type.storage_type, + ... ) + >>> arr = pa.ExtensionArray.from_storage(type, storage) + >>> arr.type + RangeIncType(extension) + + Returns + ------- + type : RangeIncType + """ + + cdef: + shared_ptr[CDataType] c_type = GetResultValue( + CRangeIncType.Make(value_type.sp_type, allow_unbounded)) + RangeIncType out = RangeIncType.__new__(RangeIncType) + out.init(c_type) + return out + + def opaque(DataType storage_type, str type_name not None, str vendor_name not None): """ Create instance of opaque extension type. From f3ec0b6bec86839ca0727c5332bb715fdda0d157 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20R=2E=20H=C3=B6lzlwimmer?= Date: Thu, 4 Jun 2026 22:44:15 +0000 Subject: [PATCH 11/11] fix(format): rename range_test helper to avoid unity-build clash Under CMAKE_UNITY_BUILD (Windows CI), range_test.cc and opaque_test.cc are merged into one translation unit. Both declared a CheckDeserialize helper (range's in an anonymous namespace, opaque's in namespace arrow), making the unqualified call ambiguous and failing the MSVC build with C2668. Rename the range helper to CheckRangeDeserialize to remove the collision. --- cpp/src/arrow/extension/range_test.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/extension/range_test.cc b/cpp/src/arrow/extension/range_test.cc index be529d372be3..44cbf9958b3b 100644 --- a/cpp/src/arrow/extension/range_test.cc +++ b/cpp/src/arrow/extension/range_test.cc @@ -132,8 +132,8 @@ TEST(RangeType, CreateFromArray) { namespace { -void CheckDeserialize(const std::string& serialized, - const std::shared_ptr& expected) { +void CheckRangeDeserialize(const std::string& serialized, + const std::shared_ptr& expected) { auto type = checked_pointer_cast(expected); ASSERT_OK_AND_ASSIGN(auto deserialized, type->Deserialize(type->storage_type(), serialized)); @@ -145,20 +145,20 @@ void CheckDeserialize(const std::string& serialized, TEST(RangeType, Deserialize) { // Normal JSON ASSERT_NO_FATAL_FAILURE( - CheckDeserialize(R"({"closed": "right"})", - extension::range(int32(), extension::RangeClosed::Right))); - ASSERT_NO_FATAL_FAILURE(CheckDeserialize( + CheckRangeDeserialize(R"({"closed": "right"})", + extension::range(int32(), extension::RangeClosed::Right))); + ASSERT_NO_FATAL_FAILURE(CheckRangeDeserialize( R"({"closed": "left"})", extension::range(int32(), extension::RangeClosed::Left))); - ASSERT_NO_FATAL_FAILURE(CheckDeserialize( + ASSERT_NO_FATAL_FAILURE(CheckRangeDeserialize( R"({"closed": "both"})", extension::range(int32(), extension::RangeClosed::Both))); ASSERT_NO_FATAL_FAILURE( - CheckDeserialize(R"({"closed": "neither"})", - extension::range(int32(), extension::RangeClosed::Neither))); + CheckRangeDeserialize(R"({"closed": "neither"})", + extension::range(int32(), extension::RangeClosed::Neither))); // Extra fields are tolerated (forward-compatibility). ASSERT_NO_FATAL_FAILURE( - CheckDeserialize(R"({"closed": "right", "extra": 42})", - extension::range(int32(), extension::RangeClosed::Right))); + CheckRangeDeserialize(R"({"closed": "right", "extra": 42})", + extension::range(int32(), extension::RangeClosed::Right))); } TEST(RangeType, DefaultClosedIsLeft) {