diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 6e0b1ce5b96..974e2ae1258 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -370,7 +370,7 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr const parquet::Statistics& statistics) { auto field_expr = compute::field_ref(field_ref); - bool may_have_null = !statistics.HasNullCount() || statistics.null_count() > 0; + bool may_have_null = !statistics.HasNullCount() || statistics.NullCount().value() > 0; // Optimize for corner case where all values are nulls if (statistics.num_values() == 0) { // If there are no non-null values, column `field_ref` in the fragment diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index 696bda19359..0534dca52ef 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -941,9 +941,8 @@ TEST(TestParquetStatistics, NoNullCount) { ::parquet::EncodedStatistics encoded_stats; encoded_stats.set_min(int32_to_parquet_stats(1)); encoded_stats.set_max(int32_to_parquet_stats(100)); - encoded_stats.has_null_count = false; encoded_stats.all_null_value = false; - encoded_stats.null_count = 0; + encoded_stats.null_count = std::nullopt; auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/10); auto stat_expression = @@ -956,7 +955,6 @@ TEST(TestParquetStatistics, NoNullCount) { // Special case: when num_value is 0, it would return // "is_null". ::parquet::EncodedStatistics encoded_stats; - encoded_stats.has_null_count = true; encoded_stats.null_count = 1; encoded_stats.all_null_value = true; auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0); @@ -965,7 +963,7 @@ TEST(TestParquetStatistics, NoNullCount) { ASSERT_TRUE(stat_expression.has_value()); EXPECT_EQ(stat_expression->ToString(), "is_null(x, {nan_is_null=false})"); - encoded_stats.has_null_count = false; + encoded_stats.null_count = std::nullopt; encoded_stats.all_null_value = false; stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0); stat_expression = ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats); diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index e2384972cf5..a4f0e7d344e 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -4735,8 +4735,8 @@ TEST_P(TestArrowWriteDictionary, Statistics) { auto expect_has_min_max = expected_has_min_max_by_page[case_index][row_group_index][page_index]; - EXPECT_EQ(stats.has_min, expect_has_min_max); - EXPECT_EQ(stats.has_max, expect_has_min_max); + EXPECT_EQ(stats.HasMin(), expect_has_min_max); + EXPECT_EQ(stats.HasMax(), expect_has_min_max); if (expect_has_min_max) { EXPECT_EQ(stats.min(), expected_min_by_page[case_index][row_group_index][page_index]); diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc b/cpp/src/parquet/arrow/arrow_statistics_test.cc index 27a76fd72be..6f7990af0d0 100644 --- a/cpp/src/parquet/arrow/arrow_statistics_test.cc +++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc @@ -89,7 +89,7 @@ TEST_P(ParameterizedStatisticsTest, NoNullCountWrittenForRepeatedFields) { auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader)); std::shared_ptr metadata = parquet_reader->metadata(); std::shared_ptr stats = metadata->RowGroup(0)->ColumnChunk(0)->statistics(); - EXPECT_EQ(stats->null_count(), GetParam().expected_null_count); + EXPECT_EQ(stats->NullCount(), GetParam().expected_null_count); EXPECT_EQ(stats->num_values(), GetParam().expected_value_count); ASSERT_TRUE(stats->HasMinMax()); EXPECT_EQ(stats->EncodeMin(), GetParam().expected_min); diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index 12f36fe39cf..152f35f8e29 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -169,8 +169,8 @@ template Status MakeMinMaxScalar(const StatisticsType& statistics, std::shared_ptr<::arrow::Scalar>* min, std::shared_ptr<::arrow::Scalar>* max) { - *min = ::arrow::MakeScalar(static_cast(statistics.min())); - *max = ::arrow::MakeScalar(static_cast(statistics.max())); + *min = ::arrow::MakeScalar(static_cast(statistics.Min().value())); + *max = ::arrow::MakeScalar(static_cast(statistics.Max().value())); return Status::OK(); } @@ -179,8 +179,8 @@ Status MakeMinMaxTypedScalar(const StatisticsType& statistics, std::shared_ptr type, std::shared_ptr<::arrow::Scalar>* min, std::shared_ptr<::arrow::Scalar>* max) { - ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.min())); - ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.max())); + ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.Min().value())); + ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.Max().value())); return Status::OK(); } @@ -227,8 +227,8 @@ static Status FromInt32Statistics(const Int32Statistics& statistics, case LogicalType::Type::NONE: return MakeMinMaxTypedScalar(statistics, type, min, max); case LogicalType::Type::DECIMAL: - return ExtractDecimalMinMaxFromInteger(statistics.min(), statistics.max(), - logical_type, min, max); + return ExtractDecimalMinMaxFromInteger( + statistics.Min().value(), statistics.Max().value(), logical_type, min, max); default: break; } @@ -252,8 +252,8 @@ static Status FromInt64Statistics(const Int64Statistics& statistics, case LogicalType::Type::NONE: return MakeMinMaxTypedScalar(statistics, type, min, max); case LogicalType::Type::DECIMAL: - return ExtractDecimalMinMaxFromInteger(statistics.min(), statistics.max(), - logical_type, min, max); + return ExtractDecimalMinMaxFromInteger( + statistics.Min().value(), statistics.Max().value(), logical_type, min, max); default: break; } @@ -384,13 +384,13 @@ void AttachStatistics(::arrow::ArrayData* data, } if (statistics) { if (statistics->HasDistinctCount()) { - array_statistics->distinct_count = statistics->distinct_count(); + array_statistics->distinct_count = statistics->DistinctCount().value(); } if (statistics->HasMinMax()) { const auto* typed_statistics = checked_cast*>(statistics.get()); - const ArrowCType min = typed_statistics->min(); - const ArrowCType max = typed_statistics->max(); + const ArrowCType min = typed_statistics->Min().value(); + const ArrowCType max = typed_statistics->Max().value(); if constexpr (std::is_same_v) { array_statistics->min = static_cast(min); array_statistics->max = static_cast(max); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 797d435e73e..4301c256155 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1084,7 +1084,7 @@ void ColumnWriterImpl::BuildDataPageV2(int64_t definition_levels_rle_size, // page_stats.null_count is not set when page_statistics_ is nullptr. It is only used // here for safety check. - DCHECK(!page_stats.has_null_count || page_stats.null_count == null_count); + DCHECK(!page_stats.HasNullCount() || page_stats.null_count == null_count); // Write the page to OutputStream eagerly if there is no dictionary or // if dictionary encoding has fallen back to PLAIN diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 157e73ffec4..3f11b9a1dbd 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -375,7 +375,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { auto metadata_accessor = ColumnChunkMetaData::Make( metadata_->contents(), this->descr_, default_reader_properties(), &app_version); auto encoded_stats = metadata_accessor->statistics()->Encode(); - return {encoded_stats.has_min, encoded_stats.has_max}; + return {encoded_stats.HasMin(), encoded_stats.HasMax()}; } std::vector metadata_encodings() { diff --git a/cpp/src/parquet/file_deserialize_test.cc b/cpp/src/parquet/file_deserialize_test.cc index 7fa5e2f167e..f584fffe2bb 100644 --- a/cpp/src/parquet/file_deserialize_test.cc +++ b/cpp/src/parquet/file_deserialize_test.cc @@ -69,10 +69,10 @@ static inline void AddDummyStats(int stat_size, H& header, bool fill_all_stats = template static inline void CheckStatistics(const H& expected, const EncodedStatistics& actual) { if (expected.statistics.__isset.max) { - ASSERT_EQ(expected.statistics.max, actual.max()); + ASSERT_EQ(expected.statistics.max, actual.Max()); } if (expected.statistics.__isset.min) { - ASSERT_EQ(expected.statistics.min, actual.min()); + ASSERT_EQ(expected.statistics.min, actual.Min()); } if (expected.statistics.__isset.null_count) { ASSERT_EQ(expected.statistics.null_count, actual.null_count); @@ -513,8 +513,8 @@ TYPED_TEST(PageFilterTest, TestPageFilterCallback) { CheckDataPageHeader(this->data_page_headers_[i], current_page.get())); auto data_page = static_cast(current_page.get()); const EncodedStatistics encoded_statistics = data_page->statistics(); - ASSERT_EQ(read_stats[i].max(), encoded_statistics.max()); - ASSERT_EQ(read_stats[i].min(), encoded_statistics.min()); + ASSERT_EQ(read_stats[i].Max(), encoded_statistics.Max()); + ASSERT_EQ(read_stats[i].Min(), encoded_statistics.Min()); ASSERT_EQ(read_stats[i].null_count, encoded_statistics.null_count); ASSERT_EQ(read_stats[i].distinct_count, encoded_statistics.distinct_count); ASSERT_EQ(read_num_values[i], this->data_page_headers_[i].num_values); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 505ace275b1..45ccb02b8fe 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -101,24 +101,40 @@ static std::shared_ptr MakeTypedColumnStats( metadata.statistics.__isset.is_max_value_exact ? std::optional(metadata.statistics.is_max_value_exact) : std::nullopt; + std::optional null_count = + metadata.statistics.__isset.null_count + ? std::optional(metadata.statistics.null_count) + : std::nullopt; + std::optional distinct_count = + metadata.statistics.__isset.distinct_count + ? std::optional(metadata.statistics.distinct_count) + : std::nullopt; + std::optional min_val = + metadata.statistics.__isset.min + ? std::optional(metadata.statistics.min) + : std::nullopt; + std::optional max_val = + metadata.statistics.__isset.max + ? std::optional(metadata.statistics.max) + : std::nullopt; // If ColumnOrder is defined, return max_value and min_value if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) { - return MakeStatistics( - descr, metadata.statistics.min_value, metadata.statistics.max_value, - metadata.num_values - metadata.statistics.null_count, - metadata.statistics.null_count, metadata.statistics.distinct_count, - metadata.statistics.__isset.max_value && metadata.statistics.__isset.min_value, - metadata.statistics.__isset.null_count, - metadata.statistics.__isset.distinct_count, min_exact, max_exact, pool); + std::optional min_value = + metadata.statistics.__isset.min_value + ? std::optional(metadata.statistics.min_value) + : std::nullopt; + std::optional max_value = + metadata.statistics.__isset.max_value + ? std::optional(metadata.statistics.max_value) + : std::nullopt; + return MakeStatistics(descr, min_value, max_value, + metadata.num_values - null_count.value_or(0), null_count, + distinct_count, min_exact, max_exact, pool); } // Default behavior - return MakeStatistics( - descr, metadata.statistics.min, metadata.statistics.max, - metadata.num_values - metadata.statistics.null_count, - metadata.statistics.null_count, metadata.statistics.distinct_count, - metadata.statistics.__isset.max && metadata.statistics.__isset.min, - metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count, - min_exact, max_exact, pool); + return MakeStatistics(descr, min_val, max_val, + metadata.num_values - null_count.value_or(0), null_count, + distinct_count, min_exact, max_exact, pool); } namespace { @@ -1610,7 +1626,7 @@ bool ApplicationVersion::HasCorrectStatistics(Type::type col_type, (application_ == "parquet-mr" && VersionLt(PARQUET_MR_FIXED_STATS_VERSION()))) { // Only SIGNED are valid unless max and min are the same // (in which case the sort order does not matter) - bool max_equals_min = statistics.has_min && statistics.has_max + bool max_equals_min = statistics.HasMin() && statistics.HasMax() ? statistics.min() == statistics.max() : false; if (SortOrder::SIGNED != sort_order && !max_equals_min) { diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc index 572f053179c..842594cb2c7 100644 --- a/cpp/src/parquet/metadata_test.cc +++ b/cpp/src/parquet/metadata_test.cc @@ -154,18 +154,18 @@ TEST(Metadata, TestBuildAccess) { auto rg1_column2 = rg1_accessor->ColumnChunk(1); ASSERT_EQ(true, rg1_column1->is_stats_set()); ASSERT_EQ(true, rg1_column2->is_stats_set()); - ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin()); - ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax()); - ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin()); - ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); - ASSERT_EQ(stats_float.min(), rg1_column2->encoded_statistics()->min()); - ASSERT_EQ(stats_float.max(), rg1_column2->encoded_statistics()->max()); - ASSERT_EQ(stats_int.min(), rg1_column1->encoded_statistics()->min()); - ASSERT_EQ(stats_int.max(), rg1_column1->encoded_statistics()->max()); - ASSERT_EQ(0, rg1_column1->statistics()->null_count()); - ASSERT_EQ(0, rg1_column2->statistics()->null_count()); - ASSERT_EQ(nrows, rg1_column1->statistics()->distinct_count()); - ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count()); + ASSERT_EQ(stats_float.Min(), rg1_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.Max(), rg1_column2->statistics()->EncodeMax()); + ASSERT_EQ(stats_int.Min(), rg1_column1->statistics()->EncodeMin()); + ASSERT_EQ(stats_int.Max(), rg1_column1->statistics()->EncodeMax()); + ASSERT_EQ(stats_float.Min(), rg1_column2->encoded_statistics()->Min()); + ASSERT_EQ(stats_float.Max(), rg1_column2->encoded_statistics()->Max()); + ASSERT_EQ(stats_int.Min(), rg1_column1->encoded_statistics()->Min()); + ASSERT_EQ(stats_int.Max(), rg1_column1->encoded_statistics()->Max()); + ASSERT_EQ(0, rg1_column1->statistics()->NullCount()); + ASSERT_EQ(0, rg1_column2->statistics()->NullCount()); + ASSERT_EQ(nrows, rg1_column1->statistics()->DistinctCount()); + ASSERT_EQ(nrows, rg1_column2->statistics()->DistinctCount()); ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column1->compression()); ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression()); ASSERT_EQ(nrows / 2, rg1_column1->num_values()); @@ -205,18 +205,18 @@ TEST(Metadata, TestBuildAccess) { auto rg2_column2 = rg2_accessor->ColumnChunk(1); ASSERT_EQ(true, rg2_column1->is_stats_set()); ASSERT_EQ(true, rg2_column2->is_stats_set()); - ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin()); - ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax()); - ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin()); - ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); - ASSERT_EQ(stats_float.min(), rg2_column2->encoded_statistics()->min()); - ASSERT_EQ(stats_float.max(), rg2_column2->encoded_statistics()->max()); - ASSERT_EQ(stats_int.min(), rg1_column1->encoded_statistics()->min()); - ASSERT_EQ(stats_int.max(), rg1_column1->encoded_statistics()->max()); - ASSERT_EQ(0, rg2_column1->statistics()->null_count()); - ASSERT_EQ(0, rg2_column2->statistics()->null_count()); - ASSERT_EQ(nrows, rg2_column1->statistics()->distinct_count()); - ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count()); + ASSERT_EQ(stats_float.Min(), rg2_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.Max(), rg2_column2->statistics()->EncodeMax()); + ASSERT_EQ(stats_int.Min(), rg1_column1->statistics()->EncodeMin()); + ASSERT_EQ(stats_int.Max(), rg1_column1->statistics()->EncodeMax()); + ASSERT_EQ(stats_float.Min(), rg2_column2->encoded_statistics()->Min()); + ASSERT_EQ(stats_float.Max(), rg2_column2->encoded_statistics()->Max()); + ASSERT_EQ(stats_int.Min(), rg1_column1->encoded_statistics()->Min()); + ASSERT_EQ(stats_int.Max(), rg1_column1->encoded_statistics()->Max()); + ASSERT_EQ(0, rg2_column1->statistics()->NullCount()); + ASSERT_EQ(0, rg2_column2->statistics()->NullCount()); + ASSERT_EQ(nrows, rg2_column1->statistics()->DistinctCount()); + ASSERT_EQ(nrows, rg2_column2->statistics()->DistinctCount()); ASSERT_EQ(nrows / 2, rg2_column1->num_values()); ASSERT_EQ(nrows / 2, rg2_column2->num_values()); ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression()); diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index c06fc77dc53..b850a923a4c 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -511,11 +511,11 @@ class ColumnIndexBuilderImpl final : public ColumnIndexBuilder { column_index_.null_pages.emplace_back(true); column_index_.min_values.emplace_back(""); column_index_.max_values.emplace_back(""); - } else if (stats.has_min && stats.has_max) { + } else if (stats.HasMin() && stats.HasMax()) { const size_t page_ordinal = column_index_.null_pages.size(); non_null_page_indices_.emplace_back(page_ordinal); - column_index_.min_values.emplace_back(stats.min()); - column_index_.max_values.emplace_back(stats.max()); + column_index_.min_values.emplace_back(stats.Min().value()); + column_index_.max_values.emplace_back(stats.Max().value()); column_index_.null_pages.emplace_back(false); } else { /// This is a non-null page but it lacks of meaningful min/max values. @@ -524,8 +524,8 @@ class ColumnIndexBuilderImpl final : public ColumnIndexBuilder { return; } - if (column_index_.__isset.null_counts && stats.has_null_count) { - column_index_.null_counts.emplace_back(stats.null_count); + if (column_index_.__isset.null_counts && stats.HasNullCount()) { + column_index_.null_counts.emplace_back(stats.null_count.value()); } else { column_index_.__isset.null_counts = false; column_index_.null_counts.clear(); diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc index 3a7308c1c6b..b21265fbf0d 100644 --- a/cpp/src/parquet/page_index_test.cc +++ b/cpp/src/parquet/page_index_test.cc @@ -546,8 +546,8 @@ void TestWriteTypedColumnIndex(schema::NodePtr node, for (size_t i = 0; i < num_pages; ++i) { ASSERT_EQ(page_stats[i].all_null_value, column_index->null_pages()[i]); - ASSERT_EQ(page_stats[i].min(), column_index->encoded_min_values()[i]); - ASSERT_EQ(page_stats[i].max(), column_index->encoded_max_values()[i]); + ASSERT_EQ(page_stats[i].Min().value_or(""), column_index->encoded_min_values()[i]); + ASSERT_EQ(page_stats[i].Max().value_or(""), column_index->encoded_max_values()[i]); if (has_null_counts) { ASSERT_EQ(page_stats[i].null_count, column_index->null_counts()[i]); } @@ -813,11 +813,11 @@ class PageIndexBuilderTest : public ::testing::Test { ASSERT_NE(nullptr, column_index); ASSERT_EQ(size_t{1}, column_index->null_pages().size()); ASSERT_EQ(stats.all_null_value, column_index->null_pages()[0]); - ASSERT_EQ(stats.min(), column_index->encoded_min_values()[0]); - ASSERT_EQ(stats.max(), column_index->encoded_max_values()[0]); - ASSERT_EQ(stats.has_null_count, column_index->has_null_counts()); - if (stats.has_null_count) { - ASSERT_EQ(stats.null_count, column_index->null_counts()[0]); + ASSERT_EQ(stats.Min(), column_index->encoded_min_values()[0]); + ASSERT_EQ(stats.Max(), column_index->encoded_max_values()[0]); + ASSERT_EQ(stats.HasNullCount(), column_index->has_null_counts()); + if (stats.HasNullCount()) { + ASSERT_EQ(stats.null_count.value(), column_index->null_counts()[0]); } } diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index dfce57a00fc..feae3bf58eb 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -165,7 +165,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte } stream << " Values: " << column_chunk->num_values(); if (column_chunk->is_stats_set()) { - std::string min = stats->min(), max = stats->max(); + std::string min = stats->Min().value_or(""), max = stats->Max().value_or(""); std::string max_exact = stats->is_max_value_exact.has_value() ? (stats->is_max_value_exact.value() ? "true" : "false") @@ -174,8 +174,8 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte stats->is_min_value_exact.has_value() ? (stats->is_min_value_exact.value() ? "true" : "false") : "unknown"; - stream << ", Null Values: " << stats->null_count - << ", Distinct Values: " << stats->distinct_count << std::endl + stream << ", Null Values: " << stats->NullCount().value_or(0) + << ", Distinct Values: " << stats->DistinctCount().value_or(0) << std::endl << " Max (exact: " << max_exact << "): " << FormatStatValue(descr->physical_type(), max, descr->logical_type()) << ", Min (exact: " << min_exact << "): " @@ -335,11 +335,11 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list selected if (column_chunk->is_stats_set()) { stream << R"("True", "Stats": {)"; if (stats->HasNullCount()) { - stream << R"("NumNulls": ")" << stats->null_count() << "\""; + stream << R"("NumNulls": ")" << stats->NullCount().value() << "\""; } if (stats->HasDistinctCount()) { stream << ", " - << R"("DistinctValues": ")" << stats->distinct_count() << "\""; + << R"("DistinctValues": ")" << stats->DistinctCount().value() << "\""; } if (stats->HasMinMax()) { std::string min = stats->EncodeMin(), max = stats->EncodeMax(); diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 2e5f6fe37c4..c035bd92ec9 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -594,60 +595,72 @@ class TypedStatisticsImpl : public TypedStatistics { TypedStatisticsImpl::IncrementNullCount(null_count); SetDistinctCount(distinct_count); - Copy(min, &min_, min_buffer_.get()); - Copy(max, &max_, max_buffer_.get()); - has_min_max_ = true; + min_.emplace(); + max_.emplace(); + Copy(min, &min_.value(), min_buffer_.get()); + Copy(max, &max_.value(), max_buffer_.get()); statistics_.is_min_value_exact = true; statistics_.is_max_value_exact = true; } // Create stats from a thrift Statistics object. - TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min, - const std::string& encoded_max, int64_t num_values, - int64_t null_count, int64_t distinct_count, bool has_min_max, - bool has_null_count, bool has_distinct_count, MemoryPool* pool) + TypedStatisticsImpl(const ColumnDescriptor* descr, + std::optional encoded_min, + std::optional encoded_max, int64_t num_values, + std::optional null_count, + std::optional distinct_count, MemoryPool* pool) : TypedStatisticsImpl(descr, encoded_min, encoded_max, num_values, null_count, - distinct_count, has_min_max, has_null_count, - has_distinct_count, + distinct_count, /*is_min_value_exact=*/std::nullopt, /*is_max_value_exact=*/std::nullopt, pool) {} - TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min, - const std::string& encoded_max, int64_t num_values, - int64_t null_count, int64_t distinct_count, bool has_min_max, - bool has_null_count, bool has_distinct_count, + TypedStatisticsImpl(const ColumnDescriptor* descr, + std::optional encoded_min, + std::optional encoded_max, int64_t num_values, + std::optional null_count, + std::optional distinct_count, std::optional is_min_value_exact, std::optional is_max_value_exact, MemoryPool* pool) : TypedStatisticsImpl(descr, pool) { TypedStatisticsImpl::IncrementNumValues(num_values); - if (has_null_count) { - TypedStatisticsImpl::IncrementNullCount(null_count); + if (null_count.has_value()) { + TypedStatisticsImpl::IncrementNullCount(null_count.value()); } else { - has_null_count_ = false; + statistics_.null_count = std::nullopt; } - if (has_distinct_count) { - SetDistinctCount(distinct_count); + if (distinct_count.has_value()) { + SetDistinctCount(distinct_count.value()); } else { - has_distinct_count_ = false; + statistics_.distinct_count = std::nullopt; } - if (has_min_max) { - PlainDecode(encoded_min, &min_); - PlainDecode(encoded_max, &max_); + if (encoded_min.has_value() && encoded_max.has_value()) { + T min, max; + PlainDecode(encoded_min.value(), &min); + PlainDecode(encoded_max.value(), &max); + min_.emplace(); + max_.emplace(); + // Copy the decoded result to avoid dangling pointer problem + Copy(min, &min_.value(), min_buffer_.get()); + Copy(max, &max_.value(), max_buffer_.get()); statistics_.is_min_value_exact = is_min_value_exact; statistics_.is_max_value_exact = is_max_value_exact; + } else { + min_ = std::nullopt; + max_ = std::nullopt; } - - has_min_max_ = has_min_max; } - bool HasDistinctCount() const override { return has_distinct_count_; }; - bool HasMinMax() const override { return has_min_max_; } - bool HasNullCount() const override { return has_null_count_; }; + bool HasDistinctCount() const override { + return statistics_.distinct_count.has_value(); + }; + bool HasMin() const override { return min_.has_value(); } + bool HasMax() const override { return max_.has_value(); } + bool HasMinMax() const override { return HasMin() && HasMax(); } + bool HasNullCount() const override { return statistics_.null_count.has_value(); }; void IncrementNullCount(int64_t n) override { - statistics_.null_count += n; - has_null_count_ = true; + statistics_.null_count = NullCount().value_or(0) + n; } void IncrementNumValues(int64_t n) override { num_values_ += n; } @@ -674,14 +687,10 @@ class TypedStatisticsImpl : public TypedStatistics { } const auto& other = checked_cast(raw_other); + if (this->HasMinMax() != other.HasMinMax()) return false; + if (this->HasMinMax() && !MinMaxEqual(other)) return false; - if (has_min_max_ != other.has_min_max_) return false; - if (has_min_max_) { - if (!MinMaxEqual(other)) return false; - } - - return null_count() == other.null_count() && - distinct_count() == other.distinct_count() && + return NullCount() == other.NullCount() && DistinctCount() == other.DistinctCount() && num_values() == other.num_values() && is_min_value_exact() == other.is_min_value_exact() && is_max_value_exact() == other.is_max_value_exact(); @@ -700,27 +709,26 @@ class TypedStatisticsImpl : public TypedStatistics { void Merge(const TypedStatistics& other) override { this->num_values_ += other.num_values(); - // null_count is always valid when merging page statistics into - // column chunk statistics. - if (other.HasNullCount()) { - this->statistics_.null_count += other.null_count(); + // null_count is valid only if both sides have it. + if (this->HasNullCount() && other.HasNullCount()) { + this->statistics_.null_count = NullCount().value() + other.NullCount().value(); } else { - this->has_null_count_ = false; + this->statistics_.null_count = std::nullopt; } - if (has_distinct_count_ && other.HasDistinctCount() && - (distinct_count() == 0 || other.distinct_count() == 0)) { + if (this->HasDistinctCount() && other.HasDistinctCount() && + (DistinctCount().value() == 0 || other.DistinctCount().value() == 0)) { // We can merge distinct counts if either side is zero. statistics_.distinct_count = - std::max(statistics_.distinct_count, other.distinct_count()); + std::max(statistics_.DistinctCount().value(), other.DistinctCount().value()); } else { - // Otherwise clear has_distinct_count_ as distinct count cannot be merged. - this->has_distinct_count_ = false; + // Otherwise clear distinct_count as distinct count cannot be merged. + this->statistics_.distinct_count = std::nullopt; } // Do not clear min/max here if the other side does not provide // min/max which may happen when other is an empty stats or all // its values are null and/or NaN. if (other.HasMinMax()) { - SetMinMax(other.min(), other.max()); + SetMinMax(other.Min().value(), other.Max().value()); } } @@ -743,9 +751,13 @@ class TypedStatisticsImpl : public TypedStatistics { SetMinMaxPair(comparator_->GetMinMax(values)); } - const T& min() const override { return min_; } + const T& min() const override { return min_.value(); } + + std::optional Min() const override { return min_; } - const T& max() const override { return max_; } + const T& max() const override { return max_.value(); } + + std::optional Max() const override { return max_; } Type::type physical_type() const override { return descr_->physical_type(); } @@ -753,13 +765,13 @@ class TypedStatisticsImpl : public TypedStatistics { std::string EncodeMin() const override { std::string s; - if (HasMinMax()) this->PlainEncode(min_, &s); + if (HasMinMax()) this->PlainEncode(min_.value(), &s); return s; } std::string EncodeMax() const override { std::string s; - if (HasMinMax()) this->PlainEncode(max_, &s); + if (HasMinMax()) this->PlainEncode(max_.value(), &s); return s; } @@ -772,19 +784,26 @@ class TypedStatisticsImpl : public TypedStatistics { s.is_max_value_exact = this->is_max_value_exact(); } if (HasNullCount()) { - s.set_null_count(this->null_count()); + s.set_null_count(this->NullCount().value()); // num_values_ is reliable and it means number of non-null values. s.all_null_value = num_values_ == 0; } if (HasDistinctCount()) { - s.set_distinct_count(this->distinct_count()); + s.set_distinct_count(this->DistinctCount().value()); } return s; } - int64_t null_count() const override { return statistics_.null_count; } - int64_t distinct_count() const override { return statistics_.distinct_count; } + int64_t null_count() const override { return statistics_.null_count.value_or(0); } + int64_t distinct_count() const override { + return statistics_.distinct_count.value_or(0); + } int64_t num_values() const override { return num_values_; } + + std::optional NullCount() const override { return statistics_.null_count; } + std::optional DistinctCount() const override { + return statistics_.distinct_count; + } std::optional is_min_value_exact() const override { return statistics_.is_min_value_exact; } @@ -794,15 +813,12 @@ class TypedStatisticsImpl : public TypedStatistics { private: const ColumnDescriptor* descr_; - bool has_min_max_ = false; - bool has_null_count_ = false; - bool has_distinct_count_ = false; - T min_; - T max_; + std::optional min_; + std::optional max_; ::arrow::MemoryPool* pool_; // Number of non-null values. - // Please note that num_values_ is reliable when has_null_count_ is set. - // When has_null_count_ is not set, e.g. a page statistics created from + // Please note that num_values_ is reliable when null_count is set. + // When null_count is not set, e.g. a page statistics created from // a statistics thrift message which doesn't have the optional null_count, // `num_values_` may include null values. int64_t num_values_ = 0; @@ -812,14 +828,13 @@ class TypedStatisticsImpl : public TypedStatistics { LogicalType::Type::type logical_type_ = LogicalType::Type::NONE; void PlainEncode(const T& src, std::string* dst) const; - void PlainDecode(const std::string& src, T* dst) const; + void PlainDecode(std::string_view src, T* dst) const; void Copy(const T& src, T* dst, ResizableBuffer*) { *dst = src; } void SetDistinctCount(int64_t n) { // distinct count can only be "set", and cannot be incremented. statistics_.distinct_count = n; - has_distinct_count_ = true; } void ResetCounts() { @@ -829,14 +844,15 @@ class TypedStatisticsImpl : public TypedStatistics { } void ResetHasFlags() { - // has_min_max_ will only be set when it meets any valid value. - this->has_min_max_ = false; - // has_distinct_count_ will only be set once SetDistinctCount() + // min_ and max_ will only be set when it meets any valid value. + this->min_ = std::nullopt; + this->max_ = std::nullopt; + // distinct_count will only be set once SetDistinctCount() // is called because distinct count calculation is not cheap and // disabled by default. - this->has_distinct_count_ = false; + this->statistics_.distinct_count = std::nullopt; // Null count calculation is cheap and enabled by default. - this->has_null_count_ = true; + this->statistics_.null_count = this->statistics_.null_count.value_or(0); } void SetMinMaxPair(std::pair min_max) { @@ -848,13 +864,16 @@ class TypedStatisticsImpl : public TypedStatistics { auto min = maybe_min_max.value().first; auto max = maybe_min_max.value().second; - if (!has_min_max_) { - has_min_max_ = true; - Copy(min, &min_, min_buffer_.get()); - Copy(max, &max_, max_buffer_.get()); + if (!HasMinMax()) { + min_.emplace(); + max_.emplace(); + Copy(min, &min_.value(), min_buffer_.get()); + Copy(max, &max_.value(), max_buffer_.get()); } else { - Copy(comparator_->Compare(min_, min) ? min_ : min, &min_, min_buffer_.get()); - Copy(comparator_->Compare(max_, max) ? max : max_, &max_, max_buffer_.get()); + Copy(comparator_->Compare(min_.value(), min) ? min_.value() : min, &min_.value(), + min_buffer_.get()); + Copy(comparator_->Compare(max_.value(), max) ? max : max_.value(), &max_.value(), + max_buffer_.get()); } statistics_.is_min_value_exact = true; statistics_.is_max_value_exact = true; @@ -865,8 +884,11 @@ template <> inline bool TypedStatisticsImpl::MinMaxEqual( const TypedStatisticsImpl& other) const { uint32_t len = descr_->type_length(); - return std::memcmp(min_.ptr, other.min_.ptr, len) == 0 && - std::memcmp(max_.ptr, other.max_.ptr, len) == 0; + if (this->HasMin() != other.HasMin()) return false; + if (this->HasMin() && std::memcmp(Min()->ptr, other.Min()->ptr, len) != 0) return false; + if (this->HasMax() != other.HasMax()) return false; + if (this->HasMax() && std::memcmp(Max()->ptr, other.Max()->ptr, len) != 0) return false; + return true; } template @@ -933,9 +955,9 @@ void TypedStatisticsImpl::PlainEncode(const T& src, std::string* dst) con } template -void TypedStatisticsImpl::PlainDecode(const std::string& src, T* dst) const { +void TypedStatisticsImpl::PlainDecode(std::string_view src, T* dst) const { auto decoder = MakeTypedDecoder(Encoding::PLAIN, descr_); - decoder->SetData(1, reinterpret_cast(src.c_str()), + decoder->SetData(1, reinterpret_cast(src.data()), static_cast(src.size())); int decoded_values = decoder->Decode(dst, 1); if (decoded_values != 1) { @@ -950,10 +972,9 @@ void TypedStatisticsImpl::PlainEncode(const T& src, } template <> -void TypedStatisticsImpl::PlainDecode(const std::string& src, - T* dst) const { +void TypedStatisticsImpl::PlainDecode(std::string_view src, T* dst) const { dst->len = static_cast(src.size()); - dst->ptr = reinterpret_cast(src.c_str()); + dst->ptr = reinterpret_cast(src.data()); } std::shared_ptr DoMakeComparator(Type::type physical_type, @@ -1076,10 +1097,8 @@ std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, int64_t num_values, ::arrow::MemoryPool* pool) { DCHECK(encoded_stats != nullptr); - return Make(descr, encoded_stats->min(), encoded_stats->max(), num_values, + return Make(descr, encoded_stats->Min(), encoded_stats->Max(), num_values, encoded_stats->null_count, encoded_stats->distinct_count, - encoded_stats->has_min && encoded_stats->has_max, - encoded_stats->has_null_count, encoded_stats->has_distinct_count, encoded_stats->is_min_value_exact, encoded_stats->is_max_value_exact, pool); } @@ -1096,18 +1115,54 @@ std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, /*is_max_value_exact=*/std::nullopt, pool); } +std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, + std::optional encoded_min, + std::optional encoded_max, + int64_t num_values, + std::optional null_count, + std::optional distinct_count, + ::arrow::MemoryPool* pool) { + return Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, + distinct_count, + /*is_min_value_exact=*/std::nullopt, + /*is_max_value_exact=*/std::nullopt, pool); +} + std::shared_ptr Statistics::Make( const ColumnDescriptor* descr, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, int64_t null_count, int64_t distinct_count, bool has_min_max, bool has_null_count, bool has_distinct_count, std::optional is_min_value_exact, std::optional is_max_value_exact, ::arrow::MemoryPool* pool) { + std::optional min = std::nullopt; + std::optional max = std::nullopt; + if (has_min_max) { + min = encoded_min; + max = encoded_max; + } + std::optional null_cnt = std::nullopt; + if (has_null_count) { + null_cnt = null_count; + } + std::optional distinct_cnt = std::nullopt; + if (has_distinct_count) { + distinct_cnt = distinct_count; + } + return Statistics::Make(descr, min, max, num_values, null_cnt, distinct_cnt, + is_min_value_exact, is_max_value_exact, pool); +} + +std::shared_ptr Statistics::Make( + const ColumnDescriptor* descr, std::optional encoded_min, + std::optional encoded_max, int64_t num_values, + std::optional null_count, std::optional distinct_count, + std::optional is_min_value_exact, std::optional is_max_value_exact, + ::arrow::MemoryPool* pool) { #define MAKE_STATS(CAP_TYPE, KLASS) \ case Type::CAP_TYPE: \ return std::make_shared>( \ descr, encoded_min, encoded_max, num_values, null_count, distinct_count, \ - has_min_max, has_null_count, has_distinct_count, is_min_value_exact, \ - is_max_value_exact, pool) + is_min_value_exact, is_max_value_exact, pool) switch (descr->physical_type()) { MAKE_STATS(BOOLEAN, BooleanType); diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index c80fb8e3b52..5b7fb2bad43 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "parquet/platform.h" @@ -119,25 +120,34 @@ std::shared_ptr> MakeComparator(const ColumnDescriptor* d /// \brief Structure represented encoded statistics to be written to /// and read from Parquet serialized metadata. class PARQUET_EXPORT EncodedStatistics { - std::string max_, min_; + std::optional max_, min_; bool is_signed_ = false; public: EncodedStatistics() = default; - const std::string& max() const { return max_; } - const std::string& min() const { return min_; } + + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use Max() instead.") + const std::string& max() const { return max_.value(); } + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use Min() instead.") + const std::string& min() const { return min_.value(); } + + std::optional Max() const { return max_; } + std::optional Min() const { return min_; } std::optional is_max_value_exact; std::optional is_min_value_exact; - int64_t null_count = 0; - int64_t distinct_count = 0; + std::optional null_count; + std::optional distinct_count; - bool has_min = false; - bool has_max = false; - bool has_null_count = false; - bool has_distinct_count = false; + bool HasMax() const { return max_.has_value(); } + bool HasMin() const { return min_.has_value(); } + bool HasNullCount() const { return null_count.has_value(); } + bool HasDistinctCount() const { return distinct_count.has_value(); } + + std::optional DistinctCount() const { return distinct_count; } + std::optional NullCount() const { return null_count; } // When all values in the statistics are null, it is set to true. // Otherwise, at least one value is not null, or we are not sure at all. @@ -151,28 +161,24 @@ class PARQUET_EXPORT EncodedStatistics { // the true minimum for aggregations and there is no way to mark that a // value has been truncated and is a lower bound and not in the page. void ApplyStatSizeLimits(size_t length) { - if (max_.length() > length) { - has_max = false; - max_.clear(); + if (HasMax() && max_->length() > length) { + max_ = std::nullopt; is_max_value_exact = std::nullopt; } - if (min_.length() > length) { - has_min = false; - min_.clear(); + if (HasMin() && min_->length() > length) { + min_ = std::nullopt; is_min_value_exact = std::nullopt; } } // Clear Min Max. void ClearMinMax() { - has_max = false; - max_.clear(); - has_min = false; - min_.clear(); + max_ = std::nullopt; + min_ = std::nullopt; } bool is_set() const { - return has_min || has_max || has_null_count || has_distinct_count; + return HasMin() || HasMax() || HasNullCount() || HasDistinctCount(); } bool is_signed() const { return is_signed_; } @@ -181,25 +187,21 @@ class PARQUET_EXPORT EncodedStatistics { EncodedStatistics& set_max(std::string value) { max_ = std::move(value); - has_max = true; return *this; } EncodedStatistics& set_min(std::string value) { min_ = std::move(value); - has_min = true; return *this; } EncodedStatistics& set_null_count(int64_t value) { null_count = value; - has_null_count = true; return *this; } EncodedStatistics& set_distinct_count(int64_t value) { distinct_count = value; - has_distinct_count = true; return *this; } }; @@ -217,6 +219,21 @@ class PARQUET_EXPORT Statistics { const ColumnDescriptor* descr, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + /// \brief Create a new statistics instance given a column schema + /// definition and preexisting state + /// \param[in] descr the column schema + /// \param[in] encoded_min the encoded minimum value + /// \param[in] encoded_max the encoded maximum value + /// \param[in] num_values total number of values + /// \param[in] null_count number of null values + /// \param[in] distinct_count number of distinct values + /// \param[in] pool a memory pool to use for any memory allocations, optional + static std::shared_ptr Make( + const ColumnDescriptor* descr, std::optional encoded_min, + std::optional encoded_max, int64_t num_values, + std::optional null_count, std::optional distinct_count, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + /// \brief Create a new statistics instance given a column schema /// definition and preexisting state /// \param[in] descr the column schema @@ -229,6 +246,8 @@ class PARQUET_EXPORT Statistics { /// \param[in] has_null_count whether the null_count statistics are set /// \param[in] has_distinct_count whether the distinct_count statistics are set /// \param[in] pool a memory pool to use for any memory allocations, optional + /// \deprecated Deprecated in 24.0.0. Use std::optional version instead. + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use std::optional version instead.") static std::shared_ptr Make( const ColumnDescriptor* descr, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, int64_t null_count, @@ -236,6 +255,24 @@ class PARQUET_EXPORT Statistics { bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + /// \brief Create a new statistics instance given a column schema + /// definition and preexisting state + /// \param[in] descr the column schema + /// \param[in] encoded_min the encoded minimum value + /// \param[in] encoded_max the encoded maximum value + /// \param[in] num_values total number of values + /// \param[in] null_count number of null values + /// \param[in] distinct_count number of distinct values + /// \param[in] is_min_value_exact whether the min value is exact + /// \param[in] is_max_value_exact whether the max value is exact + /// \param[in] pool a memory pool to use for any memory allocations, optional + static std::shared_ptr Make( + const ColumnDescriptor* descr, std::optional encoded_min, + std::optional encoded_max, int64_t num_values, + std::optional null_count, std::optional distinct_count, + std::optional is_min_value_exact, std::optional is_max_value_exact, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + /// \brief Create a new statistics instance given a column schema /// definition and preexisting state /// \param[in] descr the column schema @@ -250,6 +287,8 @@ class PARQUET_EXPORT Statistics { /// \param[in] is_min_value_exact whether the min value is exact /// \param[in] is_max_value_exact whether the max value is exact /// \param[in] pool a memory pool to use for any memory allocations, optional + /// \deprecated Deprecated in 24.0.0. Use std::optional version instead. + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use std::optional version instead.") static std::shared_ptr Make( const ColumnDescriptor* descr, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, int64_t null_count, @@ -270,17 +309,31 @@ class PARQUET_EXPORT Statistics { virtual bool HasNullCount() const = 0; /// \brief The number of null values, may not be set + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use NullCount() instead.") virtual int64_t null_count() const = 0; + /// \brief The number of null values, may not be set + virtual std::optional NullCount() const = 0; + /// \brief Return true if the count of distinct values is set virtual bool HasDistinctCount() const = 0; /// \brief The number of distinct values, may not be set + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use DistinctCount() instead.") virtual int64_t distinct_count() const = 0; + /// \brief The number of distinct values, may not be set + virtual std::optional DistinctCount() const = 0; + /// \brief The number of non-null values in the column virtual int64_t num_values() const = 0; + /// \brief Return true if the minimum value statistic is set. + virtual bool HasMin() const = 0; + + /// \brief Return true if the maximum value statistic is set. + virtual bool HasMax() const = 0; + /// \brief Return true if both min and max statistics are set. Obtain /// with TypedStatistics::min and max virtual bool HasMinMax() const = 0; @@ -327,11 +380,19 @@ class TypedStatistics : public Statistics { using T = typename DType::c_type; /// \brief The current minimum value + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use Min() instead.") virtual const T& min() const = 0; + /// \brief The current minimum value + virtual std::optional Min() const = 0; + /// \brief The current maximum value + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use Max() instead.") virtual const T& max() const = 0; + /// \brief The current maximum value + virtual std::optional Max() const = 0; + /// \brief Update state with state of another Statistics object virtual void Merge(const TypedStatistics& other) = 0; @@ -412,6 +473,19 @@ std::shared_ptr> MakeStatistics(const typename DType::c_t /// \brief Typed version of Statistics::Make template +std::shared_ptr> MakeStatistics( + const ColumnDescriptor* descr, std::optional encoded_min, + std::optional encoded_max, int64_t num_values, + std::optional null_count, std::optional distinct_count, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { + return std::static_pointer_cast>(Statistics::Make( + descr, encoded_min, encoded_max, num_values, null_count, distinct_count, + /*is_min_value_exact=*/std::nullopt, /*is_max_value_exact=*/std::nullopt, pool)); +} + +/// \brief Typed version of Statistics::Make +template +PARQUET_DEPRECATED("Deprecated in 24.0.0. Use std::optional version instead.") std::shared_ptr> MakeStatistics( const ColumnDescriptor* descr, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, int64_t null_count, @@ -425,6 +499,20 @@ std::shared_ptr> MakeStatistics( /// \brief Typed version of Statistics::Make template +std::shared_ptr> MakeStatistics( + const ColumnDescriptor* descr, std::optional encoded_min, + std::optional encoded_max, int64_t num_values, + std::optional null_count, std::optional distinct_count, + std::optional is_min_value_exact, std::optional is_max_value_exact, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { + return std::static_pointer_cast>( + Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, + distinct_count, is_min_value_exact, is_max_value_exact, pool)); +} + +/// \brief Typed version of Statistics::Make +template +PARQUET_DEPRECATED("Deprecated in 24.0.0. Use std::optional version instead.") std::shared_ptr> MakeStatistics( const ColumnDescriptor* descr, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, int64_t null_count, diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 905502cb0a5..86fc2c8e975 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -323,7 +323,6 @@ class TestStatistics : public PrimitiveTypedTest { auto statistics2 = MakeStatistics( this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), /*null_count=*/0, /*distinct_count=*/0, - /*has_min_max=*/true, /*has_null_count=*/true, /*has_distinct_count=*/true, /*is_min_value_exact=*/true, /*is_max_value_exact=*/true); auto statistics3 = MakeStatistics(this->schema_.Column(0)); @@ -335,26 +334,25 @@ class TestStatistics : public PrimitiveTypedTest { std::string encoded_max_spaced = statistics3->EncodeMax(); // Use old API without is_{min/max}_value_exact - auto statistics4 = MakeStatistics( - this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), - /*null_count=*/0, /*distinct_count=*/0, - /*has_min_max=*/true, /*has_null_count=*/true, /*has_distinct_count=*/true); + auto statistics4 = MakeStatistics(this->schema_.Column(0), encoded_min, + encoded_max, this->values_.size(), + /*null_count=*/0, /*distinct_count=*/0); ASSERT_EQ(encoded_min, statistics2->EncodeMin()); ASSERT_EQ(encoded_max, statistics2->EncodeMax()); - ASSERT_EQ(statistics1->min(), statistics2->min()); - ASSERT_EQ(statistics1->max(), statistics2->max()); + ASSERT_EQ(statistics1->Min(), statistics2->Min()); + ASSERT_EQ(statistics1->Max(), statistics2->Max()); ASSERT_EQ(statistics1->is_min_value_exact(), std::make_optional(true)); ASSERT_EQ(statistics1->is_max_value_exact(), std::make_optional(true)); ASSERT_EQ(statistics2->is_min_value_exact(), std::make_optional(true)); ASSERT_EQ(statistics2->is_max_value_exact(), std::make_optional(true)); ASSERT_EQ(encoded_min_spaced, statistics2->EncodeMin()); ASSERT_EQ(encoded_max_spaced, statistics2->EncodeMax()); - ASSERT_EQ(statistics3->min(), statistics2->min()); - ASSERT_EQ(statistics3->max(), statistics2->max()); + ASSERT_EQ(statistics3->Min(), statistics2->Min()); + ASSERT_EQ(statistics3->Max(), statistics2->Max()); ASSERT_EQ(statistics3->is_min_value_exact(), std::make_optional(true)); ASSERT_EQ(statistics3->is_max_value_exact(), std::make_optional(true)); - ASSERT_EQ(statistics4->min(), statistics2->min()); - ASSERT_EQ(statistics4->max(), statistics2->max()); + ASSERT_EQ(statistics4->Min(), statistics2->Min()); + ASSERT_EQ(statistics4->Max(), statistics2->Max()); ASSERT_EQ(statistics4->is_min_value_exact(), std::nullopt); ASSERT_EQ(statistics4->is_max_value_exact(), std::nullopt); } @@ -370,9 +368,9 @@ class TestStatistics : public PrimitiveTypedTest { ASSERT_TRUE(statistics->HasNullCount()); ASSERT_FALSE(statistics->HasMinMax()); ASSERT_FALSE(statistics->HasDistinctCount()); - ASSERT_EQ(0, statistics->null_count()); + ASSERT_EQ(0, statistics->NullCount()); ASSERT_EQ(0, statistics->num_values()); - ASSERT_EQ(0, statistics->distinct_count()); + ASSERT_EQ(std::nullopt, statistics->DistinctCount()); ASSERT_EQ("", statistics->EncodeMin()); ASSERT_EQ("", statistics->EncodeMax()); } @@ -395,10 +393,10 @@ class TestStatistics : public PrimitiveTypedTest { total->Merge(*statistics1); total->Merge(*statistics2); - ASSERT_EQ(num_null[0] + num_null[1], total->null_count()); + ASSERT_EQ(num_null[0] + num_null[1], total->NullCount()); ASSERT_EQ(this->values_.size() * 2 - num_null[0] - num_null[1], total->num_values()); - ASSERT_EQ(total->min(), std::min(statistics1->min(), statistics2->min())); - ASSERT_EQ(total->max(), std::max(statistics1->max(), statistics2->max())); + ASSERT_EQ(total->Min(), std::min(statistics1->Min(), statistics2->Min())); + ASSERT_EQ(total->Max(), std::max(statistics1->Max(), statistics2->Max())); } void TestEquals() { @@ -464,7 +462,7 @@ class TestStatistics : public PrimitiveTypedTest { if (!column_chunk->is_stats_set()) return; std::shared_ptr stats = column_chunk->statistics(); // check values after serialization + deserialization - EXPECT_EQ(null_count, stats->null_count()); + EXPECT_EQ(null_count, stats->NullCount()); EXPECT_EQ(num_values - null_count, stats->num_values()); EXPECT_TRUE(expected_stats->HasMinMax()); EXPECT_EQ(expected_stats->EncodeMin(), stats->EncodeMin()); @@ -472,10 +470,10 @@ class TestStatistics : public PrimitiveTypedTest { std::shared_ptr enc_stats = column_chunk->encoded_statistics(); EXPECT_EQ(null_count, enc_stats->null_count); - EXPECT_TRUE(enc_stats->has_min); - EXPECT_TRUE(enc_stats->has_max); - EXPECT_EQ(expected_stats->EncodeMin(), enc_stats->min()); - EXPECT_EQ(expected_stats->EncodeMax(), enc_stats->max()); + EXPECT_TRUE(enc_stats->HasMin()); + EXPECT_TRUE(enc_stats->HasMax()); + EXPECT_EQ(expected_stats->EncodeMin(), enc_stats->Min()); + EXPECT_EQ(expected_stats->EncodeMax(), enc_stats->Max()); EXPECT_EQ(enc_stats->is_min_value_exact, std::make_optional(true)); EXPECT_EQ(enc_stats->is_max_value_exact, std::make_optional(true)); } @@ -563,23 +561,21 @@ void TestStatistics::TestMinMaxEncode() { // encoded is same as unencoded ASSERT_EQ(encoded_min, - std::string(reinterpret_cast(statistics1->min().ptr), - statistics1->min().len)); + std::string(reinterpret_cast(statistics1->Min()->ptr), + statistics1->Min()->len)); ASSERT_EQ(encoded_max, - std::string(reinterpret_cast(statistics1->max().ptr), - statistics1->max().len)); + std::string(reinterpret_cast(statistics1->Max()->ptr), + statistics1->Max()->len)); auto statistics2 = MakeStatistics( this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), - /*null_count=*/0, - /*distinct_count=*/0, /*has_min_max=*/true, /*has_null_count=*/true, - /*has_distinct_count=*/true, /*is_min_value_exact=*/true, - /*is_max_value_exact=*/true); + /*null_count=*/0, /*distinct_count=*/0, + /*is_min_value_exact=*/true, /*is_max_value_exact=*/true); ASSERT_EQ(encoded_min, statistics2->EncodeMin()); ASSERT_EQ(encoded_max, statistics2->EncodeMax()); - ASSERT_EQ(statistics1->min(), statistics2->min()); - ASSERT_EQ(statistics1->max(), statistics2->max()); + ASSERT_EQ(statistics1->Min(), statistics2->Min()); + ASSERT_EQ(statistics1->Max(), statistics2->Max()); } using Types = ::testing::Types { const std::vector>& subsequent) { EncodedStatistics encoded_statistics; if (initial) { - encoded_statistics.has_distinct_count = true; encoded_statistics.distinct_count = *initial; } std::shared_ptr> statistics = @@ -649,7 +644,6 @@ class TestStatisticsHasFlag : public TestStatistics { for (const auto& distinct_count : subsequent) { EncodedStatistics next_encoded_statistics; if (distinct_count) { - next_encoded_statistics.has_distinct_count = true; next_encoded_statistics.distinct_count = *distinct_count; } std::shared_ptr> next_statistics = @@ -659,10 +653,10 @@ class TestStatisticsHasFlag : public TestStatistics { statistics->Merge(*next_statistics); } EncodedStatistics final_statistics = statistics->Encode(); - EXPECT_EQ(statistics->HasDistinctCount(), final_statistics.has_distinct_count); + EXPECT_EQ(statistics->HasDistinctCount(), final_statistics.HasDistinctCount()); if (statistics->HasDistinctCount()) { - EXPECT_EQ(statistics->distinct_count(), final_statistics.distinct_count); - return statistics->distinct_count(); + EXPECT_EQ(statistics->DistinctCount(), final_statistics.distinct_count); + return statistics->DistinctCount(); } return std::nullopt; } @@ -713,8 +707,8 @@ class TestStatisticsHasFlag : public TestStatistics { /*null_count=*/this->values_.size()); auto encoded_stats1 = statistics1->Encode(); EXPECT_FALSE(statistics1->HasMinMax()); - EXPECT_FALSE(encoded_stats1.has_min); - EXPECT_FALSE(encoded_stats1.has_max); + EXPECT_FALSE(encoded_stats1.HasMin()); + EXPECT_FALSE(encoded_stats1.HasMax()); EXPECT_EQ(encoded_stats1.is_max_value_exact, std::nullopt); EXPECT_EQ(encoded_stats1.is_min_value_exact, std::nullopt); } @@ -725,16 +719,16 @@ class TestStatisticsHasFlag : public TestStatistics { statistics2->Update(this->values_ptr_, this->values_.size(), 0); auto encoded_stats2 = statistics2->Encode(); EXPECT_TRUE(statistics2->HasMinMax()); - EXPECT_TRUE(encoded_stats2.has_min); - EXPECT_TRUE(encoded_stats2.has_max); + EXPECT_TRUE(encoded_stats2.HasMin()); + EXPECT_TRUE(encoded_stats2.HasMax()); EXPECT_EQ(encoded_stats2.is_min_value_exact, std::make_optional(true)); EXPECT_EQ(encoded_stats2.is_max_value_exact, std::make_optional(true)); } VerifyMergedStatistics(*statistics1, *statistics2, [](TypedStatistics* merged_statistics) { EXPECT_TRUE(merged_statistics->HasMinMax()); - EXPECT_TRUE(merged_statistics->Encode().has_min); - EXPECT_TRUE(merged_statistics->Encode().has_max); + EXPECT_TRUE(merged_statistics->Encode().HasMin()); + EXPECT_TRUE(merged_statistics->Encode().HasMax()); EXPECT_EQ(merged_statistics->Encode().is_min_value_exact, std::make_optional(true)); EXPECT_EQ(merged_statistics->Encode().is_max_value_exact, @@ -757,16 +751,16 @@ class TestStatisticsHasFlag : public TestStatistics { /*null_count=*/0); auto encoded_stats1 = statistics1->Encode(); EXPECT_TRUE(statistics1->HasNullCount()); - EXPECT_EQ(0, statistics1->null_count()); - EXPECT_TRUE(statistics1->Encode().has_null_count); + EXPECT_EQ(0, statistics1->NullCount()); + EXPECT_TRUE(statistics1->Encode().HasNullCount()); } // Merge with null-count should also have null count VerifyMergedStatistics(*statistics1, *statistics1, [](TypedStatistics* merged_statistics) { EXPECT_TRUE(merged_statistics->HasNullCount()); - EXPECT_EQ(0, merged_statistics->null_count()); + EXPECT_EQ(0, merged_statistics->NullCount()); auto encoded = merged_statistics->Encode(); - EXPECT_TRUE(encoded.has_null_count); + EXPECT_TRUE(encoded.HasNullCount()); EXPECT_EQ(0, encoded.null_count); }); @@ -774,11 +768,11 @@ class TestStatisticsHasFlag : public TestStatistics { std::shared_ptr> statistics2; { EncodedStatistics encoded_statistics2; - encoded_statistics2.has_null_count = false; + encoded_statistics2.null_count = std::nullopt; statistics2 = std::dynamic_pointer_cast>( Statistics::Make(this->schema_.Column(0), &encoded_statistics2, /*num_values=*/1000)); - EXPECT_FALSE(statistics2->Encode().has_null_count); + EXPECT_FALSE(statistics2->Encode().HasNullCount()); EXPECT_FALSE(statistics2->HasNullCount()); } @@ -786,7 +780,7 @@ class TestStatisticsHasFlag : public TestStatistics { VerifyMergedStatistics(*statistics1, *statistics2, [](TypedStatistics* merged_statistics) { EXPECT_FALSE(merged_statistics->HasNullCount()); - EXPECT_FALSE(merged_statistics->Encode().has_null_count); + EXPECT_FALSE(merged_statistics->Encode().HasNullCount()); }); } @@ -794,17 +788,17 @@ class TestStatisticsHasFlag : public TestStatistics { // If statistics doesn't have null count, all_null_value should be false. void TestMissingNullCount() { EncodedStatistics encoded_statistics; - encoded_statistics.has_null_count = false; + encoded_statistics.null_count = std::nullopt; auto statistics = Statistics::Make(this->schema_.Column(0), &encoded_statistics, /*num_values=*/1000); auto typed_stats = std::dynamic_pointer_cast>(statistics); EXPECT_FALSE(typed_stats->HasNullCount()); auto encoded = typed_stats->Encode(); EXPECT_FALSE(encoded.all_null_value); - EXPECT_FALSE(encoded.has_null_count); - EXPECT_FALSE(encoded.has_distinct_count); - EXPECT_FALSE(encoded.has_min); - EXPECT_FALSE(encoded.has_max); + EXPECT_FALSE(encoded.HasNullCount()); + EXPECT_FALSE(encoded.HasDistinctCount()); + EXPECT_FALSE(encoded.HasMin()); + EXPECT_FALSE(encoded.HasMax()); EXPECT_FALSE(encoded.is_min_value_exact.has_value()); EXPECT_FALSE(encoded.is_max_value_exact.has_value()); } @@ -1001,8 +995,8 @@ class TestStatisticsSortOrder : public ::testing::Test { ARROW_SCOPED_TRACE("Statistics for field #", i); std::shared_ptr cc_metadata = rg_metadata->ColumnChunk(i); - EXPECT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin()); - EXPECT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax()); + EXPECT_EQ(stats_[i].Min(), cc_metadata->statistics()->EncodeMin()); + EXPECT_EQ(stats_[i].Max(), cc_metadata->statistics()->EncodeMax()); EXPECT_EQ(stats_[i].is_max_value_exact, std::make_optional(true)); EXPECT_EQ(stats_[i].is_min_value_exact, std::make_optional(true)); } @@ -1242,9 +1236,9 @@ void TestByteArrayStatisticsFromArrow() { auto stats = MakeStatistics(&descr); ASSERT_NO_FATAL_FAILURE(stats->Update(*values)); - ASSERT_EQ(ByteArray(typed_values.GetView(2)), stats->min()); - ASSERT_EQ(ByteArray(typed_values.GetView(9)), stats->max()); - ASSERT_EQ(2, stats->null_count()); + ASSERT_EQ(ByteArray(typed_values.GetView(2)), stats->Min()); + ASSERT_EQ(ByteArray(typed_values.GetView(9)), stats->Max()); + ASSERT_EQ(2, stats->NullCount()); } TEST(TestByteArrayStatisticsFromArrow, StringType) { @@ -1419,12 +1413,12 @@ class TestFloatStatistics : public ::testing::Test { stats->Update(values.data(), values.size(), /*null_count=*/0); ASSERT_TRUE(stats->HasMinMax()); - this->CheckEq(stats->min(), positive_zero_); - ASSERT_TRUE(this->signbit(stats->min())); + this->CheckEq(stats->Min().value(), positive_zero_); + ASSERT_TRUE(this->signbit(stats->Min().value())); ASSERT_EQ(stats->EncodeMin(), EncodeValue(negative_zero_)); - this->CheckEq(stats->max(), positive_zero_); - ASSERT_FALSE(this->signbit(stats->max())); + this->CheckEq(stats->Max().value(), positive_zero_); + ASSERT_FALSE(this->signbit(stats->Max().value())); ASSERT_EQ(stats->EncodeMax(), EncodeValue(positive_zero_)); } @@ -1653,10 +1647,10 @@ TEST(TestStatisticsSortOrder, UNKNOWN) { ASSERT_TRUE(column_chunk->is_stats_set()); std::shared_ptr enc_stats = column_chunk->encoded_statistics(); - ASSERT_TRUE(enc_stats->has_null_count); - ASSERT_FALSE(enc_stats->has_distinct_count); - ASSERT_FALSE(enc_stats->has_min); - ASSERT_FALSE(enc_stats->has_max); + ASSERT_TRUE(enc_stats->HasNullCount()); + ASSERT_FALSE(enc_stats->HasDistinctCount()); + ASSERT_FALSE(enc_stats->HasMin()); + ASSERT_FALSE(enc_stats->HasMax()); ASSERT_EQ(1, enc_stats->null_count); ASSERT_FALSE(enc_stats->is_max_value_exact.has_value()); ASSERT_FALSE(enc_stats->is_min_value_exact.has_value()); @@ -1684,7 +1678,7 @@ TEST(TestStatisticsSortOrderMinMax, Unsigned) { std::shared_ptr stats = column_chunk->statistics(); ASSERT_TRUE(stats != NULL); - ASSERT_EQ(0, stats->null_count()); + ASSERT_EQ(0, stats->NullCount()); ASSERT_EQ(12, stats->num_values()); ASSERT_EQ(0x00, stats->EncodeMin()[0]); ASSERT_EQ(0x0b, stats->EncodeMax()[0]); @@ -1723,32 +1717,32 @@ TEST(TestEncodedStatistics, TruncatedMinMax) { column_chunk->encoded_statistics(); ASSERT_TRUE(encoded_statistics != NULL); ASSERT_EQ(0, encoded_statistics->null_count); - EXPECT_EQ("Al", encoded_statistics->min()); + EXPECT_EQ("Al", encoded_statistics->Min()); ASSERT_TRUE(encoded_statistics->is_max_value_exact.has_value()); ASSERT_TRUE(encoded_statistics->is_min_value_exact.has_value()); switch (num_column) { case 2: // Max couldn't truncate the utf-8 string longer than 2 bytes - EXPECT_EQ("🚀Kevin Bacon", encoded_statistics->max()); + EXPECT_EQ("🚀Kevin Bacon", encoded_statistics->Max()); ASSERT_TRUE(encoded_statistics->is_max_value_exact.value()); ASSERT_FALSE(encoded_statistics->is_min_value_exact.value()); break; case 3: // Max couldn't truncate 0xFFFF binary string - EXPECT_EQ("\xFF\xFF\x1\x2", encoded_statistics->max()); + EXPECT_EQ("\xFF\xFF\x1\x2", encoded_statistics->Max()); ASSERT_TRUE(encoded_statistics->is_max_value_exact.value()); ASSERT_FALSE(encoded_statistics->is_min_value_exact.value()); break; case 4: case 5: // Min and Max are not truncated, fit on 2 bytes - EXPECT_EQ("Ke", encoded_statistics->max()); + EXPECT_EQ("Ke", encoded_statistics->Max()); ASSERT_TRUE(encoded_statistics->is_max_value_exact.value()); ASSERT_TRUE(encoded_statistics->is_min_value_exact.value()); break; default: // Max truncated to 2 bytes on columns 0 and 1 - EXPECT_EQ("Kf", encoded_statistics->max()); + EXPECT_EQ("Kf", encoded_statistics->Max()); ASSERT_FALSE(encoded_statistics->is_max_value_exact.value()); ASSERT_FALSE(encoded_statistics->is_min_value_exact.value()); } @@ -1758,12 +1752,12 @@ TEST(TestEncodedStatistics, TruncatedMinMax) { TEST(TestEncodedStatistics, CopySafe) { EncodedStatistics encoded_statistics; encoded_statistics.set_max("abc"); - ASSERT_TRUE(encoded_statistics.has_max); + ASSERT_TRUE(encoded_statistics.HasMax()); encoded_statistics.is_max_value_exact = true; ASSERT_TRUE(encoded_statistics.is_max_value_exact.has_value()); encoded_statistics.set_min("abc"); - ASSERT_TRUE(encoded_statistics.has_min); + ASSERT_TRUE(encoded_statistics.HasMin()); encoded_statistics.is_min_value_exact = true; ASSERT_TRUE(encoded_statistics.is_min_value_exact.has_value()); @@ -1773,8 +1767,8 @@ TEST(TestEncodedStatistics, CopySafe) { copy_statistics.is_max_value_exact = false; copy_statistics.is_min_value_exact = false; - EXPECT_EQ("abc", encoded_statistics.min()); - EXPECT_EQ("abc", encoded_statistics.max()); + EXPECT_EQ("abc", encoded_statistics.Min()); + EXPECT_EQ("abc", encoded_statistics.Max()); EXPECT_EQ(encoded_statistics.is_min_value_exact, std::make_optional(true)); EXPECT_EQ(encoded_statistics.is_max_value_exact, std::make_optional(true)); } @@ -1782,16 +1776,16 @@ TEST(TestEncodedStatistics, CopySafe) { TEST(TestEncodedStatistics, ApplyStatSizeLimits) { EncodedStatistics encoded_statistics; encoded_statistics.set_min("a"); - ASSERT_TRUE(encoded_statistics.has_min); + ASSERT_TRUE(encoded_statistics.HasMin()); encoded_statistics.set_max("abc"); - ASSERT_TRUE(encoded_statistics.has_max); + ASSERT_TRUE(encoded_statistics.HasMax()); encoded_statistics.ApplyStatSizeLimits(2); - ASSERT_TRUE(encoded_statistics.has_min); - ASSERT_EQ("a", encoded_statistics.min()); - ASSERT_FALSE(encoded_statistics.has_max); + ASSERT_TRUE(encoded_statistics.HasMin()); + ASSERT_EQ("a", encoded_statistics.Min()); + ASSERT_FALSE(encoded_statistics.HasMax()); NodePtr node = PrimitiveNode::Make("StringColumn", Repetition::REQUIRED, Type::BYTE_ARRAY); diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h index 1ffe99eb3c9..4d5b184dc16 100644 --- a/cpp/src/parquet/thrift_internal.h +++ b/cpp/src/parquet/thrift_internal.h @@ -469,33 +469,33 @@ static inline format::GeospatialStatistics ToThrift( static inline format::Statistics ToThrift(const EncodedStatistics& stats) { format::Statistics statistics; - if (stats.has_min) { - statistics.__set_min_value(stats.min()); + if (stats.HasMin()) { + statistics.__set_min_value(stats.Min().value()); if (stats.is_min_value_exact.has_value()) { statistics.__set_is_min_value_exact(stats.is_min_value_exact.value()); } // If the order is SIGNED, then the old min value must be set too. // This for backward compatibility if (stats.is_signed()) { - statistics.__set_min(stats.min()); + statistics.__set_min(stats.Min().value()); } } - if (stats.has_max) { - statistics.__set_max_value(stats.max()); + if (stats.HasMax()) { + statistics.__set_max_value(stats.Max().value()); if (stats.is_max_value_exact.has_value()) { statistics.__set_is_max_value_exact(stats.is_max_value_exact.value()); } // If the order is SIGNED, then the old max value must be set too. // This for backward compatibility if (stats.is_signed()) { - statistics.__set_max(stats.max()); + statistics.__set_max(stats.Max().value()); } } - if (stats.has_null_count) { - statistics.__set_null_count(stats.null_count); + if (stats.HasNullCount()) { + statistics.__set_null_count(stats.null_count.value()); } - if (stats.has_distinct_count) { - statistics.__set_distinct_count(stats.distinct_count); + if (stats.HasDistinctCount()) { + statistics.__set_distinct_count(stats.distinct_count.value()); } return statistics;