Skip to content
2 changes: 1 addition & 1 deletion cpp/src/arrow/dataset/file_parquet.cc
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpr
const parquet::Statistics& statistics) {
auto field_expr = compute::field_ref(field_ref);

bool may_have_null = !statistics.HasNullCount() || statistics.null_count() > 0;
bool may_have_null = !statistics.HasNullCount() || statistics.NullCount().value() > 0;
// Optimize for corner case where all values are nulls
if (statistics.num_values() == 0) {
// If there are no non-null values, column `field_ref` in the fragment
Expand Down
6 changes: 2 additions & 4 deletions cpp/src/arrow/dataset/file_parquet_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -941,9 +941,8 @@ TEST(TestParquetStatistics, NoNullCount) {
::parquet::EncodedStatistics encoded_stats;
encoded_stats.set_min(int32_to_parquet_stats(1));
encoded_stats.set_max(int32_to_parquet_stats(100));
encoded_stats.has_null_count = false;
encoded_stats.all_null_value = false;
encoded_stats.null_count = 0;
encoded_stats.null_count = std::nullopt;
auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/10);

auto stat_expression =
Expand All @@ -956,7 +955,6 @@ TEST(TestParquetStatistics, NoNullCount) {
// Special case: when num_value is 0, it would return
// "is_null".
::parquet::EncodedStatistics encoded_stats;
encoded_stats.has_null_count = true;
encoded_stats.null_count = 1;
encoded_stats.all_null_value = true;
auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0);
Expand All @@ -965,7 +963,7 @@ TEST(TestParquetStatistics, NoNullCount) {
ASSERT_TRUE(stat_expression.has_value());
EXPECT_EQ(stat_expression->ToString(), "is_null(x, {nan_is_null=false})");

encoded_stats.has_null_count = false;
encoded_stats.null_count = std::nullopt;
encoded_stats.all_null_value = false;
stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0);
stat_expression = ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats);
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/parquet/arrow/arrow_reader_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4735,8 +4735,8 @@ TEST_P(TestArrowWriteDictionary, Statistics) {

auto expect_has_min_max =
expected_has_min_max_by_page[case_index][row_group_index][page_index];
EXPECT_EQ(stats.has_min, expect_has_min_max);
EXPECT_EQ(stats.has_max, expect_has_min_max);
EXPECT_EQ(stats.HasMin(), expect_has_min_max);
EXPECT_EQ(stats.HasMax(), expect_has_min_max);
if (expect_has_min_max) {
EXPECT_EQ(stats.min(),
expected_min_by_page[case_index][row_group_index][page_index]);
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/arrow/arrow_statistics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ TEST_P(ParameterizedStatisticsTest, NoNullCountWrittenForRepeatedFields) {
auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader));
std::shared_ptr<FileMetaData> metadata = parquet_reader->metadata();
std::shared_ptr<Statistics> stats = metadata->RowGroup(0)->ColumnChunk(0)->statistics();
EXPECT_EQ(stats->null_count(), GetParam().expected_null_count);
EXPECT_EQ(stats->NullCount(), GetParam().expected_null_count);
EXPECT_EQ(stats->num_values(), GetParam().expected_value_count);
ASSERT_TRUE(stats->HasMinMax());
EXPECT_EQ(stats->EncodeMin(), GetParam().expected_min);
Expand Down
22 changes: 11 additions & 11 deletions cpp/src/parquet/arrow/reader_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,8 @@ template <typename CType, typename StatisticsType>
Status MakeMinMaxScalar(const StatisticsType& statistics,
std::shared_ptr<::arrow::Scalar>* min,
std::shared_ptr<::arrow::Scalar>* max) {
*min = ::arrow::MakeScalar(static_cast<CType>(statistics.min()));
*max = ::arrow::MakeScalar(static_cast<CType>(statistics.max()));
*min = ::arrow::MakeScalar(static_cast<CType>(statistics.Min().value()));
*max = ::arrow::MakeScalar(static_cast<CType>(statistics.Max().value()));
return Status::OK();
}

Expand All @@ -179,8 +179,8 @@ Status MakeMinMaxTypedScalar(const StatisticsType& statistics,
std::shared_ptr<DataType> type,
std::shared_ptr<::arrow::Scalar>* min,
std::shared_ptr<::arrow::Scalar>* max) {
ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.min()));
ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.max()));
ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.Min().value()));
ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.Max().value()));
return Status::OK();
}

Expand Down Expand Up @@ -227,8 +227,8 @@ static Status FromInt32Statistics(const Int32Statistics& statistics,
case LogicalType::Type::NONE:
return MakeMinMaxTypedScalar<int32_t>(statistics, type, min, max);
case LogicalType::Type::DECIMAL:
return ExtractDecimalMinMaxFromInteger(statistics.min(), statistics.max(),
logical_type, min, max);
return ExtractDecimalMinMaxFromInteger(
statistics.Min().value(), statistics.Max().value(), logical_type, min, max);
default:
break;
}
Expand All @@ -252,8 +252,8 @@ static Status FromInt64Statistics(const Int64Statistics& statistics,
case LogicalType::Type::NONE:
return MakeMinMaxTypedScalar<int64_t>(statistics, type, min, max);
case LogicalType::Type::DECIMAL:
return ExtractDecimalMinMaxFromInteger(statistics.min(), statistics.max(),
logical_type, min, max);
return ExtractDecimalMinMaxFromInteger(
statistics.Min().value(), statistics.Max().value(), logical_type, min, max);
default:
break;
}
Expand Down Expand Up @@ -384,13 +384,13 @@ void AttachStatistics(::arrow::ArrayData* data,
}
if (statistics) {
if (statistics->HasDistinctCount()) {
array_statistics->distinct_count = statistics->distinct_count();
array_statistics->distinct_count = statistics->DistinctCount().value();
}
if (statistics->HasMinMax()) {
const auto* typed_statistics =
checked_cast<const ::parquet::TypedStatistics<ParquetType>*>(statistics.get());
const ArrowCType min = typed_statistics->min();
const ArrowCType max = typed_statistics->max();
const ArrowCType min = typed_statistics->Min().value();
const ArrowCType max = typed_statistics->Max().value();
if constexpr (std::is_same_v<ArrowCType, bool>) {
array_statistics->min = static_cast<bool>(min);
array_statistics->max = static_cast<bool>(max);
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1084,7 +1084,7 @@ void ColumnWriterImpl::BuildDataPageV2(int64_t definition_levels_rle_size,

// page_stats.null_count is not set when page_statistics_ is nullptr. It is only used
// here for safety check.
DCHECK(!page_stats.has_null_count || page_stats.null_count == null_count);
DCHECK(!page_stats.HasNullCount() || page_stats.null_count == null_count);

// Write the page to OutputStream eagerly if there is no dictionary or
// if dictionary encoding has fallen back to PLAIN
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/column_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
auto metadata_accessor = ColumnChunkMetaData::Make(
metadata_->contents(), this->descr_, default_reader_properties(), &app_version);
auto encoded_stats = metadata_accessor->statistics()->Encode();
return {encoded_stats.has_min, encoded_stats.has_max};
return {encoded_stats.HasMin(), encoded_stats.HasMax()};
}

std::vector<Encoding::type> metadata_encodings() {
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/parquet/file_deserialize_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@ static inline void AddDummyStats(int stat_size, H& header, bool fill_all_stats =
template <typename H>
static inline void CheckStatistics(const H& expected, const EncodedStatistics& actual) {
if (expected.statistics.__isset.max) {
ASSERT_EQ(expected.statistics.max, actual.max());
ASSERT_EQ(expected.statistics.max, actual.Max());
}
if (expected.statistics.__isset.min) {
ASSERT_EQ(expected.statistics.min, actual.min());
ASSERT_EQ(expected.statistics.min, actual.Min());
}
if (expected.statistics.__isset.null_count) {
ASSERT_EQ(expected.statistics.null_count, actual.null_count);
Expand Down Expand Up @@ -513,8 +513,8 @@ TYPED_TEST(PageFilterTest, TestPageFilterCallback) {
CheckDataPageHeader(this->data_page_headers_[i], current_page.get()));
auto data_page = static_cast<const DataPage*>(current_page.get());
const EncodedStatistics encoded_statistics = data_page->statistics();
ASSERT_EQ(read_stats[i].max(), encoded_statistics.max());
ASSERT_EQ(read_stats[i].min(), encoded_statistics.min());
ASSERT_EQ(read_stats[i].Max(), encoded_statistics.Max());
ASSERT_EQ(read_stats[i].Min(), encoded_statistics.Min());
ASSERT_EQ(read_stats[i].null_count, encoded_statistics.null_count);
ASSERT_EQ(read_stats[i].distinct_count, encoded_statistics.distinct_count);
ASSERT_EQ(read_num_values[i], this->data_page_headers_[i].num_values);
Expand Down
46 changes: 31 additions & 15 deletions cpp/src/parquet/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -101,24 +101,40 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
metadata.statistics.__isset.is_max_value_exact
? std::optional<bool>(metadata.statistics.is_max_value_exact)
: std::nullopt;
std::optional<int64_t> null_count =
metadata.statistics.__isset.null_count
? std::optional<int64_t>(metadata.statistics.null_count)
: std::nullopt;
std::optional<int64_t> distinct_count =
metadata.statistics.__isset.distinct_count
? std::optional<int64_t>(metadata.statistics.distinct_count)
: std::nullopt;
std::optional<std::string_view> min_val =
metadata.statistics.__isset.min
? std::optional<std::string_view>(metadata.statistics.min)
: std::nullopt;
std::optional<std::string_view> max_val =
metadata.statistics.__isset.max
? std::optional<std::string_view>(metadata.statistics.max)
: std::nullopt;
// If ColumnOrder is defined, return max_value and min_value
if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) {
return MakeStatistics<DType>(
descr, metadata.statistics.min_value, metadata.statistics.max_value,
metadata.num_values - metadata.statistics.null_count,
metadata.statistics.null_count, metadata.statistics.distinct_count,
metadata.statistics.__isset.max_value && metadata.statistics.__isset.min_value,
metadata.statistics.__isset.null_count,
metadata.statistics.__isset.distinct_count, min_exact, max_exact, pool);
std::optional<std::string_view> min_value =
metadata.statistics.__isset.min_value
? std::optional<std::string_view>(metadata.statistics.min_value)
: std::nullopt;
std::optional<std::string_view> max_value =
metadata.statistics.__isset.max_value
? std::optional<std::string_view>(metadata.statistics.max_value)
: std::nullopt;
return MakeStatistics<DType>(descr, min_value, max_value,
metadata.num_values - null_count.value_or(0), null_count,
distinct_count, min_exact, max_exact, pool);
}
// Default behavior
return MakeStatistics<DType>(
descr, metadata.statistics.min, metadata.statistics.max,
metadata.num_values - metadata.statistics.null_count,
metadata.statistics.null_count, metadata.statistics.distinct_count,
metadata.statistics.__isset.max && metadata.statistics.__isset.min,
metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count,
min_exact, max_exact, pool);
return MakeStatistics<DType>(descr, min_val, max_val,
metadata.num_values - null_count.value_or(0), null_count,
distinct_count, min_exact, max_exact, pool);
}

namespace {
Expand Down Expand Up @@ -1610,7 +1626,7 @@ bool ApplicationVersion::HasCorrectStatistics(Type::type col_type,
(application_ == "parquet-mr" && VersionLt(PARQUET_MR_FIXED_STATS_VERSION()))) {
// Only SIGNED are valid unless max and min are the same
// (in which case the sort order does not matter)
bool max_equals_min = statistics.has_min && statistics.has_max
bool max_equals_min = statistics.HasMin() && statistics.HasMax()
? statistics.min() == statistics.max()
: false;
if (SortOrder::SIGNED != sort_order && !max_equals_min) {
Expand Down
48 changes: 24 additions & 24 deletions cpp/src/parquet/metadata_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -154,18 +154,18 @@ TEST(Metadata, TestBuildAccess) {
auto rg1_column2 = rg1_accessor->ColumnChunk(1);
ASSERT_EQ(true, rg1_column1->is_stats_set());
ASSERT_EQ(true, rg1_column2->is_stats_set());
ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin());
ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax());
ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin());
ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax());
ASSERT_EQ(stats_float.min(), rg1_column2->encoded_statistics()->min());
ASSERT_EQ(stats_float.max(), rg1_column2->encoded_statistics()->max());
ASSERT_EQ(stats_int.min(), rg1_column1->encoded_statistics()->min());
ASSERT_EQ(stats_int.max(), rg1_column1->encoded_statistics()->max());
ASSERT_EQ(0, rg1_column1->statistics()->null_count());
ASSERT_EQ(0, rg1_column2->statistics()->null_count());
ASSERT_EQ(nrows, rg1_column1->statistics()->distinct_count());
ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count());
ASSERT_EQ(stats_float.Min(), rg1_column2->statistics()->EncodeMin());
ASSERT_EQ(stats_float.Max(), rg1_column2->statistics()->EncodeMax());
ASSERT_EQ(stats_int.Min(), rg1_column1->statistics()->EncodeMin());
ASSERT_EQ(stats_int.Max(), rg1_column1->statistics()->EncodeMax());
ASSERT_EQ(stats_float.Min(), rg1_column2->encoded_statistics()->Min());
ASSERT_EQ(stats_float.Max(), rg1_column2->encoded_statistics()->Max());
ASSERT_EQ(stats_int.Min(), rg1_column1->encoded_statistics()->Min());
ASSERT_EQ(stats_int.Max(), rg1_column1->encoded_statistics()->Max());
ASSERT_EQ(0, rg1_column1->statistics()->NullCount());
ASSERT_EQ(0, rg1_column2->statistics()->NullCount());
ASSERT_EQ(nrows, rg1_column1->statistics()->DistinctCount());
ASSERT_EQ(nrows, rg1_column2->statistics()->DistinctCount());
ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column1->compression());
ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression());
ASSERT_EQ(nrows / 2, rg1_column1->num_values());
Expand Down Expand Up @@ -205,18 +205,18 @@ TEST(Metadata, TestBuildAccess) {
auto rg2_column2 = rg2_accessor->ColumnChunk(1);
ASSERT_EQ(true, rg2_column1->is_stats_set());
ASSERT_EQ(true, rg2_column2->is_stats_set());
ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin());
ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax());
ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin());
ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax());
ASSERT_EQ(stats_float.min(), rg2_column2->encoded_statistics()->min());
ASSERT_EQ(stats_float.max(), rg2_column2->encoded_statistics()->max());
ASSERT_EQ(stats_int.min(), rg1_column1->encoded_statistics()->min());
ASSERT_EQ(stats_int.max(), rg1_column1->encoded_statistics()->max());
ASSERT_EQ(0, rg2_column1->statistics()->null_count());
ASSERT_EQ(0, rg2_column2->statistics()->null_count());
ASSERT_EQ(nrows, rg2_column1->statistics()->distinct_count());
ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count());
ASSERT_EQ(stats_float.Min(), rg2_column2->statistics()->EncodeMin());
ASSERT_EQ(stats_float.Max(), rg2_column2->statistics()->EncodeMax());
ASSERT_EQ(stats_int.Min(), rg1_column1->statistics()->EncodeMin());
ASSERT_EQ(stats_int.Max(), rg1_column1->statistics()->EncodeMax());
ASSERT_EQ(stats_float.Min(), rg2_column2->encoded_statistics()->Min());
ASSERT_EQ(stats_float.Max(), rg2_column2->encoded_statistics()->Max());
ASSERT_EQ(stats_int.Min(), rg1_column1->encoded_statistics()->Min());
ASSERT_EQ(stats_int.Max(), rg1_column1->encoded_statistics()->Max());
ASSERT_EQ(0, rg2_column1->statistics()->NullCount());
ASSERT_EQ(0, rg2_column2->statistics()->NullCount());
ASSERT_EQ(nrows, rg2_column1->statistics()->DistinctCount());
ASSERT_EQ(nrows, rg2_column2->statistics()->DistinctCount());
ASSERT_EQ(nrows / 2, rg2_column1->num_values());
ASSERT_EQ(nrows / 2, rg2_column2->num_values());
ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression());
Expand Down
10 changes: 5 additions & 5 deletions cpp/src/parquet/page_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -511,11 +511,11 @@ class ColumnIndexBuilderImpl final : public ColumnIndexBuilder {
column_index_.null_pages.emplace_back(true);
column_index_.min_values.emplace_back("");
column_index_.max_values.emplace_back("");
} else if (stats.has_min && stats.has_max) {
} else if (stats.HasMin() && stats.HasMax()) {
const size_t page_ordinal = column_index_.null_pages.size();
non_null_page_indices_.emplace_back(page_ordinal);
column_index_.min_values.emplace_back(stats.min());
column_index_.max_values.emplace_back(stats.max());
column_index_.min_values.emplace_back(stats.Min().value());
column_index_.max_values.emplace_back(stats.Max().value());
column_index_.null_pages.emplace_back(false);
} else {
/// This is a non-null page but it lacks of meaningful min/max values.
Expand All @@ -524,8 +524,8 @@ class ColumnIndexBuilderImpl final : public ColumnIndexBuilder {
return;
}

if (column_index_.__isset.null_counts && stats.has_null_count) {
column_index_.null_counts.emplace_back(stats.null_count);
if (column_index_.__isset.null_counts && stats.HasNullCount()) {
column_index_.null_counts.emplace_back(stats.null_count.value());
} else {
column_index_.__isset.null_counts = false;
column_index_.null_counts.clear();
Expand Down
14 changes: 7 additions & 7 deletions cpp/src/parquet/page_index_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -546,8 +546,8 @@ void TestWriteTypedColumnIndex(schema::NodePtr node,

for (size_t i = 0; i < num_pages; ++i) {
ASSERT_EQ(page_stats[i].all_null_value, column_index->null_pages()[i]);
ASSERT_EQ(page_stats[i].min(), column_index->encoded_min_values()[i]);
ASSERT_EQ(page_stats[i].max(), column_index->encoded_max_values()[i]);
ASSERT_EQ(page_stats[i].Min().value_or(""), column_index->encoded_min_values()[i]);
ASSERT_EQ(page_stats[i].Max().value_or(""), column_index->encoded_max_values()[i]);
if (has_null_counts) {
ASSERT_EQ(page_stats[i].null_count, column_index->null_counts()[i]);
}
Expand Down Expand Up @@ -813,11 +813,11 @@ class PageIndexBuilderTest : public ::testing::Test {
ASSERT_NE(nullptr, column_index);
ASSERT_EQ(size_t{1}, column_index->null_pages().size());
ASSERT_EQ(stats.all_null_value, column_index->null_pages()[0]);
ASSERT_EQ(stats.min(), column_index->encoded_min_values()[0]);
ASSERT_EQ(stats.max(), column_index->encoded_max_values()[0]);
ASSERT_EQ(stats.has_null_count, column_index->has_null_counts());
if (stats.has_null_count) {
ASSERT_EQ(stats.null_count, column_index->null_counts()[0]);
ASSERT_EQ(stats.Min(), column_index->encoded_min_values()[0]);
ASSERT_EQ(stats.Max(), column_index->encoded_max_values()[0]);
ASSERT_EQ(stats.HasNullCount(), column_index->has_null_counts());
if (stats.HasNullCount()) {
ASSERT_EQ(stats.null_count.value(), column_index->null_counts()[0]);
}
}

Expand Down
Loading
Loading