diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 8b9158d8f67a7d..b1e7949d8c17b3 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1164,6 +1164,7 @@ DEFINE_mBool(variant_use_cloud_schema_dict_cache, "true"); DEFINE_mInt64(variant_threshold_rows_to_estimate_sparse_column, "2048"); DEFINE_mInt32(variant_max_json_key_length, "255"); DEFINE_mBool(variant_throw_exeception_on_invalid_json, "false"); +DEFINE_mBool(variant_enable_duplicate_json_path_check, "false"); DEFINE_mBool(enable_vertical_compact_variant_subcolumns, "true"); DEFINE_mBool(enable_variant_doc_sparse_write_subcolumns, "true"); // Maximum depth of nested arrays to track with NestedGroup diff --git a/be/src/common/config.h b/be/src/common/config.h index 930e9f67fb6ffe..2814f0539ff993 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1425,6 +1425,8 @@ DECLARE_mInt64(variant_threshold_rows_to_estimate_sparse_column); DECLARE_mInt32(variant_max_json_key_length); // Treat invalid json format str as string, instead of throwing exception if false DECLARE_mBool(variant_throw_exeception_on_invalid_json); +// Enable duplicate path check when parsing json into variant subcolumns/jsonb. +DECLARE_mBool(variant_enable_duplicate_json_path_check); // Enable vertical compact subcolumns of variant column DECLARE_mBool(enable_vertical_compact_variant_subcolumns); DECLARE_mBool(enable_variant_doc_sparse_write_subcolumns); diff --git a/be/src/core/data_type_serde/data_type_variant_serde.cpp b/be/src/core/data_type_serde/data_type_variant_serde.cpp index 31538f9fcc5205..93ec3edfdfb359 100644 --- a/be/src/core/data_type_serde/data_type_variant_serde.cpp +++ b/be/src/core/data_type_serde/data_type_variant_serde.cpp @@ -21,6 +21,7 @@ #include #include "common/cast_set.h" +#include "common/config.h" #include "common/exception.h" #include "common/status.h" #include "core/assert_cast.h" @@ -107,10 +108,11 @@ Status DataTypeVariantSerDe::serialize_one_cell_to_json(const IColumn& column, i Status DataTypeVariantSerDe::deserialize_one_cell_from_json(IColumn& column, Slice& slice, const FormatOptions& options) const { - ParseConfig config; + ParseConfig parse_config; + parse_config.check_duplicate_json_path = config::variant_enable_duplicate_json_path_check; StringRef json_ref(slice.data, slice.size); RETURN_IF_CATCH_EXCEPTION( - variant_util::parse_json_to_variant(column, json_ref, nullptr, config)); + variant_util::parse_json_to_variant(column, json_ref, nullptr, parse_config)); return Status::OK(); } diff --git a/be/src/exec/common/variant_util.cpp b/be/src/exec/common/variant_util.cpp index ac97563f94d25d..75cfdf3b1fec77 100644 --- a/be/src/exec/common/variant_util.cpp +++ b/be/src/exec/common/variant_util.cpp @@ -2016,16 +2016,26 @@ void parse_json_to_variant_impl(IColumn& column, const char* src, size_t length, } }; + auto is_plain_path = [](const PathInData& path) { + for (const auto& part : path.get_parts()) { + if (part.is_nested || part.anonymous_array_level != 0) { + return false; + } + } + return true; + }; + auto get_or_create_subcolumn = [&](const PathInData& path, size_t index_hint, const FieldInfo& field_info) -> ColumnVariant::Subcolumn* { - if (column_variant.get_subcolumn(path, index_hint) == nullptr) { + auto* subcolumn = column_variant.get_subcolumn(path, index_hint); + if (subcolumn == nullptr) { if (path.has_nested_part()) { column_variant.add_nested_subcolumn(path, field_info, old_num_rows); } else { column_variant.add_sub_column(path, old_num_rows); } + subcolumn = column_variant.get_subcolumn(path, index_hint); } - auto* subcolumn = column_variant.get_subcolumn(path, index_hint); if (!subcolumn) { throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to find sub column {}", path.get_path()); @@ -2033,6 +2043,13 @@ void parse_json_to_variant_impl(IColumn& column, const char* src, size_t length, return subcolumn; }; + auto normalize_plain_path = [&](const PathInData& path) { + if (!config.check_duplicate_json_path || path.empty() || !is_plain_path(path)) { + return path; + } + return PathInData(path.get_path()); + }; + auto insert_into_subcolumn = [&](size_t i, bool check_size_mismatch) -> ColumnVariant::Subcolumn* { FieldInfo field_info; @@ -2040,12 +2057,13 @@ void parse_json_to_variant_impl(IColumn& column, const char* src, size_t length, if (field_info.scalar_type_id == PrimitiveType::INVALID_TYPE) { return nullptr; } - auto* subcolumn = get_or_create_subcolumn(paths[i], i, field_info); + auto path = normalize_plain_path(paths[i]); + auto* subcolumn = get_or_create_subcolumn(path, i, field_info); flush_defaults(subcolumn); if (check_size_mismatch && subcolumn->size() != old_num_rows) { throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "subcolumn {} size missmatched, may contains duplicated entry", - paths[i].get_path()); + path.get_path()); } subcolumn->insert(std::move(values[i]), std::move(field_info)); return subcolumn; @@ -2309,6 +2327,7 @@ Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& t // Deprecated legacy flatten-nested switch. Distinct from variant_enable_nested_group. configs[i].deprecated_enable_flatten_nested = tablet_schema.deprecated_variant_flatten_nested(); + configs[i].check_duplicate_json_path = config::variant_enable_duplicate_json_path_check; const auto& column = tablet_schema.column(variant_schema_pos[i]); if (!column.is_variant_type()) { return Status::InternalError("column is not variant type, column name: {}", diff --git a/be/src/util/json/json_parser.cpp b/be/src/util/json/json_parser.cpp index 8814258034604b..f6336783db0912 100644 --- a/be/src/util/json/json_parser.cpp +++ b/be/src/util/json/json_parser.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include "common/cast_set.h" // IWYU pragma: keep @@ -46,6 +47,7 @@ std::optional JSONDataParser::parse(const char* begin, // deprecated_enable_flatten_nested controls nested path traversal // NestedGroup expansion is now handled at storage layer context.deprecated_enable_flatten_nested = config.deprecated_enable_flatten_nested; + context.check_duplicate_json_path = config.check_duplicate_json_path; context.is_top_array = document.isArray(); traverse(document, context); ParseResult result; @@ -72,25 +74,39 @@ void JSONDataParser::traverse(const Element& element, ParseContext& // Parse nested arrays to JsonbField JsonbWriter writer; traverseArrayAsJsonb(element.getArray(), writer); - ctx.paths.push_back(ctx.builder.get_parts()); - ctx.values.push_back(Field::create_field( - JsonbField(writer.getOutput()->getBuffer(), writer.getOutput()->getSize()))); + appendValueIfNotDuplicate( + ctx, ctx.builder.get_parts(), + Field::create_field(JsonbField(writer.getOutput()->getBuffer(), + writer.getOutput()->getSize()))); } else { traverseArray(element.getArray(), ctx); } // we should set has_nested_in_flatten to false when traverse array finished for next array otherwise it will be true for next array ctx.has_nested_in_flatten = false; } else { - ctx.paths.push_back(ctx.builder.get_parts()); - ctx.values.push_back(getValueAsField(element)); + appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(), getValueAsField(element)); } } + +template +void JSONDataParser::appendValueIfNotDuplicate(ParseContext& ctx, + const PathInData::Parts& path, + Field&& value) { + if (ctx.check_duplicate_json_path) { + PathInData path_in_data(path); + if (!ctx.visited_path_names.emplace(path_in_data.get_path()).second) { + return; + } + } + ctx.paths.push_back(path); + ctx.values.push_back(std::move(value)); +} + template void JSONDataParser::traverseObject(const JSONObject& object, ParseContext& ctx) { ctx.paths.reserve(ctx.paths.size() + object.size()); ctx.values.reserve(ctx.values.size() + object.size()); - for (auto it = object.begin(); it != object.end(); ++it) { - const auto& [key, value] = *it; + auto check_key_length = [](const auto& key) { const size_t max_key_length = cast_set(config::variant_max_json_key_length); if (key.size() > max_key_length) { throw doris::Exception( @@ -98,9 +114,17 @@ void JSONDataParser::traverseObject(const JSONObject& object, ParseC fmt::format("Key length exceeds maximum allowed size of {} bytes.", max_key_length)); } + }; + auto traverse_object_member = [&](const auto& key, const auto& value) { + check_key_length(key); ctx.builder.append(key, false); traverse(value, ctx); ctx.builder.pop_back(); + }; + + for (auto it = object.begin(); it != object.end(); ++it) { + const auto& [key, value] = *it; + traverse_object_member(key, value); } } @@ -176,6 +200,7 @@ void JSONDataParser::traverseArray(const JSONArray& array, ParseCont ParseArrayContext array_ctx; array_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten; array_ctx.is_top_array = ctx.is_top_array; + array_ctx.check_duplicate_json_path = ctx.check_duplicate_json_path; array_ctx.total_size = array.size(); for (auto it = array.begin(); it != array.end(); ++it) { traverseArrayElement(*it, array_ctx); @@ -183,16 +208,17 @@ void JSONDataParser::traverseArray(const JSONArray& array, ParseCont } auto&& arrays_by_path = array_ctx.arrays_by_path; if (arrays_by_path.empty()) { - ctx.paths.push_back(ctx.builder.get_parts()); - ctx.values.push_back(Field::create_field(Array())); + appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(), + Field::create_field(Array())); } else { ctx.paths.reserve(ctx.paths.size() + arrays_by_path.size()); ctx.values.reserve(ctx.values.size() + arrays_by_path.size()); for (auto it = arrays_by_path.begin(); it != arrays_by_path.end(); ++it) { auto&& [path, path_array] = it->second; /// Merge prefix path and path of array element. - ctx.paths.push_back(ctx.builder.append(path, true).get_parts()); - ctx.values.push_back(Field::create_field(std::move(path_array))); + ctx.builder.append(path, true); + appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(), + Field::create_field(std::move(path_array))); ctx.builder.pop_back(path.size()); } } @@ -204,10 +230,12 @@ void JSONDataParser::traverseArrayElement(const Element& element, ParseContext element_ctx; element_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten; element_ctx.is_top_array = ctx.is_top_array; + element_ctx.check_duplicate_json_path = ctx.check_duplicate_json_path; traverse(element, element_ctx); - auto& [_, paths, values, deprecated_flatten_nested, __, is_top_array] = element_ctx; + auto& paths = element_ctx.paths; + auto& values = element_ctx.values; - if (element_ctx.has_nested_in_flatten && is_top_array) { + if (element_ctx.has_nested_in_flatten && element_ctx.is_top_array) { checkAmbiguousStructure(ctx, paths); } diff --git a/be/src/util/json/json_parser.h b/be/src/util/json/json_parser.h index 4b49259588b623..c4a165e899546f 100644 --- a/be/src/util/json/json_parser.h +++ b/be/src/util/json/json_parser.h @@ -101,6 +101,7 @@ void writeValueAsJsonb(const Element& element, JsonbWriter& writer) { struct ParseConfig { bool deprecated_enable_flatten_nested = false; + bool check_duplicate_json_path = false; enum class ParseTo { OnlySubcolumns = 0, OnlyDocValueColumn = 1, @@ -127,7 +128,9 @@ class JSONDataParser { PathInDataBuilder builder; std::vector paths; std::vector values; + phmap::flat_hash_set visited_path_names; bool deprecated_enable_flatten_nested = false; + bool check_duplicate_json_path = false; bool has_nested_in_flatten = false; bool is_top_array = false; }; @@ -141,10 +144,12 @@ class JSONDataParser { KeyToSizes nested_sizes_by_key; bool has_nested_in_flatten = false; bool is_top_array = false; + bool check_duplicate_json_path = false; }; void traverse(const Element& element, ParseContext& ctx); void traverseObject(const JSONObject& object, ParseContext& ctx); void traverseArray(const JSONArray& array, ParseContext& ctx); + void appendValueIfNotDuplicate(ParseContext& ctx, const PathInData::Parts& path, Field&& value); void traverseArrayElement(const Element& element, ParseArrayContext& ctx); void checkAmbiguousStructure(const ParseArrayContext& ctx, const std::vector& paths); diff --git a/be/test/storage/segment/variant_util_test.cpp b/be/test/storage/segment/variant_util_test.cpp index 597623c1018bc9..902bf9c843b115 100644 --- a/be/test/storage/segment/variant_util_test.cpp +++ b/be/test/storage/segment/variant_util_test.cpp @@ -23,11 +23,13 @@ #include #include +#include "common/config.h" #include "core/block/block.h" #include "core/column/column_string.h" #include "core/column/column_variant.h" #include "core/data_type/data_type_variant.h" #include "core/field.h" +#include "core/value/jsonb_value.h" #include "exec/common/variant_util.h" #include "gtest/gtest.h" #include "storage/tablet/tablet_schema.h" @@ -42,6 +44,20 @@ static ColumnString::MutablePtr _make_json_column(const std::vector jsons = { R"({"a":1,"b":"x"})", // @@ -225,6 +241,196 @@ TEST(VariantUtilTest, ParseOnlyDocValueColumn_SerializesMixedTypes) { EXPECT_EQ(f.field.get(), "y"); } +TEST(VariantUtilTest, ParseDuplicateJsonPathsKeepsFirstValue) { + ScopedDuplicateJsonPathCheck check_guard(true); + const std::vector jsons = { + R"({"a":42,"a":{"b":42}})", R"({"a":123,"a":"123"})", R"({"a.b":1,"a":{"b":2}})", + R"({"a":{"b":3},"a.b":4})", R"({"a":{"b":5},"a":{"c":6}})", + }; + + auto variant = ColumnVariant::create(0, false); + auto json_col = _make_json_column(jsons); + + ParseConfig cfg; + cfg.deprecated_enable_flatten_nested = false; + cfg.check_duplicate_json_path = true; + cfg.parse_to = ParseConfig::ParseTo::OnlySubcolumns; + parse_json_to_variant(*variant, *json_col, cfg); + ASSERT_TRUE(variant->sanitize().ok()); + + const auto* sub_a = variant->get_subcolumn(PathInData("a")); + const auto* sub_ab = variant->get_subcolumn(PathInData("a.b")); + const auto* sub_ac = variant->get_subcolumn(PathInData("a.c")); + ASSERT_NE(sub_a, nullptr); + ASSERT_NE(sub_ab, nullptr); + ASSERT_NE(sub_ac, nullptr); + + FieldWithDataType f; + sub_a->get(0, f); + EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT); + EXPECT_EQ(f.field.get(), 42); + sub_a->get(1, f); + EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT); + EXPECT_EQ(f.field.get(), 123); + + sub_ab->get(0, f); + EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT); + EXPECT_EQ(f.field.get(), 42); + sub_ab->get(1, f); + EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_NULL); + sub_ab->get(2, f); + EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT); + EXPECT_EQ(f.field.get(), 1); + sub_ab->get(3, f); + EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT); + EXPECT_EQ(f.field.get(), 3); + sub_ab->get(4, f); + EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT); + EXPECT_EQ(f.field.get(), 5); + + sub_ac->get(4, f); + EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT); + EXPECT_EQ(f.field.get(), 6); +} + +TEST(VariantUtilTest, ParseDuplicateJsonPathsKeepsFirstArrayOrScalarValue) { + ScopedDuplicateJsonPathCheck check_guard(true); + const std::vector jsons = { + R"({"a":[1],"a":2})", + R"({"a":2,"a":[1]})", + }; + + auto variant = ColumnVariant::create(0, false); + auto json_col = _make_json_column(jsons); + + ParseConfig cfg; + cfg.deprecated_enable_flatten_nested = false; + cfg.check_duplicate_json_path = true; + cfg.parse_to = ParseConfig::ParseTo::OnlySubcolumns; + parse_json_to_variant(*variant, *json_col, cfg); + ASSERT_TRUE(variant->sanitize().ok()); + + const auto* sub_a = variant->get_subcolumn(PathInData("a")); + ASSERT_NE(sub_a, nullptr); + + FieldWithDataType f; + sub_a->get(0, f); + ASSERT_EQ(f.field.get_type(), PrimitiveType::TYPE_JSONB); + const auto& first = f.field.get(); + EXPECT_EQ(JsonbToJson::jsonb_to_json_string(first.get_value(), first.get_size()), "[1]"); + + sub_a->get(1, f); + ASSERT_EQ(f.field.get_type(), PrimitiveType::TYPE_JSONB); + const auto& second = f.field.get(); + EXPECT_EQ(JsonbToJson::jsonb_to_json_string(second.get_value(), second.get_size()), "2"); +} + +TEST(VariantUtilTest, ParseDuplicateJsonPathsInDocModeKeepsFirstValue) { + ScopedDuplicateJsonPathCheck check_guard(true); + const std::vector jsons = { + R"({"a":42,"a":{"b":42}})", R"({"a":123,"a":"123"})", R"({"a.b":1,"a":{"b":2}})", + R"({"a":{"b":3},"a.b":4})", R"({"a":{"b":5},"a":{"c":6}})", + }; + + auto variant = ColumnVariant::create(0, true); + auto json_col = _make_json_column(jsons); + + ParseConfig cfg; + cfg.deprecated_enable_flatten_nested = false; + cfg.check_duplicate_json_path = true; + cfg.parse_to = ParseConfig::ParseTo::OnlyDocValueColumn; + parse_json_to_variant(*variant, *json_col, cfg); + ASSERT_TRUE(variant->sanitize().ok()); + + auto subcolumns = materialize_docs_to_subcolumns_map(*variant); + ASSERT_TRUE(subcolumns.contains("a")); + ASSERT_TRUE(subcolumns.contains("a.b")); + ASSERT_TRUE(subcolumns.contains("a.c")); + + auto& sub_a = subcolumns.at("a"); + auto& sub_ab = subcolumns.at("a.b"); + auto& sub_ac = subcolumns.at("a.c"); + sub_a.finalize(); + sub_ab.finalize(); + sub_ac.finalize(); + + FieldWithDataType f; + sub_a.get(0, f); + EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT); + EXPECT_EQ(f.field.get(), 42); + sub_a.get(1, f); + EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT); + EXPECT_EQ(f.field.get(), 123); + + sub_ab.get(0, f); + EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT); + EXPECT_EQ(f.field.get(), 42); + sub_ab.get(1, f); + EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_NULL); + sub_ab.get(2, f); + EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT); + EXPECT_EQ(f.field.get(), 1); + sub_ab.get(3, f); + EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT); + EXPECT_EQ(f.field.get(), 3); + sub_ab.get(4, f); + EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT); + EXPECT_EQ(f.field.get(), 5); + + sub_ac.get(4, f); + EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT); + EXPECT_EQ(f.field.get(), 6); +} + +TEST(VariantUtilTest, ParseDuplicateJsonPathsInDocModeKeepsFirstArrayOrScalarValue) { + ScopedDuplicateJsonPathCheck check_guard(true); + const std::vector jsons = { + R"({"a":[1],"a":2})", + R"({"a":2,"a":[1]})", + }; + + auto variant = ColumnVariant::create(0, true); + auto json_col = _make_json_column(jsons); + + ParseConfig cfg; + cfg.deprecated_enable_flatten_nested = false; + cfg.check_duplicate_json_path = true; + cfg.parse_to = ParseConfig::ParseTo::OnlyDocValueColumn; + parse_json_to_variant(*variant, *json_col, cfg); + ASSERT_TRUE(variant->sanitize().ok()); + + auto subcolumns = materialize_docs_to_subcolumns_map(*variant); + ASSERT_TRUE(subcolumns.contains("a")); + + auto& sub_a = subcolumns.at("a"); + sub_a.finalize(); + + FieldWithDataType f; + sub_a.get(0, f); + ASSERT_EQ(f.field.get_type(), PrimitiveType::TYPE_JSONB); + const auto& first = f.field.get(); + EXPECT_EQ(JsonbToJson::jsonb_to_json_string(first.get_value(), first.get_size()), "[1]"); + + sub_a.get(1, f); + ASSERT_EQ(f.field.get_type(), PrimitiveType::TYPE_JSONB); + const auto& second = f.field.get(); + EXPECT_EQ(JsonbToJson::jsonb_to_json_string(second.get_value(), second.get_size()), "2"); +} + +TEST(VariantUtilTest, ParseDuplicateJsonPathsCheckDisabledByDefault) { + ScopedDuplicateJsonPathCheck check_guard(false); + const std::vector jsons = { + R"({"a":123,"a":"123"})", + }; + + auto variant = ColumnVariant::create(0, false); + auto json_col = _make_json_column(jsons); + + ParseConfig cfg; + cfg.deprecated_enable_flatten_nested = false; + EXPECT_THROW(parse_json_to_variant(*variant, *json_col, cfg), Exception); +} + TEST(VariantUtilTest, ParseVariantColumns_ScalarJsonStringToSubcolumns) { TabletSchemaPB schema_pb; schema_pb.set_keys_type(KeysType::DUP_KEYS); diff --git a/regression-test/data/variant_p0/duplicate_json_path.json b/regression-test/data/variant_p0/duplicate_json_path.json new file mode 100644 index 00000000000000..e065c9b23314b6 --- /dev/null +++ b/regression-test/data/variant_p0/duplicate_json_path.json @@ -0,0 +1,7 @@ +{"k":8,"v":{"a":42,"a":{"b":42}}} +{"k":9,"v":{"a":123,"a":"123"}} +{"k":10,"v":{"a.b":8,"a":{"b":9}}} +{"k":11,"v":{"a":{"b":10},"a.b":11}} +{"k":12,"v":{"a":{"b":11},"a":{"c":12}}} +{"k":13,"v":{"a":[13],"a":14}} +{"k":14,"v":{"a":14,"a":[13]}} diff --git a/regression-test/suites/variant_p0/duplicate_json_path.groovy b/regression-test/suites/variant_p0/duplicate_json_path.groovy new file mode 100644 index 00000000000000..0c6802f461e7d9 --- /dev/null +++ b/regression-test/suites/variant_p0/duplicate_json_path.groovy @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("duplicate_json_path", "p0") { + def customBeConfig = [ + variant_enable_duplicate_json_path_check: true + ] + setBeConfigTemporary(customBeConfig) { + sql "DROP TABLE IF EXISTS duplicate_json_path" + sql """ + CREATE TABLE duplicate_json_path ( + k int, + v variant + ) + DUPLICATE KEY(k) + DISTRIBUTED BY HASH(k) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "group_commit_interval_ms" = "2000", + "disable_auto_compaction" = "true" + ); + """ + + sql """insert into duplicate_json_path values (1, '{"a":42,"a":{"b":42}}')""" + sql """insert into duplicate_json_path values (2, '{"a" : 123, "a" : "123"}')""" + sql """insert into duplicate_json_path values (3, '{"a.b":1,"a":{"b":2}}')""" + sql """insert into duplicate_json_path values (4, '{"a":{"b":3},"a.b":4}')""" + sql """insert into duplicate_json_path values (5, '{"a":{"b":5},"a":{"c":6}}')""" + sql """insert into duplicate_json_path values (6, '{"a":[1],"a":2}')""" + sql """insert into duplicate_json_path values (7, '{"a":2,"a":[1]}')""" + + streamLoad { + table "duplicate_json_path" + set 'read_json_by_line', 'true' + set 'format', 'json' + set 'group_commit', 'async_mode' + unset 'label' + file 'duplicate_json_path.json' + time 10000 + + check { result, exception, startTime, endTime -> + if (exception != null) { + throw exception + } + def json = parseJson(result) + assertEquals("success", json.Status.toLowerCase()) + assertEquals(7, json.NumberTotalRows) + assertEquals(7, json.NumberLoadedRows) + } + } + + for (int i = 0; i < 30; i++) { + def count = sql "select count(*) from duplicate_json_path" + if (count[0][0] == 14) { + break + } + sleep(1000) + } + def totalRows = sql "select count(*) from duplicate_json_path" + assertEquals(14, totalRows[0][0]) + + // When duplicate path check is enabled, duplicate Variant paths keep the first value. + def expectedResult = [ + [1, "{\"b\":42}", "42", null], + [2, "123", null, null], + [3, "{\"b\":1}", "1", null], + [4, "{\"b\":3}", "3", null], + [5, "{\"b\":5,\"c\":6}", "5", "6"], + [6, "[1]", null, null], + [7, "2", null, null], + [8, "{\"b\":42}", "42", null], + [9, "123", null, null], + [10, "{\"b\":8}", "8", null], + [11, "{\"b\":10}", "10", null], + [12, "{\"b\":11,\"c\":12}", "11", "12"], + [13, "[13]", null, null], + [14, "14", null, null] + ] + + def queryResult = { + sql """ + select k, cast(v['a'] as string), cast(v['a']['b'] as string), cast(v['a']['c'] as string) + from duplicate_json_path + order by k + """ + } + assertEquals(expectedResult, queryResult()) + + trigger_and_wait_compaction("duplicate_json_path", "full") + assertEquals(expectedResult, queryResult()) + } +}