diff --git a/be/src/vec/functions/function_jsonb_transform.cpp b/be/src/vec/functions/function_jsonb_transform.cpp index d57ad314bc9839..7d0eaaa59ed972 100644 --- a/be/src/vec/functions/function_jsonb_transform.cpp +++ b/be/src/vec/functions/function_jsonb_transform.cpp @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include "runtime/primitive_type.h" @@ -62,6 +63,50 @@ void sort_json_object_keys(JsonbWriter& jsonb_writer, const JsonbValue* jsonb_va } } +// Walk a JSONB object recursively and emit flat "": value entries +// directly into `writer`. Members whose value is a non-empty object recurse; +// every other shape (scalars, arrays, null literals, empty objects) is emitted +// as an opaque leaf at its dot-joined path. The `prefix` buffer is reused +// across the whole row — appended on descent and truncated on return — so no +// path segment is ever re-allocated outside this single growing string. +void flatten_json_object_into(JsonbWriter& jsonb_writer, const ObjectVal* obj, + std::string& prefix) { + for (auto it = obj->begin(); it != obj->end(); ++it) { + const auto* val = it->value(); + const size_t saved = prefix.size(); + if (!prefix.empty()) { + prefix.push_back('.'); + } + prefix.append(it->getKeyStr(), it->klen()); + + if (val->isObject() && val->unpack()->numElem() > 0) { + flatten_json_object_into(jsonb_writer, val->unpack(), prefix); + } else { + jsonb_writer.writeKey(prefix.data(), static_cast(prefix.size())); + jsonb_writer.writeValue(val); + } + prefix.resize(saved); + } +} + +// json_object_flatten: turn a nested JSONB object into a single-level JSONB +// object whose keys are the dot-joined paths to each leaf (NiFi FlattenJson +// "keep-arrays" semantics — arrays / scalars / nulls / empty objects stay as +// opaque leaf values; only objects are walked). +// {"a":{"b":2}} -> {"a.b":2} +// {"a":[{"b":1}]} -> {"a":[{"b":1}]} +// Top-level non-object values pass through unchanged. +void flatten_json_object(JsonbWriter& jsonb_writer, const JsonbValue* jsonb_value) { + if (!jsonb_value->isObject()) { + jsonb_writer.writeValue(jsonb_value); + return; + } + jsonb_writer.writeStartObject(); + std::string prefix; + flatten_json_object_into(jsonb_writer, jsonb_value->unpack(), prefix); + jsonb_writer.writeEndObject(); +} + // Convert all numeric types in JSON to double type void normalize_json_numbers_to_double(JsonbWriter& jsonb_writer, const JsonbValue* jsonb_value) { if (jsonb_value->isObject()) { @@ -167,12 +212,21 @@ struct NormalizeJsonNumbersToDouble { } }; +struct JsonObjectFlatten { + static constexpr auto name = "json_object_flatten"; + static void transform(JsonbWriter& writer, const JsonbValue* value) { + flatten_json_object(writer, value); + } +}; + using FunctionSortJsonObjectKeys = FunctionJsonbTransform; using FunctionNormalizeJsonNumbersToDouble = FunctionJsonbTransform; +using FunctionJsonObjectFlatten = FunctionJsonbTransform; void register_function_json_transform(SimpleFunctionFactory& factory) { factory.register_function(); factory.register_function(); + factory.register_function(); factory.register_alias(FunctionSortJsonObjectKeys::name, "sort_jsonb_object_keys"); factory.register_alias(FunctionNormalizeJsonNumbersToDouble::name, diff --git a/be/test/vec/function/function_json_object_flatten_test.cpp b/be/test/vec/function/function_json_object_flatten_test.cpp new file mode 100644 index 00000000000000..927e0917ae6e8f --- /dev/null +++ b/be/test/vec/function/function_json_object_flatten_test.cpp @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include + +#include "util/jsonb_document.h" +#include "util/jsonb_parser_simd.h" +#include "util/jsonb_utils.h" +#include "util/jsonb_writer.h" +#include "vec/functions/function_jsonb_transform.cpp" + +namespace doris { + +using vectorized::flatten_json_object; + +namespace { + +// Parse JSON text into JSONB bytes via the standard simdjson-backed parser. +std::string json_to_jsonb(const std::string& json) { + JsonbWriter writer; + auto status = JsonbParser::parse(json.data(), json.size(), writer); + EXPECT_TRUE(status.ok()) << "parse failed: " << json << " -> " << status.to_string(); + return std::string(writer.getOutput()->getBuffer(), writer.getOutput()->getSize()); +} + +// Run flatten_json_object end-to-end starting from a JSON text input and +// returning the flattened result rendered back to JSON text. The parse → +// flatten → re-render trip exercises the same code path the SQL function +// follows: ColumnString(JSONB) -> flatten_json_object -> ColumnString(JSONB). +std::string flatten(const std::string& json_in) { + const std::string in_bytes = json_to_jsonb(json_in); + const JsonbDocument* doc = nullptr; + auto status = JsonbDocument::checkAndCreateDocument(in_bytes.data(), in_bytes.size(), &doc); + EXPECT_TRUE(status.ok()) << status.to_string(); + EXPECT_NE(doc, nullptr); + + JsonbWriter writer; + flatten_json_object(writer, doc->getValue()); + return JsonbToJson::jsonb_to_json_string(writer.getOutput()->getBuffer(), + writer.getOutput()->getSize()); +} + +void check(const std::string& input, const std::string& expected) { + EXPECT_EQ(flatten(input), expected) << "input: " << input; +} + +} // namespace + +TEST(function_json_object_flatten_test, two_level) { + check(R"({"a":{"b":2}})", R"({"a.b":2})"); +} + +TEST(function_json_object_flatten_test, three_level) { + check(R"({"a":{"b":{"c":3}}})", R"({"a.b.c":3})"); +} + +TEST(function_json_object_flatten_test, already_flat) { + check(R"({"a":1,"b":"hi"})", R"({"a":1,"b":"hi"})"); +} + +TEST(function_json_object_flatten_test, empty_top_level_object) { + check("{}", "{}"); +} + +TEST(function_json_object_flatten_test, empty_nested_object_is_leaf) { + check(R"({"a":{}})", R"({"a":{}})"); +} + +TEST(function_json_object_flatten_test, deep_nesting) { + check(R"({"a":{"b":{"c":{"d":{"e":{"f":{"g":{"h":{"i":{"j":{"k":1}}}}}}}}}}})", + R"({"a.b.c.d.e.f.g.h.i.j.k":1})"); +} + +TEST(function_json_object_flatten_test, prefix_buffer_is_reset_across_siblings) { + check(R"({"a":{"x":1},"b":{"y":2}})", R"({"a.x":1,"b.y":2})"); +} + +TEST(function_json_object_flatten_test, array_of_scalars_under_nested_path_stays_opaque) { + check(R"({"a":{"b":[1,2,3]}})", R"({"a.b":[1,2,3]})"); +} + +TEST(function_json_object_flatten_test, array_of_objects_under_nested_path_stays_opaque) { + // keep-arrays semantics: the array is a leaf value under "a.b"; the + // inner object's key "d" must NOT show up at the flat level. + check(R"({"a":{"b":[{"d":1},{"d":2}]}})", R"({"a.b":[{"d":1},{"d":2}]})"); +} + +TEST(function_json_object_flatten_test, mixed_object_scalar_and_array_leaves) { + check(R"({"x":{"s":1,"a":[1,2],"o":{"k":"v"}}})", R"({"x.s":1,"x.a":[1,2],"x.o.k":"v"})"); +} + +TEST(function_json_object_flatten_test, null_leaf_at_top) { + check(R"({"a":null})", R"({"a":null})"); +} + +TEST(function_json_object_flatten_test, null_leaf_nested) { + check(R"({"a":{"b":null}})", R"({"a.b":null})"); +} + +TEST(function_json_object_flatten_test, top_level_scalar_pass_through) { + check("42", "42"); + check("\"hello\"", "\"hello\""); + check("null", "null"); + check("true", "true"); +} + +TEST(function_json_object_flatten_test, top_level_array_pass_through) { + check(R"([1,2,3])", R"([1,2,3])"); + check(R"([{"x":1}])", R"([{"x":1}])"); +} + +TEST(function_json_object_flatten_test, literal_dotted_key_round_trips) { + // A literal '.' inside a key collapses with real nesting at the flat layer + // — the same documented-lossy collision NiFi FlattenJson accepts. + check(R"({"a.b":2})", R"({"a.b":2})"); +} + +} // namespace doris diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java index c9ba5ccc61a6a3..c9b929893e1754 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java @@ -285,6 +285,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonKeys; import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonLength; import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonObject; +import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonObjectFlatten; import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonQuote; import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonRemove; import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonReplace; @@ -840,6 +841,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(JsonArray.class, "json_array", "jsonb_array"), scalar(JsonArrayIgnoreNull.class, "json_array_ignore_null", "jsonb_array_ignore_null"), scalar(JsonObject.class, "json_object", "jsonb_object"), + scalar(JsonObjectFlatten.class, "json_object_flatten"), scalar(JsonQuote.class, "json_quote"), scalar(JsonUnQuote.class, "json_unquote"), scalar(JsonExtractNoQuotes.class, "json_extract_no_quotes"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonObjectFlatten.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonObjectFlatten.java new file mode 100644 index 00000000000000..d7ddd09ae09964 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonObjectFlatten.java @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.JsonType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'json_object_flatten'. Turn a nested JSONB object into a + * single-level JSONB object whose keys are dot-joined paths to each leaf + * (NiFi FlattenJson "keep-arrays" semantics — arrays stay as opaque values): + * {"a":{"b":2}} -> {"a.b":2} + * {"a":[{"b":1}]} -> {"a":[{"b":1}]} + * To flatten a VARIANT, wrap it with `to_json`: json_object_flatten(to_json(v)). + */ +public class JsonObjectFlatten extends ScalarFunction + implements UnaryExpression, ExplicitlyCastableSignature, PropagateNullable { + + public static final List SIGNATURES = ImmutableList.of( + FunctionSignature.ret(JsonType.INSTANCE).args(JsonType.INSTANCE)); + + public JsonObjectFlatten(Expression arg0) { + super("json_object_flatten", arg0); + } + + private JsonObjectFlatten(ScalarFunctionParams functionParams) { + super(functionParams); + } + + @Override + public JsonObjectFlatten withChildren(List children) { + Preconditions.checkArgument(children.size() == 1); + return new JsonObjectFlatten(getFunctionParams(children)); + } + + @Override + public List getSignatures() { + return SIGNATURES; + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitJsonObjectFlatten(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java index 520e6193f3fc47..2edfc133152a4f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java @@ -305,6 +305,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonKeys; import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonLength; import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonObject; +import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonObjectFlatten; import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonQuote; import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonRemove; import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonReplace; @@ -1710,6 +1711,10 @@ default R visitJsonObject(JsonObject jsonObject, C context) { return visitScalarFunction(jsonObject, context); } + default R visitJsonObjectFlatten(JsonObjectFlatten jsonObjectFlatten, C context) { + return visitScalarFunction(jsonObjectFlatten, context); + } + default R visitJsonExtractNoQuotes(JsonExtractNoQuotes jsonExtract, C context) { return visitScalarFunction(jsonExtract, context); } diff --git a/regression-test/data/variant_p0/test_json_object_flatten.out b/regression-test/data/variant_p0/test_json_object_flatten.out new file mode 100644 index 00000000000000..39673a715ac95d --- /dev/null +++ b/regression-test/data/variant_p0/test_json_object_flatten.out @@ -0,0 +1,39 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql_jsonb -- +1 {"a.b":2} +2 {"a.b.c":3} +3 {"a":1,"b":"hi"} +4 {} +5 {"a":{}} +6 {"a":null} +7 {"a.b":null} +8 {"a.b.c.d.e.f.g.h.i.j.k.l":1} +9 \N +10 42 +11 "hello" +12 {"a":[1,2,3]} +13 {"a":[{"b":1},{"b":2}]} +14 {"a.b":[1,2,3]} +15 {"a.b":[{"c":1},{"c":2}]} +16 [1,2,{"x":3}] +17 {"x.s":1,"x.a":[1,2],"x.o.k":"v"} + +-- !sql_variant -- +1 {"a.b":2} +2 {"a.b.c":3} +3 {"a":1,"b":"hi"} +4 \N +5 \N +6 \N +7 \N +8 {"a.b.c.d.e.f.g.h.i.j.k.l":1} +9 \N +10 42 +11 "hello" +12 {"a":[1,2,3]} +13 {"a":[{"b":1},{"b":2}]} +14 {"a.b":[1,2,3]} +15 {"a.b":[{"c":1},{"c":2}]} +16 [1,2,{"x":3}] +17 {"x.a":[1,2],"x.o.k":"v","x.s":1} + diff --git a/regression-test/suites/variant_p0/test_json_object_flatten.groovy b/regression-test/suites/variant_p0/test_json_object_flatten.groovy new file mode 100644 index 00000000000000..02784d0dff12e9 --- /dev/null +++ b/regression-test/suites/variant_p0/test_json_object_flatten.groovy @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("regression_test_json_object_flatten", "p0") { + // 1) JSONB column: function takes JSONB directly. + sql """DROP TABLE IF EXISTS json_object_flatten_jsonb_t""" + sql """ + CREATE TABLE json_object_flatten_jsonb_t ( + k bigint, + j jsonb + ) + DUPLICATE KEY(k) + DISTRIBUTED BY HASH(k) BUCKETS 1 + PROPERTIES ("replication_num" = "1", "disable_auto_compaction" = "true"); + """ + sql """INSERT INTO json_object_flatten_jsonb_t VALUES (1, '{"a": {"b": 2}}')""" + sql """INSERT INTO json_object_flatten_jsonb_t VALUES (2, '{"a": {"b": {"c": 3}}}')""" + sql """INSERT INTO json_object_flatten_jsonb_t VALUES (3, '{"a": 1, "b": "hi"}')""" + sql """INSERT INTO json_object_flatten_jsonb_t VALUES (4, '{}')""" + sql """INSERT INTO json_object_flatten_jsonb_t VALUES (5, '{"a": {}}')""" + sql """INSERT INTO json_object_flatten_jsonb_t VALUES (6, '{"a": null}')""" + sql """INSERT INTO json_object_flatten_jsonb_t VALUES (7, '{"a": {"b": null}}')""" + sql """INSERT INTO json_object_flatten_jsonb_t VALUES (8, '{"a":{"b":{"c":{"d":{"e":{"f":{"g":{"h":{"i":{"j":{"k":{"l":1}}}}}}}}}}}}')""" + sql """INSERT INTO json_object_flatten_jsonb_t VALUES (9, NULL)""" + sql """INSERT INTO json_object_flatten_jsonb_t VALUES (10, '42')""" + sql """INSERT INTO json_object_flatten_jsonb_t VALUES (11, '"hello"')""" + // Array-leaf cases: keep-arrays semantics — the array stays opaque at + // its dot-path; element-level keys must NOT leak into the flat output. + sql """INSERT INTO json_object_flatten_jsonb_t VALUES (12, '{"a": [1, 2, 3]}')""" + sql """INSERT INTO json_object_flatten_jsonb_t VALUES (13, '{"a": [{"b": 1}, {"b": 2}]}')""" + sql """INSERT INTO json_object_flatten_jsonb_t VALUES (14, '{"a": {"b": [1, 2, 3]}}')""" + sql """INSERT INTO json_object_flatten_jsonb_t VALUES (15, '{"a": {"b": [{"c": 1}, {"c": 2}]}}')""" + sql """INSERT INTO json_object_flatten_jsonb_t VALUES (16, '[1, 2, {"x": 3}]')""" + sql """INSERT INTO json_object_flatten_jsonb_t VALUES (17, '{"x": {"s": 1, "a": [1, 2], "o": {"k": "v"}}}')""" + + qt_sql_jsonb """SELECT k, json_object_flatten(j) FROM json_object_flatten_jsonb_t ORDER BY k""" + + // 2) VARIANT column: Nereids inserts an implicit Variant -> JSONB cast, + // so users can pass the variant column straight to json_object_flatten. + sql """DROP TABLE IF EXISTS json_object_flatten_variant_t""" + sql """ + CREATE TABLE json_object_flatten_variant_t ( + k bigint, + v variant + ) + DUPLICATE KEY(k) + DISTRIBUTED BY HASH(k) BUCKETS 1 + PROPERTIES ("replication_num" = "1", "disable_auto_compaction" = "true"); + """ + sql """INSERT INTO json_object_flatten_variant_t VALUES (1, '{"a": {"b": 2}}')""" + sql """INSERT INTO json_object_flatten_variant_t VALUES (2, '{"a": {"b": {"c": 3}}}')""" + sql """INSERT INTO json_object_flatten_variant_t VALUES (3, '{"a": 1, "b": "hi"}')""" + sql """INSERT INTO json_object_flatten_variant_t VALUES (4, '{}')""" + sql """INSERT INTO json_object_flatten_variant_t VALUES (5, '{"a": {}}')""" + sql """INSERT INTO json_object_flatten_variant_t VALUES (6, '{"a": null}')""" + sql """INSERT INTO json_object_flatten_variant_t VALUES (7, '{"a": {"b": null}}')""" + sql """INSERT INTO json_object_flatten_variant_t VALUES (8, '{"a":{"b":{"c":{"d":{"e":{"f":{"g":{"h":{"i":{"j":{"k":{"l":1}}}}}}}}}}}}')""" + sql """INSERT INTO json_object_flatten_variant_t VALUES (9, NULL)""" + sql """INSERT INTO json_object_flatten_variant_t VALUES (10, '42')""" + sql """INSERT INTO json_object_flatten_variant_t VALUES (11, '"hello"')""" + // Array-leaf cases through the variant -> jsonb cast path. + sql """INSERT INTO json_object_flatten_variant_t VALUES (12, '{"a": [1, 2, 3]}')""" + sql """INSERT INTO json_object_flatten_variant_t VALUES (13, '{"a": [{"b": 1}, {"b": 2}]}')""" + sql """INSERT INTO json_object_flatten_variant_t VALUES (14, '{"a": {"b": [1, 2, 3]}}')""" + sql """INSERT INTO json_object_flatten_variant_t VALUES (15, '{"a": {"b": [{"c": 1}, {"c": 2}]}}')""" + sql """INSERT INTO json_object_flatten_variant_t VALUES (16, '[1, 2, {"x": 3}]')""" + sql """INSERT INTO json_object_flatten_variant_t VALUES (17, '{"x": {"s": 1, "a": [1, 2], "o": {"k": "v"}}}')""" + + qt_sql_variant """SELECT k, json_object_flatten(v) FROM json_object_flatten_variant_t ORDER BY k""" +}