Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions be/src/vec/functions/function_jsonb_transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.

#include <string>
#include <vector>

#include "runtime/primitive_type.h"
Expand Down Expand Up @@ -62,6 +63,50 @@ void sort_json_object_keys(JsonbWriter& jsonb_writer, const JsonbValue* jsonb_va
}
}

// Walk a JSONB object recursively and emit flat "<dot.path>": value entries
// directly into `writer`. Members whose value is a non-empty object recurse;
// every other shape (scalars, arrays, null literals, empty objects) is emitted
// as an opaque leaf at its dot-joined path. The `prefix` buffer is reused
// across the whole row — appended on descent and truncated on return — so no
// path segment is ever re-allocated outside this single growing string.
void flatten_json_object_into(JsonbWriter& jsonb_writer, const ObjectVal* obj,
std::string& prefix) {
for (auto it = obj->begin(); it != obj->end(); ++it) {
const auto* val = it->value();
const size_t saved = prefix.size();
if (!prefix.empty()) {
prefix.push_back('.');
}
prefix.append(it->getKeyStr(), it->klen());

if (val->isObject() && val->unpack<ObjectVal>()->numElem() > 0) {
flatten_json_object_into(jsonb_writer, val->unpack<ObjectVal>(), prefix);
} else {
jsonb_writer.writeKey(prefix.data(), static_cast<uint8_t>(prefix.size()));
jsonb_writer.writeValue(val);
}
prefix.resize(saved);
}
}

// json_object_flatten: turn a nested JSONB object into a single-level JSONB
// object whose keys are the dot-joined paths to each leaf (NiFi FlattenJson
// "keep-arrays" semantics — arrays / scalars / nulls / empty objects stay as
// opaque leaf values; only objects are walked).
// {"a":{"b":2}} -> {"a.b":2}
// {"a":[{"b":1}]} -> {"a":[{"b":1}]}
// Top-level non-object values pass through unchanged.
void flatten_json_object(JsonbWriter& jsonb_writer, const JsonbValue* jsonb_value) {
if (!jsonb_value->isObject()) {
jsonb_writer.writeValue(jsonb_value);
return;
}
jsonb_writer.writeStartObject();
std::string prefix;
flatten_json_object_into(jsonb_writer, jsonb_value->unpack<ObjectVal>(), prefix);
jsonb_writer.writeEndObject();
}

// Convert all numeric types in JSON to double type
void normalize_json_numbers_to_double(JsonbWriter& jsonb_writer, const JsonbValue* jsonb_value) {
if (jsonb_value->isObject()) {
Expand Down Expand Up @@ -167,12 +212,21 @@ struct NormalizeJsonNumbersToDouble {
}
};

struct JsonObjectFlatten {
static constexpr auto name = "json_object_flatten";
static void transform(JsonbWriter& writer, const JsonbValue* value) {
flatten_json_object(writer, value);
}
};

using FunctionSortJsonObjectKeys = FunctionJsonbTransform<SortJsonObjectKeys>;
using FunctionNormalizeJsonNumbersToDouble = FunctionJsonbTransform<NormalizeJsonNumbersToDouble>;
using FunctionJsonObjectFlatten = FunctionJsonbTransform<JsonObjectFlatten>;

void register_function_json_transform(SimpleFunctionFactory& factory) {
factory.register_function<FunctionSortJsonObjectKeys>();
factory.register_function<FunctionNormalizeJsonNumbersToDouble>();
factory.register_function<FunctionJsonObjectFlatten>();

factory.register_alias(FunctionSortJsonObjectKeys::name, "sort_jsonb_object_keys");
factory.register_alias(FunctionNormalizeJsonNumbersToDouble::name,
Expand Down
136 changes: 136 additions & 0 deletions be/test/vec/function/function_json_object_flatten_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <gtest/gtest.h>

#include <string>
#include <utility>
#include <vector>

#include "util/jsonb_document.h"
#include "util/jsonb_parser_simd.h"
#include "util/jsonb_utils.h"
#include "util/jsonb_writer.h"
#include "vec/functions/function_jsonb_transform.cpp"

namespace doris {

using vectorized::flatten_json_object;

namespace {

// Parse JSON text into JSONB bytes via the standard simdjson-backed parser.
std::string json_to_jsonb(const std::string& json) {
JsonbWriter writer;
auto status = JsonbParser::parse(json.data(), json.size(), writer);
EXPECT_TRUE(status.ok()) << "parse failed: " << json << " -> " << status.to_string();
return std::string(writer.getOutput()->getBuffer(), writer.getOutput()->getSize());
}

// Run flatten_json_object end-to-end starting from a JSON text input and
// returning the flattened result rendered back to JSON text. The parse →
// flatten → re-render trip exercises the same code path the SQL function
// follows: ColumnString(JSONB) -> flatten_json_object -> ColumnString(JSONB).
std::string flatten(const std::string& json_in) {
const std::string in_bytes = json_to_jsonb(json_in);
const JsonbDocument* doc = nullptr;
auto status = JsonbDocument::checkAndCreateDocument(in_bytes.data(), in_bytes.size(), &doc);
EXPECT_TRUE(status.ok()) << status.to_string();
EXPECT_NE(doc, nullptr);

JsonbWriter writer;
flatten_json_object(writer, doc->getValue());
return JsonbToJson::jsonb_to_json_string(writer.getOutput()->getBuffer(),
writer.getOutput()->getSize());
}

void check(const std::string& input, const std::string& expected) {
EXPECT_EQ(flatten(input), expected) << "input: " << input;
}

} // namespace

TEST(function_json_object_flatten_test, two_level) {
check(R"({"a":{"b":2}})", R"({"a.b":2})");
}

TEST(function_json_object_flatten_test, three_level) {
check(R"({"a":{"b":{"c":3}}})", R"({"a.b.c":3})");
}

TEST(function_json_object_flatten_test, already_flat) {
check(R"({"a":1,"b":"hi"})", R"({"a":1,"b":"hi"})");
}

TEST(function_json_object_flatten_test, empty_top_level_object) {
check("{}", "{}");
}

TEST(function_json_object_flatten_test, empty_nested_object_is_leaf) {
check(R"({"a":{}})", R"({"a":{}})");
}

TEST(function_json_object_flatten_test, deep_nesting) {
check(R"({"a":{"b":{"c":{"d":{"e":{"f":{"g":{"h":{"i":{"j":{"k":1}}}}}}}}}}})",
R"({"a.b.c.d.e.f.g.h.i.j.k":1})");
}

TEST(function_json_object_flatten_test, prefix_buffer_is_reset_across_siblings) {
check(R"({"a":{"x":1},"b":{"y":2}})", R"({"a.x":1,"b.y":2})");
}

TEST(function_json_object_flatten_test, array_of_scalars_under_nested_path_stays_opaque) {
check(R"({"a":{"b":[1,2,3]}})", R"({"a.b":[1,2,3]})");
}

TEST(function_json_object_flatten_test, array_of_objects_under_nested_path_stays_opaque) {
// keep-arrays semantics: the array is a leaf value under "a.b"; the
// inner object's key "d" must NOT show up at the flat level.
check(R"({"a":{"b":[{"d":1},{"d":2}]}})", R"({"a.b":[{"d":1},{"d":2}]})");
}

TEST(function_json_object_flatten_test, mixed_object_scalar_and_array_leaves) {
check(R"({"x":{"s":1,"a":[1,2],"o":{"k":"v"}}})", R"({"x.s":1,"x.a":[1,2],"x.o.k":"v"})");
}

TEST(function_json_object_flatten_test, null_leaf_at_top) {
check(R"({"a":null})", R"({"a":null})");
}

TEST(function_json_object_flatten_test, null_leaf_nested) {
check(R"({"a":{"b":null}})", R"({"a.b":null})");
}

TEST(function_json_object_flatten_test, top_level_scalar_pass_through) {
check("42", "42");
check("\"hello\"", "\"hello\"");
check("null", "null");
check("true", "true");
}

TEST(function_json_object_flatten_test, top_level_array_pass_through) {
check(R"([1,2,3])", R"([1,2,3])");
check(R"([{"x":1}])", R"([{"x":1}])");
}

TEST(function_json_object_flatten_test, literal_dotted_key_round_trips) {
// A literal '.' inside a key collapses with real nesting at the flat layer
// — the same documented-lossy collision NiFi FlattenJson accepts.
check(R"({"a.b":2})", R"({"a.b":2})");
}

} // namespace doris
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonKeys;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonLength;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonObject;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonObjectFlatten;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonQuote;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonRemove;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonReplace;
Expand Down Expand Up @@ -840,6 +841,7 @@ public class BuiltinScalarFunctions implements FunctionHelper {
scalar(JsonArray.class, "json_array", "jsonb_array"),
scalar(JsonArrayIgnoreNull.class, "json_array_ignore_null", "jsonb_array_ignore_null"),
scalar(JsonObject.class, "json_object", "jsonb_object"),
scalar(JsonObjectFlatten.class, "json_object_flatten"),
scalar(JsonQuote.class, "json_quote"),
scalar(JsonUnQuote.class, "json_unquote"),
scalar(JsonExtractNoQuotes.class, "json_extract_no_quotes"),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.nereids.trees.expressions.functions.scalar;

import org.apache.doris.catalog.FunctionSignature;
import org.apache.doris.nereids.trees.expressions.Expression;
import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression;
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
import org.apache.doris.nereids.types.JsonType;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;

import java.util.List;

/**
* ScalarFunction 'json_object_flatten'. Turn a nested JSONB object into a
* single-level JSONB object whose keys are dot-joined paths to each leaf
* (NiFi FlattenJson "keep-arrays" semantics — arrays stay as opaque values):
* {"a":{"b":2}} -> {"a.b":2}
* {"a":[{"b":1}]} -> {"a":[{"b":1}]}
* To flatten a VARIANT, wrap it with `to_json`: json_object_flatten(to_json(v)).
*/
public class JsonObjectFlatten extends ScalarFunction
implements UnaryExpression, ExplicitlyCastableSignature, PropagateNullable {

public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
FunctionSignature.ret(JsonType.INSTANCE).args(JsonType.INSTANCE));

public JsonObjectFlatten(Expression arg0) {
super("json_object_flatten", arg0);
}

private JsonObjectFlatten(ScalarFunctionParams functionParams) {
super(functionParams);
}

@Override
public JsonObjectFlatten withChildren(List<Expression> children) {
Preconditions.checkArgument(children.size() == 1);
return new JsonObjectFlatten(getFunctionParams(children));
}

@Override
public List<FunctionSignature> getSignatures() {
return SIGNATURES;
}

@Override
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
return visitor.visitJsonObjectFlatten(this, context);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonKeys;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonLength;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonObject;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonObjectFlatten;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonQuote;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonRemove;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonReplace;
Expand Down Expand Up @@ -1710,6 +1711,10 @@ default R visitJsonObject(JsonObject jsonObject, C context) {
return visitScalarFunction(jsonObject, context);
}

default R visitJsonObjectFlatten(JsonObjectFlatten jsonObjectFlatten, C context) {
return visitScalarFunction(jsonObjectFlatten, context);
}

default R visitJsonExtractNoQuotes(JsonExtractNoQuotes jsonExtract, C context) {
return visitScalarFunction(jsonExtract, context);
}
Expand Down
39 changes: 39 additions & 0 deletions regression-test/data/variant_p0/test_json_object_flatten.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql_jsonb --
1 {"a.b":2}
2 {"a.b.c":3}
3 {"a":1,"b":"hi"}
4 {}
5 {"a":{}}
6 {"a":null}
7 {"a.b":null}
8 {"a.b.c.d.e.f.g.h.i.j.k.l":1}
9 \N
10 42
11 "hello"
12 {"a":[1,2,3]}
13 {"a":[{"b":1},{"b":2}]}
14 {"a.b":[1,2,3]}
15 {"a.b":[{"c":1},{"c":2}]}
16 [1,2,{"x":3}]
17 {"x.s":1,"x.a":[1,2],"x.o.k":"v"}

-- !sql_variant --
1 {"a.b":2}
2 {"a.b.c":3}
3 {"a":1,"b":"hi"}
4 \N
5 \N
6 \N
7 \N
8 {"a.b.c.d.e.f.g.h.i.j.k.l":1}
9 \N
10 42
11 "hello"
12 {"a":[1,2,3]}
13 {"a":[{"b":1},{"b":2}]}
14 {"a.b":[1,2,3]}
15 {"a.b":[{"c":1},{"c":2}]}
16 [1,2,{"x":3}]
17 {"x.a":[1,2],"x.o.k":"v","x.s":1}

Loading
Loading