diff --git a/kernels/portable/cpu/op_sum.cpp b/kernels/portable/cpu/op_sum.cpp index dcd81797dcf..f2d750d93b7 100644 --- a/kernels/portable/cpu/op_sum.cpp +++ b/kernels/portable/cpu/op_sum.cpp @@ -46,6 +46,36 @@ Tensor& sum_dim_out( ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + // Fast path: contiguous tensor, single innermost dim reduction, same dtype. + // Bypasses generic MapReduceOverDimListPlan to use a tight vectorizable loop. + if (in.numel() > 0 && dim_list.has_value() && dim_list.value().size() == 1 && + !executorch::runtime::isComplexType(in.scalar_type()) && + in.scalar_type() == out.scalar_type()) { + const int64_t d = dim_list.value()[0] < 0 ? dim_list.value()[0] + in.dim() + : dim_list.value()[0]; + if (d >= 0 && d < in.dim() && d == in.dim() - 1 && + tensor_is_contiguous(in)) { + const int64_t reduce_size = in.size(d); + const int64_t outer_size = in.numel() / reduce_size; + + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "sum.IntList_out"; + ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] { + const CTYPE* in_data = in.const_data_ptr(); + CTYPE* out_data = out.mutable_data_ptr(); + for (int64_t i = 0; i < outer_size; i++) { + const CTYPE* row = in_data + i * reduce_size; + CTYPE acc = 0; + for (int64_t j = 0; j < reduce_size; j++) { + acc += row[j]; + } + out_data[i] = acc; + } + }); + return out; + } + } + std::optional plan; if (in.numel() > 0) { plan.emplace(in, dim_list);