From 515fd99fcd32354db5d32b89d4769bc3bb6932db Mon Sep 17 00:00:00 2001
From: Michael Vartanian <mvartanian@meta.com>
Date: Mon, 2 Feb 2026 07:20:11 -0800
Subject: [PATCH] Add modified masking routine to Quantized Softmax Operator
 (#16907)

Summary:

This diff introduces a modified method for masking. Instead of using the mask tensor, the function uses a position tensor where it attends the values up to the position value. In addition, the current implementation only reads the first value from the position tensor and then increments by one for each additional row.

Pull Request: https://github.com/pytorch/executorch/pull/16907

Differential Revision: D88997196
---
 .../operators/op_quantized_softmax.cpp        |  302 +++-
 .../generic/operators/op_quantized_softmax.h  |   35 +
 backends/cadence/generic/operators/tests/BUCK |   24 +
 .../tests/test_op_quantized_softmax.cpp       | 1540 +++++++++++++++++
 4 files changed, 1835 insertions(+), 66 deletions(-)
 create mode 100644 backends/cadence/generic/operators/tests/BUCK
 create mode 100644 backends/cadence/generic/operators/tests/test_op_quantized_softmax.cpp
diff --git a/backends/cadence/generic/operators/op_quantized_softmax.cpp b/backends/cadence/generic/operators/op_quantized_softmax.cpp
index 61037f22167..1656402d8f9 100644
--- a/backends/cadence/generic/operators/op_quantized_softmax.cpp
+++ b/backends/cadence/generic/operators/op_quantized_softmax.cpp
@@ -11,6 +11,8 @@
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
+#include <memory>
+#include <vector>
 
 #include <executorch/backends/cadence/generic/kernels/kernels.h>
 #include <executorch/backends/cadence/generic/operators/cadence_type_util.h>
@@ -29,11 +31,95 @@ using ::executorch::runtime::KernelRuntimeContext;
 using ::impl::generic::kernels::dequantize;
 using ::impl::generic::kernels::quantize;
 
+/**
+ * @brief Compute position mask for incremental causal masking
+ *
+ * Mask semantics: maskArray[i] = true means mask out (don't attend to position
+ * i) For posValue = P, elements 0..P are attended (false), elements P+1.. are
+ * masked (true).
+ *
+ * @param maskArray Output mask array to populate
+ * @param size Size of the mask array (softmax dimension size)
+ * @param posValue Current position value (elements 0..posValue are attended)
+ */
+void computePositionMask(bool* maskArray, size_t size, int64_t posValue) {
+  for (size_t i = 0; i < size; ++i) {
+    maskArray[i] = (static_cast<int64_t>(i) > posValue);
+  }
+}
+
+/**
+ * @brief Update position mask incrementally for next row
+ *
+ * This is an O(1) operation per row instead of O(n) full recomputation.
+ * Unmasks position (lastUnmaskedPos + 1) to newPosValue.
+ *
+ * @param maskArray Mask array to update in-place
+ * @param size Size of the mask array
+ * @param lastUnmaskedPos Reference to track highest unmasked position
+ * @param newPosValue New position value to unmask up to
+ */
+void updatePositionMaskIncremental(
+    bool* maskArray,
+    size_t size,
+    int64_t& lastUnmaskedPos,
+    int64_t newPosValue) {
+  // Clamp to a local variable to maintain clear semantics and avoid modifying
+  // the parameter, which could cause confusion about caller-side effects.
+  const int64_t clampedPosValue =
+      std::min(newPosValue, static_cast<int64_t>(size) - 1);
+
+  while (lastUnmaskedPos < clampedPosValue) {
+    lastUnmaskedPos++;
+    if (lastUnmaskedPos >= 0 && lastUnmaskedPos < static_cast<int64_t>(size)) {
+      maskArray[lastUnmaskedPos] = false;
+    }
+  }
+}
+
+/**
+ * @brief Core implementation of quantized softmax with optional causal masking.
+ *
+ * Algorithm Overview:
+ * ===================
+ * This function computes softmax on quantized input tensors with support for
+ * position-based causal masking, commonly used in transformer attention layers.
+ *
+ * Softmax Formula (numerically stable version):
+ *   softmax(x_i) = exp(x_i - max(x)) / sum(exp(x_j - max(x))) for all j
+ *
+ * The computation proceeds in these phases:
+ *   1. Dequantize: Convert quantized input to float using in_scale and in_zero_point
+ *   2. Find max: Compute max over unmasked positions (for numerical stability)
+ *   3. Exp & sum: Compute exp(x - max) and accumulate sum for unmasked positions
+ *   4. Normalize: Divide by sum to get probabilities
+ *   5. Quantize: Convert back to quantized output using out_scale and out_zero_point
+ *
+ * Causal Masking (mask_type == 1):
+ * ================================
+ * Implements incremental causal attention where each row can attend to
+ * progressively more positions. For base position P and row index i:
+ *   - Positions 0 to (P + i) are attended (included in softmax)
+ *   - Positions (P + i + 1) onwards are masked (set to 0 probability)
+ *
+ * This creates a lower-triangular attention pattern commonly used in
+ * autoregressive language models to prevent attending to future tokens.
+ *
+ * Memory Layout:
+ * ==============
+ * Input is treated as a 2D tensor of shape [outerSize, lastDimSize] where:
+ *   - outerSize = total_elements / lastDimSize (number of rows)
+ *   - lastDimSize = size of the last dimension (softmax is computed over this)
+ *
+ * @tparam T Quantized data type (int8, uint8, int16, etc.)
+ */
 template <typename T>
 void quantized_softmax_per_tensor_(
     const Tensor& input,
     ET_UNUSED const Tensor& mask,
     int64_t dim,
+    int64_t mask_type,
+    const Tensor& pos,
     const float in_scale,
     const int64_t in_zero_point,
     const float out_scale,
@@ -46,81 +132,155 @@ void quantized_softmax_per_tensor_(
   if (dim < 0) {
     dim += input.dim();
   }
+
+  const size_t num_dims = input.dim();
+  const size_t lastDimSize = input.size(num_dims - 1);
+  const size_t outerSize = input.numel() / lastDimSize;
+
+  // Validate dimension: this implementation only supports softmax over the last
+  // dimension. The dim parameter after normalization should equal (num_dims -
+  // 1).
+  ET_DCHECK_MSG(
+      dim == static_cast<int64_t>(num_dims - 1),
+      "quantized_softmax_per_tensor_ only supports softmax over the last "
+      "dimension. Got dim=%ld, expected dim=%zu",
+      static_cast<long>(dim),
+      num_dims - 1);
+
   const int64_t input_size = input.numel();
-  float* x = new float[input_size];
-
-  torch::executor::apply_over_dim(
-      [in_data,
-       out_data,
-       x,
-       in_scale,
-       in_zero_point,
-       out_inv_scale,
-       out_zero_point](
-          const size_t size, const size_t stride, const size_t base) {
-        // Dequantize the input tensor
-        torch::executor::apply_unary_map_fn(
-            [in_scale, in_zero_point](const float val_in) {
-              return dequantize<T>(
-                  val_in, in_scale, static_cast<int32_t>(in_zero_point));
-            },
-            in_data + base,
-            x + base,
-            size,
-            stride);
-
-        // Subtract max(X) from input tensor
-        float max_in = torch::executor::apply_unary_reduce_fn(
-            [](const float val_in, float val_accum) {
-              return std::max(val_in, val_accum);
-            },
-            x + base,
-            size,
-            stride);
-
-        // Compute exp(X - max(X))
-        torch::executor::apply_unary_map_fn(
-            [max_in](const float val_in) { return std::exp(val_in - max_in); },
-            x + base,
-            x + base,
-            size,
-            stride);
-
-        // Compute sum(exp(X - max(X))
-        float temp_sum = torch::executor::apply_unary_reduce_fn(
-            [](const float val_in, float val_accum) {
-              return val_accum + val_in;
-            },
-            x + base,
-            size,
-            stride);
-
-        // Compute exp(X - max(X)) / sum(exp(X - max(X)) and quantize the
-        float recip = 1.0 / temp_sum;
-        torch::executor::apply_unary_map_fn(
-            [recip, out_inv_scale, out_zero_point](const float val_in) {
-              float res = val_in * recip;
-              return quantize<T>(
-                  res, out_inv_scale, static_cast<int32_t>(out_zero_point));
-            },
-            x + base,
-            out_data + base,
-            size,
-            stride);
-      },
-      input,
-      dim);
+  std::vector<float> x(input_size);  // Working buffer for dequantized values
+
+  // ========================================================================
+  // Mask Initialization (for mask_type == 1: position-based causal masking)
+  // ========================================================================
+  // positionMask[i] = true means position i is masked (excluded from softmax)
+  // positionMask[i] = false means position i is attended (included in softmax)
+  //
+  // Initial state based on basePosValue (from pos tensor):
+  //   - If basePosValue < 0: all positions masked (edge case)
+  //   - If basePosValue >= lastDimSize: no positions masked
+  //   - Otherwise: positions 0..basePosValue unmasked, rest masked
+  // ========================================================================
+  std::unique_ptr<bool[]> positionMask;
+  int64_t lastUnmaskedPos = -1;  // Tracks highest unmasked index for incremental updates
+  int64_t basePosValue = 0;
+
+  if (mask_type == 1 && pos.numel() > 0) {
+    positionMask = std::make_unique<bool[]>(lastDimSize);
+
+    if (pos.scalar_type() == ::executorch::aten::ScalarType::Short) {
+      basePosValue = static_cast<int64_t>(pos.const_data_ptr<int16_t>()[0]);
+    } else {
+      basePosValue = pos.const_data_ptr<int64_t>()[0];
+    }
+
+    if (basePosValue < 0) {
+      std::fill(positionMask.get(), positionMask.get() + lastDimSize, true);
+      lastUnmaskedPos = -1;
+    } else if (basePosValue >= static_cast<int64_t>(lastDimSize)) {
+      std::fill(positionMask.get(), positionMask.get() + lastDimSize, false);
+      lastUnmaskedPos = static_cast<int64_t>(lastDimSize) - 1;
+    } else {
+      computePositionMask(positionMask.get(), lastDimSize, basePosValue);
+      lastUnmaskedPos = basePosValue;
+    }
+  }
 
-  delete[] x;
+  // Determine if incremental mask updates are needed. This is true only when:
+  // - mask_type == 1 (position-based causal masking is enabled)
+  // - positionMask was allocated (pos tensor has elements)
+  // - basePosValue >= 0 (not all positions are masked from the start)
+  // By computing this once outside the loop, we avoid redundant checks on every
+  // iteration since basePosValue doesn't change during the loop.
+  const bool needsIncrementalMaskUpdate =
+      (mask_type == 1 && positionMask && basePosValue >= 0);
+
+  // ========================================================================
+  // Main Loop: Process each row independently
+  // ========================================================================
+  // For each row idx in [0, outerSize):
+  //   1. Update mask if using incremental causal masking
+  //   2. Dequantize input values
+  //   3. Find max over unmasked positions (numerical stability)
+  //   4. Compute exp(x - max) for unmasked, 0 for masked positions
+  //   5. Normalize by sum to get probabilities
+  //   6. Quantize and store output
+  // ========================================================================
+  for (size_t idx = 0; idx < outerSize; ++idx) {
+    const size_t base = idx * lastDimSize;
+
+    // Step 1: Incremental mask update for causal attention
+    // For row idx, unmask positions up to (basePosValue + idx)
+    // This is O(1) amortized per row instead of O(n) full recomputation
+    if (needsIncrementalMaskUpdate) {
+      int64_t newPosValue = basePosValue + static_cast<int64_t>(idx);
+      updatePositionMaskIncremental(
+          positionMask.get(), lastDimSize, lastUnmaskedPos, newPosValue);
+    }
+
+    // Step 2: Dequantize input values
+    // x_float = (x_quant - zero_point) * scale
+    for (size_t i = 0; i < lastDimSize; ++i) {
+      x[base + i] = dequantize<T>(
+          in_data[base + i], in_scale, static_cast<int32_t>(in_zero_point));
+    }
+
+    // Step 3: Find max over unmasked positions for numerical stability
+    // Subtracting max prevents exp() overflow for large values
+    float max_in = -std::numeric_limits<float>::infinity();
+    for (size_t i = 0; i < lastDimSize; ++i) {
+      bool isMasked =
+          (mask_type == 1 && positionMask) ? positionMask[i] : false;
+      if (!isMasked) {
+        max_in = std::max(max_in, x[base + i]);
+      }
+    }
+
+    // Handle edge case: all positions masked (use 0 as neutral max)
+    if (max_in == -std::numeric_limits<float>::infinity()) {
+      max_in = 0.0f;
+    }
+
+    // Step 4: Compute exp(x - max) and accumulate sum
+    // Masked positions get 0, unmasked positions get exp(x - max)
+    float temp_sum = 0.0f;
+    for (size_t i = 0; i < lastDimSize; ++i) {
+      bool isMasked =
+          (mask_type == 1 && positionMask) ? positionMask[i] : false;
+      if (isMasked) {
+        x[base + i] = 0.0f;  // Masked positions contribute 0 probability
+      } else {
+        x[base + i] = std::exp(x[base + i] - max_in);
+        temp_sum += x[base + i];
+      }
+    }
+
+    // Step 5 & 6: Normalize and quantize output
+    // softmax_i = exp_i / sum, then quantize to output type
+    float recip = (temp_sum > 0.0f) ? (1.0f / temp_sum) : 0.0f;
+    for (size_t i = 0; i < lastDimSize; ++i) {
+      float res = x[base + i] * recip;
+      out_data[base + i] =
+          quantize<T>(res, out_inv_scale, static_cast<int32_t>(out_zero_point));
+    }
+  }
 }
 
-// Compute quantized softmax. The current implementation assumes that the
-// input is per-tensor quantized.
+/**
+ * @brief Wrapper that extracts quantization parameters from tensors.
+ *
+ * This function extracts scalar quantization parameters from input tensors
+ * and delegates to quantized_softmax_per_tensor_ for the actual computation.
+ * Used when quantization parameters are provided as single-element tensors
+ * rather than scalar values.
+ */
 template <typename T>
 void quantized_softmax_(
     const Tensor& input,
     const Tensor& mask,
     const int64_t dim,
+    int64_t mask_type,
+    const Tensor& pos,
     const Tensor& in_scale,
     const Tensor& in_zero_point,
     const Tensor& out_scale,
@@ -135,6 +295,8 @@ void quantized_softmax_(
       input,
       mask,
       dim,
+      mask_type,
+      pos,
       input_scale,
       input_zero_point,
       output_scale,
@@ -149,6 +311,8 @@ Tensor& quantized_softmax_out(
     const Tensor& input,
     const Tensor& mask,
     int64_t dim,
+    int64_t mask_type,
+    const Tensor& pos,
     const Tensor& in_scale,
     const Tensor& in_zero_point,
     const Tensor& out_scale,
@@ -160,6 +324,8 @@ Tensor& quantized_softmax_out(
         input,                                \
         mask,                                 \
         dim,                                  \
+        mask_type,                            \
+        pos,                                  \
         in_scale,                             \
         in_zero_point,                        \
         out_scale,                            \
@@ -185,6 +351,8 @@ Tensor& quantized_softmax_per_tensor_out(
     const Tensor& input,
     const Tensor& mask,
     int64_t dim,
+    int64_t mask_type,
+    const Tensor& pos,
     double in_scale,
     int64_t in_zero_point,
     double out_scale,
@@ -196,6 +364,8 @@ Tensor& quantized_softmax_per_tensor_out(
         input,                                \
         mask,                                 \
         dim,                                  \
+        mask_type,                            \
+        pos,                                  \
         in_scale,                             \
         in_zero_point,                        \
         out_scale,                            \
diff --git a/backends/cadence/generic/operators/op_quantized_softmax.h b/backends/cadence/generic/operators/op_quantized_softmax.h
index 485113851a3..4ce6a966f77 100644
--- a/backends/cadence/generic/operators/op_quantized_softmax.h
+++ b/backends/cadence/generic/operators/op_quantized_softmax.h
@@ -15,11 +15,44 @@ namespace impl {
 namespace generic {
 namespace native {
 
+/**
+ * @brief Compute quantized softmax with optional masking support.
+ *
+ * Computes softmax on the input tensor along the specified dimension,
+ * with support for different masking strategies controlled by mask_type.
+ *
+ * @param ctx Kernel runtime context (unused)
+ * @param input Input quantized tensor
+ * @param mask Mask tensor (currently unused, reserved for future mask types)
+ * @param dim Dimension along which to compute softmax. Only the last dimension
+ *            is currently supported (dim == -1 or dim == input.dim() - 1)
+ * @param mask_type Masking strategy to use:
+ *                  - 0: No masking. Standard softmax is computed over all
+ *                       elements in the dimension.
+ *                  - 1: Position-based causal masking. Uses the pos tensor to
+ *                       determine which positions to attend to. For each row i,
+ *                       positions 0 to (pos[0] + i) are attended, and positions
+ *                       beyond that are masked out (set to 0 probability).
+ *                       This implements incremental causal attention where
+ *                       each subsequent row can attend to one additional
+ *                       position.
+ * @param pos Position tensor for causal masking (used when mask_type == 1).
+ *            Contains the base position value. Supports int16 (Short) or
+ *            int64 (Long) scalar types.
+ * @param in_scale Input quantization scale tensor
+ * @param in_zero_point Input quantization zero point tensor
+ * @param out_scale Output quantization scale tensor
+ * @param out_zero_point Output quantization zero point tensor
+ * @param out Output tensor to store the result
+ * @return Reference to the output tensor
+ */
 ::executorch::aten::Tensor& quantized_softmax_out(
     __ET_UNUSED ::executorch::runtime::KernelRuntimeContext& ctx,
     const ::executorch::aten::Tensor& input,
     const ::executorch::aten::Tensor& mask,
     int64_t dim,
+    int64_t mask_type,
+    const ::executorch::aten::Tensor& pos,
     const ::executorch::aten::Tensor& in_scale,
     const ::executorch::aten::Tensor& in_zero_point,
     const ::executorch::aten::Tensor& out_scale,
@@ -31,6 +64,8 @@ ::executorch::aten::Tensor& quantized_softmax_per_tensor_out(
     const ::executorch::aten::Tensor& input,
     const ::executorch::aten::Tensor& mask,
     int64_t dim,
+    int64_t mask_type,
+    const ::executorch::aten::Tensor& pos,
     double in_scale,
     int64_t in_zero_point,
     double out_scale,
diff --git a/backends/cadence/generic/operators/tests/BUCK b/backends/cadence/generic/operators/tests/BUCK
new file mode 100644
index 00000000000..ff3d42344ec
--- /dev/null
+++ b/backends/cadence/generic/operators/tests/BUCK
@@ -0,0 +1,24 @@
+load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
+load("@fbsource//tools/target_determinator/macros:ci.bzl", "ci")
+load("@fbcode//on_device_ai/Assistant/Jarvis/build:cxx_wrapper.bzl", "jarvis_wrapper")
+
+oncall("executorch")
+
+jarvis_wrapper.cxx_test(
+    name = "test_op_quantized_softmax",
+    srcs = [
+        "test_op_quantized_softmax.cpp",
+    ],
+    compatible_backends = ["generic"],
+    labels = [ci.linux(ci.mode("fbsource//arvr/mode/platform010/dev"))],
+    platforms = CXX,
+    visibility = [
+        "fbsource//xplat/executorch/backends/cadence/...",
+        "fbcode//executorch/backends/cadence/...",
+    ],
+    deps = [
+        "fbsource//xplat/executorch/backends/cadence/generic/operators:op_quantized_softmax",
+        "fbsource//xplat/executorch/kernels/test:gtest_utils",
+        "fbsource//xplat/executorch/runtime/core/exec_aten/testing_util:tensor_util",
+    ],
+)
diff --git a/backends/cadence/generic/operators/tests/test_op_quantized_softmax.cpp b/backends/cadence/generic/operators/tests/test_op_quantized_softmax.cpp
new file mode 100644
index 00000000000..f08af743191
--- /dev/null
+++ b/backends/cadence/generic/operators/tests/test_op_quantized_softmax.cpp
@@ -0,0 +1,1540 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/generic/operators/op_quantized_softmax.h>
+
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <gtest/gtest.h>
+
+namespace impl {
+namespace generic {
+namespace native {
+namespace {
+
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+using ::executorch::runtime::testing::TensorFactory;
+
+class GenericQuantizedSoftmaxTest : public OperatorTest {
+ public:
+ protected:
+  // Helper that accepts explicit mask_type value
+  Tensor& quantized_softmax_per_tensor_out(
+      const Tensor& input,
+      const Tensor& mask,
+      int64_t dim,
+      int64_t mask_type,
+      const Tensor& pos,
+      double in_scale,
+      int64_t in_zero_point,
+      double out_scale,
+      int64_t out_zero_point,
+      Tensor& output) {
+    return impl::generic::native::quantized_softmax_per_tensor_out(
+        context_,
+        input,
+        mask,
+        dim,
+        mask_type,
+        pos,
+        in_scale,
+        in_zero_point,
+        out_scale,
+        out_zero_point,
+        output);
+  }
+};
+
+// Test basic softmax without masking (mask_type = 0)
+// Uses a 4x16 input tensor with explicit data values
+TEST_F(GenericQuantizedSoftmaxTest, BasicSoftmaxInt8NoMask) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Long> tf_int64;
+
+  // ============================================================
+  // Tensor dimensions
+  // ============================================================
+  constexpr int kNumRows = 4;
+  constexpr int kNumCols = 16;
+
+  // ============================================================
+  // Input tensor: 4 rows x 16 cols (64 elements)
+  // Row 0: values 10-25 (dequantized: 1.0 to 2.5)
+  // Row 1: values 20-35 (dequantized: 2.0 to 3.5)
+  // Row 2: uniform values (dequantized: all 5.0)
+  // Row 3: alternating pattern
+  // ============================================================
+  // clang-format off
+  Tensor input = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+          // Row 1
+          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
+          // Row 2
+          50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+          // Row 3
+          10, 30, 10, 30, 10, 30, 10, 30, 10, 30, 10, 30, 10, 30, 10, 30
+      });
+  // clang-format on
+
+  // ============================================================
+  // Output tensor: 4 rows x 16 cols, initialized to zeros
+  // ============================================================
+  Tensor output = tf_int8.zeros({kNumRows, kNumCols});
+
+  // ============================================================
+  // Mask tensor: single element (unused when mask_type = 0)
+  // ============================================================
+  Tensor mask = tf_int8.make({1}, {0});
+
+  // ============================================================
+  // Position tensor: single element (unused when mask_type = 0)
+  // ============================================================
+  Tensor pos = tf_int64.make({1}, {0});
+
+  // ============================================================
+  // Quantization parameters
+  // ============================================================
+  const double in_scale = 0.1;
+  const int64_t in_zero_point = 0;
+  const double out_scale = 1.0 / 127.0;
+  const int64_t out_zero_point = 0;
+
+  // ============================================================
+  // Expected output tensor
+  // Softmax is computed on dequantized values, then requantized.
+  // Row 0: dequantized [1.0, 1.1, ..., 2.5], softmax applied
+  // Row 1: dequantized [2.0, 2.1, ..., 3.5], softmax applied
+  // Row 2: dequantized [5.0, 5.0, ..., 5.0], uniform -> ~8 each (127/16)
+  // Row 3: dequantized alternating [1.0, 3.0, ...], bimodal distribution
+  // ============================================================
+  // clang-format off
+  Tensor expected = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0: increasing values [1.0-2.5], softmax gives increasing probs
+          3, 4, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 11, 12, 14, 15,
+          // Row 1: increasing values [2.0-3.5], softmax gives increasing probs
+          3, 4, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 11, 12, 14, 15,
+          // Row 2: uniform input -> uniform output (~8 each, 127/16 ≈ 7.9)
+          8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+          // Row 3: alternating [1.0, 3.0] -> bimodal [low, high, low, high, ...]
+          2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14
+      });
+  // clang-format on
+
+  // ============================================================
+  // Execute softmax
+  // ============================================================
+  quantized_softmax_per_tensor_out(
+      input,
+      mask,
+      -1, // dim = last dimension
+      0, // mask_type = 0 (no masking)
+      pos,
+      in_scale,
+      in_zero_point,
+      out_scale,
+      out_zero_point,
+      output);
+
+  EXPECT_TENSOR_EQ(output, expected);
+}
+
+// Test softmax with uint8 input type using larger tensor
+TEST_F(GenericQuantizedSoftmaxTest, BasicSoftmaxUInt8NoMask) {
+  TensorFactory<ScalarType::Byte> tf_uint8;
+  TensorFactory<ScalarType::Long> tf_int64;
+
+  // ============================================================
+  // Tensor dimensions
+  // ============================================================
+  constexpr int kNumRows = 8;
+  constexpr int kNumCols = 32;
+
+  // ============================================================
+  // Input tensor: 8 rows x 32 cols (256 elements)
+  // Pattern: (i % 32) + 10 for each element
+  // ============================================================
+  // clang-format off
+  Tensor input = tf_uint8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+          // Row 1
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+          // Row 2
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+          // Row 3
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+          // Row 4
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+          // Row 5
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+          // Row 6
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+          // Row 7
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41
+      });
+  // clang-format on
+
+  // ============================================================
+  // Output tensor: 8 rows x 32 cols, initialized to zeros
+  // ============================================================
+  Tensor output = tf_uint8.zeros({kNumRows, kNumCols});
+
+  // ============================================================
+  // Mask tensor: single element (unused)
+  // ============================================================
+  Tensor mask = tf_uint8.make({1}, {0});
+
+  // ============================================================
+  // Position tensor: single element (unused)
+  // ============================================================
+  Tensor pos = tf_int64.make({1}, {0});
+
+  // ============================================================
+  // Quantization parameters
+  // ============================================================
+  const double in_scale = 0.1;
+  const int64_t in_zero_point = 0;
+  const double out_scale = 1.0 / 255.0;
+  const int64_t out_zero_point = 0;
+
+  // ============================================================
+  // Expected output tensor
+  // All rows have the same increasing pattern, softmax distribution
+  // ============================================================
+  // clang-format off
+  Tensor expected = tf_uint8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0: increasing values [1.0-4.1], softmax distribution
+          1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 11, 13, 14, 15, 17, 19, 21, 23, 25,
+          // Row 1
+          1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 11, 13, 14, 15, 17, 19, 21, 23, 25,
+          // Row 2
+          1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 11, 13, 14, 15, 17, 19, 21, 23, 25,
+          // Row 3
+          1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 11, 13, 14, 15, 17, 19, 21, 23, 25,
+          // Row 4
+          1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 11, 13, 14, 15, 17, 19, 21, 23, 25,
+          // Row 5
+          1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 11, 13, 14, 15, 17, 19, 21, 23, 25,
+          // Row 6
+          1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 11, 13, 14, 15, 17, 19, 21, 23, 25,
+          // Row 7
+          1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 11, 13, 14, 15, 17, 19, 21, 23, 25
+      });
+  // clang-format on
+
+  // ============================================================
+  // Execute softmax
+  // ============================================================
+  quantized_softmax_per_tensor_out(
+      input,
+      mask,
+      -1,
+      0, // mask_type = 0
+      pos,
+      in_scale,
+      in_zero_point,
+      out_scale,
+      out_zero_point,
+      output);
+
+  EXPECT_TENSOR_EQ(output, expected);
+}
+
+// Test softmax with int16 input type
+TEST_F(GenericQuantizedSoftmaxTest, BasicSoftmaxInt16NoMask) {
+  TensorFactory<ScalarType::Short> tf_int16;
+  TensorFactory<ScalarType::Long> tf_int64;
+
+  // ============================================================
+  // Tensor dimensions
+  // ============================================================
+  constexpr int kNumRows = 4;
+  constexpr int kNumCols = 16;
+
+  // ============================================================
+  // Input tensor: 4 rows x 16 cols
+  // Pattern: increasing values per row
+  // ============================================================
+  // clang-format off
+  Tensor input = tf_int16.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0: 100, 110, 120, ..., 250
+          100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250,
+          // Row 1
+          100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250,
+          // Row 2
+          100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250,
+          // Row 3
+          100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250
+      });
+  // clang-format on
+
+  // ============================================================
+  // Output tensor: 4 rows x 16 cols, initialized to zeros
+  // ============================================================
+  Tensor output = tf_int16.zeros({kNumRows, kNumCols});
+
+  // ============================================================
+  // Mask tensor: single element (unused)
+  // ============================================================
+  Tensor mask = tf_int16.make({1}, {0});
+
+  // ============================================================
+  // Position tensor: single element (unused)
+  // ============================================================
+  Tensor pos = tf_int64.make({1}, {0});
+
+  // ============================================================
+  // Quantization parameters
+  // ============================================================
+  const double in_scale = 0.01;
+  const int64_t in_zero_point = 0;
+  const double out_scale = 1.0 / 32767.0;
+  const int64_t out_zero_point = 0;
+
+  // ============================================================
+  // Execute softmax
+  // ============================================================
+  quantized_softmax_per_tensor_out(
+      input,
+      mask,
+      -1,
+      0,
+      pos,
+      in_scale,
+      in_zero_point,
+      out_scale,
+      out_zero_point,
+      output);
+
+  // ============================================================
+  // Expected output tensor
+  // All rows have the same increasing pattern, softmax distribution
+  // With scale=0.00003 and zero_point=0, values are in int16 range
+  // ============================================================
+  // clang-format off
+  Tensor expected = tf_int16.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0: increasing values [1.0-2.5], softmax distribution
+          872, 963, 1065, 1177, 1301, 1437, 1588, 1756, 1940, 2144, 2370, 2619, 2894, 3199, 3535, 3907,
+          // Row 1
+          872, 963, 1065, 1177, 1301, 1437, 1588, 1756, 1940, 2144, 2370, 2619, 2894, 3199, 3535, 3907,
+          // Row 2
+          872, 963, 1065, 1177, 1301, 1437, 1588, 1756, 1940, 2144, 2370, 2619, 2894, 3199, 3535, 3907,
+          // Row 3
+          872, 963, 1065, 1177, 1301, 1437, 1588, 1756, 1940, 2144, 2370, 2619, 2894, 3199, 3535, 3907
+      });
+  // clang-format on
+
+  EXPECT_TENSOR_EQ(output, expected);
+}
+
+// Test softmax with position-based causal masking (mask_type = 1)
+TEST_F(GenericQuantizedSoftmaxTest, SoftmaxWithCausalMaskingInt8) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Long> tf_int64;
+
+  // ============================================================
+  // Tensor dimensions: 8 rows x 16 cols (simulating attention matrix)
+  // ============================================================
+  constexpr int kNumRows = 8;
+  constexpr int kNumCols = 16;
+
+  // ============================================================
+  // Input tensor: attention scores
+  // ============================================================
+  // clang-format off
+  Tensor input = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+          // Row 1
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+          // Row 2
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+          // Row 3
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+          // Row 4
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+          // Row 5
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+          // Row 6
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+          // Row 7
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
+      });
+  // clang-format on
+
+  // ============================================================
+  // Output tensor: 8 rows x 16 cols, initialized to zeros
+  // ============================================================
+  Tensor output = tf_int8.zeros({kNumRows, kNumCols});
+
+  // ============================================================
+  // Mask tensor: single element (unused when pos is used)
+  // ============================================================
+  Tensor mask = tf_int8.make({1}, {0});
+
+  // ============================================================
+  // Position tensor: start attending from position 3
+  // ============================================================
+  Tensor pos = tf_int64.make({1}, {3});
+
+  // ============================================================
+  // Quantization parameters
+  // ============================================================
+  const double in_scale = 0.1;
+  const int64_t in_zero_point = 0;
+  const double out_scale = 1.0 / 127.0;
+  const int64_t out_zero_point = 0;
+
+  // ============================================================
+  // Execute softmax
+  // ============================================================
+  quantized_softmax_per_tensor_out(
+      input,
+      mask,
+      -1,
+      1, // mask_type = 1 (position-based causal masking)
+      pos,
+      in_scale,
+      in_zero_point,
+      out_scale,
+      out_zero_point,
+      output);
+
+  // ============================================================
+  // Expected output tensor
+  // Causal masking with pos=3: row i attends to positions 0..(3+i)
+  // Row 0 (pos=3): attend 0-3, mask 4-15
+  // Row 1 (pos=4): attend 0-4, mask 5-15
+  // Row 2 (pos=5): attend 0-5, mask 6-15
+  // Row 3 (pos=6): attend 0-6, mask 7-15
+  // Row 4 (pos=7): attend 0-7, mask 8-15
+  // Row 5 (pos=8): attend 0-8, mask 9-15
+  // Row 6 (pos=9): attend 0-9, mask 10-15
+  // Row 7 (pos=10): attend 0-10, mask 11-15
+  // ============================================================
+  // clang-format off
+  Tensor expected = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0: attend cols 0-3, mask cols 4-15
+          27, 30, 33, 37, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          // Row 1: attend cols 0-4, mask cols 5-15
+          21, 23, 25, 28, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          // Row 2: attend cols 0-5, mask cols 6-15
+          16, 18, 20, 22, 24, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          // Row 3: attend cols 0-6, mask cols 7-15
+          13, 15, 16, 18, 20, 22, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          // Row 4: attend cols 0-7, mask cols 8-15
+          11, 12, 13, 15, 16, 18, 20, 22, 0, 0, 0, 0, 0, 0, 0, 0,
+          // Row 5: attend cols 0-8, mask cols 9-15
+          9, 10, 11, 12, 14, 15, 17, 18, 20, 0, 0, 0, 0, 0, 0, 0,
+          // Row 6: attend cols 0-9, mask cols 10-15
+          8, 9, 9, 10, 12, 13, 14, 16, 17, 19, 0, 0, 0, 0, 0, 0,
+          // Row 7: attend cols 0-10, mask cols 11-15
+          7, 7, 8, 9, 10, 11, 12, 13, 15, 16, 18, 0, 0, 0, 0, 0
+      });
+  // clang-format on
+
+  EXPECT_TENSOR_EQ(output, expected);
+}
+
+// Test softmax with uniform input (all same values)
+TEST_F(GenericQuantizedSoftmaxTest, SoftmaxUniformInputInt8) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Long> tf_int64;
+
+  // ============================================================
+  // Tensor dimensions
+  // ============================================================
+  constexpr int kNumRows = 4;
+  constexpr int kNumCols = 16;
+
+  // ============================================================
+  // Input tensor: all same values (uniform) = 50
+  // ============================================================
+  // clang-format off
+  Tensor input = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0
+          50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+          // Row 1
+          50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+          // Row 2
+          50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+          // Row 3
+          50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50
+      });
+  // clang-format on
+
+  // ============================================================
+  // Output tensor: initialized to zeros
+  // ============================================================
+  Tensor output = tf_int8.zeros({kNumRows, kNumCols});
+
+  // ============================================================
+  // Mask tensor: single element (unused)
+  // ============================================================
+  Tensor mask = tf_int8.make({1}, {0});
+
+  // ============================================================
+  // Position tensor: single element (unused)
+  // ============================================================
+  Tensor pos = tf_int64.make({1}, {0});
+
+  // ============================================================
+  // Quantization parameters
+  // ============================================================
+  const double in_scale = 0.1;
+  const int64_t in_zero_point = 0;
+  const double out_scale = 1.0 / 127.0;
+  const int64_t out_zero_point = 0;
+
+  // ============================================================
+  // Execute softmax
+  // ============================================================
+  quantized_softmax_per_tensor_out(
+      input,
+      mask,
+      -1,
+      0,
+      pos,
+      in_scale,
+      in_zero_point,
+      out_scale,
+      out_zero_point,
+      output);
+
+  // ============================================================
+  // Expected output tensor
+  // Uniform input: all outputs should be approximately equal (1/16 per element)
+  // Expected value: 127 / 16 ≈ 8
+  // ============================================================
+  // clang-format off
+  Tensor expected = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0: uniform output
+          8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+          // Row 1
+          8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+          // Row 2
+          8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+          // Row 3
+          8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
+      });
+  // clang-format on
+
+  EXPECT_TENSOR_EQ(output, expected);
+}
+
+// Test softmax with negative position value (all positions masked)
+TEST_F(GenericQuantizedSoftmaxTest, SoftmaxAllMaskedInt8) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Long> tf_int64;
+
+  // ============================================================
+  // Tensor dimensions
+  // ============================================================
+  constexpr int kNumRows = 4;
+  constexpr int kNumCols = 16;
+
+  // ============================================================
+  // Input tensor: various values
+  // ============================================================
+  // clang-format off
+  Tensor input = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0
+          10, 20, 30, 40, 50, 60, 10, 20, 30, 40, 50, 60, 10, 20, 30, 40,
+          // Row 1
+          10, 20, 30, 40, 50, 60, 10, 20, 30, 40, 50, 60, 10, 20, 30, 40,
+          // Row 2
+          10, 20, 30, 40, 50, 60, 10, 20, 30, 40, 50, 60, 10, 20, 30, 40,
+          // Row 3
+          10, 20, 30, 40, 50, 60, 10, 20, 30, 40, 50, 60, 10, 20, 30, 40
+      });
+  // clang-format on
+
+  // ============================================================
+  // Output tensor: initialized to zeros
+  // ============================================================
+  Tensor output = tf_int8.zeros({kNumRows, kNumCols});
+
+  // ============================================================
+  // Mask tensor: single element (unused)
+  // ============================================================
+  Tensor mask = tf_int8.make({1}, {0});
+
+  // ============================================================
+  // Position tensor: -1 means all positions masked
+  // ============================================================
+  Tensor pos = tf_int64.make({1}, {-1});
+
+  // ============================================================
+  // Quantization parameters
+  // ============================================================
+  const double in_scale = 0.1;
+  const int64_t in_zero_point = 0;
+  const double out_scale = 1.0 / 127.0;
+  const int64_t out_zero_point = 0;
+
+  // ============================================================
+  // Execute softmax
+  // ============================================================
+  quantized_softmax_per_tensor_out(
+      input,
+      mask,
+      -1,
+      1, // mask_type = 1
+      pos,
+      in_scale,
+      in_zero_point,
+      out_scale,
+      out_zero_point,
+      output);
+
+  // ============================================================
+  // Expected output tensor
+  // All positions are masked, so all outputs should be zero
+  // ============================================================
+  // clang-format off
+  Tensor expected = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0: all masked
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          // Row 1
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          // Row 2
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          // Row 3
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+      });
+  // clang-format on
+
+  EXPECT_TENSOR_EQ(output, expected);
+}
+
+// Test softmax with large position value (no positions masked)
+TEST_F(GenericQuantizedSoftmaxTest, SoftmaxNoneMaskedInt8) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Long> tf_int64;
+
+  // ============================================================
+  // Tensor dimensions
+  // ============================================================
+  constexpr int kNumRows = 4;
+  constexpr int kNumCols = 16;
+
+  // ============================================================
+  // Input tensor: increasing pattern
+  // ============================================================
+  // clang-format off
+  Tensor input = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+          // Row 1
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+          // Row 2
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+          // Row 3
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
+      });
+  // clang-format on
+
+  // ============================================================
+  // Output tensor: initialized to zeros
+  // ============================================================
+  Tensor output = tf_int8.zeros({kNumRows, kNumCols});
+
+  // ============================================================
+  // Mask tensor: single element (unused)
+  // ============================================================
+  Tensor mask = tf_int8.make({1}, {0});
+
+  // ============================================================
+  // Position tensor: 1000 > size, so no masking
+  // ============================================================
+  Tensor pos = tf_int64.make({1}, {1000});
+
+  // ============================================================
+  // Quantization parameters
+  // ============================================================
+  const double in_scale = 0.1;
+  const int64_t in_zero_point = 0;
+  const double out_scale = 1.0 / 127.0;
+  const int64_t out_zero_point = 0;
+
+  // ============================================================
+  // Execute softmax
+  // ============================================================
+  quantized_softmax_per_tensor_out(
+      input,
+      mask,
+      -1,
+      1, // mask_type = 1
+      pos,
+      in_scale,
+      in_zero_point,
+      out_scale,
+      out_zero_point,
+      output);
+
+  // ============================================================
+  // Expected output tensor
+  // No masking (mask_type=0), all positions attended
+  // Same pattern as BasicSoftmaxInt8NoMask
+  // ============================================================
+  // clang-format off
+  Tensor expected = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0: increasing values, softmax distribution
+          3, 4, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 11, 12, 14, 15,
+          // Row 1
+          3, 4, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 11, 12, 14, 15,
+          // Row 2
+          3, 4, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 11, 12, 14, 15,
+          // Row 3
+          3, 4, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 11, 12, 14, 15
+      });
+  // clang-format on
+
+  EXPECT_TENSOR_EQ(output, expected);
+}
+
+// Test softmax with single element per row
+TEST_F(GenericQuantizedSoftmaxTest, SoftmaxSingleElementInt8) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Long> tf_int64;
+
+  // ============================================================
+  // Tensor dimensions: multiple rows, single column
+  // ============================================================
+  constexpr int kNumRows = 8;
+  constexpr int kNumCols = 1;
+
+  // ============================================================
+  // Input tensor: various single values
+  // ============================================================
+  Tensor input =
+      tf_int8.make({kNumRows, kNumCols}, {10, 20, 30, 40, 50, 60, 70, 80});
+
+  // ============================================================
+  // Output tensor: initialized to zeros
+  // ============================================================
+  Tensor output = tf_int8.zeros({kNumRows, kNumCols});
+
+  // ============================================================
+  // Mask tensor: single element (unused)
+  // ============================================================
+  Tensor mask = tf_int8.make({1}, {0});
+
+  // ============================================================
+  // Position tensor: single element (unused)
+  // ============================================================
+  Tensor pos = tf_int64.make({1}, {0});
+
+  // ============================================================
+  // Quantization parameters
+  // ============================================================
+  const double in_scale = 0.1;
+  const int64_t in_zero_point = 0;
+  const double out_scale = 1.0 / 127.0;
+  const int64_t out_zero_point = 0;
+
+  // ============================================================
+  // Execute softmax
+  // ============================================================
+  quantized_softmax_per_tensor_out(
+      input,
+      mask,
+      -1,
+      0,
+      pos,
+      in_scale,
+      in_zero_point,
+      out_scale,
+      out_zero_point,
+      output);
+
+  // Single element softmax should output 1.0 (quantized as 127 for all rows)
+  // clang-format off
+  Tensor expected = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          127,  // Row 0
+          127,  // Row 1
+          127,  // Row 2
+          127,  // Row 3
+          127,  // Row 4
+          127,  // Row 5
+          127,  // Row 6
+          127,  // Row 7
+      });
+  // clang-format on
+  EXPECT_TENSOR_EQ(output, expected);
+}
+
+// Test softmax with int16 position tensor for causal masking
+TEST_F(GenericQuantizedSoftmaxTest, SoftmaxCausalMaskingInt16Pos) {
+  TensorFactory<ScalarType::Short> tf_int16;
+
+  // ============================================================
+  // Tensor dimensions
+  // ============================================================
+  constexpr int kNumRows = 4;
+  constexpr int kNumCols = 16;
+
+  // ============================================================
+  // Input tensor: increasing values
+  // ============================================================
+  // clang-format off
+  Tensor input = tf_int16.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0
+          100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250,
+          // Row 1
+          100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250,
+          // Row 2
+          100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250,
+          // Row 3
+          100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250
+      });
+  // clang-format on
+
+  // ============================================================
+  // Output tensor: initialized to zeros
+  // ============================================================
+  Tensor output = tf_int16.zeros({kNumRows, kNumCols});
+
+  // ============================================================
+  // Mask tensor: single element (unused)
+  // ============================================================
+  Tensor mask = tf_int16.make({1}, {0});
+
+  // ============================================================
+  // Position tensor as int16: start from position 5
+  // ============================================================
+  Tensor pos = tf_int16.make({1}, {5});
+
+  // ============================================================
+  // Quantization parameters
+  // ============================================================
+  const double in_scale = 0.01;
+  const int64_t in_zero_point = 0;
+  const double out_scale = 1.0 / 32767.0;
+  const int64_t out_zero_point = 0;
+
+  // ============================================================
+  // Execute softmax
+  // ============================================================
+  quantized_softmax_per_tensor_out(
+      input,
+      mask,
+      -1,
+      1, // mask_type = 1
+      pos,
+      in_scale,
+      in_zero_point,
+      out_scale,
+      out_zero_point,
+      output);
+
+  // Compare with expected output tensor
+  // Row 0 (pos=5): positions 0-5 attended (exponential growth), positions 6-15
+  // masked (0) Row 1 (pos=5+1=6): positions 0-6 attended, positions 7-15 masked
+  // Row 2 (pos=5+2=7): positions 0-7 attended, positions 8-15 masked
+  // Row 3 (pos=5+3=8): positions 0-8 attended, positions 9-15 masked
+  // clang-format off
+  Tensor expected = tf_int16.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0: positions 0-5 attended (exp growth), 6-15 masked
+          4192, 4633, 5120, 5658, 6253, 6911, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          // Row 1: positions 0-6 attended, 7-15 masked
+          3399, 3757, 4152, 4589, 5071, 5605, 6194, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          // Row 2: positions 0-7 attended, 8-15 masked
+          2812, 3108, 3434, 3796, 4195, 4636, 5124, 5663, 0, 0, 0, 0, 0, 0, 0, 0,
+          // Row 3: positions 0-8 attended, 9-15 masked
+          2361, 2609, 2884, 3187, 3522, 3893, 4302, 4754, 5255, 0, 0, 0, 0, 0, 0, 0,
+      });
+  // clang-format on
+  EXPECT_TENSOR_EQ(output, expected);
+}
+
+// Test numerical accuracy: verify softmax with known values
+TEST_F(GenericQuantizedSoftmaxTest, SoftmaxNumericalAccuracyKnown) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Long> tf_int64;
+
+  // ============================================================
+  // Tensor dimensions: 2 rows x 4 cols
+  // ============================================================
+  constexpr int kNumRows = 2;
+  constexpr int kNumCols = 4;
+
+  // ============================================================
+  // Input tensor: [0, 1, 2, 3] repeated
+  // softmax([0, 1, 2, 3]) = [0.0321, 0.0871, 0.2369, 0.6439]
+  // ============================================================
+  Tensor input = tf_int8.make(
+      {kNumRows, kNumCols},
+      {// Row 0
+       0,
+       1,
+       2,
+       3,
+       // Row 1
+       0,
+       1,
+       2,
+       3});
+
+  // ============================================================
+  // Output tensor: initialized to zeros
+  // ============================================================
+  Tensor output = tf_int8.zeros({kNumRows, kNumCols});
+
+  // ============================================================
+  // Mask tensor: single element (unused)
+  // ============================================================
+  Tensor mask = tf_int8.make({1}, {0});
+
+  // ============================================================
+  // Position tensor: single element (unused)
+  // ============================================================
+  Tensor pos = tf_int64.make({1}, {0});
+
+  // ============================================================
+  // Quantization parameters
+  // With out_scale = 0.01, expected quantized values: [3, 9, 24, 64]
+  // ============================================================
+  const double in_scale = 1.0;
+  const int64_t in_zero_point = 0;
+  const double out_scale = 0.01;
+  const int64_t out_zero_point = 0;
+
+  // ============================================================
+  // Execute softmax
+  // ============================================================
+  quantized_softmax_per_tensor_out(
+      input,
+      mask,
+      -1,
+      0,
+      pos,
+      in_scale,
+      in_zero_point,
+      out_scale,
+      out_zero_point,
+      output);
+
+  // Compare with expected output tensor
+  // Both rows have identical input [0, 1, 2, 3], so both have the same
+  // softmax distribution: approx [0.032, 0.087, 0.237, 0.644]
+  // Quantized with scale=0.01, zero_point=0: [3, 9, 24, 64]
+  // clang-format off
+  Tensor expected = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0: softmax([0,1,2,3]) quantized
+          3, 9, 24, 64,
+          // Row 1: identical distribution
+          3, 9, 24, 64,
+      });
+  // clang-format on
+  EXPECT_TENSOR_EQ(output, expected);
+}
+
+// Test causal masking with larger sequence for attention-like patterns
+TEST_F(GenericQuantizedSoftmaxTest, SoftmaxCausalMaskingLargeSequence) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Long> tf_int64;
+
+  // ============================================================
+  // Tensor dimensions: simulating multi-head attention
+  // ============================================================
+  constexpr int kNumHeads = 4;
+  constexpr int kSeqLen = 32;
+
+  // ============================================================
+  // Input tensor: attention scores (128 elements)
+  // ============================================================
+  // clang-format off
+  Tensor input = tf_int8.make(
+      {kNumHeads, kSeqLen},
+      {
+          // Head 0
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+          // Head 1
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+          // Head 2
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+          // Head 3
+          10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41
+      });
+  // clang-format on
+
+  // ============================================================
+  // Output tensor: initialized to zeros
+  // ============================================================
+  Tensor output = tf_int8.zeros({kNumHeads, kSeqLen});
+
+  // ============================================================
+  // Mask tensor: single element (unused)
+  // ============================================================
+  Tensor mask = tf_int8.make({1}, {0});
+
+  // ============================================================
+  // Position tensor: start with position 7
+  // ============================================================
+  Tensor pos = tf_int64.make({1}, {7});
+
+  // ============================================================
+  // Quantization parameters
+  // ============================================================
+  const double in_scale = 0.1;
+  const int64_t in_zero_point = 0;
+  const double out_scale = 1.0 / 127.0;
+  const int64_t out_zero_point = 0;
+
+  // ============================================================
+  // Execute softmax
+  // ============================================================
+  quantized_softmax_per_tensor_out(
+      input,
+      mask,
+      -1,
+      1, // Causal masking
+      pos,
+      in_scale,
+      in_zero_point,
+      out_scale,
+      out_zero_point,
+      output);
+
+  // Compare with expected output tensor
+  // Head 0 (pos=7): positions 0-7 attended, positions 8-31 masked
+  // Head 1 (pos=8): positions 0-8 attended, positions 9-31 masked
+  // Head 2 (pos=9): positions 0-9 attended, positions 10-31 masked
+  // Head 3 (pos=10): positions 0-10 attended, positions 11-31 masked
+  // clang-format off
+  Tensor expected = tf_int8.make(
+      {kNumHeads, kSeqLen},
+      {
+          // Head 0: positions 0-7 attended, 8-31 masked
+          11, 12, 13, 15, 16, 18, 20, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          // Head 1: positions 0-8 attended, 9-31 masked
+          9, 10, 11, 12, 14, 15, 17, 18, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          // Head 2: positions 0-9 attended, 10-31 masked
+          8, 9, 9, 10, 12, 13, 14, 16, 17, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          // Head 3: positions 0-10 attended, 11-31 masked
+          7, 7, 8, 9, 10, 11, 12, 13, 15, 16, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      });
+  // clang-format on
+  EXPECT_TENSOR_EQ(output, expected);
+}
+
+// ============================================================
+// Test softmax with basePosValue = 0 (only first position attended)
+// ============================================================
+TEST_F(GenericQuantizedSoftmaxTest, SoftmaxFirstPositionOnlyInt8) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Long> tf_int64;
+
+  // ============================================================
+  // Tensor dimensions
+  // ============================================================
+  constexpr int kNumRows = 4;
+  constexpr int kNumCols = 8;
+
+  // ============================================================
+  // Input tensor: 4 rows x 8 cols
+  // ============================================================
+  // clang-format off
+  Tensor input = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0
+          50, 40, 30, 20, 10, 10, 10, 10,
+          // Row 1
+          50, 40, 30, 20, 10, 10, 10, 10,
+          // Row 2
+          50, 40, 30, 20, 10, 10, 10, 10,
+          // Row 3
+          50, 40, 30, 20, 10, 10, 10, 10
+      });
+  // clang-format on
+
+  // ============================================================
+  // Output tensor: initialized to zeros
+  // ============================================================
+  Tensor output = tf_int8.zeros({kNumRows, kNumCols});
+
+  // ============================================================
+  // Mask tensor: unused
+  // ============================================================
+  Tensor mask = tf_int8.make({1}, {0});
+
+  // ============================================================
+  // Position tensor: 0 means only first position attended
+  // Row 0: pos=0 (only col 0 attended)
+  // Row 1: pos=1 (cols 0-1 attended)
+  // Row 2: pos=2 (cols 0-2 attended)
+  // Row 3: pos=3 (cols 0-3 attended)
+  // ============================================================
+  Tensor pos = tf_int64.make({1}, {0});
+
+  // ============================================================
+  // Quantization parameters
+  // ============================================================
+  const double in_scale = 0.1;
+  const int64_t in_zero_point = 0;
+  const double out_scale = 1.0 / 127.0;
+  const int64_t out_zero_point = 0;
+
+  // ============================================================
+  // Expected output tensor
+  // Row 0: pos=0, only col 0 attended -> softmax([5.0]) = [1.0] -> [127]
+  // Row 1: pos=1, cols 0-1 attended -> softmax([5.0, 4.0])
+  //        = exp([5,4]) / sum = [148.4, 54.6] / 203 = [0.73, 0.27]
+  //        Quantized at scale=1/127: [93, 34]
+  // Row 2: pos=2, cols 0-2 attended -> softmax([5.0, 4.0, 3.0])
+  //        = exp([5,4,3]) / sum = [148.4, 54.6, 20.1] / 223.1 = [0.665, 0.245,
+  //        0.09] Quantized at scale=1/127: [84, 31, 11]
+  // Row 3: pos=3, cols 0-3 attended -> softmax([5.0, 4.0, 3.0, 2.0])
+  //        = exp([5,4,3,2]) / sum = [148.4, 54.6, 20.1, 7.4] / 230.5 = [0.644,
+  //        0.237, 0.087, 0.032] Quantized at scale=1/127: [82, 30, 11, 4]
+  // Masked positions get out_zero_point = 0
+  // ============================================================
+  // clang-format off
+  Tensor expected = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0: only col 0 attended
+          127, 0, 0, 0, 0, 0, 0, 0,
+          // Row 1: cols 0-1 attended
+          93, 34, 0, 0, 0, 0, 0, 0,
+          // Row 2: cols 0-2 attended
+          84, 31, 11, 0, 0, 0, 0, 0,
+          // Row 3: cols 0-3 attended
+          82, 30, 11, 4, 0, 0, 0, 0
+      });
+  // clang-format on
+
+  // ============================================================
+  // Execute softmax
+  // ============================================================
+  quantized_softmax_per_tensor_out(
+      input,
+      mask,
+      -1,
+      1, // mask_type = 1
+      pos,
+      in_scale,
+      in_zero_point,
+      out_scale,
+      out_zero_point,
+      output);
+
+  EXPECT_TENSOR_EQ(output, expected);
+}
+
+// ============================================================
+// Test softmax with single row
+// ============================================================
+TEST_F(GenericQuantizedSoftmaxTest, SoftmaxSingleRowInt8) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Long> tf_int64;
+
+  // ============================================================
+  // Tensor dimensions
+  // ============================================================
+  constexpr int kNumCols = 8;
+
+  // ============================================================
+  // Input tensor: 1 row x 8 cols
+  // ============================================================
+  // clang-format off
+  Tensor input = tf_int8.make(
+      {1, kNumCols},
+      {
+          100, 90, 80, 70, 60, 50, 40, 30
+      });
+  // clang-format on
+
+  // ============================================================
+  // Output tensor: initialized to zeros
+  // ============================================================
+  Tensor output = tf_int8.zeros({1, kNumCols});
+
+  // ============================================================
+  // Mask tensor: unused
+  // ============================================================
+  Tensor mask = tf_int8.make({1}, {0});
+
+  // ============================================================
+  // Position tensor: 3 means positions 0-3 are attended
+  // ============================================================
+  Tensor pos = tf_int64.make({1}, {3});
+
+  // ============================================================
+  // Quantization parameters
+  // ============================================================
+  const double in_scale = 0.1;
+  const int64_t in_zero_point = 0;
+  const double out_scale = 1.0 / 127.0;
+  const int64_t out_zero_point = 0;
+
+  // ============================================================
+  // Expected output tensor
+  // pos=3: cols 0-3 attended
+  // Input dequantized values: [10.0, 9.0, 8.0, 7.0, ...]
+  // softmax([10.0, 9.0, 8.0, 7.0]) = [0.643, 0.236, 0.087, 0.032]
+  // Quantized at out_scale = 1/127: [82, 30, 11, 4]
+  // Masked positions get out_zero_point = 0
+  // ============================================================
+  // clang-format off
+  Tensor expected = tf_int8.make(
+      {1, kNumCols},
+      {
+          82, 30, 11, 4, 0, 0, 0, 0
+      });
+  // clang-format on
+
+  // ============================================================
+  // Execute softmax
+  // ============================================================
+  quantized_softmax_per_tensor_out(
+      input,
+      mask,
+      -1,
+      1, // mask_type = 1
+      pos,
+      in_scale,
+      in_zero_point,
+      out_scale,
+      out_zero_point,
+      output);
+
+  EXPECT_TENSOR_EQ(output, expected);
+}
+
+// ============================================================
+// Test softmax with Int16 position tensor
+// ============================================================
+TEST_F(GenericQuantizedSoftmaxTest, SoftmaxWithInt16PositionTensor) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Short> tf_int16;
+
+  // ============================================================
+  // Tensor dimensions
+  // ============================================================
+  constexpr int kNumRows = 2;
+  constexpr int kNumCols = 8;
+
+  // ============================================================
+  // Input tensor: 2 rows x 8 cols
+  // ============================================================
+  // clang-format off
+  Tensor input = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0
+          10, 20, 30, 40, 50, 60, 70, 80,
+          // Row 1
+          10, 20, 30, 40, 50, 60, 70, 80
+      });
+  // clang-format on
+
+  // ============================================================
+  // Output tensor: initialized to zeros
+  // ============================================================
+  Tensor output = tf_int8.zeros({kNumRows, kNumCols});
+
+  // ============================================================
+  // Mask tensor: unused
+  // ============================================================
+  Tensor mask = tf_int8.make({1}, {0});
+
+  // ============================================================
+  // Position tensor: Int16 type with value 2
+  // Row 0: pos=2 (cols 0-2 attended)
+  // Row 1: pos=3 (cols 0-3 attended)
+  // ============================================================
+  Tensor pos = tf_int16.make({1}, {2});
+
+  // ============================================================
+  // Quantization parameters
+  // ============================================================
+  const double in_scale = 0.1;
+  const int64_t in_zero_point = 0;
+  const double out_scale = 1.0 / 127.0;
+  const int64_t out_zero_point = 0;
+
+  // ============================================================
+  // Expected output tensor
+  // Row 0: pos=2, cols 0-2 attended
+  // Input dequantized values: [1.0, 2.0, 3.0, ...]
+  // softmax([1.0, 2.0, 3.0]) = exp([1,2,3]) / sum
+  // Quantized at out_scale = 1/127 (values verified from implementation)
+  // Row 1: pos=3, cols 0-3 attended
+  // softmax([1.0, 2.0, 3.0, 4.0]) = exp([1,2,3,4]) / sum
+  // Masked positions get out_zero_point = 0
+  // ============================================================
+  // clang-format off
+  Tensor expected = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0: cols 0-2 attended
+          11, 31, 84, 0, 0, 0, 0, 0,
+          // Row 1: cols 0-3 attended
+          4, 11, 30, 82, 0, 0, 0, 0
+      });
+  // clang-format on
+
+  // ============================================================
+  // Execute softmax
+  // ============================================================
+  quantized_softmax_per_tensor_out(
+      input,
+      mask,
+      -1,
+      1, // mask_type = 1
+      pos,
+      in_scale,
+      in_zero_point,
+      out_scale,
+      out_zero_point,
+      output);
+
+  EXPECT_TENSOR_EQ(output, expected);
+}
+
+// ============================================================
+// Test softmax with non-zero output zero_point
+// ============================================================
+TEST_F(GenericQuantizedSoftmaxTest, SoftmaxWithNonZeroOutputZeroPoint) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Long> tf_int64;
+
+  // ============================================================
+  // Tensor dimensions
+  // ============================================================
+  constexpr int kNumRows = 2;
+  constexpr int kNumCols = 8;
+
+  // ============================================================
+  // Input tensor
+  // ============================================================
+  // clang-format off
+  Tensor input = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0
+          10, 20, 30, 40, 50, 60, 70, 80,
+          // Row 1
+          10, 20, 30, 40, 50, 60, 70, 80
+      });
+  // clang-format on
+
+  // ============================================================
+  // Output tensor: initialized to zeros
+  // ============================================================
+  Tensor output = tf_int8.zeros({kNumRows, kNumCols});
+
+  // ============================================================
+  // Mask and position tensors
+  // ============================================================
+  Tensor mask = tf_int8.make({1}, {0});
+  Tensor pos = tf_int64.make({1}, {-1}); // All masked
+
+  // ============================================================
+  // Quantization parameters with non-zero output zero_point
+  // ============================================================
+  const double in_scale = 0.1;
+  const int64_t in_zero_point = 0;
+  const double out_scale = 1.0 / 127.0;
+  const int64_t out_zero_point = 64; // Non-zero zero_point
+
+  // ============================================================
+  // Expected output tensor
+  // All positions masked -> all outputs should be out_zero_point = 64
+  // ============================================================
+  // clang-format off
+  Tensor expected = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0: all masked
+          64, 64, 64, 64, 64, 64, 64, 64,
+          // Row 1: all masked
+          64, 64, 64, 64, 64, 64, 64, 64
+      });
+  // clang-format on
+
+  // ============================================================
+  // Execute softmax
+  // ============================================================
+  quantized_softmax_per_tensor_out(
+      input,
+      mask,
+      -1,
+      1, // mask_type = 1
+      pos,
+      in_scale,
+      in_zero_point,
+      out_scale,
+      out_zero_point,
+      output);
+
+  EXPECT_TENSOR_EQ(output, expected);
+}
+
+// ============================================================
+// Test full causal masking pattern (simulates transformer attention)
+// ============================================================
+TEST_F(GenericQuantizedSoftmaxTest, CausalMaskingSimulationInt8) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Long> tf_int64;
+
+  // ============================================================
+  // Tensor dimensions - 8x8 simulates 8 token positions
+  // ============================================================
+  constexpr int kNumRows = 8;
+  constexpr int kNumCols = 8;
+
+  // ============================================================
+  // Input tensor: uniform values (no preference for any position)
+  // All values are the same to make expected output predictable
+  // ============================================================
+  // clang-format off
+  Tensor input = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          50, 50, 50, 50, 50, 50, 50, 50,
+          50, 50, 50, 50, 50, 50, 50, 50,
+          50, 50, 50, 50, 50, 50, 50, 50,
+          50, 50, 50, 50, 50, 50, 50, 50,
+          50, 50, 50, 50, 50, 50, 50, 50,
+          50, 50, 50, 50, 50, 50, 50, 50,
+          50, 50, 50, 50, 50, 50, 50, 50,
+          50, 50, 50, 50, 50, 50, 50, 50
+      });
+  // clang-format on
+
+  // ============================================================
+  // Output tensor: initialized to zeros
+  // ============================================================
+  Tensor output = tf_int8.zeros({kNumRows, kNumCols});
+
+  // ============================================================
+  // Mask tensor: unused
+  // ============================================================
+  Tensor mask = tf_int8.make({1}, {0});
+
+  // ============================================================
+  // Position tensor: 0 means first token
+  // Row 0: pos=0, cols 0 attended
+  // Row 1: pos=1, cols 0-1 attended
+  // ...
+  // Row 7: pos=7, cols 0-7 attended
+  // ============================================================
+  Tensor pos = tf_int64.make({1}, {0});
+
+  // ============================================================
+  // Quantization parameters
+  // ============================================================
+  const double in_scale = 0.1;
+  const int64_t in_zero_point = 0;
+  const double out_scale = 1.0 / 127.0;
+  const int64_t out_zero_point = 0;
+
+  // ============================================================
+  // Expected output tensor
+  // With uniform inputs, softmax distributes probability evenly
+  // among attended positions:
+  // Row 0: 1 attended -> [127, 0, 0, 0, 0, 0, 0, 0]
+  // Row 1: 2 attended -> [64, 64, 0, 0, 0, 0, 0, 0]
+  // Row 2: 3 attended -> [42, 42, 42, 0, 0, 0, 0, 0]
+  // Row 3: 4 attended -> [32, 32, 32, 32, 0, 0, 0, 0]
+  // Row 4: 5 attended -> [25, 25, 25, 25, 25, 0, 0, 0]
+  // Row 5: 6 attended -> [21, 21, 21, 21, 21, 21, 0, 0]
+  // Row 6: 7 attended -> [18, 18, 18, 18, 18, 18, 18, 0]
+  // Row 7: 8 attended -> [16, 16, 16, 16, 16, 16, 16, 16]
+  // Note: These are exact quantized values (127/n rounded)
+  // ============================================================
+  // clang-format off
+  Tensor expected = tf_int8.make(
+      {kNumRows, kNumCols},
+      {
+          // Row 0: 1 attended (127/1 = 127)
+          127, 0, 0, 0, 0, 0, 0, 0,
+          // Row 1: 2 attended (127/2 ≈ 63-64)
+          64, 64, 0, 0, 0, 0, 0, 0,
+          // Row 2: 3 attended (127/3 ≈ 42)
+          42, 42, 42, 0, 0, 0, 0, 0,
+          // Row 3: 4 attended (127/4 ≈ 32)
+          32, 32, 32, 32, 0, 0, 0, 0,
+          // Row 4: 5 attended (127/5 ≈ 25)
+          25, 25, 25, 25, 25, 0, 0, 0,
+          // Row 5: 6 attended (127/6 ≈ 21)
+          21, 21, 21, 21, 21, 21, 0, 0,
+          // Row 6: 7 attended (127/7 ≈ 18)
+          18, 18, 18, 18, 18, 18, 18, 0,
+          // Row 7: 8 attended (127/8 ≈ 16)
+          16, 16, 16, 16, 16, 16, 16, 16
+      });
+  // clang-format on
+
+  // ============================================================
+  // Execute softmax
+  // ============================================================
+  quantized_softmax_per_tensor_out(
+      input,
+      mask,
+      -1,
+      1, // mask_type = 1
+      pos,
+      in_scale,
+      in_zero_point,
+      out_scale,
+      out_zero_point,
+      output);
+
+  EXPECT_TENSOR_EQ(output, expected);
+}
+
+} // namespace
+} // namespace native
+} // namespace generic
+} // namespace impl