From e94d7105287ecc509845932fd03fd601522bf139 Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Fri, 12 Jun 2026 12:57:01 -0700
Subject: [PATCH] llm_runner: add Engine and Session interfaces

The LLM serving path needs a stable contract between generic serving code and model-specific execution code. TextLLM, Qwen, Gemma, CUDA, and future backends all differ in how they own weights and mutable state, but the server should not know those details or grow a new Python binding for every model.

This introduces the minimal runner-level split needed for that contract. LLMEngine represents the loaded physical model and its serving capacity; LLMSession represents one logical conversation state and exposes reset/prefill/decode-style operations. That shape lets a worker drive different model implementations through one interface while keeping KV/recurrent/cache ownership inside C++.

This commit is only the interface and build export. It deliberately does not add a concrete adapter or change existing runner behavior, so model migrations and serving can be reviewed as downstream uses of the contract rather than hidden side effects.
---
 extension/llm/runner/CMakeLists.txt |   7 ++
 extension/llm/runner/llm_session.h  | 163 ++++++++++++++++++++++++++++
 extension/llm/runner/targets.bzl    |  13 +++
 3 files changed, 183 insertions(+)
 create mode 100644 extension/llm/runner/llm_session.h
diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
index 43b89f0a908..9e50513062b 100644
--- a/extension/llm/runner/CMakeLists.txt
+++ b/extension/llm/runner/CMakeLists.txt
@@ -28,6 +28,12 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 #
 executorch_load_build_variables()
 
+add_library(extension_llm_session INTERFACE)
+target_link_libraries(extension_llm_session INTERFACE executorch_core)
+target_include_directories(
+  extension_llm_session INTERFACE ${_common_include_directories}
+)
+
 # build llm runner library
 list(TRANSFORM _extension_llm_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
 
@@ -81,6 +87,7 @@ if(EXECUTORCH_BUILD_CUDA)
   endif()
 endif()
 
+install(TARGETS extension_llm_session EXPORT ExecuTorchTargets)
 install(
   TARGETS extension_llm_runner
   EXPORT ExecuTorchTargets
diff --git a/extension/llm/runner/llm_session.h b/extension/llm/runner/llm_session.h
new file mode 100644
index 00000000000..b562cc958e7
--- /dev/null
+++ b/extension/llm/runner/llm_session.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Model-agnostic Engine/Session interfaces. Model-specific execution lives in
+// adapters that implement these (TextLLMSession over TextLLMRunner today;
+// Gemma4Session etc. later); the serving code (HTTP control plane + C++ worker
+// binaries) depends only on these interfaces, never on a concrete runner.
+// This is a lower-level token-step contract than IRunner/GenerationConfig,
+// which remain the higher-level generation API.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/platform/compiler.h>
+
+namespace executorch::extension::llm {
+
+/// Per-decode sampling parameters. An adapter applies the fields it supports
+/// and rejects non-default values of the rest rather than silently ignoring
+/// them (today only temperature is plumbed). -1 temperature means model
+/// default.
+struct ET_EXPERIMENTAL SamplingConfig {
+  float temperature = -1.0f;
+  float top_p = 1.0f;
+  int32_t top_k = 0; // 0 = disabled
+  uint64_t seed = 0; // 0 = unset
+};
+
+/// One decoded step: the exact sampled token id (for prefix-cache id tracking
+/// and batching) and its decoded text piece (raw bytes; may be a partial UTF-8
+/// sequence the caller assembles).
+///
+/// `is_eos` is literal: the sampled token is an end-of-sequence token (use it
+/// for the "stop" finish reason, metrics, cache/accounting). `is_terminal` is
+/// the loop signal: generation ended at this step — either EOS or a cooperative
+/// stop() took effect. A decode loop should end when is_terminal is set; every
+/// EOS step is also terminal, but a stop step is terminal without being EOS.
+///
+/// For a cooperative stop step (requested via stop()), no token is forwarded,
+/// position() must not advance, `token_id` must be 0, and `text_piece` must be
+/// empty.
+struct ET_EXPERIMENTAL DecodeResult {
+  uint64_t token_id = 0;
+  std::string text_piece;
+  bool is_eos = false;
+  bool is_terminal = false;
+};
+
+/// How many physical sessions an engine can host, so the server admits logical
+/// requests without silently multiplying model memory. This is a *serving
+/// capacity* concern (engine-level), distinct from how a session advances a
+/// conversation (LLMSession) — keep backend memory flags off LLMSession.
+struct ET_EXPERIMENTAL LLMServingCapacity {
+  // Physical sessions creatable without duplicating packed weights.
+  // Conservatively 1 (some backends repack weights per runtime, so extra
+  // sessions would copy the whole model); raise only on a backend proven to
+  // share packed weights.
+  int32_t max_physical_sessions_without_weight_duplication = 1;
+  // Planned bytes one session adds (KV + activations). Reported for a FUTURE
+  // memory-budget admission policy; NOT yet enforced -- admission is currently
+  // by session COUNT only (--max-sessions). Over-provisioning therefore fails
+  // at the first execute (cudaMalloc) of the over-committed session, not at
+  // admit time. 0 = unknown.
+  int64_t estimated_bytes_per_session = 0;
+};
+
+/// One conversation's mutable state (KV cache, position cursor). Created by an
+/// LLMEngine; conversation/cache-scoped (kept warm across requests for prefix
+/// reuse), not request-scoped.
+class ET_EXPERIMENTAL LLMSession {
+ public:
+  virtual ~LLMSession() = default;
+
+  /// Prefill pre-tokenized input at the current position (call seek() first for
+  /// prefix reuse). Must be non-empty and fit the context window.
+  ///
+  /// `initial_sampling` (optional): the sampling config for the FIRST generated
+  /// token, for backends that sample during prefill (e.g. in-graph sampling).
+  /// Pass it so the first token uses the request's sampling instead of a stale
+  /// default. Backends that only sample in decode_one() ignore it. NOTE:
+  /// because the first token is sampled here, it does NOT pass through
+  /// decode_one()'s logit processors -- a grammar/tool mask that must constrain
+  /// the opening token is not applied to it (a known limitation for
+  /// grammar-constrained serving).
+  ///
+  /// ERROR CONTRACT: an error may be returned AFTER backend state has already
+  /// mutated. On any error from prefill_tokens()/decode_one(), the session is
+  /// POISONED -- position() may no longer agree with the resident KV. The
+  /// caller must call reset() (and only proceed once it returns Ok) before any
+  /// further prefill/decode; it must NOT retry the failed call. The serving
+  /// worker enforces this (marks the session dirty and forces a reset next
+  /// request).
+  virtual ::executorch::runtime::Error prefill_tokens(
+      std::vector<uint64_t> tokens,
+      const SamplingConfig* initial_sampling = nullptr) = 0;
+
+  /// Decode one token from the pending state; looping reproduces a full
+  /// generation while returning exact sampled token ids. A normal decode_one()
+  /// runs one forward pass and is not interruptible mid-call. If stop() is
+  /// pending, decode_one() instead returns the synthetic terminal stop result
+  /// documented on DecodeResult without forwarding a token.
+  /// On error the session is poisoned -- see the error contract on
+  /// prefill_tokens() (reset() before any further use; never retry).
+  virtual ::executorch::runtime::Result<DecodeResult> decode_one(
+      const SamplingConfig& sampling) = 0;
+
+  /// Rewind the KV cache to `pos` (prefix reuse). Valid for full-KV models.
+  /// Returns InvalidArgument if `pos` is outside [0, position()]. Returns
+  /// NotSupported for models whose state cannot be safely rewound (for example,
+  /// non-KV-cache, sliding-window, or recurrent-state models); callers should
+  /// fall back to reset() + full prefill.
+  virtual ::executorch::runtime::Error seek(int64_t pos) = 0;
+
+  /// Number of tokens with resident KV (upper bound for seek()).
+  virtual int64_t position() const = 0;
+
+  /// Clear the KV cache / position for a fresh conversation.
+  virtual ::executorch::runtime::Error reset() = 0;
+
+  /// Request that a decode_one() loop stop. This is a TOKEN-BOUNDARY,
+  /// cooperative stop: it is safe to call from another thread, but it does not
+  /// abort a decode_one() that is already running. It takes effect at the next
+  /// decode_one(), which then returns a terminal step (is_terminal set, is_eos
+  /// false) without forwarding a new token. For that synthetic step, token_id
+  /// is 0, text_piece is empty, and position() does not advance. The stop is
+  /// cleared by the next prefill_tokens() or reset().
+  virtual void stop() = 0;
+};
+
+/// Holds the immutable model resources (program, tokenizer, metadata) once and
+/// creates sessions that reuse them while isolating their own KV state. How
+/// many sessions can be created without duplicating packed weights is backend-
+/// dependent — see serving_capacity().
+class ET_EXPERIMENTAL LLMEngine {
+ public:
+  virtual ~LLMEngine() = default;
+
+  /// Build a new session that reuses this engine's program/resources when the
+  /// backend supports it, with its own KV cache. serving_capacity() is the
+  /// authority on how many physical sessions are safe without weight
+  /// duplication.
+  virtual ::executorch::runtime::Result<std::unique_ptr<LLMSession>>
+  create_session() = 0;
+
+  /// How many physical sessions this engine can host without duplicating
+  /// weights (+ optional per-session memory estimate); the server clamps the
+  /// number of physical sessions it creates to this.
+  virtual LLMServingCapacity serving_capacity() const = 0;
+  virtual const std::unordered_map<std::string, int64_t>& metadata() const = 0;
+};
+
+} // namespace executorch::extension::llm
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
index d3e12266adc..9af2597b4f2 100644
--- a/extension/llm/runner/targets.bzl
+++ b/extension/llm/runner/targets.bzl
@@ -17,6 +17,18 @@ def define_common_targets():
         visibility = ["PUBLIC"],
     )
 
+    runtime.cxx_library(
+        name = "llm_session",
+        exported_headers = [
+            "llm_session.h",
+        ],
+        visibility = ["PUBLIC"],
+        exported_deps = [
+            "//executorch/runtime/core:core",
+            "//executorch/runtime/platform:platform",
+        ],
+    )
+
     for aten in get_aten_mode_options():
         aten_suffix = "_aten" if aten else ""
 
@@ -128,6 +140,7 @@ def define_common_targets():
             exported_deps = [
                 ":image_prefiller" + aten_suffix,
                 ":irunner",
+                ":llm_session",
                 ":multimodal_runner_lib" + aten_suffix,
                 ":text_decoder_runner" + aten_suffix,
                 ":text_prefiller" + aten_suffix,