From e94d7105287ecc509845932fd03fd601522bf139 Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Fri, 12 Jun 2026 12:57:01 -0700 Subject: [PATCH] llm_runner: add Engine and Session interfaces The LLM serving path needs a stable contract between generic serving code and model-specific execution code. TextLLM, Qwen, Gemma, CUDA, and future backends all differ in how they own weights and mutable state, but the server should not know those details or grow a new Python binding for every model. This introduces the minimal runner-level split needed for that contract. LLMEngine represents the loaded physical model and its serving capacity; LLMSession represents one logical conversation state and exposes reset/prefill/decode-style operations. That shape lets a worker drive different model implementations through one interface while keeping KV/recurrent/cache ownership inside C++. This commit is only the interface and build export. It deliberately does not add a concrete adapter or change existing runner behavior, so model migrations and serving can be reviewed as downstream uses of the contract rather than hidden side effects. --- extension/llm/runner/CMakeLists.txt | 7 ++ extension/llm/runner/llm_session.h | 163 ++++++++++++++++++++++++++++ extension/llm/runner/targets.bzl | 13 +++ 3 files changed, 183 insertions(+) create mode 100644 extension/llm/runner/llm_session.h diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt index 43b89f0a908..9e50513062b 100644 --- a/extension/llm/runner/CMakeLists.txt +++ b/extension/llm/runner/CMakeLists.txt @@ -28,6 +28,12 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake) # executorch_load_build_variables() +add_library(extension_llm_session INTERFACE) +target_link_libraries(extension_llm_session INTERFACE executorch_core) +target_include_directories( + extension_llm_session INTERFACE ${_common_include_directories} +) + # build llm runner library list(TRANSFORM _extension_llm_runner__srcs PREPEND "${EXECUTORCH_ROOT}/") @@ -81,6 +87,7 @@ if(EXECUTORCH_BUILD_CUDA) endif() endif() +install(TARGETS extension_llm_session EXPORT ExecuTorchTargets) install( TARGETS extension_llm_runner EXPORT ExecuTorchTargets diff --git a/extension/llm/runner/llm_session.h b/extension/llm/runner/llm_session.h new file mode 100644 index 00000000000..b562cc958e7 --- /dev/null +++ b/extension/llm/runner/llm_session.h @@ -0,0 +1,163 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Model-agnostic Engine/Session interfaces. Model-specific execution lives in +// adapters that implement these (TextLLMSession over TextLLMRunner today; +// Gemma4Session etc. later); the serving code (HTTP control plane + C++ worker +// binaries) depends only on these interfaces, never on a concrete runner. +// This is a lower-level token-step contract than IRunner/GenerationConfig, +// which remain the higher-level generation API. + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +namespace executorch::extension::llm { + +/// Per-decode sampling parameters. An adapter applies the fields it supports +/// and rejects non-default values of the rest rather than silently ignoring +/// them (today only temperature is plumbed). -1 temperature means model +/// default. +struct ET_EXPERIMENTAL SamplingConfig { + float temperature = -1.0f; + float top_p = 1.0f; + int32_t top_k = 0; // 0 = disabled + uint64_t seed = 0; // 0 = unset +}; + +/// One decoded step: the exact sampled token id (for prefix-cache id tracking +/// and batching) and its decoded text piece (raw bytes; may be a partial UTF-8 +/// sequence the caller assembles). +/// +/// `is_eos` is literal: the sampled token is an end-of-sequence token (use it +/// for the "stop" finish reason, metrics, cache/accounting). `is_terminal` is +/// the loop signal: generation ended at this step — either EOS or a cooperative +/// stop() took effect. A decode loop should end when is_terminal is set; every +/// EOS step is also terminal, but a stop step is terminal without being EOS. +/// +/// For a cooperative stop step (requested via stop()), no token is forwarded, +/// position() must not advance, `token_id` must be 0, and `text_piece` must be +/// empty. +struct ET_EXPERIMENTAL DecodeResult { + uint64_t token_id = 0; + std::string text_piece; + bool is_eos = false; + bool is_terminal = false; +}; + +/// How many physical sessions an engine can host, so the server admits logical +/// requests without silently multiplying model memory. This is a *serving +/// capacity* concern (engine-level), distinct from how a session advances a +/// conversation (LLMSession) — keep backend memory flags off LLMSession. +struct ET_EXPERIMENTAL LLMServingCapacity { + // Physical sessions creatable without duplicating packed weights. + // Conservatively 1 (some backends repack weights per runtime, so extra + // sessions would copy the whole model); raise only on a backend proven to + // share packed weights. + int32_t max_physical_sessions_without_weight_duplication = 1; + // Planned bytes one session adds (KV + activations). Reported for a FUTURE + // memory-budget admission policy; NOT yet enforced -- admission is currently + // by session COUNT only (--max-sessions). Over-provisioning therefore fails + // at the first execute (cudaMalloc) of the over-committed session, not at + // admit time. 0 = unknown. + int64_t estimated_bytes_per_session = 0; +}; + +/// One conversation's mutable state (KV cache, position cursor). Created by an +/// LLMEngine; conversation/cache-scoped (kept warm across requests for prefix +/// reuse), not request-scoped. +class ET_EXPERIMENTAL LLMSession { + public: + virtual ~LLMSession() = default; + + /// Prefill pre-tokenized input at the current position (call seek() first for + /// prefix reuse). Must be non-empty and fit the context window. + /// + /// `initial_sampling` (optional): the sampling config for the FIRST generated + /// token, for backends that sample during prefill (e.g. in-graph sampling). + /// Pass it so the first token uses the request's sampling instead of a stale + /// default. Backends that only sample in decode_one() ignore it. NOTE: + /// because the first token is sampled here, it does NOT pass through + /// decode_one()'s logit processors -- a grammar/tool mask that must constrain + /// the opening token is not applied to it (a known limitation for + /// grammar-constrained serving). + /// + /// ERROR CONTRACT: an error may be returned AFTER backend state has already + /// mutated. On any error from prefill_tokens()/decode_one(), the session is + /// POISONED -- position() may no longer agree with the resident KV. The + /// caller must call reset() (and only proceed once it returns Ok) before any + /// further prefill/decode; it must NOT retry the failed call. The serving + /// worker enforces this (marks the session dirty and forces a reset next + /// request). + virtual ::executorch::runtime::Error prefill_tokens( + std::vector tokens, + const SamplingConfig* initial_sampling = nullptr) = 0; + + /// Decode one token from the pending state; looping reproduces a full + /// generation while returning exact sampled token ids. A normal decode_one() + /// runs one forward pass and is not interruptible mid-call. If stop() is + /// pending, decode_one() instead returns the synthetic terminal stop result + /// documented on DecodeResult without forwarding a token. + /// On error the session is poisoned -- see the error contract on + /// prefill_tokens() (reset() before any further use; never retry). + virtual ::executorch::runtime::Result decode_one( + const SamplingConfig& sampling) = 0; + + /// Rewind the KV cache to `pos` (prefix reuse). Valid for full-KV models. + /// Returns InvalidArgument if `pos` is outside [0, position()]. Returns + /// NotSupported for models whose state cannot be safely rewound (for example, + /// non-KV-cache, sliding-window, or recurrent-state models); callers should + /// fall back to reset() + full prefill. + virtual ::executorch::runtime::Error seek(int64_t pos) = 0; + + /// Number of tokens with resident KV (upper bound for seek()). + virtual int64_t position() const = 0; + + /// Clear the KV cache / position for a fresh conversation. + virtual ::executorch::runtime::Error reset() = 0; + + /// Request that a decode_one() loop stop. This is a TOKEN-BOUNDARY, + /// cooperative stop: it is safe to call from another thread, but it does not + /// abort a decode_one() that is already running. It takes effect at the next + /// decode_one(), which then returns a terminal step (is_terminal set, is_eos + /// false) without forwarding a new token. For that synthetic step, token_id + /// is 0, text_piece is empty, and position() does not advance. The stop is + /// cleared by the next prefill_tokens() or reset(). + virtual void stop() = 0; +}; + +/// Holds the immutable model resources (program, tokenizer, metadata) once and +/// creates sessions that reuse them while isolating their own KV state. How +/// many sessions can be created without duplicating packed weights is backend- +/// dependent — see serving_capacity(). +class ET_EXPERIMENTAL LLMEngine { + public: + virtual ~LLMEngine() = default; + + /// Build a new session that reuses this engine's program/resources when the + /// backend supports it, with its own KV cache. serving_capacity() is the + /// authority on how many physical sessions are safe without weight + /// duplication. + virtual ::executorch::runtime::Result> + create_session() = 0; + + /// How many physical sessions this engine can host without duplicating + /// weights (+ optional per-session memory estimate); the server clamps the + /// number of physical sessions it creates to this. + virtual LLMServingCapacity serving_capacity() const = 0; + virtual const std::unordered_map& metadata() const = 0; +}; + +} // namespace executorch::extension::llm diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl index d3e12266adc..9af2597b4f2 100644 --- a/extension/llm/runner/targets.bzl +++ b/extension/llm/runner/targets.bzl @@ -17,6 +17,18 @@ def define_common_targets(): visibility = ["PUBLIC"], ) + runtime.cxx_library( + name = "llm_session", + exported_headers = [ + "llm_session.h", + ], + visibility = ["PUBLIC"], + exported_deps = [ + "//executorch/runtime/core:core", + "//executorch/runtime/platform:platform", + ], + ) + for aten in get_aten_mode_options(): aten_suffix = "_aten" if aten else "" @@ -128,6 +140,7 @@ def define_common_targets(): exported_deps = [ ":image_prefiller" + aten_suffix, ":irunner", + ":llm_session", ":multimodal_runner_lib" + aten_suffix, ":text_decoder_runner" + aten_suffix, ":text_prefiller" + aten_suffix,