ray-project
diff --git a/‎python/ray/llm/_internal/serve/deployments/llm/llm_server.py‎
Lines changed: 293 additions & 254 deletions b/‎python/ray/llm/_internal/serve/deployments/llm/llm_server.py‎
Lines changed: 293 additions & 254 deletions
diff --git a/‎python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py‎
Lines changed: 0 additions & 25 deletions b/‎python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py‎
Lines changed: 0 additions & 25 deletions
diff --git a/‎python/ray/llm/_internal/serve/deployments/routers/router.py‎
Lines changed: 46 additions & 65 deletions b/‎python/ray/llm/_internal/serve/deployments/routers/router.py‎
Lines changed: 46 additions & 65 deletions
diff --git a/‎python/ray/llm/_internal/serve/deployments/utils/batcher.py‎
Lines changed: 22 additions & 5 deletions b/‎python/ray/llm/_internal/serve/deployments/utils/batcher.py‎
Lines changed: 22 additions & 5 deletions
diff --git a/‎python/ray/llm/_internal/serve/deployments/utils/server_utils.py‎
Lines changed: 4 additions & 1 deletion b/‎python/ray/llm/_internal/serve/deployments/utils/server_utils.py‎
Lines changed: 4 additions & 1 deletion
@@ -49,12 +49,10 @@
 from ray.llm._internal.serve.configs.constants import (
     RAYLLM_ENABLE_REQUEST_PROMPT_LOGS,
     RAYLLM_GUIDED_DECODING_BACKEND,
-    MODEL_RESPONSE_BATCH_TIMEOUT_MS,
     MIN_NUM_TOPLOGPROBS_ALLOWED,
     MAX_NUM_TOPLOGPROBS_ALLOWED,
 )
 from ray.llm._internal.utils import try_import
-from ray.llm._internal.serve.deployments.utils.batcher import LLMRawResponsesBatcher
 
 from ray.llm._internal.serve.deployments.llm.llm_engine import LLMEngine
 
@@ -519,30 +517,7 @@ async def prepare_request(
         vllm_request = VLLMGenerationRequest(**request_params)
         return vllm_request
 
-    def _get_batch_interval_ms(self, stream: bool = True) -> int:
-        """Calculate the batching interval for responses."""
-        stream_batching_interval_ms = self.llm_config.experimental_configs.get(
-            "stream_batching_interval_ms"
-        )
-        if stream_batching_interval_ms is None:
-            stream_batching_interval_ms = MODEL_RESPONSE_BATCH_TIMEOUT_MS
-        return stream_batching_interval_ms if stream else None
-
     async def generate(
-        self,
-        request: GenerationRequest,
-    ) -> AsyncGenerator[LLMRawResponse, None]:
-        # TODO (genesu): Responses batching logics should be common to all
-        #  engines and belongs to the LLMServer level instead of the engine
-        #  level here. Refactor the entire batching logics up.
-        response_stream = LLMRawResponsesBatcher(
-            self._generate(request),
-            interval_ms=self._get_batch_interval_ms(request.stream),
-        )
-        async for response in response_stream.stream():
-            yield response
-
-    async def _generate(
         self, request: GenerationRequest
     ) -> AsyncGenerator[LLMRawResponse, None]:
         """Generate an LLMRawResponse stream
 
@@ -10,11 +10,9 @@
     Dict,
     List,
     Optional,
-    Tuple,
     Union,
 )
 
-
 from fastapi import FastAPI, HTTPException, status
 from fastapi.middleware.cors import CORSMiddleware
 from ray import serve
@@ -125,8 +123,11 @@ def _apply_openai_json_format(
     The converted strings are concatenated and returned:
         data: <response-json1>\n\ndata: <response-json2>\n\n...
     """
-
-    return "".join(f"data: {response.model_dump_json()}\n\n")
+    if isinstance(response, list):
+        return "".join(f"data: {r.model_dump_json()}\n\n" for r in response)
+    if hasattr(response, "model_dump_json"):
+        return f"data: {response.model_dump_json()}\n\n"
+    raise ValueError(f"Unexpected response type: {type(response)}")
 
 
 async def _openai_json_wrapper(
@@ -147,29 +148,16 @@ async def _openai_json_wrapper(
     Yields:
         Concatenated JSON strings that represent CompletionStreamResponse.
     """
-    yield _apply_openai_json_format(first_response)
+    packet = _apply_openai_json_format(first_response)
+    yield packet
 
     async for response in generator:
-        yield _apply_openai_json_format(response)
+        packet = _apply_openai_json_format(response)
+        yield packet
 
     yield "data: [DONE]\n\n"
 
 
-async def _peek_at_openai_json_generator(
-    generator: Union[LLMChatResponse, LLMCompletionsResponse],
-) -> Tuple[
-    Union[ChatCompletionStreamResponse, CompletionStreamResponse, ErrorResponse],
-    AsyncGenerator[str, None],
-]:
-    """Runs one iteration of the underlying generator
-    and returns the result, alongside the generator itself (with the
-    first iteration still there).
-    """
-    first_response = await generator.__anext__()
-
-    return first_response, _openai_json_wrapper(generator, first_response)
-
-
 class LLMRouter:
     def __init__(
         self,
@@ -347,6 +335,41 @@ async def model_data(self, model: str) -> ModelData:
             )
         return model_data
 
+    async def _process_llm_request(
+        self, body: Union[CompletionRequest, ChatCompletionRequest], is_chat: bool
+    ) -> Response:
+        NoneStreamingResponseType = (
+            ChatCompletionResponse if is_chat else CompletionResponse
+        )
+        call_method = "chat" if is_chat else "completions"
+
+        async with timeout(RAYLLM_ROUTER_HTTP_TIMEOUT):
+
+            gen = self._get_response(body=body, call_method=call_method)
+
+            first_response = await gen.__anext__()
+
+            # In case of streaming the first response can be batched.
+            if body.stream and isinstance(first_response, list):
+                first_response = first_response[0]
+
+            if isinstance(first_response, ErrorResponse):
+                raise OpenAIHTTPException(
+                    message=first_response.message,
+                    status_code=first_response.code,
+                    type=first_response.type,
+                )
+
+            if isinstance(first_response, NoneStreamingResponseType):
+                # Not streaming
+                return JSONResponse(content=first_response.model_dump())
+
+            openai_stream_generator = _openai_json_wrapper(gen, first_response)
+
+            return StreamingResponse(
+                openai_stream_generator, media_type="text/event-stream"
+            )
+
     @fastapi_router_app.post("/v1/completions")
     async def completions(self, body: CompletionRequest) -> Response:
         """Given a prompt, the model will return one or more predicted completions,
@@ -355,28 +378,7 @@ async def completions(self, body: CompletionRequest) -> Response:
         Returns:
             A response object with completions.
         """
-        async with timeout(RAYLLM_ROUTER_HTTP_TIMEOUT):
-            results = self._get_response(body=body, call_method="completions")
-            if body.stream:
-                first_response, wrapper = await _peek_at_openai_json_generator(results)
-                if isinstance(first_response, ErrorResponse):
-                    raise OpenAIHTTPException(
-                        message=first_response.message,
-                        status_code=first_response.code,
-                        type=first_response.type,
-                    )
-                return StreamingResponse(wrapper, media_type="text/event-stream")
-
-            result = await results.__anext__()
-            if isinstance(result, ErrorResponse):
-                raise OpenAIHTTPException(
-                    message=result.message,
-                    status_code=result.code,
-                    type=result.type,
-                )
-
-            if isinstance(result, CompletionResponse):
-                return JSONResponse(content=result.model_dump())
+        return await self._process_llm_request(body, is_chat=False)
 
     @fastapi_router_app.post("/v1/chat/completions")
     async def chat(self, body: ChatCompletionRequest) -> Response:
@@ -387,28 +389,7 @@ async def chat(self, body: ChatCompletionRequest) -> Response:
             A response object with completions.
         """
 
-        async with timeout(RAYLLM_ROUTER_HTTP_TIMEOUT):
-            results = self._get_response(body=body, call_method="chat")
-            if body.stream:
-                first_response, wrapper = await _peek_at_openai_json_generator(results)
-                if isinstance(first_response, ErrorResponse):
-                    raise OpenAIHTTPException(
-                        message=first_response.message,
-                        status_code=first_response.code,
-                        type=first_response.type,
-                    )
-                return StreamingResponse(wrapper, media_type="text/event-stream")
-
-            result = await results.__anext__()
-            if isinstance(result, ErrorResponse):
-                raise OpenAIHTTPException(
-                    message=result.message,
-                    status_code=result.code,
-                    type=result.type,
-                )
-
-            if isinstance(result, ChatCompletionResponse):
-                return JSONResponse(content=result.model_dump())
+        return await self._process_llm_request(body, is_chat=True)
 
     @classmethod
     def as_deployment(
 
@@ -1,5 +1,5 @@
 import asyncio
-from typing import AsyncGenerator, Optional
+from typing import AsyncGenerator, Optional, Iterable, List, TypeVar, Generic
 
 
 from ray.llm._internal.serve.observability.logging import get_logger
@@ -15,8 +15,10 @@
 
 logger = get_logger(__name__)
 
+T = TypeVar("T")
 
-class LLMRawResponsesBatcher:
+
+class Batcher(Generic[T]):
     """This class batches multiple LLMRawResponses from a generator into a
     single response, at some time interval.
 
@@ -30,7 +32,7 @@ class LLMRawResponsesBatcher:
 
     def __init__(
         self,
-        generator: AsyncGenerator[LLMRawResponse, None],
+        generator: AsyncGenerator[T, None],
         interval_ms: Optional[float] = MODEL_RESPONSE_BATCH_TIMEOUT_MS,
     ):
         self.generator = generator
@@ -46,7 +48,10 @@ def __init__(
         # We are okay with this task getting cancelled (to propagate cancellations)
         self.read_task = asyncio.create_task(self.read())
 
-    async def stream(self) -> AsyncGenerator[BatchedLLMRawResponse, None]:
+    def _merge_results(self, results: List[T]) -> Iterable[T]:
+        return results
+
+    async def stream(self) -> AsyncGenerator[Iterable[T], None]:
         """Drain from the queue every interval_ms and yield the merged results"""
         try:
             while True:
@@ -67,7 +72,7 @@ async def stream(self) -> AsyncGenerator[BatchedLLMRawResponse, None]:
 
                 # If there are results, merge and yield them
                 if results:
-                    output: BatchedLLMRawResponse = BatchedLLMRawResponse.merge_stream(*results)  # type: ignore
+                    output = self._merge_results(results)
                     yield output
 
                 # If the read task is done, exit the stream task
@@ -101,3 +106,15 @@ def drain_queue(self):
         except asyncio.QueueEmpty:
             pass
         return results
+
+
+class LLMRawResponseBatcher(Batcher):
+    """This class batches multiple LLMRawResponses into a single BatchedLLMRawResponse."""
+
+    def _merge_results(self, results: List[LLMRawResponse]) -> BatchedLLMRawResponse:
+        output: BatchedLLMRawResponse = BatchedLLMRawResponse.merge_stream(*results)  # type: ignore
+        return output
+
+
+class OpenAIResponseBatcher(Batcher):
+    """This class batches multiple OpenAI responses into a single OpenAI response."""
@@ -125,7 +125,10 @@ def get_response_for_error(
 
 def get_serve_request_id() -> str:
     """Get request id from serve request context."""
-    return serve.context._serve_request_context.get().request_id
+    context = serve.context._serve_request_context.get()
+    if context is not None:
+        return context.request_id
+    return ""
 
 
 def get_model_request_id(model: str):