From 89762067a3f8d1192b792b1310c5264358fadf33 Mon Sep 17 00:00:00 2001 From: yuvrajangadsingh Date: Thu, 19 Mar 2026 17:04:43 +0530 Subject: [PATCH 1/4] feat: support BaseLlm instances as judge_model in evaluation Allow JudgeModelOptions.judge_model to accept Union[str, BaseLlm] instead of only str. This enables custom/self-hosted models (e.g. LiteLlm with custom base_url) to be used as judge models for evaluation without requiring LLMRegistry registration. Follows the same pattern used by LlmAgent.model which already accepts Union[str, BaseLlm]. Fixes #3400 --- src/google/adk/evaluation/eval_metrics.py | 7 +++++-- src/google/adk/evaluation/hallucinations_v1.py | 11 +++++++---- src/google/adk/evaluation/llm_as_judge.py | 12 ++++++++---- .../simulation/llm_backed_user_simulator.py | 10 +++++++--- .../simulation/per_turn_user_simulator_quality_v1.py | 8 +++++--- 5 files changed, 32 insertions(+), 16 deletions(-) diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py index 50c3473c3a..1a6cdec8ee 100644 --- a/src/google/adk/evaluation/eval_metrics.py +++ b/src/google/adk/evaluation/eval_metrics.py @@ -28,6 +28,8 @@ from pydantic.json_schema import SkipJsonSchema from typing_extensions import TypeAlias +from google.adk.models.base_llm import BaseLlm + from .common import EvalBaseModel from .eval_case import Invocation from .eval_rubrics import Rubric @@ -75,10 +77,11 @@ class PrebuiltMetrics(Enum): class JudgeModelOptions(EvalBaseModel): """Options for an eval metric's judge model.""" - judge_model: str = Field( + judge_model: Union[str, BaseLlm] = Field( default="gemini-2.5-flash", description=( - "The judge model to use for evaluation. It can be a model name." + "The judge model to use for evaluation. It can be a model name" + " string or a BaseLlm instance for custom/self-hosted models." ), ) diff --git a/src/google/adk/evaluation/hallucinations_v1.py b/src/google/adk/evaluation/hallucinations_v1.py index 0b97c6c54a..4e1837563c 100644 --- a/src/google/adk/evaluation/hallucinations_v1.py +++ b/src/google/adk/evaluation/hallucinations_v1.py @@ -294,17 +294,20 @@ def __init__(self, eval_metric: EvalMetric): self._judge_model = self._setup_auto_rater() self.segmenter_prompt = _HALLUCINATIONS_V1_SEGMENTER_PROMPT self.sentence_validator_prompt = _HALLUCINATIONS_V1_VALIDATOR_PROMPT - self._model = self._judge_model_options.judge_model + judge_model = self._judge_model_options.judge_model + self._model = judge_model.model if isinstance(judge_model, BaseLlm) else judge_model self._model_config = ( self._judge_model_options.judge_model_config or genai_types.GenerateContentConfig() ) def _setup_auto_rater(self) -> BaseLlm: - model_id = self._judge_model_options.judge_model + judge_model = self._judge_model_options.judge_model + if isinstance(judge_model, BaseLlm): + return judge_model llm_registry = LLMRegistry() - llm_class = llm_registry.resolve(model_id) - return llm_class(model=model_id) + llm_class = llm_registry.resolve(judge_model) + return llm_class(model=judge_model) def _create_context_for_step( self, diff --git a/src/google/adk/evaluation/llm_as_judge.py b/src/google/adk/evaluation/llm_as_judge.py index de832395ab..41c0a8aa43 100644 --- a/src/google/adk/evaluation/llm_as_judge.py +++ b/src/google/adk/evaluation/llm_as_judge.py @@ -136,8 +136,10 @@ async def evaluate_invocations( per_invocation_results = [] for actual, expected in zip(actual_invocations, expected_invocations): auto_rater_prompt = self.format_auto_rater_prompt(actual, expected) + judge_model = self._judge_model_options.judge_model + model_str = judge_model.model if isinstance(judge_model, BaseLlm) else judge_model llm_request = LlmRequest( - model=self._judge_model_options.judge_model, + model=model_str, contents=[ genai_types.Content( parts=[genai_types.Part(text=auto_rater_prompt)], @@ -181,7 +183,9 @@ async def evaluate_invocations( return EvaluationResult() def _setup_auto_rater(self) -> BaseLlm: - model_id = self._judge_model_options.judge_model + judge_model = self._judge_model_options.judge_model + if isinstance(judge_model, BaseLlm): + return judge_model llm_registry = LLMRegistry() - llm_class = llm_registry.resolve(model_id) - return llm_class(model=model_id) + llm_class = llm_registry.resolve(judge_model) + return llm_class(model=judge_model) diff --git a/src/google/adk/evaluation/simulation/llm_backed_user_simulator.py b/src/google/adk/evaluation/simulation/llm_backed_user_simulator.py index 2f11301730..ea679676d5 100644 --- a/src/google/adk/evaluation/simulation/llm_backed_user_simulator.py +++ b/src/google/adk/evaluation/simulation/llm_backed_user_simulator.py @@ -24,6 +24,7 @@ from typing_extensions import override from ...events.event import Event +from ...models.base_llm import BaseLlm from ...models.llm_request import LlmRequest from ...models.registry import LLMRegistry from ...utils.context_utils import Aclosing @@ -124,9 +125,12 @@ def __init__( super().__init__(config, config_type=LlmBackedUserSimulator.config_type) self._conversation_scenario = conversation_scenario self._invocation_count = 0 - llm_registry = LLMRegistry() - llm_class = llm_registry.resolve(self._config.model) - self._llm = llm_class(model=self._config.model) + if isinstance(self._config.model, BaseLlm): + self._llm = self._config.model + else: + llm_registry = LLMRegistry() + llm_class = llm_registry.resolve(self._config.model) + self._llm = llm_class(model=self._config.model) self._user_persona = self._conversation_scenario.user_persona @classmethod diff --git a/src/google/adk/evaluation/simulation/per_turn_user_simulator_quality_v1.py b/src/google/adk/evaluation/simulation/per_turn_user_simulator_quality_v1.py index a95eb87d88..038cd394dc 100644 --- a/src/google/adk/evaluation/simulation/per_turn_user_simulator_quality_v1.py +++ b/src/google/adk/evaluation/simulation/per_turn_user_simulator_quality_v1.py @@ -198,10 +198,12 @@ async def evaluate_invocations( return self._aggregate_conversation_results(results) def _setup_llm(self) -> BaseLlm: - model_id = self._llm_options.judge_model + judge_model = self._llm_options.judge_model + if isinstance(judge_model, BaseLlm): + return judge_model llm_registry = LLMRegistry() - llm_class = llm_registry.resolve(model_id) - return llm_class(model=model_id) + llm_class = llm_registry.resolve(judge_model) + return llm_class(model=judge_model) def _format_llm_prompt( self, From 012dde609379892dc0facf666d12d1ca48a9a896 Mon Sep 17 00:00:00 2001 From: yuvrajangadsingh Date: Thu, 19 Mar 2026 18:11:11 +0530 Subject: [PATCH 2/4] fix: address gemini-code-assist review feedback - Update LlmBackedUserSimulatorConfig.model type to Union[str, BaseLlm] - Use isinstance check for model string extraction in LlmRequest instead of accessing .model on potentially mocked objects --- src/google/adk/evaluation/hallucinations_v1.py | 2 +- src/google/adk/evaluation/llm_as_judge.py | 2 +- .../evaluation/simulation/llm_backed_user_simulator.py | 8 ++++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/google/adk/evaluation/hallucinations_v1.py b/src/google/adk/evaluation/hallucinations_v1.py index 4e1837563c..fd0793fb14 100644 --- a/src/google/adk/evaluation/hallucinations_v1.py +++ b/src/google/adk/evaluation/hallucinations_v1.py @@ -295,7 +295,7 @@ def __init__(self, eval_metric: EvalMetric): self.segmenter_prompt = _HALLUCINATIONS_V1_SEGMENTER_PROMPT self.sentence_validator_prompt = _HALLUCINATIONS_V1_VALIDATOR_PROMPT judge_model = self._judge_model_options.judge_model - self._model = judge_model.model if isinstance(judge_model, BaseLlm) else judge_model + self._model = judge_model if isinstance(judge_model, str) else judge_model.model self._model_config = ( self._judge_model_options.judge_model_config or genai_types.GenerateContentConfig() diff --git a/src/google/adk/evaluation/llm_as_judge.py b/src/google/adk/evaluation/llm_as_judge.py index 41c0a8aa43..3cac52dd15 100644 --- a/src/google/adk/evaluation/llm_as_judge.py +++ b/src/google/adk/evaluation/llm_as_judge.py @@ -137,7 +137,7 @@ async def evaluate_invocations( for actual, expected in zip(actual_invocations, expected_invocations): auto_rater_prompt = self.format_auto_rater_prompt(actual, expected) judge_model = self._judge_model_options.judge_model - model_str = judge_model.model if isinstance(judge_model, BaseLlm) else judge_model + model_str = judge_model if isinstance(judge_model, str) else judge_model.model llm_request = LlmRequest( model=model_str, contents=[ diff --git a/src/google/adk/evaluation/simulation/llm_backed_user_simulator.py b/src/google/adk/evaluation/simulation/llm_backed_user_simulator.py index ea679676d5..78836a59d6 100644 --- a/src/google/adk/evaluation/simulation/llm_backed_user_simulator.py +++ b/src/google/adk/evaluation/simulation/llm_backed_user_simulator.py @@ -17,6 +17,7 @@ import logging from typing import ClassVar from typing import Optional +from typing import Union from google.genai import types as genai_types from pydantic import Field @@ -48,9 +49,12 @@ class LlmBackedUserSimulatorConfig(BaseUserSimulatorConfig): """Contains configurations required by an LLM backed user simulator.""" - model: str = Field( + model: Union[str, BaseLlm] = Field( default="gemini-2.5-flash", - description="The model to use for user simulation.", + description=( + "The model to use for user simulation. It can be a model name" + " string or a BaseLlm instance for custom/self-hosted models." + ), ) model_configuration: genai_types.GenerateContentConfig = Field( From 7c1b9c9af4d3fed88e402f206cd88d1afd92d2f7 Mon Sep 17 00:00:00 2001 From: yuvrajangadsingh Date: Fri, 20 Mar 2026 02:00:29 +0530 Subject: [PATCH 3/4] fix: extract model string for LlmRequest in simulator and per-turn evaluator LlmRequest.model expects str but self._config.model and self._llm_options.judge_model can now be BaseLlm instances. Extract .model string before passing to LlmRequest. --- .../adk/evaluation/simulation/llm_backed_user_simulator.py | 4 +++- .../simulation/per_turn_user_simulator_quality_v1.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/google/adk/evaluation/simulation/llm_backed_user_simulator.py b/src/google/adk/evaluation/simulation/llm_backed_user_simulator.py index 78836a59d6..235b7154fc 100644 --- a/src/google/adk/evaluation/simulation/llm_backed_user_simulator.py +++ b/src/google/adk/evaluation/simulation/llm_backed_user_simulator.py @@ -180,8 +180,10 @@ async def _get_llm_response( user_persona=self._user_persona, ) + config_model = self._config.model + model_str = config_model if isinstance(config_model, str) else config_model.model llm_request = LlmRequest( - model=self._config.model, + model=model_str, config=self._config.model_configuration, contents=[ genai_types.Content( diff --git a/src/google/adk/evaluation/simulation/per_turn_user_simulator_quality_v1.py b/src/google/adk/evaluation/simulation/per_turn_user_simulator_quality_v1.py index 038cd394dc..e93aff695a 100644 --- a/src/google/adk/evaluation/simulation/per_turn_user_simulator_quality_v1.py +++ b/src/google/adk/evaluation/simulation/per_turn_user_simulator_quality_v1.py @@ -327,8 +327,10 @@ async def _evaluate_intermediate_turn( previous_invocations=invocation_history, ) + judge_model = self._llm_options.judge_model + model_str = judge_model if isinstance(judge_model, str) else judge_model.model llm_request = LlmRequest( - model=self._llm_options.judge_model, + model=model_str, contents=[ genai_types.Content( parts=[genai_types.Part(text=auto_rater_prompt)], From 3f346820b23d396b14a48a2f5f65dbc187c81755 Mon Sep 17 00:00:00 2001 From: yuvrajangadsingh Date: Sat, 21 Mar 2026 16:28:04 +0530 Subject: [PATCH 4/4] style: run autoformat.sh (isort + pyink) --- src/google/adk/evaluation/eval_metrics.py | 3 +-- src/google/adk/evaluation/hallucinations_v1.py | 4 +++- src/google/adk/evaluation/llm_as_judge.py | 4 +++- .../adk/evaluation/simulation/llm_backed_user_simulator.py | 4 +++- .../simulation/per_turn_user_simulator_quality_v1.py | 4 +++- 5 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py index 1a6cdec8ee..2b11925555 100644 --- a/src/google/adk/evaluation/eval_metrics.py +++ b/src/google/adk/evaluation/eval_metrics.py @@ -19,6 +19,7 @@ from typing import Optional from typing import Union +from google.adk.models.base_llm import BaseLlm from google.genai import types as genai_types from pydantic import alias_generators from pydantic import BaseModel @@ -28,8 +29,6 @@ from pydantic.json_schema import SkipJsonSchema from typing_extensions import TypeAlias -from google.adk.models.base_llm import BaseLlm - from .common import EvalBaseModel from .eval_case import Invocation from .eval_rubrics import Rubric diff --git a/src/google/adk/evaluation/hallucinations_v1.py b/src/google/adk/evaluation/hallucinations_v1.py index fd0793fb14..1503e79d86 100644 --- a/src/google/adk/evaluation/hallucinations_v1.py +++ b/src/google/adk/evaluation/hallucinations_v1.py @@ -295,7 +295,9 @@ def __init__(self, eval_metric: EvalMetric): self.segmenter_prompt = _HALLUCINATIONS_V1_SEGMENTER_PROMPT self.sentence_validator_prompt = _HALLUCINATIONS_V1_VALIDATOR_PROMPT judge_model = self._judge_model_options.judge_model - self._model = judge_model if isinstance(judge_model, str) else judge_model.model + self._model = ( + judge_model if isinstance(judge_model, str) else judge_model.model + ) self._model_config = ( self._judge_model_options.judge_model_config or genai_types.GenerateContentConfig() diff --git a/src/google/adk/evaluation/llm_as_judge.py b/src/google/adk/evaluation/llm_as_judge.py index 3cac52dd15..3df003c8c0 100644 --- a/src/google/adk/evaluation/llm_as_judge.py +++ b/src/google/adk/evaluation/llm_as_judge.py @@ -137,7 +137,9 @@ async def evaluate_invocations( for actual, expected in zip(actual_invocations, expected_invocations): auto_rater_prompt = self.format_auto_rater_prompt(actual, expected) judge_model = self._judge_model_options.judge_model - model_str = judge_model if isinstance(judge_model, str) else judge_model.model + model_str = ( + judge_model if isinstance(judge_model, str) else judge_model.model + ) llm_request = LlmRequest( model=model_str, contents=[ diff --git a/src/google/adk/evaluation/simulation/llm_backed_user_simulator.py b/src/google/adk/evaluation/simulation/llm_backed_user_simulator.py index 235b7154fc..5ae646bead 100644 --- a/src/google/adk/evaluation/simulation/llm_backed_user_simulator.py +++ b/src/google/adk/evaluation/simulation/llm_backed_user_simulator.py @@ -181,7 +181,9 @@ async def _get_llm_response( ) config_model = self._config.model - model_str = config_model if isinstance(config_model, str) else config_model.model + model_str = ( + config_model if isinstance(config_model, str) else config_model.model + ) llm_request = LlmRequest( model=model_str, config=self._config.model_configuration, diff --git a/src/google/adk/evaluation/simulation/per_turn_user_simulator_quality_v1.py b/src/google/adk/evaluation/simulation/per_turn_user_simulator_quality_v1.py index e93aff695a..ba7e7435c9 100644 --- a/src/google/adk/evaluation/simulation/per_turn_user_simulator_quality_v1.py +++ b/src/google/adk/evaluation/simulation/per_turn_user_simulator_quality_v1.py @@ -328,7 +328,9 @@ async def _evaluate_intermediate_turn( ) judge_model = self._llm_options.judge_model - model_str = judge_model if isinstance(judge_model, str) else judge_model.model + model_str = ( + judge_model if isinstance(judge_model, str) else judge_model.model + ) llm_request = LlmRequest( model=model_str, contents=[