Skip to content

Commit cef2e60

Browse files
committed
Re-introduce column-mapper arguments
Signed-off-by: Samuel Monson <[email protected]>
1 parent 9bdf00f commit cef2e60

File tree

4 files changed

+42
-9
lines changed

4 files changed

+42
-9
lines changed

src/guidellm/__main__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,12 @@ def benchmark():
218218
"and dynamically generates more."
219219
),
220220
)
221+
@click.option(
222+
"--data-column-mapper",
223+
default=BenchmarkGenerativeTextArgs.get_default("data_column_mapper"),
224+
callback=cli_tools.parse_json,
225+
help="JSON string of column mappings to apply to the dataset.",
226+
)
221227
@click.option(
222228
"--data-preprocessors",
223229
default=BenchmarkGenerativeTextArgs.get_default("data_preprocessors"),

src/guidellm/benchmark/entrypoints.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from transformers import PreTrainedTokenizerBase
2020
from typing_extensions import TypeAliasType
2121

22+
from guidellm import settings
2223
from guidellm.backends import Backend, BackendType
2324
from guidellm.benchmark.benchmarker import Benchmarker
2425
from guidellm.benchmark.outputs import (
@@ -236,6 +237,11 @@ async def resolve_request_loader(
236237
data_samples: int,
237238
processor: ProcessorInputT | None,
238239
processor_args: dict[str, Any] | None,
240+
data_column_mapper: (
241+
DatasetPreprocessor
242+
| dict[str, str | list[str]]
243+
| Literal["generative_column_mapper"]
244+
),
239245
data_preprocessors: list[DatasetPreprocessor | dict[str, str | list[str]] | str],
240246
data_finalizer: (DatasetFinalizer | dict[str, Any] | str),
241247
data_collator: Callable | Literal["generative"] | None,
@@ -279,13 +285,20 @@ async def resolve_request_loader(
279285
else None
280286
)
281287

288+
# If no type is specified for the data column mapper, load default
289+
if isinstance(data_column_mapper, dict) and "type" not in data_column_mapper:
290+
data_column_mapper = {
291+
"type": settings.dataset.default_column_mapper,
292+
**data_column_mapper,
293+
}
294+
282295
preprocessors_list: list[DatasetPreprocessor] = [
283296
resolve_item_from_registry(
284297
DatasetPreprocessor, # type: ignore [type-abstract]
285298
PreprocessorRegistry,
286299
preprocessor,
287300
)
288-
for preprocessor in data_preprocessors
301+
for preprocessor in ([data_column_mapper] + data_preprocessors)
289302
]
290303

291304
finalizer_instance = resolve_item_from_registry(

src/guidellm/benchmark/schemas/generative/entrypoints.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -202,11 +202,17 @@ def get_default(cls: type[BenchmarkGenerativeTextArgs], field: str) -> Any:
202202
data_samples: int = Field(
203203
default=-1, description="Number of samples to use from datasets (-1 for all)"
204204
)
205-
# TODO: Make it easy to cutomize preprocessors without editing the full list
205+
data_column_mapper: (
206+
DatasetPreprocessor
207+
| dict[str, str | list[str]]
208+
| Literal["generative_column_mapper"]
209+
) = Field(
210+
default_factory=lambda: settings.dataset.default_column_mapper,
211+
description="Column mapping preprocessor for dataset fields",
212+
)
206213
data_preprocessors: list[DatasetPreprocessor | dict[str, str | list[str]] | str] = (
207214
Field(
208215
default_factory=lambda: [ # type: ignore [arg-type]
209-
"generative_column_mapper",
210216
"encode_media",
211217
],
212218
description="List of dataset preprocessors to apply in order",
@@ -335,18 +341,23 @@ def serialize_data_collator(
335341
"""Serialize data_collator to string or None."""
336342
return data_collator if isinstance(data_collator, str) else None
337343

344+
@field_serializer("data_column_mapper")
345+
def serialize_preprocessor(
346+
self,
347+
data_preprocessor: (DatasetPreprocessor | dict[str, str | list[str]] | str),
348+
) -> dict | str:
349+
"""Serialize a preprocessor to dict or string."""
350+
return data_preprocessor if isinstance(data_preprocessor, dict | str) else {}
351+
338352
@field_serializer("data_preprocessors")
339-
def serialize_data_column_mapper(
353+
def serialize_preprocessors(
340354
self,
341355
data_preprocessors: list[
342356
DatasetPreprocessor | dict[str, str | list[str]] | str
343357
],
344358
) -> list[dict | str]:
345-
"""Serialize data_column_mapper to dict or string."""
346-
return [
347-
(preprocessor if isinstance(preprocessor, dict | str) else {})
348-
for preprocessor in data_preprocessors
349-
]
359+
"""Serialize each preprocessor to dict or string."""
360+
return [self.serialize_preprocessor(p) for p in data_preprocessors]
350361

351362
@field_serializer("data_finalizer")
352363
def serialize_data_request_formatter(

src/guidellm/settings.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ class DatasetSettings(BaseModel):
7373
preferred_data_splits: list[str] = Field(
7474
default_factory=lambda: ["test", "tst", "validation", "val", "train"]
7575
)
76+
default_column_mapper: Literal["generative_column_mapper"] = (
77+
"generative_column_mapper"
78+
)
7679

7780

7881
class OpenAISettings(BaseModel):

0 commit comments

Comments
 (0)