Skip to content

Commit 8dbc7f5

Browse files
committed
Re-introduce column-mapper arguments
Signed-off-by: Samuel Monson <[email protected]>
1 parent 167d84a commit 8dbc7f5

File tree

4 files changed

+42
-9
lines changed

4 files changed

+42
-9
lines changed

src/guidellm/__main__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,12 @@ def benchmark():
216216
"and dynamically generates more."
217217
),
218218
)
219+
@click.option(
220+
"--data-column-mapper",
221+
default=BenchmarkGenerativeTextArgs.get_default("data_column_mapper"),
222+
callback=cli_tools.parse_json,
223+
help="JSON string of column mappings to apply to the dataset.",
224+
)
219225
@click.option(
220226
"--data-preprocessors",
221227
default=BenchmarkGenerativeTextArgs.get_default("data_preprocessors"),

src/guidellm/benchmark/entrypoints.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from transformers import PreTrainedTokenizerBase
2020
from typing_extensions import TypeAliasType
2121

22+
from guidellm import settings
2223
from guidellm.backends import Backend, BackendType
2324
from guidellm.benchmark.benchmarker import Benchmarker
2425
from guidellm.benchmark.outputs import (
@@ -236,6 +237,11 @@ async def resolve_request_loader(
236237
data_samples: int,
237238
processor: ProcessorInputT | None,
238239
processor_args: dict[str, Any] | None,
240+
data_column_mapper: (
241+
DatasetPreprocessor
242+
| dict[str, str | list[str]]
243+
| Literal["generative_column_mapper"]
244+
),
239245
data_preprocessors: list[DatasetPreprocessor | dict[str, str | list[str]] | str],
240246
data_finalizer: (DatasetFinalizer | dict[str, Any] | str),
241247
data_collator: Callable | Literal["generative"] | None,
@@ -279,13 +285,20 @@ async def resolve_request_loader(
279285
else None
280286
)
281287

288+
# If no type is specified for the data column mapper, load default
289+
if isinstance(data_column_mapper, dict) and "type" not in data_column_mapper:
290+
data_column_mapper = {
291+
"type": settings.dataset.default_column_mapper,
292+
**data_column_mapper,
293+
}
294+
282295
preprocessors_list: list[DatasetPreprocessor] = [
283296
resolve_item_from_registry(
284297
DatasetPreprocessor, # type: ignore [type-abstract]
285298
PreprocessorRegistry,
286299
preprocessor,
287300
)
288-
for preprocessor in data_preprocessors
301+
for preprocessor in ([data_column_mapper] + data_preprocessors)
289302
]
290303

291304
finalizer_instance = resolve_item_from_registry(

src/guidellm/benchmark/schemas/generative/entrypoints.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -202,11 +202,17 @@ def get_default(cls: type[BenchmarkGenerativeTextArgs], field: str) -> Any:
202202
data_samples: int = Field(
203203
default=-1, description="Number of samples to use from datasets (-1 for all)"
204204
)
205-
# TODO: Make it easy to cutomize preprocessors without editing the full list
205+
data_column_mapper: (
206+
DatasetPreprocessor
207+
| dict[str, str | list[str]]
208+
| Literal["generative_column_mapper"]
209+
) = Field(
210+
default_factory=lambda: settings.dataset.default_column_mapper,
211+
description="Column mapping preprocessor for dataset fields",
212+
)
206213
data_preprocessors: list[DatasetPreprocessor | dict[str, str | list[str]] | str] = (
207214
Field(
208215
default_factory=lambda: [ # type: ignore [arg-type]
209-
"generative_column_mapper",
210216
"encode_media",
211217
],
212218
description="List of dataset preprocessors to apply in order",
@@ -327,18 +333,23 @@ def serialize_data_collator(
327333
"""Serialize data_collator to string or None."""
328334
return data_collator if isinstance(data_collator, str) else None
329335

336+
@field_serializer("data_column_mapper")
337+
def serialize_preprocessor(
338+
self,
339+
data_preprocessor: (DatasetPreprocessor | dict[str, str | list[str]] | str),
340+
) -> dict | str:
341+
"""Serialize a preprocessor to dict or string."""
342+
return data_preprocessor if isinstance(data_preprocessor, dict | str) else {}
343+
330344
@field_serializer("data_preprocessors")
331-
def serialize_data_column_mapper(
345+
def serialize_preprocessors(
332346
self,
333347
data_preprocessors: list[
334348
DatasetPreprocessor | dict[str, str | list[str]] | str
335349
],
336350
) -> list[dict | str]:
337-
"""Serialize data_column_mapper to dict or string."""
338-
return [
339-
(preprocessor if isinstance(preprocessor, dict | str) else {})
340-
for preprocessor in data_preprocessors
341-
]
351+
"""Serialize each preprocessor to dict or string."""
352+
return [self.serialize_preprocessor(p) for p in data_preprocessors]
342353

343354
@field_serializer("data_finalizer")
344355
def serialize_data_request_formatter(

src/guidellm/settings.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ class DatasetSettings(BaseModel):
7373
preferred_data_splits: list[str] = Field(
7474
default_factory=lambda: ["test", "tst", "validation", "val", "train"]
7575
)
76+
default_column_mapper: Literal["generative_column_mapper"] = (
77+
"generative_column_mapper"
78+
)
7679

7780

7881
class OpenAISettings(BaseModel):

0 commit comments

Comments
 (0)