From be21f914f7b855fcaa0b7b293916f81fd963ceae Mon Sep 17 00:00:00 2001 From: zhenchaoni Date: Thu, 25 Jun 2026 14:01:06 +0800 Subject: [PATCH] Remove unused data code --- src/winml/modelkit/data/__init__.py | 18 ++- src/winml/modelkit/data/data_config.py | 28 ----- src/winml/modelkit/data/dummy_dataset.py | 38 ------ .../data/image_classification_dataset.py | 82 ------------- src/winml/modelkit/data/random_dataset.py | 108 ------------------ src/winml/modelkit/data/registry.py | 74 ------------ 6 files changed, 7 insertions(+), 341 deletions(-) delete mode 100644 src/winml/modelkit/data/data_config.py delete mode 100644 src/winml/modelkit/data/dummy_dataset.py delete mode 100644 src/winml/modelkit/data/image_classification_dataset.py delete mode 100644 src/winml/modelkit/data/random_dataset.py delete mode 100644 src/winml/modelkit/data/registry.py diff --git a/src/winml/modelkit/data/__init__.py b/src/winml/modelkit/data/__init__.py index 353823069..08f849635 100644 --- a/src/winml/modelkit/data/__init__.py +++ b/src/winml/modelkit/data/__init__.py @@ -2,15 +2,11 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- -"""Data loading and preprocessing components for WinML CLI.""" +"""Package data for WinML CLI. -from . import ( - dummy_dataset, - image_classification_dataset, - random_dataset, -) -from .data_config import DataConfig -from .registry import DataRegistry - - -__all__ = ["DataConfig", "DataRegistry"] +This package ships the built-in model catalog (``hub_models.json``) consumed by +the ``catalog`` command and the ``serve`` HTTP API. It intentionally contains no +importable code so that resolving the package (e.g. via +``importlib.resources.files``) stays lightweight and free of heavy optional +dependencies. +""" diff --git a/src/winml/modelkit/data/data_config.py b/src/winml/modelkit/data/data_config.py deleted file mode 100644 index f023bcbcd..000000000 --- a/src/winml/modelkit/data/data_config.py +++ /dev/null @@ -1,28 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -"""Configuration class for dataset loading and preprocessing.""" - -from typing import Any - - -class DataConfig: - """Simple configuration container for dataset loading and preprocessing.""" - - def __init__( - self, - load_dataset_config: dict[str, Any] | None = None, - pre_process_data_config: dict[str, Any] | None = None, - model_input: str | None = None, - ) -> None: - """Initialize DataConfig. - - Args: - load_dataset_config: Parameters for dataset loading - pre_process_data_config: Parameters for preprocessing - model_input: Path to model input file - """ - self.load_dataset_config = load_dataset_config or {} - self.pre_process_data_config = pre_process_data_config or {} - self.model_input = model_input diff --git a/src/winml/modelkit/data/dummy_dataset.py b/src/winml/modelkit/data/dummy_dataset.py deleted file mode 100644 index 8024879b4..000000000 --- a/src/winml/modelkit/data/dummy_dataset.py +++ /dev/null @@ -1,38 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -"""Dummy dataset for testing with all-ones data.""" - -from __future__ import annotations - -import numpy as np - -from .random_dataset import RandomDataset -from .registry import DataRegistry - - -@DataRegistry.register_dataset() -class DummyDataset(RandomDataset): - """Dummy dataset that generates all-ones data for testing. - - Inherits from RandomDataset but overrides data generation to use - all ones instead of random values. Useful for deterministic testing - and debugging. - """ - - def _generate_data(self, shape: list[int], dtype: np.dtype) -> np.ndarray: - """Generate all-ones data for a given shape and dtype. - - Args: - shape: Shape of the tensor to generate - dtype: NumPy dtype of the tensor - - Returns: - NumPy array filled with ones (or True for boolean) - """ - if np.issubdtype(dtype, np.floating) or np.issubdtype(dtype, np.integer): - return np.ones(shape, dtype=dtype) - if dtype == np.bool_: - return np.ones(shape, dtype=np.bool_) - return np.ones(shape).astype(np.float32) diff --git a/src/winml/modelkit/data/image_classification_dataset.py b/src/winml/modelkit/data/image_classification_dataset.py deleted file mode 100644 index b335babbc..000000000 --- a/src/winml/modelkit/data/image_classification_dataset.py +++ /dev/null @@ -1,82 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -"""Image classification dataset for testing quantization.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING, Any - -from .registry import DataRegistry - - -if TYPE_CHECKING: - from .data_config import DataConfig - - -@DataRegistry.register_dataset() -class ImageClassificationDataset: - """Image classification dataset using Hugging Face datasets. - - Loads mini-imagenet dataset and preprocesses images for model input. - """ - - def __init__(self, config: DataConfig | None = None): - """Initialize dataset. - - Args: - config: Optional DataConfig with dataset_name, split, stream, and size settings. - Defaults to "timm/mini-imagenet", split="train", stream=True, size=256 - """ - load_dataset_config = config.load_dataset_config if config else {} - - dataset_name = load_dataset_config.get("dataset_name", "timm/mini-imagenet") - split = load_dataset_config.get("split", "train") - stream = load_dataset_config.get("stream", True) - size = load_dataset_config.get("size", 256) - - from datasets import load_dataset - from torchvision import transforms - - self.dataset = load_dataset(dataset_name, split=split, streaming=stream) - - # TODO: Image preprocessing is temporarily hardcoded; - # will integrate with Hugging Face data processor - self.preprocess = transforms.Compose( - [ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), - ] - ) - - # Cache preprocessed samples (limited by size) - self.images = [] - self.labels = [] - for i, sample in enumerate(self.dataset): - if i >= size: - break - img = sample["image"] - # Convert grayscale to RGB if needed - if img.mode != "RGB": - img = img.convert("RGB") - tensor = self.preprocess(img).unsqueeze(0) - self.images.append(tensor.numpy()) - self.labels.append(0) # Placeholder label - - def __len__(self) -> int: - """Return dataset length.""" - return min(len(self.images), len(self.labels)) - - def __getitem__(self, idx: int) -> dict[str, Any]: - """Get item by index. - - Args: - idx: Index of the sample - - Returns: - Dict with pixel_values key containing numpy array - """ - return {"pixel_values": self.images[idx]} diff --git a/src/winml/modelkit/data/random_dataset.py b/src/winml/modelkit/data/random_dataset.py deleted file mode 100644 index 9c1b2a21b..000000000 --- a/src/winml/modelkit/data/random_dataset.py +++ /dev/null @@ -1,108 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -"""Random dataset for calibration when no specific dataset is provided.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING, Any - -import numpy as np -import onnx - -from ..onnx import load_onnx -from .registry import DataRegistry - - -if TYPE_CHECKING: - from .data_config import DataConfig - - -@DataRegistry.register_dataset() -class RandomDataset: - """Random dataset that generates synthetic data for calibration. - - Uses model input shape and type information to generate appropriate - random data for quantization calibration when specific datasets - aren't available or specified. - """ - - def __init__(self, config: DataConfig) -> None: - """Initialize random dataset. - - Args: - config: DataConfig object containing model_input and load_dataset_config with: - - size (int, optional): Number of random samples to generate (default: 10) - """ - if config is None: - raise ValueError("DataConfig is required for RandomDataset") - - load_config = config.load_dataset_config - model_path = config.model_input - self.size = load_config.get("size", 10) - - if not model_path: - raise ValueError("model_input must be specified in DataConfig") - - self.model_path = model_path - self.samples: list[dict[str, np.ndarray]] = [] - self._load_model_and_generate() - - def _load_model_and_generate(self) -> None: - """Load ONNX model and generate random data based on input specifications.""" - try: - model = load_onnx(self.model_path, load_weights=False, validate=False) - inputs = model.graph.input - - for _ in range(self.size): - sample: dict[str, np.ndarray] = {} - for input_info in inputs: - name = input_info.name - shape = [dim.dim_value for dim in input_info.type.tensor_type.shape.dim] - dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[ - input_info.type.tensor_type.elem_type - ] - - # Handle dynamic dimensions (replace 0 with 1) - shape = [1 if dim == 0 else dim for dim in shape] - - sample[name] = self._generate_data(shape, dtype) - - self.samples.append(sample) - - except Exception as e: - raise RuntimeError(f"Failed to load model or generate random data: {e}") from e - - def _generate_data(self, shape: list[int], dtype: np.dtype) -> np.ndarray: - """Generate data for a given shape and dtype. - - Args: - shape: Shape of the tensor to generate - dtype: NumPy dtype of the tensor - - Returns: - NumPy array with generated data - """ - if np.issubdtype(dtype, np.floating): - return np.random.rand(*shape).astype(dtype) - if np.issubdtype(dtype, np.integer): - return np.random.randint(0, 100, size=shape, dtype=dtype) - if dtype == np.bool_: - return np.random.choice([False, True], size=shape) - return np.random.rand(*shape).astype(np.float32) - - def __len__(self) -> int: - """Return dataset length.""" - return len(self.samples) - - def __getitem__(self, idx: int) -> dict[str, Any]: - """Get item by index. - - Args: - idx: Index of the sample - - Returns: - Dict containing input tensors - """ - return self.samples[idx] diff --git a/src/winml/modelkit/data/registry.py b/src/winml/modelkit/data/registry.py deleted file mode 100644 index c6d605391..000000000 --- a/src/winml/modelkit/data/registry.py +++ /dev/null @@ -1,74 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -"""Central registry for dataset class management.""" - -from collections.abc import Callable -from typing import ClassVar, TypeVar - - -T = TypeVar("T", bound=type) - - -class DataRegistry: - """Central registry for dataset class management. - - Enables dynamic dataset instantiation from configuration using a - decorator-based registration pattern. - """ - - _datasets: ClassVar[dict[str, type]] = {} - - @classmethod - def register_dataset(cls, name: str | None = None) -> Callable[[T], T]: - """Decorator to register a dataset class. - - Args: - name: Optional name to register the dataset under. If not provided, - uses the class's __name__ attribute. - - Returns: - Decorator function that registers the class and returns it unchanged - - Example: - @DataRegistry.register_dataset() - class ImageClassificationDataset(Dataset): - pass - - @DataRegistry.register_dataset("custom_name") - class MyDataset(Dataset): - pass - """ - - def decorator(dataset_class: T) -> T: - dataset_name = name or dataset_class.__name__ - cls._datasets[dataset_name] = dataset_class - return dataset_class - - return decorator - - @classmethod - def get_component(cls, name: str) -> type: - """Retrieve registered class or function by name. - - Args: - name: Name of the registered dataset class - - Returns: - The registered dataset class - - Raises: - ValueError: If the requested dataset name is not registered - - Example: - dataset_class = DataRegistry.get_component("ImageClassificationDataset") - dataset = dataset_class(config) - """ - if name not in cls._datasets: - available = ", ".join(cls._datasets.keys()) - raise ValueError( - f"Unknown dataset: '{name}'. " - f"Available datasets: {available if available else 'none'}" - ) - return cls._datasets[name]