albumentations-team
diff --git a/‎.cursor/rules/optimizations.mdc‎
Lines changed: 64 additions & 0 deletions b/‎.cursor/rules/optimizations.mdc‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎albucore/functions.py‎
Lines changed: 154 additions & 74 deletions b/‎albucore/functions.py‎
Lines changed: 154 additions & 74 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,64 @@
+# Performance Optimization Guidelines
+
+## OpenCV LUT (Look-Up Table) Operations
+
+### Critical: Maintain float32 dtype for LUT arrays
+
+When using `cv2.LUT()` with floating-point lookup tables, **always ensure the LUT array is float32, not float64**. This can have a dramatic performance impact, especially on large arrays like videos.
+
+#### The Problem
+
+OpenCV's statistics functions (`cv2.meanStdDev`, etc.) return float64 values. When these are used in LUT creation:
+
+```python
+# BAD: Creates float64 LUT due to numpy promotion
+mean, std = cv2.meanStdDev(img)  # Returns float64
+lut = (np.arange(0, 256, dtype=np.float32) - mean[0, 0]) / std[0, 0]
+# lut.dtype is now float64!
+```
+
+This causes:
+1. `cv2.LUT()` returns a float64 array (slower operations)
+2. Subsequent operations (clip, etc.) are slower on float64
+3. Often requires `.astype(np.float32)` on the large result array (very expensive)
+
+#### The Solution
+
+Cast the LUT array to float32 after creation:
+
+```python
+# GOOD: Maintain float32 throughout
+lut = ((np.arange(0, 256, dtype=np.float32) - mean[0, 0]) / std[0, 0]).astype(np.float32)
+# lut.dtype is float32
+```
+
+#### Performance Impact
+
+For a video of shape (200, 256, 256, 3):
+- With float64 LUT: ~111ms (includes expensive astype on result)
+- With float32 LUT: ~55ms (2x faster!)
+
+#### Best Practices
+
+1. **For uint8 images**: LUT operations are extremely fast and should be preferred when possible
+2. **Always check dtype**: Use `.astype(np.float32)` on small LUT arrays (256 elements) rather than large result arrays
+3. **Avoid dtype promotion**: Be aware that numpy operations with mixed dtypes promote to the higher precision type
+
+#### Example: Image Normalization with LUT
+
+```python
+def normalize_with_lut(img: np.ndarray) -> np.ndarray:
+    """Fast normalization for uint8 images using LUT"""
+    # Get statistics
+    mean, std = cv2.meanStdDev(img)
+    mean = mean[0, 0]
+    std = std[0, 0] + 1e-4
+
+    # Create LUT - ensure float32!
+    lut = ((np.arange(0, 256, dtype=np.float32) - mean) / std).astype(np.float32)
+
+    # Apply LUT - result will be float32
+    return cv2.LUT(img, lut).clip(-20, 20)
+```
+
+This optimization applies to any LUT-based operation where floating-point precision is needed.
@@ -276,8 +276,13 @@ def add(img: np.ndarray, value: ValueType, inplace: bool = False) -> np.ndarray:
 
 def normalize_numpy(img: np.ndarray, mean: float | np.ndarray, denominator: float | np.ndarray) -> np.ndarray:
     img = img.astype(np.float32, copy=False)
+    # Ensure mean and denominator are float32 to avoid dtype promotion
+    if isinstance(mean, np.ndarray):
+        mean = mean.astype(np.float32, copy=False)
+    if isinstance(denominator, np.ndarray):
+        denominator = denominator.astype(np.float32, copy=False)
     img -= mean
-    return img * denominator
+    return (img * denominator).astype(np.float32, copy=False)
 
 
 @preserve_channel_dim
@@ -286,12 +291,6 @@ def normalize_opencv(img: np.ndarray, mean: float | np.ndarray, denominator: flo
     mean_img = np.zeros_like(img, dtype=np.float32)
     denominator_img = np.zeros_like(img, dtype=np.float32)
 
-    # If mean or denominator are scalar, convert them to arrays
-    if isinstance(mean, (float, int)):
-        mean = np.full(img.shape, mean, dtype=np.float32)
-    if isinstance(denominator, (float, int)):
-        denominator = np.full(img.shape, denominator, dtype=np.float32)
-
     # Ensure the shapes match for broadcasting
     mean_img = (mean_img + mean).astype(np.float32, copy=False)
     denominator_img = denominator_img + denominator
@@ -307,27 +306,39 @@ def normalize_lut(img: np.ndarray, mean: float | np.ndarray, denominator: float
     num_channels = get_num_channels(img)
 
     if isinstance(denominator, (float, int)) and isinstance(mean, (float, int)):
-        lut = (np.arange(0, max_value + 1, dtype=np.float32) - mean) * denominator
+        lut = ((np.arange(0, max_value + 1, dtype=np.float32) - mean) * denominator).astype(np.float32)
         return cv2.LUT(img, lut)
 
-    if isinstance(denominator, np.ndarray) and denominator.shape != ():
-        denominator = denominator.reshape(-1, 1)
-
+    # Convert to float32 if needed
     if isinstance(mean, np.ndarray):
-        mean = mean.reshape(-1, 1)
+        mean = mean.astype(np.float32, copy=False)
+    if isinstance(denominator, np.ndarray):
+        denominator = denominator.astype(np.float32, copy=False)
+
+    # Vectorized LUT creation - shape: (256, num_channels)
+    arange_vals = np.arange(0, max_value + 1, dtype=np.float32)
+    luts = (arange_vals[:, np.newaxis] - mean) * denominator
 
-    luts = (np.arange(0, max_value + 1, dtype=np.float32) - mean) * denominator
+    # Pre-allocate result array
+    result = np.empty_like(img, dtype=np.float32)
+    for i in range(num_channels):
+        result[..., i] = cv2.LUT(img[..., i], luts[:, i])
 
-    return np.stack([cv2.LUT(img[..., i], luts[i]) for i in range(num_channels)], axis=-1)
+    return result
 
 
 def normalize(img: np.ndarray, mean: ValueType, denominator: ValueType) -> np.ndarray:
     num_channels = get_num_channels(img)
     denominator = convert_value(denominator, num_channels)
     mean = convert_value(mean, num_channels)
+
     if img.dtype == np.uint8:
         return normalize_lut(img, mean, denominator)
 
+    if img.dtype == np.float32:
+        return normalize_numpy(img, mean, denominator)
+
+    # Fallback to OpenCV for other dtypes
     return normalize_opencv(img, mean, denominator)
 
 
@@ -474,6 +485,66 @@ def multiply_add(img: np.ndarray, factor: ValueType, value: ValueType, inplace:
     return multiply_add_opencv(img, factor, value)
 
 
+def _compute_image_stats_opencv(img: np.ndarray) -> tuple[float, float]:
+    """Compute global mean and std for an image."""
+    eps = 1e-4
+    if img.ndim > 3:
+        # For 4D/5D arrays (video/volume), OpenCV returns global mean/std directly
+        mean, std = cv2.meanStdDev(img)
+        return float(mean[0, 0]), float(std[0, 0]) + eps
+    # For 3D images, use numpy for accurate global statistics
+    return float(img.mean()), float(img.std()) + eps
+
+
+def _compute_per_channel_stats_opencv(img: np.ndarray, spatial_axes: tuple[int, ...]) -> tuple[np.ndarray, np.ndarray]:
+    """Compute per-channel mean and std."""
+    eps = 1e-4
+    if img.ndim > 3:
+        # For 4D/5D arrays, compute per-channel statistics using numpy
+        mean = img.mean(axis=spatial_axes)
+        std = img.std(axis=spatial_axes) + eps
+    else:
+        # For 3D arrays, use OpenCV
+        mean, std = cv2.meanStdDev(img)
+        mean = mean[:, 0]
+        std = std[:, 0] + eps
+    return mean, std
+
+
+def _normalize_mean_std_opencv(img_f: np.ndarray, mean: float | np.ndarray, std: float | np.ndarray) -> np.ndarray:
+    """Apply mean-std normalization using OpenCV or NumPy based on dimensionality."""
+    if img_f.ndim > 3:
+        # Use NumPy operations for 4D/5D (faster)
+        normalized_img = (img_f - mean) / std
+    else:
+        # Use OpenCV for 3D
+        if img_f.shape[-1] > MAX_OPENCV_WORKING_CHANNELS:
+            mean = np.full_like(img_f, mean)
+            std = np.full_like(img_f, std)
+        normalized_img = cv2.divide(cv2.subtract(img_f, mean, dtype=cv2.CV_32F), std, dtype=cv2.CV_32F)
+    return np.clip(normalized_img, -20, 20, out=normalized_img)
+
+
+def _normalize_min_max_per_channel_opencv(img: np.ndarray, spatial_axes: tuple[int, ...]) -> np.ndarray:
+    """Apply per-channel min-max normalization."""
+    eps = 1e-4
+
+    img_min = img.min(axis=spatial_axes)
+    img_max = img.max(axis=spatial_axes)
+
+    if img.shape[-1] > MAX_OPENCV_WORKING_CHANNELS:
+        img_min = np.full_like(img, img_min)
+        img_max = np.full_like(img, img_max)
+
+    # Use NumPy operations for 4D/5D (faster), OpenCV for 3D
+    if img.ndim > 3:
+        normalized_img = (img - img_min) / (img_max - img_min + eps)
+    else:
+        normalized_img = cv2.divide(cv2.subtract(img, img_min), (img_max - img_min + eps), dtype=cv2.CV_32F)
+
+    return np.clip(normalized_img, -20, 20, out=normalized_img)
+
+
 @preserve_channel_dim
 def normalize_per_image_opencv(
     img: np.ndarray,
@@ -508,47 +579,27 @@ def normalize_per_image_opencv(
         - For images with >4 channels, falls back to array operations as OpenCV has limitations
         - Single channel images treated as "image" normalization when "image_per_channel" is specified
     """
-    img = img.astype(np.float32, copy=False)
-    eps = 1e-4
+    # Handle single-channel edge case
+    if img.shape[-1] == 1 and normalization == "image_per_channel":
+        normalization = "image"
+    if img.shape[-1] == 1 and normalization == "min_max_per_channel":
+        normalization = "min_max"
 
-    if normalization == "image" or (img.shape[-1] == 1 and normalization == "image_per_channel"):
-        mean = img.mean().item()
-        std = img.std().item() + eps
-        if img.shape[-1] > MAX_OPENCV_WORKING_CHANNELS:
-            mean = np.full_like(img, mean)
-            std = np.full_like(img, std)
-        normalized_img = cv2.divide(cv2.subtract(img, mean), std)
-        return np.clip(normalized_img, -20, 20, out=normalized_img)
+    if normalization == "image":
+        mean, std = _compute_image_stats_opencv(img)
+        img_f = img.astype(np.float32, copy=False)
+        return _normalize_mean_std_opencv(img_f, mean, std)
 
     if normalization == "image_per_channel":
-        mean, std = cv2.meanStdDev(img)
-        mean = mean[:, 0]
-        std = std[:, 0]
-
-        if img.shape[-1] > MAX_OPENCV_WORKING_CHANNELS:
-            mean = np.full_like(img, mean)
-            std = np.full_like(img, std)
+        mean, std = _compute_per_channel_stats_opencv(img, spatial_axes)
+        img_f = img.astype(np.float32, copy=False)
+        return _normalize_mean_std_opencv(img_f, mean, std)
 
-        normalized_img = cv2.divide(cv2.subtract(img, mean), std, dtype=cv2.CV_32F)
-        return np.clip(normalized_img, -20, 20, out=normalized_img)
-
-    if normalization == "min_max" or (img.shape[-1] == 1 and normalization == "min_max_per_channel"):
+    if normalization == "min_max":
         return cv2.normalize(img, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
 
     if normalization == "min_max_per_channel":
-        img_min = img.min(axis=spatial_axes)
-        img_max = img.max(axis=spatial_axes)
-
-        if img.shape[-1] > MAX_OPENCV_WORKING_CHANNELS:
-            img_min = np.full_like(img, img_min)
-            img_max = np.full_like(img, img_max)
-
-        return np.clip(
-            cv2.divide(cv2.subtract(img, img_min), (img_max - img_min + eps), dtype=cv2.CV_32F),
-            -20,
-            20,
-            out=img,
-        )
+        return _normalize_min_max_per_channel_opencv(img, spatial_axes)
 
     raise ValueError(f"Unknown normalization method: {normalization}")
 
@@ -656,33 +707,52 @@ def normalize_per_image_lut(
     num_channels = get_num_channels(img)
 
     if normalization == "image" or (img.shape[-1] == 1 and normalization == "image_per_channel"):
-        mean = img.mean()
-        std = img.std() + eps
-        lut = (np.arange(0, max_value + 1, dtype=np.float32) - mean) / std
-        return cv2.LUT(img, lut).clip(-20, 20).astype(np.float32)
+        if img.ndim > 3:
+            # For 4D/5D arrays (video/volume), OpenCV returns global mean/std directly
+            mean, std = cv2.meanStdDev(img)
+            mean = mean[0, 0]
+            std = std[0, 0] + eps
+        else:
+            # For 3D images, use numpy for accurate global statistics
+            mean = img.mean()
+            std = img.std() + eps
+
+        lut = ((np.arange(0, max_value + 1, dtype=np.float32) - mean) / std).astype(np.float32)
+        return cv2.LUT(img, lut).clip(-20, 20)
 
     if normalization == "image_per_channel":
-        pixel_mean = img.mean(axis=spatial_axes)
-        pixel_std = img.std(axis=spatial_axes) + eps
-        luts = [
-            (np.arange(0, max_value + 1, dtype=np.float32) - pixel_mean[c]) / pixel_std[c] for c in range(num_channels)
-        ]
-        return np.stack([cv2.LUT(img[..., i], luts[i]).clip(-20, 20) for i in range(num_channels)], axis=-1)
+        pixel_mean = img.mean(axis=spatial_axes).astype(np.float32)
+        pixel_std = img.std(axis=spatial_axes).astype(np.float32) + np.float32(eps)
+
+        # Create all LUTs at once using vectorized operations
+        arange_vals = np.arange(0, max_value + 1, dtype=np.float32)
+        # LUTs shape will be (256, num_channels)
+        luts = (arange_vals[:, np.newaxis] - pixel_mean) / pixel_std
+
+        result = np.empty_like(img, dtype=np.float32)
+        for i in range(num_channels):
+            result[..., i] = cv2.LUT(img[..., i], luts[:, i])
+        return result.clip(-20, 20)
 
     if normalization == "min_max" or (img.shape[-1] == 1 and normalization == "min_max_per_channel"):
         img_min = img.min()
         img_max = img.max()
-        lut = (np.arange(0, max_value + 1, dtype=np.float32) - img_min) / (img_max - img_min + eps)
-        return cv2.LUT(img, lut).clip(-20, 20).astype(np.float32)
+        lut = ((np.arange(0, max_value + 1, dtype=np.float32) - img_min) / (img_max - img_min + eps)).astype(np.float32)
+        return cv2.LUT(img, lut).clip(-20, 20)
 
     if normalization == "min_max_per_channel":
         img_min = img.min(axis=spatial_axes)
         img_max = img.max(axis=spatial_axes)
-        luts = [
-            (np.arange(0, max_value + 1, dtype=np.float32) - img_min[c]) / (img_max[c] - img_min[c] + eps)
-            for c in range(num_channels)
-        ]
-        return np.stack([cv2.LUT(img[..., i], luts[i]) for i in range(num_channels)], axis=-1).astype(np.float32)
+
+        # Create all LUTs at once using vectorized operations
+        arange_vals = np.arange(0, max_value + 1, dtype=np.float32)
+        # LUTs shape will be (256, num_channels)
+        luts = ((arange_vals[:, np.newaxis] - img_min) / (img_max - img_min + eps)).astype(np.float32)
+
+        result = np.empty_like(img, dtype=np.float32)
+        for i in range(num_channels):
+            result[..., i] = cv2.LUT(img[..., i], luts[:, i])
+        return result.clip(-20, 20)
 
     raise ValueError(f"Unknown normalization method: {normalization}")
 
@@ -705,8 +775,8 @@ def normalize_per_image(img: np.ndarray, normalization: NormalizationType) -> np
         Normalized image as float32 array with values clipped to [-20, 20] range.
 
     Notes:
-        - For uint8 images (except "image_per_channel"), uses LUT method for maximum speed
-        - For other dtypes, uses OpenCV implementation for good performance
+        - For uint8 images (except "min_max"), uses LUT method for maximum speed
+        - For other dtypes, uses OpenCV or NumPy implementation for good performance
         - Automatically determines spatial axes based on input dimensions
     """
     # Determine spatial axes based on input dimensions
@@ -719,17 +789,27 @@ def normalize_per_image(img: np.ndarray, normalization: NormalizationType) -> np
     else:
         raise ValueError(f"Unsupported image dimensions: {img.ndim}. Expected 3, 4, or 5 dimensions.")
 
-    if img.dtype == np.uint8 and (
-        (normalization != "image_per_channel" and img.ndim == 3)
-        or (normalization == "min_max_per_channel" and img.ndim > 3)
-        or (normalization == "image_per_channel" and img.ndim > 3)
-    ):
+    # Optimized routing based on benchmarks
+
+    # Route uint8 images
+    if img.dtype == np.uint8:
+        # Use LUT for everything except min_max (where OpenCV is 3x faster)
+        if normalization == "min_max":
+            return normalize_per_image_opencv(img, normalization, spatial_axes)
+        # LUT is fastest for "image", "image_per_channel", and "min_max_per_channel"
         return normalize_per_image_lut(img, normalization, spatial_axes)
 
-    # For ndim > 3, use numpy implementation as OpenCV doesn't handle batch dimensions well
+    # Route float32 images
+    if img.dtype == np.float32:
+        if normalization == "image":
+            # NumPy is 1.5x faster for "image" normalization
+            return normalize_per_image_numpy(img, normalization, spatial_axes)
+        # OpenCV is fastest or equal for all other normalizations
+        return normalize_per_image_opencv(img, normalization, spatial_axes)
+
+    # Default fallback: OpenCV for single images, NumPy for videos/volumes
     if img.ndim > 3:
         return normalize_per_image_numpy(img, normalization, spatial_axes)
-
     return normalize_per_image_opencv(img, normalization, spatial_axes)
 
 
 
@@ -5,7 +5,7 @@ requires = [ "setuptools>=45", "wheel" ]
 
 [project]
 name = "albucore"
-version = "0.0.30"
+version = "0.0.31"
 
 description = "High-performance image processing functions for deep learning and computer vision."
 readme = "README.md"